From 7c7e3d31e7856a8260a254f8c71db416f7f9f5a1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 5 Nov 2021 16:23:29 -0700 Subject: bpf: Introduce helper bpf_find_vma In some profiler use cases, it is necessary to map an address to the backing file, e.g., a shared library. bpf_find_vma helper provides a flexible way to achieve this. bpf_find_vma maps an address of a task to the vma (vm_area_struct) for this address, and feed the vma to an callback BPF function. The callback function is necessary here, as we need to ensure mmap_sem is unlocked. It is necessary to lock mmap_sem for find_vma. To lock and unlock mmap_sem safely when irqs are disable, we use the same mechanism as stackmap with build_id. Specifically, when irqs are disabled, the unlocked is postponed in an irq_work. Refactor stackmap.c so that the irq_work is shared among bpf_find_vma and stackmap helpers. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Tested-by: Hengqi Chen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211105232330.1936330-2-songliubraving@fb.com --- kernel/bpf/btf.c | 5 ++- kernel/bpf/mmap_unlock_work.h | 65 +++++++++++++++++++++++++++++++++++ kernel/bpf/stackmap.c | 80 ++++--------------------------------------- kernel/bpf/task_iter.c | 76 ++++++++++++++++++++++++++++++++++++---- kernel/bpf/verifier.c | 34 ++++++++++++++++++ kernel/trace/bpf_trace.c | 2 ++ 6 files changed, 181 insertions(+), 81 deletions(-) create mode 100644 kernel/bpf/mmap_unlock_work.h (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbc3ad07e21b..cdb0fba65600 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6342,7 +6342,10 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .arg4_type = ARG_ANYTHING, }; -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) +BTF_ID_LIST_GLOBAL(btf_task_struct_ids) +BTF_ID(struct, task_struct) +BTF_ID(struct, file) +BTF_ID(struct, vm_area_struct) /* BTF ID set registration API for modules */ diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h new file mode 100644 index 000000000000..5d18d7d85bef --- /dev/null +++ b/kernel/bpf/mmap_unlock_work.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2021 Facebook + */ + +#ifndef __MMAP_UNLOCK_WORK_H__ +#define __MMAP_UNLOCK_WORK_H__ +#include + +/* irq_work to run mmap_read_unlock() in irq_work */ +struct mmap_unlock_irq_work { + struct irq_work irq_work; + struct mm_struct *mm; +}; + +DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); + +/* + * We cannot do mmap_read_unlock() when the irq is disabled, because of + * risk to deadlock with rq_lock. To look up vma when the irqs are + * disabled, we need to run mmap_read_unlock() in irq_work. We use a + * percpu variable to do the irq_work. If the irq_work is already used + * by another lookup, we fall over. + */ +static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr) +{ + struct mmap_unlock_irq_work *work = NULL; + bool irq_work_busy = false; + + if (irqs_disabled()) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + work = this_cpu_ptr(&mmap_unlock_work); + if (irq_work_is_busy(&work->irq_work)) { + /* cannot queue more up_read, fallback */ + irq_work_busy = true; + } + } else { + /* + * PREEMPT_RT does not allow to trylock mmap sem in + * interrupt disabled context. Force the fallback code. + */ + irq_work_busy = true; + } + } + + *work_ptr = work; + return irq_work_busy; +} + +static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm) +{ + if (!work) { + mmap_read_unlock(mm); + } else { + work->mm = mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); + irq_work_queue(&work->irq_work); + } +} + +#endif /* __MMAP_UNLOCK_WORK_H__ */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6e75bbee39f0..1de0a1b03636 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -7,10 +7,10 @@ #include #include #include -#include #include #include #include "percpu_freelist.h" +#include "mmap_unlock_work.h" #define STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ @@ -31,25 +31,6 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; -/* irq_work to run up_read() for build_id lookup in nmi context */ -struct stack_map_irq_work { - struct irq_work irq_work; - struct mm_struct *mm; -}; - -static void do_up_read(struct irq_work *entry) -{ - struct stack_map_irq_work *work; - - if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) - return; - - work = container_of(entry, struct stack_map_irq_work, irq_work); - mmap_read_unlock_non_owner(work->mm); -} - -static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); - static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); @@ -149,35 +130,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { int i; + struct mmap_unlock_irq_work *work = NULL; + bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); struct vm_area_struct *vma; - bool irq_work_busy = false; - struct stack_map_irq_work *work = NULL; - - if (irqs_disabled()) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { - work = this_cpu_ptr(&up_read_work); - if (irq_work_is_busy(&work->irq_work)) { - /* cannot queue more up_read, fallback */ - irq_work_busy = true; - } - } else { - /* - * PREEMPT_RT does not allow to trylock mmap sem in - * interrupt disabled context. Force the fallback code. - */ - irq_work_busy = true; - } - } - /* - * We cannot do up_read() when the irq is disabled, because of - * risk to deadlock with rq_lock. To do build_id lookup when the - * irqs are disabled, we need to run up_read() in irq_work. We use - * a percpu variable to do the irq_work. If the irq_work is - * already used by another lookup, we fall back to report ips. - * - * Same fallback is used for kernel stack (!user) on a stackmap - * with build_id. + /* If the irq_work is in use, fall back to report ips. Same + * fallback is used for kernel stack (!user) on a stackmap with + * build_id. */ if (!user || !current || !current->mm || irq_work_busy || !mmap_read_trylock(current->mm)) { @@ -203,19 +162,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; } - - if (!work) { - mmap_read_unlock(current->mm); - } else { - work->mm = current->mm; - - /* The lock will be released once we're out of interrupt - * context. Tell lockdep that we've released it now so - * it doesn't complain that we forgot to release it. - */ - rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); - irq_work_queue(&work->irq_work); - } + bpf_mmap_unlock_mm(work, current->mm); } static struct perf_callchain_entry * @@ -719,16 +666,3 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_btf_name = "bpf_stack_map", .map_btf_id = &stack_trace_map_btf_id, }; - -static int __init stack_map_init(void) -{ - int cpu; - struct stack_map_irq_work *work; - - for_each_possible_cpu(cpu) { - work = per_cpu_ptr(&up_read_work, cpu); - init_irq_work(&work->irq_work, do_up_read); - } - return 0; -} -subsys_initcall(stack_map_init); diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index b48750bfba5a..f171479f7dd6 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -8,6 +8,7 @@ #include #include #include +#include "mmap_unlock_work.h" struct bpf_iter_seq_task_common { struct pid_namespace *ns; @@ -524,10 +525,6 @@ static const struct seq_operations task_vma_seq_ops = { .show = task_vma_seq_show, }; -BTF_ID_LIST(btf_task_file_ids) -BTF_ID(struct, file) -BTF_ID(struct, vm_area_struct) - static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, .init_seq_private = init_seq_pidns, @@ -586,9 +583,74 @@ static struct bpf_iter_reg task_vma_reg_info = { .seq_info = &task_vma_seq_info, }; +BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, + bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) +{ + struct mmap_unlock_irq_work *work = NULL; + struct vm_area_struct *vma; + bool irq_work_busy = false; + struct mm_struct *mm; + int ret = -ENOENT; + + if (flags) + return -EINVAL; + + if (!task) + return -ENOENT; + + mm = task->mm; + if (!mm) + return -ENOENT; + + irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); + + if (irq_work_busy || !mmap_read_trylock(mm)) + return -EBUSY; + + vma = find_vma(mm, start); + + if (vma && vma->vm_start <= start && vma->vm_end > start) { + callback_fn((u64)(long)task, (u64)(long)vma, + (u64)(long)callback_ctx, 0, 0); + ret = 0; + } + bpf_mmap_unlock_mm(work, mm); + return ret; +} + +const struct bpf_func_proto bpf_find_vma_proto = { + .func = bpf_find_vma, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_task_struct_ids[0], + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_FUNC, + .arg4_type = ARG_PTR_TO_STACK_OR_NULL, + .arg5_type = ARG_ANYTHING, +}; + +DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); + +static void do_mmap_read_unlock(struct irq_work *entry) +{ + struct mmap_unlock_irq_work *work; + + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) + return; + + work = container_of(entry, struct mmap_unlock_irq_work, irq_work); + mmap_read_unlock_non_owner(work->mm); +} + static int __init task_iter_init(void) { - int ret; + struct mmap_unlock_irq_work *work; + int ret, cpu; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&mmap_unlock_work, cpu); + init_irq_work(&work->irq_work, do_mmap_read_unlock); + } task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; ret = bpf_iter_reg_target(&task_reg_info); @@ -596,13 +658,13 @@ static int __init task_iter_init(void) return ret; task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; - task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_struct_ids[1]; ret = bpf_iter_reg_target(&task_file_reg_info); if (ret) return ret; task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; - task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_struct_ids[2]; return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f0dca726ebfd..1aafb43f61d1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6132,6 +6132,33 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_find_vma_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_find_vma(struct task_struct *task, u64 addr, + * void *callback_fn, void *callback_ctx, u64 flags) + * (callback_fn)(struct task_struct *task, + * struct vm_area_struct *vma, void *callback_ctx); + */ + callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1]; + + callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].btf = btf_vmlinux; + callee->regs[BPF_REG_2].btf_id = btf_task_struct_ids[2]; + + /* pointer to stack or null */ + callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -6489,6 +6516,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (func_id == BPF_FUNC_find_vma) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_find_vma_callback_state); + if (err < 0) + return -EINVAL; + } + if (func_id == BPF_FUNC_snprintf) { err = check_bpf_snprintf_call(env, regs); if (err < 0) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7396488793ff..390176a3031a 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1208,6 +1208,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_func_ip_proto_tracing; case BPF_FUNC_get_branch_snapshot: return &bpf_get_branch_snapshot_proto; + case BPF_FUNC_find_vma: + return &bpf_find_vma_proto; case BPF_FUNC_trace_vprintk: return bpf_get_trace_vprintk_proto(); default: -- cgit v1.2.3 From 5d5e4522a7f404d1a96fd6c703989d32a9c9568d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 7 Nov 2021 14:51:16 +1000 Subject: printk: restore flushing of NMI buffers on remote CPUs after NMI backtraces printk from NMI context relies on irq work being raised on the local CPU to print to console. This can be a problem if the NMI was raised by a lockup detector to print lockup stack and regs, because the CPU may not enable irqs (because it is locked up). Introduce printk_trigger_flush() that can be called another CPU to try to get those messages to the console, call that where printk_safe_flush was previously called. Fixes: 93d102f094be ("printk: remove safe buffers") Cc: stable@vger.kernel.org # 5.15 Signed-off-by: Nicholas Piggin Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211107045116.1754411-1-npiggin@gmail.com --- arch/powerpc/kernel/watchdog.c | 6 ++++++ include/linux/printk.h | 4 ++++ kernel/printk/printk.c | 5 +++++ lib/nmi_backtrace.c | 6 ++++++ 4 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index dc17d8903d4f..6b7a83d5e03e 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -186,6 +186,12 @@ static void watchdog_smp_panic(int cpu, u64 tb) if (sysctl_hardlockup_all_cpu_backtrace) trigger_allbutself_cpu_backtrace(); + /* + * Force flush any remote buffers that might be stuck in IRQ context + * and therefore could not run their irq_work. + */ + printk_trigger_flush(); + if (hardlockup_panic) nmi_panic(NULL, "Hard LOCKUP"); diff --git a/include/linux/printk.h b/include/linux/printk.h index a1379df43251..596ad6fa0336 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -206,6 +206,7 @@ void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold; extern asmlinkage void dump_stack(void) __cold; +void printk_trigger_flush(void); #else static inline __printf(1, 0) int vprintk(const char *s, va_list args) @@ -282,6 +283,9 @@ static inline void dump_stack_lvl(const char *log_lvl) static inline void dump_stack(void) { } +static inline void printk_trigger_flush(void) +{ +} #endif #ifdef CONFIG_SMP diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 65fffa6368c9..eabe23b0a982 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3261,6 +3261,11 @@ void defer_console_output(void) preempt_enable(); } +void printk_trigger_flush(void) +{ + defer_console_output(); +} + int vprintk_deferred(const char *fmt, va_list args) { int r; diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index f9e89001b52e..199ab201d501 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -75,6 +75,12 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, touch_softlockup_watchdog(); } + /* + * Force flush any remote buffers that might be stuck in IRQ context + * and therefore could not run their irq_work. + */ + printk_trigger_flush(); + clear_bit_unlock(0, &backtrace_flag); put_cpu(); } -- cgit v1.2.3 From 8c42d2fa4eeab6c37a0b1b1aa7a2715248ef4f34 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 11 Nov 2021 17:26:09 -0800 Subject: bpf: Support BTF_KIND_TYPE_TAG for btf_type_tag attributes LLVM patches ([1] for clang, [2] and [3] for BPF backend) added support for btf_type_tag attributes. This patch added support for the kernel. The main motivation for btf_type_tag is to bring kernel annotations __user, __rcu etc. to btf. With such information available in btf, bpf verifier can detect mis-usages and reject the program. For example, for __user tagged pointer, developers can then use proper helper like bpf_probe_read_user() etc. to read the data. BTF_KIND_TYPE_TAG may also useful for other tracing facility where instead of to require user to specify kernel/user address type, the kernel can detect it by itself with btf. [1] https://reviews.llvm.org/D111199 [2] https://reviews.llvm.org/D113222 [3] https://reviews.llvm.org/D113496 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211112012609.1505032-1-yhs@fb.com --- include/uapi/linux/btf.h | 3 ++- kernel/bpf/btf.c | 14 +++++++++++++- tools/include/uapi/linux/btf.h | 3 ++- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index deb12f755f0f..b0d8fea1951d 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -43,7 +43,7 @@ struct btf_type { * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC, FUNC_PROTO, VAR and DECL_TAG. + * FUNC, FUNC_PROTO, VAR, DECL_TAG and TYPE_TAG. * "type" is a type_id referring to another type. */ union { @@ -75,6 +75,7 @@ enum { BTF_KIND_DATASEC = 15, /* Section */ BTF_KIND_FLOAT = 16, /* Floating point */ BTF_KIND_DECL_TAG = 17, /* Decl Tag */ + BTF_KIND_TYPE_TAG = 18, /* Type Tag */ NR_BTF_KINDS, BTF_KIND_MAX = NR_BTF_KINDS - 1, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cdb0fba65600..1dd9ba82da1e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -282,6 +282,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_DATASEC] = "DATASEC", [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", + [BTF_KIND_TYPE_TAG] = "TYPE_TAG", }; const char *btf_type_str(const struct btf_type *t) @@ -418,6 +419,7 @@ static bool btf_type_is_modifier(const struct btf_type *t) case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: return true; } @@ -1737,6 +1739,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: id = type->type; type = btf_type_by_id(btf, type->type); break; @@ -2345,6 +2348,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { + const char *value; + if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; @@ -2360,7 +2365,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } - /* typedef type must have a valid name, and other ref types, + /* typedef/type_tag type must have a valid name, and other ref types, * volatile, const, restrict, should have a null name. */ if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) { @@ -2369,6 +2374,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } + } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) { + value = btf_name_by_offset(env->btf, t->name_off); + if (!value || !value[0]) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } } else { if (t->name_off) { btf_verifier_log_type(env, t, "Invalid name"); @@ -4059,6 +4070,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_DATASEC] = &datasec_ops, [BTF_KIND_FLOAT] = &float_ops, [BTF_KIND_DECL_TAG] = &decl_tag_ops, + [BTF_KIND_TYPE_TAG] = &modifier_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h index deb12f755f0f..b0d8fea1951d 100644 --- a/tools/include/uapi/linux/btf.h +++ b/tools/include/uapi/linux/btf.h @@ -43,7 +43,7 @@ struct btf_type { * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC, FUNC_PROTO, VAR and DECL_TAG. + * FUNC, FUNC_PROTO, VAR, DECL_TAG and TYPE_TAG. * "type" is a type_id referring to another type. */ union { @@ -75,6 +75,7 @@ enum { BTF_KIND_DATASEC = 15, /* Section */ BTF_KIND_FLOAT = 16, /* Floating point */ BTF_KIND_DECL_TAG = 17, /* Decl Tag */ + BTF_KIND_TYPE_TAG = 18, /* Type Tag */ NR_BTF_KINDS, BTF_KIND_MAX = NR_BTF_KINDS - 1, -- cgit v1.2.3 From 34d11a440c6167133201b7374065b59f259730d7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 10 Nov 2021 09:25:56 -0800 Subject: bpf: Fix inner map state pruning regression. Introduction of map_uid made two lookups from outer map to be distinct. That distinction is only necessary when inner map has an embedded timer. Otherwise it will make the verifier state pruning to be conservative which will cause complex programs to hit 1M insn_processed limit. Tighten map_uid logic to apply to inner maps with timers only. Fixes: 3e8ce29850f1 ("bpf: Prevent pointer mismatch in bpf_timer_init.") Reported-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/CACAyw99hVEJFoiBH_ZGyy=+oO-jyydoz6v1DeKPKs2HVsUH28w@mail.gmail.com Link: https://lore.kernel.org/bpf/20211110172556.20754-1-alexei.starovoitov@gmail.com --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 890b3ec375a3..aab7482ed1c3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1151,7 +1151,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - reg->map_uid = reg->id; + if (map_value_has_timer(map->inner_map_meta)) + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || -- cgit v1.2.3 From 9e2ad638ae3632ef916ceb39f70e3104bf8fdc97 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 12 Nov 2021 07:02:42 -0800 Subject: bpf: Extend BTF_ID_LIST_GLOBAL with parameter for number of IDs syzbot reported the following BUG w/o CONFIG_DEBUG_INFO_BTF BUG: KASAN: global-out-of-bounds in task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661 Read of size 4 at addr ffffffff90297404 by task swapper/0/1 CPU: 1 PID: 1 Comm: swapper/0 Not tainted 5.15.0-syzkaller #0 Hardware name: ... Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0xf/0x309 mm/kasan/report.c:256 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 task_iter_init+0x212/0x2e7 kernel/bpf/task_iter.c:661 do_one_initcall+0x103/0x650 init/main.c:1295 do_initcall_level init/main.c:1368 [inline] do_initcalls init/main.c:1384 [inline] do_basic_setup init/main.c:1403 [inline] kernel_init_freeable+0x6b1/0x73a init/main.c:1606 kernel_init+0x1a/0x1d0 init/main.c:1497 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 This is caused by hard-coded name[1] in BTF_ID_LIST_GLOBAL (w/o CONFIG_DEBUG_INFO_BTF). Fix this by adding a parameter n to BTF_ID_LIST_GLOBAL. This avoids ifdef CONFIG_DEBUG_INFO_BTF in btf.c and filter.c. Fixes: 7c7e3d31e785 ("bpf: Introduce helper bpf_find_vma") Reported-by: syzbot+e0d81ec552a21d9071aa@syzkaller.appspotmail.com Reported-by: Eric Dumazet Suggested-by: Eric Dumazet Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211112150243.1270987-2-songliubraving@fb.com --- include/linux/btf_ids.h | 6 +++--- kernel/bpf/btf.c | 2 +- net/core/filter.c | 6 +----- 3 files changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 47d9abfbdb55..6bb42b785293 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -73,7 +73,7 @@ asm( \ __BTF_ID_LIST(name, local) \ extern u32 name[]; -#define BTF_ID_LIST_GLOBAL(name) \ +#define BTF_ID_LIST_GLOBAL(name, n) \ __BTF_ID_LIST(name, globl) /* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with @@ -83,7 +83,7 @@ __BTF_ID_LIST(name, globl) BTF_ID_LIST(name) \ BTF_ID(prefix, typename) #define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) \ - BTF_ID_LIST_GLOBAL(name) \ + BTF_ID_LIST_GLOBAL(name, 1) \ BTF_ID(prefix, typename) /* @@ -149,7 +149,7 @@ extern struct btf_id_set name; #define BTF_ID_LIST(name) static u32 name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED -#define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_ID_LIST_GLOBAL(name, n) u32 name[n]; #define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; #define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 1dd9ba82da1e..2a9d8a1fee1d 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6354,7 +6354,7 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .arg4_type = ARG_ANYTHING, }; -BTF_ID_LIST_GLOBAL(btf_task_struct_ids) +BTF_ID_LIST_GLOBAL(btf_task_struct_ids, 3) BTF_ID(struct, task_struct) BTF_ID(struct, file) BTF_ID(struct, vm_area_struct) diff --git a/net/core/filter.c b/net/core/filter.c index 315a58466fc9..46f09a8fba20 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10611,14 +10611,10 @@ void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog) bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog); } -#ifdef CONFIG_DEBUG_INFO_BTF -BTF_ID_LIST_GLOBAL(btf_sock_ids) +BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE) #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type) BTF_SOCK_TYPE_xxx #undef BTF_SOCK_TYPE -#else -u32 btf_sock_ids[MAX_BTF_SOCK_TYPE]; -#endif BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk) { -- cgit v1.2.3 From d19ddb476a539fd78ad1028ae13bb38506286931 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 12 Nov 2021 07:02:43 -0800 Subject: bpf: Introduce btf_tracing_ids Similar to btf_sock_ids, btf_tracing_ids provides btf ID for task_struct, file, and vm_area_struct via easy to understand format like btf_tracing_ids[BTF_TRACING_TYPE_[TASK|file|VMA]]. Suggested-by: Alexei Starovoitov Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211112150243.1270987-3-songliubraving@fb.com --- include/linux/btf_ids.h | 14 +++++++++++++- kernel/bpf/bpf_task_storage.c | 4 ++-- kernel/bpf/btf.c | 8 ++++---- kernel/bpf/stackmap.c | 2 +- kernel/bpf/task_iter.c | 12 ++++++------ kernel/bpf/verifier.c | 2 +- kernel/trace/bpf_trace.c | 4 ++-- 7 files changed, 29 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 6bb42b785293..919c0fde1c51 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -189,6 +189,18 @@ MAX_BTF_SOCK_TYPE, extern u32 btf_sock_ids[]; #endif -extern u32 btf_task_struct_ids[]; +#define BTF_TRACING_TYPE_xxx \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_TASK, task_struct) \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_FILE, file) \ + BTF_TRACING_TYPE(BTF_TRACING_TYPE_VMA, vm_area_struct) + +enum { +#define BTF_TRACING_TYPE(name, type) name, +BTF_TRACING_TYPE_xxx +#undef BTF_TRACING_TYPE +MAX_BTF_TRACING_TYPE, +}; + +extern u32 btf_tracing_ids[]; #endif diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index ebfa8bc90892..bb69aea1a777 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -323,7 +323,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = { .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &btf_task_struct_ids[0], + .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; @@ -334,5 +334,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &btf_task_struct_ids[0], + .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2a9d8a1fee1d..6b9d23be1e99 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6354,10 +6354,10 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .arg4_type = ARG_ANYTHING, }; -BTF_ID_LIST_GLOBAL(btf_task_struct_ids, 3) -BTF_ID(struct, task_struct) -BTF_ID(struct, file) -BTF_ID(struct, vm_area_struct) +BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) +#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type) +BTF_TRACING_TYPE_xxx +#undef BTF_TRACING_TYPE /* BTF ID set registration API for modules */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 1de0a1b03636..49e567209c6b 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -489,7 +489,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, - .arg1_btf_id = &btf_task_struct_ids[0], + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index f171479f7dd6..d94696198ef8 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -622,7 +622,7 @@ const struct bpf_func_proto bpf_find_vma_proto = { .func = bpf_find_vma, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, - .arg1_btf_id = &btf_task_struct_ids[0], + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_FUNC, .arg4_type = ARG_PTR_TO_STACK_OR_NULL, @@ -652,19 +652,19 @@ static int __init task_iter_init(void) init_irq_work(&work->irq_work, do_mmap_read_unlock); } - task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; + task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; ret = bpf_iter_reg_target(&task_reg_info); if (ret) return ret; - task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; - task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_struct_ids[1]; + task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE]; ret = bpf_iter_reg_target(&task_file_reg_info); if (ret) return ret; - task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; - task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_struct_ids[2]; + task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1aafb43f61d1..d31a031ab377 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6147,7 +6147,7 @@ static int set_find_vma_callback_state(struct bpf_verifier_env *env, callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID; __mark_reg_known_zero(&callee->regs[BPF_REG_2]); callee->regs[BPF_REG_2].btf = btf_vmlinux; - callee->regs[BPF_REG_2].btf_id = btf_task_struct_ids[2]; + callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA], /* pointer to stack or null */ callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4]; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 390176a3031a..25ea521fb8f1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -764,7 +764,7 @@ const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, .ret_type = RET_PTR_TO_BTF_ID, - .ret_btf_id = &btf_task_struct_ids[0], + .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; BPF_CALL_1(bpf_task_pt_regs, struct task_struct *, task) @@ -779,7 +779,7 @@ const struct bpf_func_proto bpf_task_pt_regs_proto = { .func = bpf_task_pt_regs, .gpl_only = true, .arg1_type = ARG_PTR_TO_BTF_ID, - .arg1_btf_id = &btf_task_struct_ids[0], + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .ret_type = RET_PTR_TO_BTF_ID, .ret_btf_id = &bpf_task_pt_regs_ids[0], }; -- cgit v1.2.3 From 938aa33f14657c9ed9deea348b7d6f14b6d69cb7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sun, 14 Nov 2021 13:28:34 -0500 Subject: tracing: Add length protection to histogram string copies The string copies to the histogram storage has a max size of 256 bytes (defined by MAX_FILTER_STR_VAL). Only the string size of the event field needs to be copied to the event storage, but no more than what is in the event storage. Although nothing should be bigger than 256 bytes, there's no protection against overwriting of the storage if one day there is. Copy no more than the destination size, and enforce it. Also had to turn MAX_FILTER_STR_VAL into an unsigned int, to keep the min() comparison of the string sizes of comparable types. Link: https://lore.kernel.org/all/CAHk-=wjREUihCGrtRBwfX47y_KrLCGjiq3t6QtoNJpmVrAEb1w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211114132834.183429a4@rorschach.local.home Cc: Ingo Molnar Cc: Andrew Morton Cc: Tom Zanussi Reported-by: Linus Torvalds Reviewed-by: Masami Hiramatsu Fixes: 63f84ae6b82b ("tracing/histogram: Do not copy the fixed-size char array field over the field size") Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 2 +- kernel/trace/trace_events_hist.c | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 50453b287615..2d167ac3452c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -673,7 +673,7 @@ struct trace_event_file { #define PERF_MAX_TRACE_SIZE 8192 -#define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ +#define MAX_FILTER_STR_VAL 256U /* Should handle KSYM_SYMBOL_LEN */ enum event_trigger_type { ETT_NONE = (0), diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1475d7347fe0..34afcaebd0e5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3026,8 +3026,10 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, if (val->flags & HIST_FIELD_FL_STRING) { char *str = elt_data->field_var_str[j++]; char *val_str = (char *)(uintptr_t)var_val; + unsigned int size; - strscpy(str, val_str, val->size); + size = min(val->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); var_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, var_val); @@ -4914,6 +4916,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, if (hist_field->flags & HIST_FIELD_FL_STRING) { unsigned int str_start, var_str_idx, idx; char *str, *val_str; + unsigned int size; str_start = hist_data->n_field_var_str + hist_data->n_save_var_str; @@ -4922,7 +4925,9 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, str = elt_data->field_var_str[idx]; val_str = (char *)(uintptr_t)hist_val; - strscpy(str, val_str, hist_field->size); + + size = min(hist_field->size, STR_VAR_LEN_MAX); + strscpy(str, val_str, size); hist_val = (u64)(uintptr_t)str; } -- cgit v1.2.3 From eda09706b240ca9129ac4e1fbb4eb1e2bc67aadc Mon Sep 17 00:00:00 2001 From: Michal Koutný Date: Wed, 3 Nov 2021 17:58:45 +0100 Subject: cgroup: rstat: Mark benign data race to silence KCSAN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a race between updaters and flushers (flush can possibly miss the latest update(s)). This is expected as explained in cgroup_rstat_updated() comment, add also machine readable annotation so that KCSAN results aren't noisy. Reported-by: Hao Sun Link: https://lore.kernel.org/r/CACkBjsbPVdkub=e-E-p1WBOLxS515ith-53SFdmFHWV_QMo40w@mail.gmail.com Suggested-by: Hao Sun Signed-off-by: Michal Koutný Reviewed-by: Shakeel Butt Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1486768f2318..1abe74114527 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -35,7 +35,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. */ - if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) + if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; raw_spin_lock_irqsave(cpu_lock, flags); -- cgit v1.2.3 From 5e0bc3082e2e403ac0753e099c2b01446bb35578 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Sat, 13 Nov 2021 18:22:26 +0400 Subject: bpf: Forbid bpf_ktime_get_coarse_ns and bpf_timer_* in tracing progs Use of bpf_ktime_get_coarse_ns() and bpf_timer_* helpers in tracing progs may result in locking issues. bpf_ktime_get_coarse_ns() uses ktime_get_coarse_ns() time accessor that isn't safe for any context: ====================================================== WARNING: possible circular locking dependency detected 5.15.0-syzkaller #0 Not tainted ------------------------------------------------------ syz-executor.4/14877 is trying to acquire lock: ffffffff8cb30008 (tk_core.seq.seqcount){----}-{0:0}, at: ktime_get_coarse_ts64+0x25/0x110 kernel/time/timekeeping.c:2255 but task is already holding lock: ffffffff90dbf200 (&obj_hash[i].lock){-.-.}-{2:2}, at: debug_object_deactivate+0x61/0x400 lib/debugobjects.c:735 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&obj_hash[i].lock){-.-.}-{2:2}: lock_acquire+0x19f/0x4d0 kernel/locking/lockdep.c:5625 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0xd1/0x120 kernel/locking/spinlock.c:162 __debug_object_init+0xd9/0x1860 lib/debugobjects.c:569 debug_hrtimer_init kernel/time/hrtimer.c:414 [inline] debug_init kernel/time/hrtimer.c:468 [inline] hrtimer_init+0x20/0x40 kernel/time/hrtimer.c:1592 ntp_init_cmos_sync kernel/time/ntp.c:676 [inline] ntp_init+0xa1/0xad kernel/time/ntp.c:1095 timekeeping_init+0x512/0x6bf kernel/time/timekeeping.c:1639 start_kernel+0x267/0x56e init/main.c:1030 secondary_startup_64_no_verify+0xb1/0xbb -> #0 (tk_core.seq.seqcount){----}-{0:0}: check_prev_add kernel/locking/lockdep.c:3051 [inline] check_prevs_add kernel/locking/lockdep.c:3174 [inline] validate_chain+0x1dfb/0x8240 kernel/locking/lockdep.c:3789 __lock_acquire+0x1382/0x2b00 kernel/locking/lockdep.c:5015 lock_acquire+0x19f/0x4d0 kernel/locking/lockdep.c:5625 seqcount_lockdep_reader_access+0xfe/0x230 include/linux/seqlock.h:103 ktime_get_coarse_ts64+0x25/0x110 kernel/time/timekeeping.c:2255 ktime_get_coarse include/linux/timekeeping.h:120 [inline] ktime_get_coarse_ns include/linux/timekeeping.h:126 [inline] ____bpf_ktime_get_coarse_ns kernel/bpf/helpers.c:173 [inline] bpf_ktime_get_coarse_ns+0x7e/0x130 kernel/bpf/helpers.c:171 bpf_prog_a99735ebafdda2f1+0x10/0xb50 bpf_dispatcher_nop_func include/linux/bpf.h:721 [inline] __bpf_prog_run include/linux/filter.h:626 [inline] bpf_prog_run include/linux/filter.h:633 [inline] BPF_PROG_RUN_ARRAY include/linux/bpf.h:1294 [inline] trace_call_bpf+0x2cf/0x5d0 kernel/trace/bpf_trace.c:127 perf_trace_run_bpf_submit+0x7b/0x1d0 kernel/events/core.c:9708 perf_trace_lock+0x37c/0x440 include/trace/events/lock.h:39 trace_lock_release+0x128/0x150 include/trace/events/lock.h:58 lock_release+0x82/0x810 kernel/locking/lockdep.c:5636 __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:149 [inline] _raw_spin_unlock_irqrestore+0x75/0x130 kernel/locking/spinlock.c:194 debug_hrtimer_deactivate kernel/time/hrtimer.c:425 [inline] debug_deactivate kernel/time/hrtimer.c:481 [inline] __run_hrtimer kernel/time/hrtimer.c:1653 [inline] __hrtimer_run_queues+0x2f9/0xa60 kernel/time/hrtimer.c:1749 hrtimer_interrupt+0x3b3/0x1040 kernel/time/hrtimer.c:1811 local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1086 [inline] __sysvec_apic_timer_interrupt+0xf9/0x270 arch/x86/kernel/apic/apic.c:1103 sysvec_apic_timer_interrupt+0x8c/0xb0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline] _raw_spin_unlock_irqrestore+0xd4/0x130 kernel/locking/spinlock.c:194 try_to_wake_up+0x702/0xd20 kernel/sched/core.c:4118 wake_up_process kernel/sched/core.c:4200 [inline] wake_up_q+0x9a/0xf0 kernel/sched/core.c:953 futex_wake+0x50f/0x5b0 kernel/futex/waitwake.c:184 do_futex+0x367/0x560 kernel/futex/syscalls.c:127 __do_sys_futex kernel/futex/syscalls.c:199 [inline] __se_sys_futex+0x401/0x4b0 kernel/futex/syscalls.c:180 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae There is a possible deadlock with bpf_timer_* set of helpers: hrtimer_start() lock_base(); trace_hrtimer...() perf_event() bpf_run() bpf_timer_start() hrtimer_start() lock_base() <- DEADLOCK Forbid use of bpf_ktime_get_coarse_ns() and bpf_timer_* helpers in BPF_PROG_TYPE_KPROBE, BPF_PROG_TYPE_TRACEPOINT, BPF_PROG_TYPE_PERF_EVENT and BPF_PROG_TYPE_RAW_TRACEPOINT prog types. Fixes: d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper") Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") Reported-by: syzbot+43fd005b5a1b4d10781e@syzkaller.appspotmail.com Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211113142227.566439-2-me@ubique.spb.ru --- kernel/bpf/cgroup.c | 2 ++ kernel/bpf/helpers.c | 2 -- kernel/bpf/verifier.c | 7 +++++++ kernel/trace/bpf_trace.c | 2 -- net/core/filter.c | 6 ++++++ net/ipv4/bpf_tcp_ca.c | 2 ++ 6 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 2ca643af9a54..43eb3501721b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1809,6 +1809,8 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_get_new_value_proto; case BPF_FUNC_sysctl_set_new_value: return &bpf_sysctl_set_new_value_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return cgroup_base_func_proto(func_id, prog); } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ffd469c217f..649f07623df6 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1364,8 +1364,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_ktime_get_coarse_ns: - return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index aab7482ed1c3..65d2f93b7030 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11632,6 +11632,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if (map_value_has_timer(map)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_timer yet\n"); + return -EINVAL; + } + } + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7396488793ff..ae9755037b7e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1111,8 +1111,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_ktime_get_coarse_ns: - return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: diff --git a/net/core/filter.c b/net/core/filter.c index e471c9b09670..6102f093d59a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7162,6 +7162,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } @@ -10327,6 +10329,8 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, return &sk_reuseport_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_ptr_cookie_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } @@ -10833,6 +10837,8 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skc_to_unix_sock: func = &bpf_skc_to_unix_sock_proto; break; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 2cf02b4d77fb..4bb9401b0a3f 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -205,6 +205,8 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, offsetof(struct tcp_congestion_ops, release)) return &bpf_sk_getsockopt_proto; return NULL; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 353050be4c19e102178ccc05988101887c25ae53 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Nov 2021 18:48:08 +0000 Subject: bpf: Fix toctou on read-only map's constant scalar tracking Commit a23740ec43ba ("bpf: Track contents of read-only maps as scalars") is checking whether maps are read-only both from BPF program side and user space side, and then, given their content is constant, reading out their data via map->ops->map_direct_value_addr() which is then subsequently used as known scalar value for the register, that is, it is marked as __mark_reg_known() with the read value at verification time. Before a23740ec43ba, the register content was marked as an unknown scalar so the verifier could not make any assumptions about the map content. The current implementation however is prone to a TOCTOU race, meaning, the value read as known scalar for the register is not guaranteed to be exactly the same at a later point when the program is executed, and as such, the prior made assumptions of the verifier with regards to the program will be invalid which can cause issues such as OOB access, etc. While the BPF_F_RDONLY_PROG map flag is always fixed and required to be specified at map creation time, the map->frozen property is initially set to false for the map given the map value needs to be populated, e.g. for global data sections. Once complete, the loader "freezes" the map from user space such that no subsequent updates/deletes are possible anymore. For the rest of the lifetime of the map, this freeze one-time trigger cannot be undone anymore after a successful BPF_MAP_FREEZE cmd return. Meaning, any new BPF_* cmd calls which would update/delete map entries will be rejected with -EPERM since map_get_sys_perms() removes the FMODE_CAN_WRITE permission. This also means that pending update/delete map entries must still complete before this guarantee is given. This corner case is not an issue for loaders since they create and prepare such program private map in successive steps. However, a malicious user is able to trigger this TOCTOU race in two different ways: i) via userfaultfd, and ii) via batched updates. For i) userfaultfd is used to expand the competition interval, so that map_update_elem() can modify the contents of the map after map_freeze() and bpf_prog_load() were executed. This works, because userfaultfd halts the parallel thread which triggered a map_update_elem() at the time where we copy key/value from the user buffer and this already passed the FMODE_CAN_WRITE capability test given at that time the map was not "frozen". Then, the main thread performs the map_freeze() and bpf_prog_load(), and once that had completed successfully, the other thread is woken up to complete the pending map_update_elem() which then changes the map content. For ii) the idea of the batched update is similar, meaning, when there are a large number of updates to be processed, it can increase the competition interval between the two. It is therefore possible in practice to modify the contents of the map after executing map_freeze() and bpf_prog_load(). One way to fix both i) and ii) at the same time is to expand the use of the map's map->writecnt. The latter was introduced in fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") and further refined in 1f6cb19be2e2 ("bpf: Prevent re-mmap()'ing BPF map as writable for initially r/o mapping") with the rationale to make a writable mmap()'ing of a map mutually exclusive with read-only freezing. The counter indicates writable mmap() mappings and then prevents/fails the freeze operation. Its semantics can be expanded beyond just mmap() by generally indicating ongoing write phases. This would essentially span any parallel regular and batched flavor of update/delete operation and then also have map_freeze() fail with -EBUSY. For the check_mem_access() in the verifier we expand upon the bpf_map_is_rdonly() check ensuring that all last pending writes have completed via bpf_map_write_active() test. Once the map->frozen is set and bpf_map_write_active() indicates a map->writecnt of 0 only then we are really guaranteed to use the map's data as known constants. For map->frozen being set and pending writes in process of still being completed we fall back to marking that register as unknown scalar so we don't end up making assumptions about it. With this, both TOCTOU reproducers from i) and ii) are fixed. Note that the map->writecnt has been converted into a atomic64 in the fix in order to avoid a double freeze_mutex mutex_{un,}lock() pair when updating map->writecnt in the various map update/delete BPF_* cmd flavors. Spanning the freeze_mutex over entire map update/delete operations in syscall side would not be possible due to then causing everything to be serialized. Similarly, something like synchronize_rcu() after setting map->frozen to wait for update/deletes to complete is not possible either since it would also have to span the user copy which can sleep. On the libbpf side, this won't break d66562fba1ce ("libbpf: Add BPF object skeleton support") as the anonymous mmap()-ed "map initialization image" is remapped as a BPF map-backed mmap()-ed memory where for .rodata it's non-writable. Fixes: a23740ec43ba ("bpf: Track contents of read-only maps as scalars") Reported-by: w1tcher.bupt@gmail.com Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- kernel/bpf/syscall.c | 57 ++++++++++++++++++++++++++++++++------------------- kernel/bpf/verifier.c | 17 ++++++++++++++- 3 files changed, 54 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f715e8863f4d..e7a163a3146b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -193,7 +193,7 @@ struct bpf_map { atomic64_t usercnt; struct work_struct work; struct mutex freeze_mutex; - u64 writecnt; /* writable mmap cnt; protected by freeze_mutex */ + atomic64_t writecnt; }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -1419,6 +1419,7 @@ void bpf_map_put(struct bpf_map *map); void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); +bool bpf_map_write_active(const struct bpf_map *map); void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 50f96ea4452a..1033ee8c0caf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -132,6 +132,21 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } +static void bpf_map_write_active_inc(struct bpf_map *map) +{ + atomic64_inc(&map->writecnt); +} + +static void bpf_map_write_active_dec(struct bpf_map *map) +{ + atomic64_dec(&map->writecnt); +} + +bool bpf_map_write_active(const struct bpf_map *map) +{ + return atomic64_read(&map->writecnt) != 0; +} + static u32 bpf_map_value_size(const struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || @@ -601,11 +616,8 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_MAYWRITE) { - mutex_lock(&map->freeze_mutex); - map->writecnt++; - mutex_unlock(&map->freeze_mutex); - } + if (vma->vm_flags & VM_MAYWRITE) + bpf_map_write_active_inc(map); } /* called for all unmapped memory region (including initial) */ @@ -613,11 +625,8 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_MAYWRITE) { - mutex_lock(&map->freeze_mutex); - map->writecnt--; - mutex_unlock(&map->freeze_mutex); - } + if (vma->vm_flags & VM_MAYWRITE) + bpf_map_write_active_dec(map); } static const struct vm_operations_struct bpf_map_default_vmops = { @@ -668,7 +677,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) goto out; if (vma->vm_flags & VM_MAYWRITE) - map->writecnt++; + bpf_map_write_active_inc(map); out: mutex_unlock(&map->freeze_mutex); return err; @@ -1139,6 +1148,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; @@ -1174,6 +1184,7 @@ free_value: free_key: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1196,6 +1207,7 @@ static int map_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; @@ -1226,6 +1238,7 @@ static int map_delete_elem(union bpf_attr *attr) out: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1533,6 +1546,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; @@ -1597,6 +1611,7 @@ free_value: free_key: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1624,8 +1639,7 @@ static int map_freeze(const union bpf_attr *attr) } mutex_lock(&map->freeze_mutex); - - if (map->writecnt) { + if (bpf_map_write_active(map)) { err = -EBUSY; goto err_put; } @@ -4171,6 +4185,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr, union bpf_attr __user *uattr, int cmd) { + bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || + cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; + bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; struct bpf_map *map; int err, ufd; struct fd f; @@ -4183,16 +4200,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr, map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if ((cmd == BPF_MAP_LOOKUP_BATCH || - cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && - !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { + if (has_write) + bpf_map_write_active_inc(map); + if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } - - if (cmd != BPF_MAP_LOOKUP_BATCH && - !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -4205,8 +4219,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr, BPF_DO_BATCH(map->ops->map_update_batch); else BPF_DO_BATCH(map->ops->map_delete_batch); - err_put: + if (has_write) + bpf_map_write_active_dec(map); fdput(f); return err; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 65d2f93b7030..50efda51515b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4056,7 +4056,22 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) static bool bpf_map_is_rdonly(const struct bpf_map *map) { - return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen; + /* A map is considered read-only if the following condition are true: + * + * 1) BPF program side cannot change any of the map content. The + * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map + * and was set at map creation time. + * 2) The map value(s) have been initialized from user space by a + * loader and then "frozen", such that no new map update/delete + * operations from syscall side are possible for the rest of + * the map's lifetime from that point onwards. + * 3) Any parallel/pending map update/delete operations from syscall + * side have been completed. Only after that point, it's safe to + * assume that map value(s) are immutable. + */ + return (map->map_flags & BPF_F_RDONLY_PROG) && + READ_ONCE(map->frozen) && + !bpf_map_write_active(map); } static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) -- cgit v1.2.3 From ebf7f6f0a6cdcc17a3da52b81e4b3a98c4005028 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Fri, 5 Nov 2021 09:30:00 +0800 Subject: bpf: Change value of MAX_TAIL_CALL_CNT from 32 to 33 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the current code, the actual max tail call count is 33 which is greater than MAX_TAIL_CALL_CNT (defined as 32). The actual limit is not consistent with the meaning of MAX_TAIL_CALL_CNT and thus confusing at first glance. We can see the historical evolution from commit 04fd61ab36ec ("bpf: allow bpf programs to tail-call other bpf programs") and commit f9dabe016b63 ("bpf: Undo off-by-one in interpreter tail call count limit"). In order to avoid changing existing behavior, the actual limit is 33 now, this is reasonable. After commit 874be05f525e ("bpf, tests: Add tail call test suite"), we can see there exists failed testcase. On all archs when CONFIG_BPF_JIT_ALWAYS_ON is not set: # echo 0 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf # dmesg | grep -w FAIL Tail call error path, max count reached jited:0 ret 34 != 33 FAIL On some archs: # echo 1 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf # dmesg | grep -w FAIL Tail call error path, max count reached jited:1 ret 34 != 33 FAIL Although the above failed testcase has been fixed in commit 18935a72eb25 ("bpf/tests: Fix error in tail call limit tests"), it would still be good to change the value of MAX_TAIL_CALL_CNT from 32 to 33 to make the code more readable. The 32-bit x86 JIT was using a limit of 32, just fix the wrong comments and limit to 33 tail calls as the constant MAX_TAIL_CALL_CNT updated. For the mips64 JIT, use "ori" instead of "addiu" as suggested by Johan Almbladh. For the riscv JIT, use RV_REG_TCC directly to save one register move as suggested by Björn Töpel. For the other implementations, no function changes, it does not change the current limit 33, the new value of MAX_TAIL_CALL_CNT can reflect the actual max tail call count, the related tail call testcases in test_bpf module and selftests can work well for the interpreter and the JIT. Here are the test results on x86_64: # uname -m x86_64 # echo 0 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf test_suite=test_tail_calls # dmesg | tail -1 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [0/8 JIT'ed] # rmmod test_bpf # echo 1 > /proc/sys/net/core/bpf_jit_enable # modprobe test_bpf test_suite=test_tail_calls # dmesg | tail -1 test_bpf: test_tail_calls: Summary: 8 PASSED, 0 FAILED, [8/8 JIT'ed] # rmmod test_bpf # ./test_progs -t tailcalls #142 tailcalls:OK Summary: 1/11 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Tiezhu Yang Signed-off-by: Daniel Borkmann Tested-by: Johan Almbladh Tested-by: Ilya Leoshkevich Acked-by: Björn Töpel Acked-by: Johan Almbladh Acked-by: Ilya Leoshkevich Link: https://lore.kernel.org/bpf/1636075800-3264-1-git-send-email-yangtiezhu@loongson.cn --- arch/arm/net/bpf_jit_32.c | 5 +++-- arch/arm64/net/bpf_jit_comp.c | 5 +++-- arch/mips/net/bpf_jit_comp32.c | 3 +-- arch/mips/net/bpf_jit_comp64.c | 2 +- arch/powerpc/net/bpf_jit_comp32.c | 4 ++-- arch/powerpc/net/bpf_jit_comp64.c | 4 ++-- arch/riscv/net/bpf_jit_comp32.c | 6 ++---- arch/riscv/net/bpf_jit_comp64.c | 7 +++---- arch/s390/net/bpf_jit_comp.c | 6 +++--- arch/sparc/net/bpf_jit_comp_64.c | 2 +- arch/x86/net/bpf_jit_comp.c | 10 +++++----- arch/x86/net/bpf_jit_comp32.c | 4 ++-- include/linux/bpf.h | 2 +- include/uapi/linux/bpf.h | 2 +- kernel/bpf/core.c | 3 ++- lib/test_bpf.c | 4 ++-- tools/include/uapi/linux/bpf.h | 2 +- 17 files changed, 35 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index eeb6dc0ecf46..e59b41e9ab0c 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1199,7 +1199,8 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) /* tmp2[0] = array, tmp2[1] = index */ - /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) + /* + * if (tail_call_cnt >= MAX_TAIL_CALL_CNT) * goto out; * tail_call_cnt++; */ @@ -1208,7 +1209,7 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) tc = arm_bpf_get_reg64(tcc, tmp, ctx); emit(ARM_CMP_I(tc[0], hi), ctx); _emit(ARM_COND_EQ, ARM_CMP_I(tc[1], lo), ctx); - _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx); + _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx); emit(ARM_ADDS_I(tc[1], tc[1], 1), ctx); emit(ARM_ADC_I(tc[0], tc[0], 0), ctx); arm_bpf_put_reg64(tcc, tmp, ctx); diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 86c9dc0681cc..07c12c42b751 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -287,13 +287,14 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) emit(A64_CMP(0, r3, tmp), ctx); emit(A64_B_(A64_COND_CS, jmp_offset), ctx); - /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) + /* + * if (tail_call_cnt >= MAX_TAIL_CALL_CNT) * goto out; * tail_call_cnt++; */ emit_a64_mov_i64(tmp, MAX_TAIL_CALL_CNT, ctx); emit(A64_CMP(1, tcc, tmp), ctx); - emit(A64_B_(A64_COND_HI, jmp_offset), ctx); + emit(A64_B_(A64_COND_CS, jmp_offset), ctx); emit(A64_ADD_I(1, tcc, tcc, 1), ctx); /* prog = array->ptrs[index]; diff --git a/arch/mips/net/bpf_jit_comp32.c b/arch/mips/net/bpf_jit_comp32.c index bd996ede12f8..044b11b65bca 100644 --- a/arch/mips/net/bpf_jit_comp32.c +++ b/arch/mips/net/bpf_jit_comp32.c @@ -1381,8 +1381,7 @@ void build_prologue(struct jit_context *ctx) * 16-byte area in the parent's stack frame. On a tail call, the * calling function jumps into the prologue after these instructions. */ - emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, - min(MAX_TAIL_CALL_CNT + 1, 0xffff)); + emit(ctx, ori, MIPS_R_T9, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT, 0xffff)); emit(ctx, sw, MIPS_R_T9, 0, MIPS_R_SP); /* diff --git a/arch/mips/net/bpf_jit_comp64.c b/arch/mips/net/bpf_jit_comp64.c index 815ade724227..6475828ffb36 100644 --- a/arch/mips/net/bpf_jit_comp64.c +++ b/arch/mips/net/bpf_jit_comp64.c @@ -552,7 +552,7 @@ void build_prologue(struct jit_context *ctx) * On a tail call, the calling function jumps into the prologue * after this instruction. */ - emit(ctx, addiu, tc, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT + 1, 0xffff)); + emit(ctx, ori, tc, MIPS_R_ZERO, min(MAX_TAIL_CALL_CNT, 0xffff)); /* === Entry-point for tail calls === */ diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 0da31d41d413..8a4faa05f9e4 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -221,13 +221,13 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o PPC_BCC(COND_GE, out); /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt >= MAX_TAIL_CALL_CNT) * goto out; */ EMIT(PPC_RAW_CMPLWI(_R0, MAX_TAIL_CALL_CNT)); /* tail_call_cnt++; */ EMIT(PPC_RAW_ADDIC(_R0, _R0, 1)); - PPC_BCC(COND_GT, out); + PPC_BCC(COND_GE, out); /* prog = array->ptrs[index]; */ EMIT(PPC_RAW_RLWINM(_R3, b2p_index, 2, 0, 29)); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 8b5157ccfeba..8571aafcc9e1 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -228,12 +228,12 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o PPC_BCC(COND_GE, out); /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt >= MAX_TAIL_CALL_CNT) * goto out; */ PPC_BPF_LL(b2p[TMP_REG_1], 1, bpf_jit_stack_tailcallcnt(ctx)); EMIT(PPC_RAW_CMPLWI(b2p[TMP_REG_1], MAX_TAIL_CALL_CNT)); - PPC_BCC(COND_GT, out); + PPC_BCC(COND_GE, out); /* * tail_call_cnt++; diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c index e6497424cbf6..529a83b85c1c 100644 --- a/arch/riscv/net/bpf_jit_comp32.c +++ b/arch/riscv/net/bpf_jit_comp32.c @@ -799,11 +799,10 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) emit_bcc(BPF_JGE, lo(idx_reg), RV_REG_T1, off, ctx); /* - * temp_tcc = tcc - 1; - * if (tcc < 0) + * if (--tcc < 0) * goto out; */ - emit(rv_addi(RV_REG_T1, RV_REG_TCC, -1), ctx); + emit(rv_addi(RV_REG_TCC, RV_REG_TCC, -1), ctx); off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn)); emit_bcc(BPF_JSLT, RV_REG_TCC, RV_REG_ZERO, off, ctx); @@ -829,7 +828,6 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) if (is_12b_check(off, insn)) return -1; emit(rv_lw(RV_REG_T0, off, RV_REG_T0), ctx); - emit(rv_addi(RV_REG_TCC, RV_REG_T1, 0), ctx); /* Epilogue jumps to *(t0 + 4). */ __build_epilogue(true, ctx); return 0; diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index f2a779c7e225..603630b6f3c5 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -327,12 +327,12 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn)); emit_branch(BPF_JGE, RV_REG_A2, RV_REG_T1, off, ctx); - /* if (TCC-- < 0) + /* if (--TCC < 0) * goto out; */ - emit_addi(RV_REG_T1, tcc, -1, ctx); + emit_addi(RV_REG_TCC, tcc, -1, ctx); off = ninsns_rvoff(tc_ninsn - (ctx->ninsns - start_insn)); - emit_branch(BPF_JSLT, tcc, RV_REG_ZERO, off, ctx); + emit_branch(BPF_JSLT, RV_REG_TCC, RV_REG_ZERO, off, ctx); /* prog = array->ptrs[index]; * if (!prog) @@ -352,7 +352,6 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) if (is_12b_check(off, insn)) return -1; emit_ld(RV_REG_T3, off, RV_REG_T2, ctx); - emit_mv(RV_REG_TCC, RV_REG_T1, ctx); __build_epilogue(true, ctx); return 0; } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 233cc9bcd652..9ff2bd83aad7 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1369,7 +1369,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, jit->prg); /* - * if (tail_call_cnt++ > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ @@ -1381,9 +1381,9 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, EMIT4_IMM(0xa7080000, REG_W0, 1); /* laal %w1,%w0,off(%r15) */ EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W1, REG_W0, REG_15, off); - /* clij %w1,MAX_TAIL_CALL_CNT,0x2,out */ + /* clij %w1,MAX_TAIL_CALL_CNT-1,0x2,out */ patch_2_clij = jit->prg; - EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT, + EMIT6_PCREL_RIEC(0xec000000, 0x007f, REG_W1, MAX_TAIL_CALL_CNT - 1, 2, jit->prg); /* diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 9a2f20cbd48b..0bfe1c72a0c9 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -867,7 +867,7 @@ static void emit_tail_call(struct jit_ctx *ctx) emit(LD32 | IMMED | RS1(SP) | S13(off) | RD(tmp), ctx); emit_cmpi(tmp, MAX_TAIL_CALL_CNT, ctx); #define OFFSET2 13 - emit_branch(BGU, ctx->idx, ctx->idx + OFFSET2, ctx); + emit_branch(BGEU, ctx->idx, ctx->idx + OFFSET2, ctx); emit_nop(ctx); emit_alu_K(ADD, tmp, 1, ctx); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 726700fabca6..631847907786 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -412,7 +412,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... * if (index >= array->map.max_entries) * goto out; - * if (++tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; * prog = array->ptrs[index]; * if (prog == NULL) @@ -446,14 +446,14 @@ static void emit_bpf_tail_call_indirect(u8 **pprog, bool *callee_regs_used, EMIT2(X86_JBE, offset); /* jbe out */ /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ offset = ctx->tail_call_indirect_label - (prog + 2 - start); - EMIT2(X86_JA, offset); /* ja out */ + EMIT2(X86_JAE, offset); /* jae out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ @@ -504,14 +504,14 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, int offset; /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */ EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ offset = ctx->tail_call_direct_label - (prog + 2 - start); - EMIT2(X86_JA, offset); /* ja out */ + EMIT2(X86_JAE, offset); /* jae out */ EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */ diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index da9b7cfa4632..429a89c5468b 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1323,7 +1323,7 @@ static void emit_bpf_tail_call(u8 **pprog, u8 *ip) EMIT2(IA32_JBE, jmp_label(jmp_label1, 2)); /* - * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT) * goto out; */ lo = (u32)MAX_TAIL_CALL_CNT; @@ -1337,7 +1337,7 @@ static void emit_bpf_tail_call(u8 **pprog, u8 *ip) /* cmp ecx,lo */ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo); - /* ja out */ + /* jae out */ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); /* add eax,0x1 */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 56098c866704..cc7a0c36e7df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1081,7 +1081,7 @@ struct bpf_array { }; #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ -#define MAX_TAIL_CALL_CNT 32 +#define MAX_TAIL_CALL_CNT 33 #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6297eafdc40f..a69e4b04ffeb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1744,7 +1744,7 @@ union bpf_attr { * if the maximum number of tail calls has been reached for this * chain of programs. This limit is defined in the kernel by the * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), - * which is currently set to 32. + * which is currently set to 33. * Return * 0 on success, or a negative error in case of failure. * diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2405e39d800f..b52dc845ecea 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1574,7 +1574,8 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) + + if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index adae39567264..0c5cb2d6436a 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -14683,7 +14683,7 @@ static struct tail_call_test tail_call_tests[] = { BPF_EXIT_INSN(), }, .flags = FLAG_NEED_STATE | FLAG_RESULT_IN_STATE, - .result = (MAX_TAIL_CALL_CNT + 1 + 1) * MAX_TESTRUNS, + .result = (MAX_TAIL_CALL_CNT + 1) * MAX_TESTRUNS, }, { "Tail call count preserved across function calls", @@ -14705,7 +14705,7 @@ static struct tail_call_test tail_call_tests[] = { }, .stack_depth = 8, .flags = FLAG_NEED_STATE | FLAG_RESULT_IN_STATE, - .result = (MAX_TAIL_CALL_CNT + 1 + 1) * MAX_TESTRUNS, + .result = (MAX_TAIL_CALL_CNT + 1) * MAX_TESTRUNS, }, { "Tail call error path, NULL target", diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6297eafdc40f..a69e4b04ffeb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1744,7 +1744,7 @@ union bpf_attr { * if the maximum number of tail calls has been reached for this * chain of programs. This limit is defined in the kernel by the * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), - * which is currently set to 32. + * which is currently set to 33. * Return * 0 on success, or a negative error in case of failure. * -- cgit v1.2.3 From 2202e15b2b1a946ce760d96748cd7477589701ab Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Nov 2021 13:27:06 +0100 Subject: kernel/locking: Use a pointer in ww_mutex_trylock(). mutex_acquire_nest() expects a pointer, pass the pointer. Fixes: 12235da8c80a1 ("kernel/locking: Add context to ww_mutex_trylock()") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211104122706.frk52zxbjorso2kv@linutronix.de --- kernel/locking/ww_rt_mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index 0e00205cf467..d1473c624105 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -26,7 +26,7 @@ int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx) if (__rt_mutex_trylock(&rtm->rtmutex)) { ww_mutex_set_context_fastpath(lock, ww_ctx); - mutex_acquire_nest(&rtm->dep_map, 0, 1, ww_ctx->dep_map, _RET_IP_); + mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_); return 1; } -- cgit v1.2.3 From 2d3791f116bb3d5b17571dadb8e085e12ae3a3cf Mon Sep 17 00:00:00 2001 From: Liu Xinpeng Date: Mon, 25 Oct 2021 11:46:25 +0800 Subject: psi: Remove repeated verbose comment Comment in function psi_task_switch,there are two same lines. ... * runtime state, the cgroup that contains both tasks * runtime state, the cgroup that contains both tasks ... Signed-off-by: Liu Xinpeng Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/1635133586-84611-1-git-send-email-liuxp11@chinatelecom.cn --- kernel/sched/psi.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1652f2bb54b7..526af84ab852 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -833,7 +833,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, /* * When switching between tasks that have an identical * runtime state, the cgroup that contains both tasks - * runtime state, the cgroup that contains both tasks * we reach the first common ancestor. Iterate @next's * ancestors only until we encounter @prev's ONCPU. */ -- cgit v1.2.3 From 2fb75e1b642f49253d8848c9e47e8942f5366221 Mon Sep 17 00:00:00 2001 From: Liu Xinpeng Date: Mon, 25 Oct 2021 11:46:26 +0800 Subject: psi: Add a missing SPDX license header Add the missing SPDX license header to include/linux/psi.h include/linux/psi_types.h kernel/sched/psi.c Signed-off-by: Liu Xinpeng Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/1635133586-84611-2-git-send-email-liuxp11@chinatelecom.cn --- include/linux/psi.h | 1 + include/linux/psi_types.h | 1 + kernel/sched/psi.c | 1 + 3 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/include/linux/psi.h b/include/linux/psi.h index 65eb1476ac70..a70ca833c6d7 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PSI_H #define _LINUX_PSI_H diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 0a23300d49af..bf50068d5d4b 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PSI_TYPES_H #define _LINUX_PSI_TYPES_H diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 526af84ab852..3397fa001157 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Pressure stall information for CPU, memory and IO * -- cgit v1.2.3 From 4feee7d12603deca8775f9f9ae5e121093837444 Mon Sep 17 00:00:00 2001 From: Josh Don Date: Mon, 18 Oct 2021 13:34:28 -0700 Subject: sched/core: Forced idle accounting Adds accounting for "forced idle" time, which is time where a cookie'd task forces its SMT sibling to idle, despite the presence of runnable tasks. Forced idle time is one means to measure the cost of enabling core scheduling (ie. the capacity lost due to the need to force idle). Forced idle time is attributed to the thread responsible for causing the forced idle. A few details: - Forced idle time is displayed via /proc/PID/sched. It also requires that schedstats is enabled. - Forced idle is only accounted when a sibling hyperthread is held idle despite the presence of runnable tasks. No time is charged if a sibling is idle but has no runnable tasks. - Tasks with 0 cookie are never charged forced idle. - For SMT > 2, we scale the amount of forced idle charged based on the number of forced idle siblings. Additionally, we split the time up and evenly charge it to all running tasks, as each is equally responsible for the forced idle. Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211018203428.2025792-1-joshdon@google.com --- include/linux/sched.h | 4 +++ kernel/sched/core.c | 82 +++++++++++++++++++++++++++++++++++------------ kernel/sched/core_sched.c | 66 +++++++++++++++++++++++++++++++++++++- kernel/sched/debug.c | 4 +++ kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 32 ++++++++++++++++-- 6 files changed, 166 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 78c351e35fec..d2e261adb8ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -523,7 +523,11 @@ struct sched_statistics { u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; + +#ifdef CONFIG_SCHED_CORE + u64 core_forceidle_sum; #endif +#endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; struct sched_entity { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c9b0fda64ac..beaa8be6241e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -144,7 +144,7 @@ static inline bool __sched_core_less(struct task_struct *a, struct task_struct * return false; /* flip prio, so high prio is leftmost */ - if (prio_less(b, a, task_rq(a)->core->core_forceidle)) + if (prio_less(b, a, !!task_rq(a)->core->core_forceidle_count)) return true; return false; @@ -181,15 +181,23 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p) rb_add(&p->core_node, &rq->core_tree, rb_sched_core_less); } -void sched_core_dequeue(struct rq *rq, struct task_struct *p) +void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { rq->core->core_task_seq++; - if (!sched_core_enqueued(p)) - return; + if (sched_core_enqueued(p)) { + rb_erase(&p->core_node, &rq->core_tree); + RB_CLEAR_NODE(&p->core_node); + } - rb_erase(&p->core_node, &rq->core_tree); - RB_CLEAR_NODE(&p->core_node); + /* + * Migrating the last task off the cpu, with the cpu in forced idle + * state. Reschedule to create an accounting edge for forced idle, + * and re-examine whether the core is still in forced idle state. + */ + if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 && + rq->core->core_forceidle_count && rq->curr == rq->idle) + resched_curr(rq); } /* @@ -280,6 +288,8 @@ static void __sched_core_flip(bool enabled) for_each_cpu(t, smt_mask) cpu_rq(t)->core_enabled = enabled; + cpu_rq(cpu)->core->core_forceidle_start = 0; + sched_core_unlock(cpu, &flags); cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask); @@ -364,7 +374,8 @@ void sched_core_put(void) #else /* !CONFIG_SCHED_CORE */ static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } -static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { } +static inline void +sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } #endif /* CONFIG_SCHED_CORE */ @@ -2005,7 +2016,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { if (sched_core_enabled(rq)) - sched_core_dequeue(rq, p); + sched_core_dequeue(rq, p, flags); if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); @@ -5244,6 +5255,7 @@ void scheduler_tick(void) if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); + sched_core_tick(rq); rq_unlock(rq, &rf); @@ -5656,6 +5668,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *next, *p, *max = NULL; const struct cpumask *smt_mask; bool fi_before = false; + bool core_clock_updated = (rq == rq->core); unsigned long cookie; int i, cpu, occ = 0; struct rq *rq_i; @@ -5708,10 +5721,18 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* reset state */ rq->core->core_cookie = 0UL; - if (rq->core->core_forceidle) { + if (rq->core->core_forceidle_count) { + if (!core_clock_updated) { + update_rq_clock(rq->core); + core_clock_updated = true; + } + sched_core_account_forceidle(rq); + /* reset after accounting force idle */ + rq->core->core_forceidle_start = 0; + rq->core->core_forceidle_count = 0; + rq->core->core_forceidle_occupation = 0; need_sync = true; fi_before = true; - rq->core->core_forceidle = false; } /* @@ -5753,7 +5774,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) for_each_cpu_wrap(i, smt_mask, cpu) { rq_i = cpu_rq(i); - if (i != cpu) + /* + * Current cpu always has its clock updated on entrance to + * pick_next_task(). If the current cpu is not the core, + * the core may also have been updated above. + */ + if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); p = rq_i->core_pick = pick_task(rq_i); @@ -5783,7 +5809,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (p == rq_i->idle) { if (rq_i->nr_running) { - rq->core->core_forceidle = true; + rq->core->core_forceidle_count++; if (!fi_before) rq->core->core_forceidle_seq++; } @@ -5792,6 +5818,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } } + if (schedstat_enabled() && rq->core->core_forceidle_count) { + if (cookie) + rq->core->core_forceidle_start = rq_clock(rq->core); + rq->core->core_forceidle_occupation = occ; + } + rq->core->core_pick_seq = rq->core->core_task_seq; next = rq->core_pick; rq->core_sched_seq = rq->core->core_pick_seq; @@ -5828,8 +5860,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * 1 0 1 * 1 1 0 */ - if (!(fi_before && rq->core->core_forceidle)) - task_vruntime_update(rq_i, rq_i->core_pick, rq->core->core_forceidle); + if (!(fi_before && rq->core->core_forceidle_count)) + task_vruntime_update(rq_i, rq_i->core_pick, !!rq->core->core_forceidle_count); rq_i->core_pick->core_occupation = occ; @@ -6033,11 +6065,19 @@ static void sched_core_cpu_deactivate(unsigned int cpu) goto unlock; /* copy the shared state to the new leader */ - core_rq->core_task_seq = rq->core_task_seq; - core_rq->core_pick_seq = rq->core_pick_seq; - core_rq->core_cookie = rq->core_cookie; - core_rq->core_forceidle = rq->core_forceidle; - core_rq->core_forceidle_seq = rq->core_forceidle_seq; + core_rq->core_task_seq = rq->core_task_seq; + core_rq->core_pick_seq = rq->core_pick_seq; + core_rq->core_cookie = rq->core_cookie; + core_rq->core_forceidle_count = rq->core_forceidle_count; + core_rq->core_forceidle_seq = rq->core_forceidle_seq; + core_rq->core_forceidle_occupation = rq->core_forceidle_occupation; + + /* + * Accounting edge for forced idle is handled in pick_next_task(). + * Don't need another one here, since the hotplug thread shouldn't + * have a cookie. + */ + core_rq->core_forceidle_start = 0; /* install new leader */ for_each_cpu(t, smt_mask) { @@ -9413,7 +9453,9 @@ void __init sched_init(void) rq->core_pick = NULL; rq->core_enabled = 0; rq->core_tree = RB_ROOT; - rq->core_forceidle = false; + rq->core_forceidle_count = 0; + rq->core_forceidle_occupation = 0; + rq->core_forceidle_start = 0; rq->core_cookie = 0UL; #endif diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 517f72b008f5..1fb45672ec85 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -73,7 +73,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, enqueued = sched_core_enqueued(p); if (enqueued) - sched_core_dequeue(rq, p); + sched_core_dequeue(rq, p, DEQUEUE_SAVE); old_cookie = p->core_cookie; p->core_cookie = cookie; @@ -85,6 +85,10 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, * If task is currently running, it may not be compatible anymore after * the cookie change, so enter the scheduler on its CPU to schedule it * away. + * + * Note that it is possible that as a result of this cookie change, the + * core has now entered/left forced idle state. Defer accounting to the + * next scheduling edge, rather than always forcing a reschedule here. */ if (task_running(rq, p)) resched_curr(rq); @@ -232,3 +236,63 @@ out: return err; } +#ifdef CONFIG_SCHEDSTATS + +/* REQUIRES: rq->core's clock recently updated. */ +void __sched_core_account_forceidle(struct rq *rq) +{ + const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); + u64 delta, now = rq_clock(rq->core); + struct rq *rq_i; + struct task_struct *p; + int i; + + lockdep_assert_rq_held(rq); + + WARN_ON_ONCE(!rq->core->core_forceidle_count); + + if (rq->core->core_forceidle_start == 0) + return; + + delta = now - rq->core->core_forceidle_start; + if (unlikely((s64)delta <= 0)) + return; + + rq->core->core_forceidle_start = now; + + if (WARN_ON_ONCE(!rq->core->core_forceidle_occupation)) { + /* can't be forced idle without a running task */ + } else if (rq->core->core_forceidle_count > 1 || + rq->core->core_forceidle_occupation > 1) { + /* + * For larger SMT configurations, we need to scale the charged + * forced idle amount since there can be more than one forced + * idle sibling and more than one running cookied task. + */ + delta *= rq->core->core_forceidle_count; + delta = div_u64(delta, rq->core->core_forceidle_occupation); + } + + for_each_cpu(i, smt_mask) { + rq_i = cpu_rq(i); + p = rq_i->core_pick ?: rq_i->curr; + + if (!p->core_cookie) + continue; + + __schedstat_add(p->stats.core_forceidle_sum, delta); + } +} + +void __sched_core_tick(struct rq *rq) +{ + if (!rq->core->core_forceidle_count) + return; + + if (rq != rq->core) + update_rq_clock(rq->core); + + __sched_core_account_forceidle(rq); +} + +#endif /* CONFIG_SCHEDSTATS */ diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7dcbaa31c5d9..aa29211de1bf 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -1023,6 +1023,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PN(avg_atom); __PN(avg_per_cpu); + +#ifdef CONFIG_SCHED_CORE + PN_SCHEDSTAT(core_forceidle_sum); +#endif } __P(nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e476f6d9435..884f29d07963 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11068,7 +11068,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check * if we need to give up the CPU. */ - if (rq->core->core_forceidle && rq->cfs.nr_running == 1 && + if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 && __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE)) resched_curr(rq); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0e66749486e7..eb971151e7e4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1111,8 +1111,10 @@ struct rq { unsigned int core_task_seq; unsigned int core_pick_seq; unsigned long core_cookie; - unsigned char core_forceidle; + unsigned int core_forceidle_count; unsigned int core_forceidle_seq; + unsigned int core_forceidle_occupation; + u64 core_forceidle_start; #endif }; @@ -1253,7 +1255,7 @@ static inline bool sched_core_enqueued(struct task_struct *p) } extern void sched_core_enqueue(struct rq *rq, struct task_struct *p); -extern void sched_core_dequeue(struct rq *rq, struct task_struct *p); +extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); extern void sched_core_get(void); extern void sched_core_put(void); @@ -1854,6 +1856,32 @@ static inline void flush_smp_call_function_from_idle(void) { } #include "stats.h" #include "autogroup.h" +#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) + +extern void __sched_core_account_forceidle(struct rq *rq); + +static inline void sched_core_account_forceidle(struct rq *rq) +{ + if (schedstat_enabled()) + __sched_core_account_forceidle(rq); +} + +extern void __sched_core_tick(struct rq *rq); + +static inline void sched_core_tick(struct rq *rq) +{ + if (sched_core_enabled(rq) && schedstat_enabled()) + __sched_core_tick(rq); +} + +#else + +static inline void sched_core_account_forceidle(struct rq *rq) {} + +static inline void sched_core_tick(struct rq *rq) {} + +#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */ + #ifdef CONFIG_CGROUP_SCHED /* -- cgit v1.2.3 From cb0e52b7748737b2cf6481fdd9b920ce7e1ebbdf Mon Sep 17 00:00:00 2001 From: Brian Chen Date: Wed, 10 Nov 2021 21:33:12 +0000 Subject: psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim We've noticed cases where tasks in a cgroup are stalled on memory but there is little memory FULL pressure since tasks stay on the runqueue in reclaim. A simple example involves a single threaded program that keeps leaking and touching large amounts of memory. It runs in a cgroup with swap enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though there is significant CPU pressure and memory SOME, there is barely any memory FULL since the task enters reclaim and stays on the runqueue. However, this memory-bound task is effectively stalled on memory and we expect memory FULL to match memory SOME in this scenario. The code is confused about memstall && running, thinking there is a stalled task and a productive task when there's only one task: a reclaimer that's counted as both. To fix this, we redefine the condition for PSI_MEM_FULL to check that all running tasks are in an active memstall instead of checking that there are no running tasks. case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); This will capture reclaimers. It will also capture tasks that called psi_memstall_enter() and are about to sleep, but this should be negligible noise. Signed-off-by: Brian Chen Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com --- include/linux/psi_types.h | 13 ++++++++++++- kernel/sched/psi.c | 45 ++++++++++++++++++++++++++++----------------- kernel/sched/stats.h | 5 ++++- 3 files changed, 44 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index bf50068d5d4b..516c0fe836fd 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -22,7 +22,17 @@ enum psi_task_count { * don't have to special case any state tracking for it. */ NR_ONCPU, - NR_PSI_TASK_COUNTS = 4, + /* + * For IO and CPU stalls the presence of running/oncpu tasks + * in the domain means a partial rather than a full stall. + * For memory it's not so simple because of page reclaimers: + * they are running/oncpu while representing a stall. To tell + * whether a domain has productivity left or not, we need to + * distinguish between regular running (i.e. productive) + * threads and memstall ones. + */ + NR_MEMSTALL_RUNNING, + NR_PSI_TASK_COUNTS = 5, }; /* Task state bitmasks */ @@ -30,6 +40,7 @@ enum psi_task_count { #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) #define TSK_ONCPU (1 << NR_ONCPU) +#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING) /* Resources that workloads could be stalled on */ enum psi_res { diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 3397fa001157..a679613a7cb7 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -35,13 +35,19 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * Naturally, the FULL state doesn't exist for the CPU resource at the - * system level, but exist at the cgroup level, means all non-idle tasks - * in a cgroup are delayed on the CPU resource which used by others outside - * of the cgroup or throttled by the cgroup cpu.max configuration. - * * SOME = nr_delayed_tasks != 0 - * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0 + * + * What it means for a task to be productive is defined differently + * for each resource. For IO, productive means a running task. For + * memory, productive means a running task that isn't a reclaimer. For + * CPU, productive means an oncpu task. + * + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level. At the cgroup level, + * FULL means all non-idle tasks in the cgroup are delayed on the CPU + * resource which is being used by others outside of the cgroup or + * throttled by the cgroup cpu.max configuration. * * The percentage of wallclock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, @@ -82,13 +88,13 @@ * * threads = min(nr_nonidle_tasks, nr_cpus) * SOME = min(nr_delayed_tasks / threads, 1) - * FULL = (threads - min(nr_running_tasks, threads)) / threads + * FULL = (threads - min(nr_productive_tasks, threads)) / threads * * For the 257 number crunchers on 256 CPUs, this yields: * * threads = min(257, 256) * SOME = min(1 / 256, 1) = 0.4% - * FULL = (256 - min(257, 256)) / 256 = 0% + * FULL = (256 - min(256, 256)) / 256 = 0% * * For the 1 out of 4 memory-delayed tasks, this yields: * @@ -113,7 +119,7 @@ * For each runqueue, we track: * * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) - * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu]) * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) * * and then periodically aggregate: @@ -234,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) case PSI_MEM_SOME: return unlikely(tasks[NR_MEMSTALL]); case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); case PSI_CPU_FULL: @@ -711,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu, if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], clear, set); + groupc->tasks[3], groupc->tasks[4], + clear, set); psi_bug = 1; } } @@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, int clear = TSK_ONCPU, set = 0; /* - * When we're going to sleep, psi_dequeue() lets us handle - * TSK_RUNNING and TSK_IOWAIT here, where we can combine it - * with TSK_ONCPU and save walking common ancestors twice. + * When we're going to sleep, psi_dequeue() lets us + * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and + * TSK_IOWAIT here, where we can combine it with + * TSK_ONCPU and save walking common ancestors twice. */ if (sleep) { clear |= TSK_RUNNING; + if (prev->in_memstall) + clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; } @@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 1; - psi_task_change(current, 0, TSK_MEMSTALL); + psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); rq_unlock_irq(rq, &rf); } @@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags) rq = this_rq_lock_irq(&rf); current->in_memstall = 0; - psi_task_change(current, TSK_MEMSTALL, 0); + psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0); rq_unlock_irq(rq, &rf); } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index cfb0893a83d4..3a3c826dd83a 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) if (static_branch_likely(&psi_disabled)) return; + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; + if (!wakeup || p->sched_psi_wake_requeue) { if (p->in_memstall) set |= TSK_MEMSTALL; @@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) return; if (p->in_memstall) - clear |= TSK_MEMSTALL; + clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING); psi_task_change(p, clear, 0); } -- cgit v1.2.3 From ff083a2d972f56bebfd82409ca62e5dfce950961 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:22 +0000 Subject: perf: Protect perf_guest_cbs with RCU Protect perf_guest_cbs with RCU to fix multiple possible errors. Luckily, all paths that read perf_guest_cbs already require RCU protection, e.g. to protect the callback chains, so only the direct perf_guest_cbs touchpoints need to be modified. Bug #1 is a simple lack of WRITE_ONCE/READ_ONCE behavior to ensure perf_guest_cbs isn't reloaded between a !NULL check and a dereference. Fixed via the READ_ONCE() in rcu_dereference(). Bug #2 is that on weakly-ordered architectures, updates to the callbacks themselves are not guaranteed to be visible before the pointer is made visible to readers. Fixed by the smp_store_release() in rcu_assign_pointer() when the new pointer is non-NULL. Bug #3 is that, because the callbacks are global, it's possible for readers to run in parallel with an unregisters, and thus a module implementing the callbacks can be unloaded while readers are in flight, resulting in a use-after-free. Fixed by a synchronize_rcu() call when unregistering callbacks. Bug #1 escaped notice because it's extremely unlikely a compiler will reload perf_guest_cbs in this sequence. perf_guest_cbs does get reloaded for future derefs, e.g. for ->is_user_mode(), but the ->is_in_guest() guard all but guarantees the consumer will win the race, e.g. to nullify perf_guest_cbs, KVM has to completely exit the guest and teardown down all VMs before KVM start its module unload / unregister sequence. This also makes it all but impossible to encounter bug #3. Bug #2 has not been a problem because all architectures that register callbacks are strongly ordered and/or have a static set of callbacks. But with help, unloading kvm_intel can trigger bug #1 e.g. wrapping perf_guest_cbs with READ_ONCE in perf_misc_flags() while spamming kvm_intel module load/unload leads to: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 6 PID: 1825 Comm: stress Not tainted 5.14.0-rc2+ #459 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 RIP: 0010:perf_misc_flags+0x1c/0x70 Call Trace: perf_prepare_sample+0x53/0x6b0 perf_event_output_forward+0x67/0x160 __perf_event_overflow+0x52/0xf0 handle_pmi_common+0x207/0x300 intel_pmu_handle_irq+0xcf/0x410 perf_event_nmi_handler+0x28/0x50 nmi_handle+0xc7/0x260 default_do_nmi+0x6b/0x170 exc_nmi+0x103/0x130 asm_exc_nmi+0x76/0xbf Fixes: 39447b386c84 ("perf: Enhance perf to allow for guest statistic collection from host") Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211111020738.2512932-2-seanjc@google.com --- arch/arm/kernel/perf_callchain.c | 17 +++++++++++------ arch/arm64/kernel/perf_callchain.c | 18 ++++++++++++------ arch/csky/kernel/perf_callchain.c | 6 ++++-- arch/nds32/kernel/perf_event_cpu.c | 17 +++++++++++------ arch/riscv/kernel/perf_callchain.c | 7 +++++-- arch/x86/events/core.c | 17 +++++++++++------ arch/x86/events/intel/core.c | 9 ++++++--- include/linux/perf_event.h | 13 ++++++++++++- kernel/events/core.c | 13 ++++++++++--- 9 files changed, 82 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c index 3b69a76d341e..1626dfc6f6ce 100644 --- a/arch/arm/kernel/perf_callchain.c +++ b/arch/arm/kernel/perf_callchain.c @@ -62,9 +62,10 @@ user_backtrace(struct frame_tail __user *tail, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct frame_tail __user *tail; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -98,9 +99,10 @@ callchain_trace(struct stackframe *fr, void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -111,18 +113,21 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + if (guest_cbs && guest_cbs->is_in_guest()) + return guest_cbs->get_guest_ip(); return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs->is_user_mode()) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 4a72c2727309..86d9f2013172 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -102,7 +102,9 @@ compat_user_backtrace(struct compat_frame_tail __user *tail, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -147,9 +149,10 @@ static bool callchain_trace(void *data, unsigned long pc) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe frame; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -160,18 +163,21 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + if (guest_cbs && guest_cbs->is_in_guest()) + return guest_cbs->get_guest_ip(); return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs->is_user_mode()) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/csky/kernel/perf_callchain.c b/arch/csky/kernel/perf_callchain.c index ab55e98ee8f6..35318a635a5f 100644 --- a/arch/csky/kernel/perf_callchain.c +++ b/arch/csky/kernel/perf_callchain.c @@ -86,10 +86,11 @@ static unsigned long user_backtrace(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; /* C-SKY does not support virtualization. */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) + if (guest_cbs && guest_cbs->is_in_guest()) return; fp = regs->regs[4]; @@ -110,10 +111,11 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; /* C-SKY does not support virtualization. */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { pr_warn("C-SKY does not support perf in guest mode!"); return; } diff --git a/arch/nds32/kernel/perf_event_cpu.c b/arch/nds32/kernel/perf_event_cpu.c index 0ce6f9f307e6..f38791960781 100644 --- a/arch/nds32/kernel/perf_event_cpu.c +++ b/arch/nds32/kernel/perf_event_cpu.c @@ -1363,6 +1363,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; unsigned long gp = 0; unsigned long lp = 0; @@ -1371,7 +1372,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, leaf_fp = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -1479,9 +1480,10 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stackframe fr; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* We don't support guest os callchain now */ return; } @@ -1493,20 +1495,23 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, unsigned long perf_instruction_pointer(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + /* However, NDS32 does not support virtualization */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + if (guest_cbs && guest_cbs->is_in_guest()) + return guest_cbs->get_guest_ip(); return instruction_pointer(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; /* However, NDS32 does not support virtualization */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs->is_user_mode()) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/riscv/kernel/perf_callchain.c b/arch/riscv/kernel/perf_callchain.c index 0bb1854dce83..8ecfc4c128bc 100644 --- a/arch/riscv/kernel/perf_callchain.c +++ b/arch/riscv/kernel/perf_callchain.c @@ -56,10 +56,11 @@ static unsigned long user_backtrace(struct perf_callchain_entry_ctx *entry, void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); unsigned long fp = 0; /* RISC-V does not support perf in guest mode. */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) + if (guest_cbs && guest_cbs->is_in_guest()) return; fp = regs->s0; @@ -78,8 +79,10 @@ static bool fill_callchain(void *entry, unsigned long pc) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + /* RISC-V does not support perf in guest mode. */ - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { pr_warn("RISC-V does not support perf in guest mode!"); return; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 38b2c779146f..32cec290d3ad 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2768,10 +2768,11 @@ static bool perf_hw_regs(struct pt_regs *regs) void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct unwind_state state; unsigned long addr; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2871,10 +2872,11 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); struct stack_frame frame; const struct stack_frame __user *fp; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (guest_cbs && guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2951,18 +2953,21 @@ static unsigned long code_segment_base(struct pt_regs *regs) unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); + + if (guest_cbs && guest_cbs->is_in_guest()) + return guest_cbs->get_guest_ip(); return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_cbs && guest_cbs->is_in_guest()) { + if (guest_cbs->is_user_mode()) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 42cf01ecdd13..2258e02ca350 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2837,6 +2837,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) { struct perf_sample_data data; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_guest_info_callbacks *guest_cbs; int bit; int handled = 0; u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); @@ -2903,9 +2904,11 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { handled++; - if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() && - perf_guest_cbs->handle_intel_pt_intr)) - perf_guest_cbs->handle_intel_pt_intr(); + + guest_cbs = perf_get_guest_cbs(); + if (unlikely(guest_cbs && guest_cbs->is_in_guest() && + guest_cbs->handle_intel_pt_intr)) + guest_cbs->handle_intel_pt_intr(); else intel_pt_interrupt(); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0dcfd265beed..318c489b735b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1240,7 +1240,18 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); -extern struct perf_guest_info_callbacks *perf_guest_cbs; +extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; +static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) +{ + /* + * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading + * the callbacks between a !NULL check and dereferences, to ensure + * pending stores/changes to the callback pointers are visible before a + * non-NULL perf_guest_cbs is visible to readers, and to prevent a + * module from unloading callbacks while readers are active. + */ + return rcu_dereference(perf_guest_cbs); +} extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); diff --git a/kernel/events/core.c b/kernel/events/core.c index 523106a506ee..c552e1bfcaea 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6526,18 +6526,25 @@ static void perf_pending_event(struct irq_work *entry) * Later on, we might change it to a list if there is * another virtualization implementation supporting the callbacks. */ -struct perf_guest_info_callbacks *perf_guest_cbs; +struct perf_guest_info_callbacks __rcu *perf_guest_cbs; int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = cbs; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) + return -EBUSY; + + rcu_assign_pointer(perf_guest_cbs, cbs); return 0; } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = NULL; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) + return -EINVAL; + + rcu_assign_pointer(perf_guest_cbs, NULL); + synchronize_rcu(); return 0; } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); -- cgit v1.2.3 From 2934e3d09350c1a7ca2433fbeabfcd831e48a575 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:25 +0000 Subject: perf: Stop pretending that perf can handle multiple guest callbacks Drop the 'int' return value from the perf (un)register callbacks helpers and stop pretending perf can support multiple callbacks. The 'int' returns are not future proofing anything as none of the callers take action on an error. It's also not obvious that there will ever be co-tenant hypervisors, and if there are, that allowing multiple callbacks to be registered is desirable or even correct. Opportunistically rename callbacks=>cbs in the affected declarations to match their definitions. No functional change intended. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-5-seanjc@google.com --- arch/arm64/include/asm/kvm_host.h | 4 ++-- arch/arm64/kvm/perf.c | 8 ++++---- include/linux/perf_event.h | 12 ++++++------ kernel/events/core.c | 15 ++++----------- 4 files changed, 16 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 2a5f7f38006f..f680f303ba7c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -675,8 +675,8 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); int kvm_handle_mmio_return(struct kvm_vcpu *vcpu); int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); -int kvm_perf_init(void); -int kvm_perf_teardown(void); +void kvm_perf_init(void); +void kvm_perf_teardown(void); long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index c84fe24b2ea1..a0d660cf889e 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -48,12 +48,12 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .get_guest_ip = kvm_get_guest_ip, }; -int kvm_perf_init(void) +void kvm_perf_init(void) { - return perf_register_guest_info_callbacks(&kvm_guest_cbs); + perf_register_guest_info_callbacks(&kvm_guest_cbs); } -int kvm_perf_teardown(void) +void kvm_perf_teardown(void) { - return perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 318c489b735b..98c204488496 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1252,8 +1252,8 @@ static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) */ return rcu_dereference(perf_guest_cbs); } -extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); -extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); +extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); +extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); @@ -1497,10 +1497,10 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { } static inline void perf_bp_event(struct perf_event *event, void *data) { } -static inline int perf_register_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } -static inline int perf_unregister_guest_info_callbacks -(struct perf_guest_info_callbacks *callbacks) { return 0; } +static inline void perf_register_guest_info_callbacks +(struct perf_guest_info_callbacks *cbs) { } +static inline void perf_unregister_guest_info_callbacks +(struct perf_guest_info_callbacks *cbs) { } static inline void perf_event_mmap(struct vm_area_struct *vma) { } diff --git a/kernel/events/core.c b/kernel/events/core.c index c552e1bfcaea..17e5b20762c5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6521,31 +6521,24 @@ static void perf_pending_event(struct irq_work *entry) perf_swevent_put_recursion_context(rctx); } -/* - * We assume there is only KVM supporting the callbacks. - * Later on, we might change it to a list if there is - * another virtualization implementation supporting the callbacks. - */ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; -int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) - return -EBUSY; + return; rcu_assign_pointer(perf_guest_cbs, cbs); - return 0; } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); -int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) - return -EINVAL; + return; rcu_assign_pointer(perf_guest_cbs, NULL); synchronize_rcu(); - return 0; } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); -- cgit v1.2.3 From 2aef6f306b39bbe74e2287d6e2ee07c4867d87d0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:29 +0000 Subject: perf: Force architectures to opt-in to guest callbacks Introduce GUEST_PERF_EVENTS and require architectures to select it to allow registering and using guest callbacks in perf. This will hopefully make it more difficult for new architectures to add useless "support" for guest callbacks, e.g. via copy+paste. Stubbing out the helpers has the happy bonus of avoiding a load of perf_guest_cbs when GUEST_PERF_EVENTS=n on arm64/x86. Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-9-seanjc@google.com --- arch/arm64/kvm/Kconfig | 1 + arch/x86/kvm/Kconfig | 1 + arch/x86/xen/Kconfig | 1 + include/linux/perf_event.h | 6 ++++++ init/Kconfig | 4 ++++ kernel/events/core.c | 2 ++ 6 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8ffcbe29395e..e9761d84f982 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -39,6 +39,7 @@ menuconfig KVM select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_VCPU_RUN_PID_CHANGE select SCHED_INFO + select GUEST_PERF_EVENTS if PERF_EVENTS help Support hosting virtualized guest machines. diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 619186138176..47bdbe705a76 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -36,6 +36,7 @@ config KVM select KVM_MMIO select SCHED_INFO select PERF_EVENTS + select GUEST_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 6bcd3d8ca6ac..85246dd9faa1 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -23,6 +23,7 @@ config XEN_PV select PARAVIRT_XXL select XEN_HAVE_PVMMU select XEN_HAVE_VPMU + select GUEST_PERF_EVENTS help Support running as a Xen PV guest. diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 346d5aff5804..ea47ef616ee0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1242,6 +1242,7 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, enum perf_bpf_event_type type, u16 flags); +#ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) { @@ -1280,6 +1281,11 @@ static inline unsigned int perf_guest_handle_intel_pt_intr(void) } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); +#else +static inline unsigned int perf_guest_state(void) { return 0; } +static inline unsigned long perf_guest_get_ip(void) { return 0; } +static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; } +#endif /* CONFIG_GUEST_PERF_EVENTS */ extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); diff --git a/init/Kconfig b/init/Kconfig index 036b750e8d8a..72d40b3b5805 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1804,6 +1804,10 @@ config HAVE_PERF_EVENTS help See tools/perf/design.txt for details. +config GUEST_PERF_EVENTS + bool + depends on HAVE_PERF_EVENTS + config PERF_USE_VMALLOC bool help diff --git a/kernel/events/core.c b/kernel/events/core.c index 17e5b20762c5..5a3502cd5362 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6521,6 +6521,7 @@ static void perf_pending_event(struct irq_work *entry) perf_swevent_put_recursion_context(rctx); } +#ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) @@ -6541,6 +6542,7 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +#endif static void perf_output_sample_regs(struct perf_output_handle *handle, -- cgit v1.2.3 From 87b940a0675e25261f022ac3e53e0dfff9cdb995 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 Nov 2021 02:07:30 +0000 Subject: perf/core: Use static_call to optimize perf_guest_info_callbacks Use static_call to optimize perf's guest callbacks on arm64 and x86, which are now the only architectures that define the callbacks. Use DEFINE_STATIC_CALL_RET0 as the default/NULL for all guest callbacks, as the callback semantics are that a return value '0' means "not in guest". static_call obviously avoids the overhead of CONFIG_RETPOLINE=y, but is also advantageous versus other solutions, e.g. per-cpu callbacks, in that a per-cpu memory load is not needed to detect the !guest case. Based on code from Peter and Like. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Sean Christopherson Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20211111020738.2512932-10-seanjc@google.com --- include/linux/perf_event.h | 34 ++++++++-------------------------- kernel/events/core.c | 15 +++++++++++++++ 2 files changed, 23 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ea47ef616ee0..0ac7d867ca0c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1244,40 +1244,22 @@ extern void perf_event_bpf_event(struct bpf_prog *prog, #ifdef CONFIG_GUEST_PERF_EVENTS extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; -static inline struct perf_guest_info_callbacks *perf_get_guest_cbs(void) -{ - /* - * Callbacks are RCU-protected and must be READ_ONCE to avoid reloading - * the callbacks between a !NULL check and dereferences, to ensure - * pending stores/changes to the callback pointers are visible before a - * non-NULL perf_guest_cbs is visible to readers, and to prevent a - * module from unloading callbacks while readers are active. - */ - return rcu_dereference(perf_guest_cbs); -} + +DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); +DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); +DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); + static inline unsigned int perf_guest_state(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - return guest_cbs ? guest_cbs->state() : 0; + return static_call(__perf_guest_state)(); } static inline unsigned long perf_guest_get_ip(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - /* - * Arbitrarily return '0' in the unlikely scenario that the callbacks - * are unregistered between checking guest state and getting the IP. - */ - return guest_cbs ? guest_cbs->get_ip() : 0; + return static_call(__perf_guest_get_ip)(); } static inline unsigned int perf_guest_handle_intel_pt_intr(void) { - struct perf_guest_info_callbacks *guest_cbs = perf_get_guest_cbs(); - - if (guest_cbs && guest_cbs->handle_intel_pt_intr) - return guest_cbs->handle_intel_pt_intr(); - return 0; + return static_call(__perf_guest_handle_intel_pt_intr)(); } extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a3502cd5362..3b3297a57228 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6524,12 +6524,23 @@ static void perf_pending_event(struct irq_work *entry) #ifdef CONFIG_GUEST_PERF_EVENTS struct perf_guest_info_callbacks __rcu *perf_guest_cbs; +DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); +DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); + void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) return; rcu_assign_pointer(perf_guest_cbs, cbs); + static_call_update(__perf_guest_state, cbs->state); + static_call_update(__perf_guest_get_ip, cbs->get_ip); + + /* Implementing ->handle_intel_pt_intr is optional. */ + if (cbs->handle_intel_pt_intr) + static_call_update(__perf_guest_handle_intel_pt_intr, + cbs->handle_intel_pt_intr); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); @@ -6539,6 +6550,10 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) return; rcu_assign_pointer(perf_guest_cbs, NULL); + static_call_update(__perf_guest_state, (void *)&__static_call_return0); + static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, + (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); -- cgit v1.2.3 From e7f7c99ba911f56bc338845c1cd72954ba591707 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 15 Nov 2021 11:55:57 -0600 Subject: signal: In get_signal test for signal_group_exit every time through the loop Recently while investigating a problem with rr and signals I noticed that siglock is dropped in ptrace_signal and get_signal does not jump to relock. Looking farther to see if the problem is anywhere else I see that do_signal_stop also returns if signal_group_exit is true. I believe that test can now never be true, but it is a bit hard to trace through and be certain. Testing signal_group_exit is not expensive, so move the test for signal_group_exit into the for loop inside of get_signal to ensure the test is never skipped improperly. This has been a potential problem since I added the test for signal_group_exit was added. Fixes: 35634ffa1751 ("signal: Always notice exiting tasks") Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/875yssekcd.fsf_-_@email.froward.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7c4b7ae714d4..986fa69c15c5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2662,19 +2662,19 @@ relock: goto relock; } - /* Has this task already been marked for death? */ - if (signal_group_exit(signal)) { - ksig->info.si_signo = signr = SIGKILL; - sigdelset(¤t->pending.signal, SIGKILL); - trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, - &sighand->action[SIGKILL - 1]); - recalc_sigpending(); - goto fatal; - } - for (;;) { struct k_sigaction *ka; + /* Has this task already been marked for death? */ + if (signal_group_exit(signal)) { + ksig->info.si_signo = signr = SIGKILL; + sigdelset(¤t->pending.signal, SIGKILL); + trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, + &sighand->action[SIGKILL - 1]); + recalc_sigpending(); + goto fatal; + } + if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) && do_signal_stop(0)) goto relock; -- cgit v1.2.3 From 5768d8906bc23d512b1a736c1e198aa833a6daa4 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 15 Nov 2021 13:47:13 -0600 Subject: signal: Requeue signals in the appropriate queue In the event that a tracer changes which signal needs to be delivered and that signal is currently blocked then the signal needs to be requeued for later delivery. With the advent of CLONE_THREAD the kernel has 2 signal queues per task. The per process queue and the per task queue. Update the code so that if the signal is removed from the per process queue it is requeued on the per process queue. This is necessary to make it appear the signal was never dequeued. The rr debugger reasonably believes that the state of the process from the last ptrace_stop it observed until PTRACE_EVENT_EXIT can be recreated by simply letting a process run. If a SIGKILL interrupts a ptrace_stop this is not true today. So return signals to their original queue in ptrace_signal so that signals that are not delivered appear like they were never dequeued. Fixes: 794aa320b79d ("[PATCH] sigfix-2.5.40-D6") History Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.gi Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/87zgq4d5r4.fsf_-_@email.froward.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- fs/signalfd.c | 5 +++-- include/linux/sched/signal.h | 7 ++++--- kernel/signal.c | 21 ++++++++++++++------- 3 files changed, 21 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/signalfd.c b/fs/signalfd.c index 040e1cf90528..74f134cd1ff6 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -165,11 +165,12 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info, int nonblock) { + enum pid_type type; ssize_t ret; DECLARE_WAITQUEUE(wait, current); spin_lock_irq(¤t->sighand->siglock); - ret = dequeue_signal(current, &ctx->sigmask, info); + ret = dequeue_signal(current, &ctx->sigmask, info, &type); switch (ret) { case 0: if (!nonblock) @@ -184,7 +185,7 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info add_wait_queue(¤t->sighand->signalfd_wqh, &wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); - ret = dequeue_signal(current, &ctx->sigmask, info); + ret = dequeue_signal(current, &ctx->sigmask, info, &type); if (ret != 0) break; if (signal_pending(current)) { diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 23505394ef70..167995d471da 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -286,17 +286,18 @@ static inline int signal_group_exit(const struct signal_struct *sig) extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); -extern int dequeue_signal(struct task_struct *task, - sigset_t *mask, kernel_siginfo_t *info); +extern int dequeue_signal(struct task_struct *task, sigset_t *mask, + kernel_siginfo_t *info, enum pid_type *type); static inline int kernel_dequeue_signal(void) { struct task_struct *task = current; kernel_siginfo_t __info; + enum pid_type __type; int ret; spin_lock_irq(&task->sighand->siglock); - ret = dequeue_signal(task, &task->blocked, &__info); + ret = dequeue_signal(task, &task->blocked, &__info, &__type); spin_unlock_irq(&task->sighand->siglock); return ret; diff --git a/kernel/signal.c b/kernel/signal.c index 986fa69c15c5..43e8b7e362b0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -626,7 +626,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, * * All callers have to hold the siglock. */ -int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *info) +int dequeue_signal(struct task_struct *tsk, sigset_t *mask, + kernel_siginfo_t *info, enum pid_type *type) { bool resched_timer = false; int signr; @@ -634,8 +635,10 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ + *type = PIDTYPE_PID; signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer); if (!signr) { + *type = PIDTYPE_TGID; signr = __dequeue_signal(&tsk->signal->shared_pending, mask, info, &resched_timer); #ifdef CONFIG_POSIX_TIMERS @@ -2522,7 +2525,7 @@ static void do_freezer_trap(void) freezable_schedule(); } -static int ptrace_signal(int signr, kernel_siginfo_t *info) +static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) { /* * We do not check sig_kernel_stop(signr) but set this marker @@ -2563,7 +2566,7 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info) /* If the (new) signal is now blocked, requeue it. */ if (sigismember(¤t->blocked, signr)) { - send_signal(signr, info, current, PIDTYPE_PID); + send_signal(signr, info, current, type); signr = 0; } @@ -2664,6 +2667,7 @@ relock: for (;;) { struct k_sigaction *ka; + enum pid_type type; /* Has this task already been marked for death? */ if (signal_group_exit(signal)) { @@ -2706,16 +2710,18 @@ relock: * so that the instruction pointer in the signal stack * frame points to the faulting instruction. */ + type = PIDTYPE_PID; signr = dequeue_synchronous_signal(&ksig->info); if (!signr) - signr = dequeue_signal(current, ¤t->blocked, &ksig->info); + signr = dequeue_signal(current, ¤t->blocked, + &ksig->info, &type); if (!signr) break; /* will return 0 */ if (unlikely(current->ptrace) && (signr != SIGKILL) && !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) { - signr = ptrace_signal(signr, &ksig->info); + signr = ptrace_signal(signr, &ksig->info, type); if (!signr) continue; } @@ -3540,6 +3546,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, ktime_t *to = NULL, timeout = KTIME_MAX; struct task_struct *tsk = current; sigset_t mask = *which; + enum pid_type type; int sig, ret = 0; if (ts) { @@ -3556,7 +3563,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, signotset(&mask); spin_lock_irq(&tsk->sighand->siglock); - sig = dequeue_signal(tsk, &mask, info); + sig = dequeue_signal(tsk, &mask, info, &type); if (!sig && timeout) { /* * None ready, temporarily unblock those we're interested @@ -3575,7 +3582,7 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); - sig = dequeue_signal(tsk, &mask, info); + sig = dequeue_signal(tsk, &mask, info, &type); } spin_unlock_irq(&tsk->sighand->siglock); -- cgit v1.2.3 From b171f667f3787946a8ba9644305339e93ae799c9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 15 Nov 2021 13:49:45 -0600 Subject: signal: Requeue ptrace signals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kyle Huey writes: > rr, a userspace record and replay debugger[0], uses the recorded register > state at PTRACE_EVENT_EXIT to find the point in time at which to cease > executing the program during replay. > > If a SIGKILL races with processing another signal in get_signal, it is > possible for the kernel to decline to notify the tracer of the original > signal. But if the original signal had a handler, the kernel proceeds > with setting up a signal handler frame as if the tracer had chosen to > deliver the signal unmodified to the tracee. When the kernel goes to > execute the signal handler that it has now modified the stack and registers > for, it will discover the pending SIGKILL, and terminate the tracee > without executing the handler. When PTRACE_EVENT_EXIT is delivered to > the tracer, however, the effects of handler setup will be visible to > the tracer. > > Because rr (the tracer) was never notified of the signal, it is not aware > that a signal handler frame was set up and expects the state of the program > at PTRACE_EVENT_EXIT to be a state that will be reconstructed naturally > by allowing the program to execute from the last event. When that fails > to happen during replay, rr will assert and die. > > The following patches add an explicit check for a newly pending SIGKILL > after the ptracer has been notified and the siglock has been reacquired. > If this happens, we stop processing the current signal and proceed > immediately to handling the SIGKILL. This makes the state reported at > PTRACE_EVENT_EXIT the unmodified state of the program, and also avoids the > work to set up a signal handler frame that will never be used. > > [0] https://rr-project.org/ The problem is that while the traced process makes it into ptrace_stop, the tracee is killed before the tracer manages to wait for the tracee and discover which signal was about to be delivered. More generally the problem is that while siglock was dropped a signal with process wide effect is short cirucit delivered to the entire process killing it, but the process continues to try and deliver another signal. In general it impossible to avoid all cases where work is performed after the process has been killed. In particular if the process is killed after get_signal returns the code will simply not know it has been killed until after delivering the signal frame to userspace. On the other hand when the code has already discovered the process has been killed and taken user space visible action that shows the kernel knows the process has been killed, it is just silly to then write the signal frame to the user space stack. Instead of being silly detect the process has been killed in ptrace_signal and requeue the signal so the code can pretend it was simply never dequeued for delivery. To test the process has been killed I use fatal_signal_pending rather than signal_group_exit to match the test in signal_pending_state which is used in schedule which is where ptrace_stop detects the process has been killed. Requeuing the signal so the code can pretend it was simply never dequeued improves the user space visible behavior that has been present since ebf5ebe31d2c ("[PATCH] signal-fixes-2.5.59-A4"). Kyle Huey verified that this change in behavior and makes rr happy. Reported-by: Kyle Huey Reported-by: Marko Mäkelä History Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.gi Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/87tugcd5p2.fsf_-_@email.froward.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 43e8b7e362b0..621401550f0f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2565,7 +2565,8 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) } /* If the (new) signal is now blocked, requeue it. */ - if (sigismember(¤t->blocked, signr)) { + if (sigismember(¤t->blocked, signr) || + fatal_signal_pending(current)) { send_signal(signr, info, current, type); signr = 0; } -- cgit v1.2.3 From f86b0aaad741c45aba5a84a27277dd56a96808ba Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 17 Nov 2021 17:15:42 -0800 Subject: tracing/histogram: Fix UAF in destroy_hist_field() Calling destroy_hist_field() on an expression will recursively free any operands associated with the expression. If during expression parsing the operands of the expression are already set when an error is encountered, there is no need to explicity free the operands. Doing so will result in destroy_hist_field() being called twice for the operands and lead to a use-after-free (UAF) error. If the operands are associated with the expression, only call destroy_hist_field() on the expression since the operands will be recursively freed. Link: https://lore.kernel.org/all/CAHk-=wgcrEbFgkw9720H3tW-AhHOoEKhYwZinYJw4FpzSaJ6_Q@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211118011542.1420131-1-kaleshsingh@google.com Suggested-by: Linus Torvalds Signed-off-by: Kalesh Singh Fixes: 8b5d46fd7a38 ("tracing/histogram: Optimize division by constants") Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 41 +++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5ea2c9ec54a6..9555b8e1d1e3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2576,28 +2576,27 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, /* Split the expression string at the root operator */ if (!sep) - goto free; + return ERR_PTR(-EINVAL); + *sep = '\0'; operand1_str = str; str = sep+1; /* Binary operator requires both operands */ if (*operand1_str == '\0' || *str == '\0') - goto free; + return ERR_PTR(-EINVAL); operand_flags = 0; /* LHS of string is an expression e.g. a+b in a+b+c */ operand1 = parse_expr(hist_data, file, operand1_str, operand_flags, NULL, n_subexprs); - if (IS_ERR(operand1)) { - ret = PTR_ERR(operand1); - operand1 = NULL; - goto free; - } + if (IS_ERR(operand1)) + return ERR_CAST(operand1); + if (operand1->flags & HIST_FIELD_FL_STRING) { hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(operand1_str)); ret = -EINVAL; - goto free; + goto free_op1; } /* RHS of string is another expression e.g. c in a+b+c */ @@ -2605,13 +2604,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand2)) { ret = PTR_ERR(operand2); - operand2 = NULL; - goto free; + goto free_op1; } if (operand2->flags & HIST_FIELD_FL_STRING) { hist_err(file->tr, HIST_ERR_INVALID_STR_OPERAND, errpos(str)); ret = -EINVAL; - goto free; + goto free_operands; } switch (field_op) { @@ -2629,12 +2627,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, break; default: ret = -EINVAL; - goto free; + goto free_operands; } ret = check_expr_operands(file->tr, operand1, operand2, &var1, &var2); if (ret) - goto free; + goto free_operands; operand_flags = var1 ? var1->flags : operand1->flags; operand2_flags = var2 ? var2->flags : operand2->flags; @@ -2653,12 +2651,13 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr = create_hist_field(hist_data, NULL, flags, var_name); if (!expr) { ret = -ENOMEM; - goto free; + goto free_operands; } operand1->read_once = true; operand2->read_once = true; + /* The operands are now owned and free'd by 'expr' */ expr->operands[0] = operand1; expr->operands[1] = operand2; @@ -2669,7 +2668,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, if (!divisor) { hist_err(file->tr, HIST_ERR_DIVISION_BY_ZERO, errpos(str)); ret = -EDOM; - goto free; + goto free_expr; } /* @@ -2709,18 +2708,22 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->type = kstrdup_const(operand1->type, GFP_KERNEL); if (!expr->type) { ret = -ENOMEM; - goto free; + goto free_expr; } expr->name = expr_str(expr, 0); } return expr; -free: - destroy_hist_field(operand1, 0); + +free_operands: destroy_hist_field(operand2, 0); - destroy_hist_field(expr, 0); +free_op1: + destroy_hist_field(operand1, 0); + return ERR_PTR(ret); +free_expr: + destroy_hist_field(expr, 0); return ERR_PTR(ret); } -- cgit v1.2.3 From c4c1dbcc09e723295969a62aff401815b7ee15f4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 Nov 2021 12:22:17 -0800 Subject: tracing: Use memset_startat() to zero struct trace_iterator In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memset(), avoid intentionally writing across neighboring fields. Use memset_startat() to avoid confusing memset() about writing beyond the target struct member. Link: https://lkml.kernel.org/r/20211118202217.1285588-1-keescook@chromium.org Signed-off-by: Kees Cook Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f9139dc1262c..e3c80cfd4eec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6706,9 +6706,7 @@ waitagain: cnt = PAGE_SIZE - 1; /* reset all but tr, trace, and overruns */ - memset(&iter->seq, 0, - sizeof(struct trace_iterator) - - offsetof(struct trace_iterator, seq)); + memset_startat(iter, 0, seq); cpumask_clear(iter->started); trace_seq_init(&iter->seq); iter->pos = -1; -- cgit v1.2.3 From 2ef75e9bd2c998f1c6f6f23a3744136105ddefd5 Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Thu, 18 Nov 2021 17:55:16 +0300 Subject: tracing: Don't use out-of-sync va_list in event printing If trace_seq becomes full, trace_seq_vprintf() no longer consumes arguments from va_list, making va_list out of sync with format processing by trace_check_vprintf(). This causes va_arg() in trace_check_vprintf() to return wrong positional argument, which results into a WARN_ON_ONCE() hit. ftrace_stress_test from LTP triggers this situation. Fix it by explicitly avoiding further use if va_list at the point when it's consistency can no longer be guaranteed. Link: https://lkml.kernel.org/r/20211118145516.13219-1-nikita.yushchenko@virtuozzo.com Signed-off-by: Nikita Yushchenko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e3c80cfd4eec..88de94da596b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3812,6 +3812,18 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, iter->fmt[i] = '\0'; trace_seq_vprintf(&iter->seq, iter->fmt, ap); + /* + * If iter->seq is full, the above call no longer guarantees + * that ap is in sync with fmt processing, and further calls + * to va_arg() can return wrong positional arguments. + * + * Ensure that ap is no longer used in this case. + */ + if (iter->seq.full) { + p = ""; + break; + } + if (star) len = va_arg(ap, int); -- cgit v1.2.3 From e349d945fac76bddc78ae1cb92a0145b427a87ce Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Nov 2021 11:11:13 -0600 Subject: signal: Don't always set SA_IMMUTABLE for forced signals Recently to prevent issues with SECCOMP_RET_KILL and similar signals being changed before they are delivered SA_IMMUTABLE was added. Unfortunately this broke debuggers[1][2] which reasonably expect to be able to trap synchronous SIGTRAP and SIGSEGV even when the target process is not configured to handle those signals. Update force_sig_to_task to support both the case when we can allow the debugger to intercept and possibly ignore the signal and the case when it is not safe to let userspace know about the signal until the process has exited. Suggested-by: Linus Torvalds Reported-by: Kyle Huey Reported-by: kernel test robot Cc: stable@vger.kernel.org [1] https://lkml.kernel.org/r/CAP045AoMY4xf8aC_4QU_-j7obuEPYgTcnQQP3Yxk=2X90jtpjw@mail.gmail.com [2] https://lkml.kernel.org/r/20211117150258.GB5403@xsang-OptiPlex-9020 Fixes: 00b06da29cf9 ("signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed") Link: https://lkml.kernel.org/r/877dd5qfw5.fsf_-_@email.froward.int.ebiederm.org Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Kyle Huey Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7c4b7ae714d4..7815e1bbeddc 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1298,6 +1298,12 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p return ret; } +enum sig_handler { + HANDLER_CURRENT, /* If reachable use the current handler */ + HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */ + HANDLER_EXIT, /* Only visible as the process exit code */ +}; + /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. @@ -1310,7 +1316,8 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p * that is why we also clear SIGNAL_UNKILLABLE. */ static int -force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool sigdfl) +force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, + enum sig_handler handler) { unsigned long int flags; int ret, blocked, ignored; @@ -1321,9 +1328,10 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; blocked = sigismember(&t->blocked, sig); - if (blocked || ignored || sigdfl) { + if (blocked || ignored || (handler != HANDLER_CURRENT)) { action->sa.sa_handler = SIG_DFL; - action->sa.sa_flags |= SA_IMMUTABLE; + if (handler == HANDLER_EXIT) + action->sa.sa_flags |= SA_IMMUTABLE; if (blocked) { sigdelset(&t->blocked, sig); recalc_sigpending_and_wake(t); @@ -1343,7 +1351,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool int force_sig_info(struct kernel_siginfo *info) { - return force_sig_info_to_task(info, current, false); + return force_sig_info_to_task(info, current, HANDLER_CURRENT); } /* @@ -1660,7 +1668,7 @@ void force_fatal_sig(int sig) info.si_code = SI_KERNEL; info.si_pid = 0; info.si_uid = 0; - force_sig_info_to_task(&info, current, true); + force_sig_info_to_task(&info, current, HANDLER_SIG_DFL); } /* @@ -1693,7 +1701,7 @@ int force_sig_fault_to_task(int sig, int code, void __user *addr info.si_flags = flags; info.si_isr = isr; #endif - return force_sig_info_to_task(&info, t, false); + return force_sig_info_to_task(&info, t, HANDLER_CURRENT); } int force_sig_fault(int sig, int code, void __user *addr @@ -1813,7 +1821,8 @@ int force_sig_seccomp(int syscall, int reason, bool force_coredump) info.si_errno = reason; info.si_arch = syscall_get_arch(current); info.si_syscall = syscall; - return force_sig_info_to_task(&info, current, force_coredump); + return force_sig_info_to_task(&info, current, + force_coredump ? HANDLER_EXIT : HANDLER_CURRENT); } /* For the crazy architectures that include trap information in -- cgit v1.2.3 From fcb116bc43c8c37c052530ead79872f8b2615711 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Nov 2021 14:23:21 -0600 Subject: signal: Replace force_fatal_sig with force_exit_sig when in doubt Recently to prevent issues with SECCOMP_RET_KILL and similar signals being changed before they are delivered SA_IMMUTABLE was added. Unfortunately this broke debuggers[1][2] which reasonably expect to be able to trap synchronous SIGTRAP and SIGSEGV even when the target process is not configured to handle those signals. Add force_exit_sig and use it instead of force_fatal_sig where historically the code has directly called do_exit. This has the implementation benefits of going through the signal exit path (including generating core dumps) without the danger of allowing userspace to ignore or change these signals. This avoids userspace regressions as older kernels exited with do_exit which debuggers also can not intercept. In the future is should be possible to improve the quality of implementation of the kernel by changing some of these force_exit_sig calls to force_fatal_sig. That can be done where it matters on a case-by-case basis with careful analysis. Reported-by: Kyle Huey Reported-by: kernel test robot [1] https://lkml.kernel.org/r/CAP045AoMY4xf8aC_4QU_-j7obuEPYgTcnQQP3Yxk=2X90jtpjw@mail.gmail.com [2] https://lkml.kernel.org/r/20211117150258.GB5403@xsang-OptiPlex-9020 Fixes: 00b06da29cf9 ("signal: Add SA_IMMUTABLE to ensure forced siganls do not get changed") Fixes: a3616a3c0272 ("signal/m68k: Use force_sigsegv(SIGSEGV) in fpsp040_die") Fixes: 83a1f27ad773 ("signal/powerpc: On swapcontext failure force SIGSEGV") Fixes: 9bc508cf0791 ("signal/s390: Use force_sigsegv in default_trap_handler") Fixes: 086ec444f866 ("signal/sparc32: In setup_rt_frame and setup_fram use force_fatal_sig") Fixes: c317d306d550 ("signal/sparc32: Exit with a fatal signal when try_to_clear_window_buffer fails") Fixes: 695dd0d634df ("signal/x86: In emulate_vsyscall force a signal instead of calling do_exit") Fixes: 1fbd60df8a85 ("signal/vm86_32: Properly send SIGSEGV when the vm86 state cannot be saved.") Fixes: 941edc5bf174 ("exit/syscall_user_dispatch: Send ordinary signals on failure") Link: https://lkml.kernel.org/r/871r3dqfv8.fsf_-_@email.froward.int.ebiederm.org Reviewed-by: Kees Cook Tested-by: Kees Cook Tested-by: Kyle Huey Signed-off-by: "Eric W. Biederman" --- arch/m68k/kernel/traps.c | 2 +- arch/powerpc/kernel/signal_32.c | 2 +- arch/powerpc/kernel/signal_64.c | 4 ++-- arch/s390/kernel/traps.c | 2 +- arch/sparc/kernel/signal_32.c | 4 ++-- arch/sparc/kernel/windows.c | 2 +- arch/x86/entry/vsyscall/vsyscall_64.c | 2 +- arch/x86/kernel/vm86_32.c | 2 +- include/linux/sched/signal.h | 1 + kernel/entry/syscall_user_dispatch.c | 4 ++-- kernel/signal.c | 13 +++++++++++++ 11 files changed, 26 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index 99058a6da956..34d6458340b0 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -1145,7 +1145,7 @@ asmlinkage void set_esp0(unsigned long ssp) */ asmlinkage void fpsp040_die(void) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); } #ifdef CONFIG_M68KFPU_EMU diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 00a9c9cd6d42..3e053e2fd6b6 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -1063,7 +1063,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, * We kill the task with a SIGSEGV in this situation. */ if (do_setcontext(new_ctx, regs, 0)) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return -EFAULT; } diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index ef518535d436..d1e1fc0acbea 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -704,7 +704,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, */ if (__get_user_sigset(&set, &new_ctx->uc_sigmask)) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return -EFAULT; } set_current_blocked(&set); @@ -713,7 +713,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, old_ctx, return -EFAULT; if (__unsafe_restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext)) { user_read_access_end(); - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return -EFAULT; } user_read_access_end(); diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 035705c9f23e..2b780786fc68 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -84,7 +84,7 @@ static void default_trap_handler(struct pt_regs *regs) { if (user_mode(regs)) { report_user_fault(regs, SIGSEGV, 0); - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); } else die(regs, "Unknown program exception"); } diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index cd677bc564a7..ffab16369bea 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -244,7 +244,7 @@ static int setup_frame(struct ksignal *ksig, struct pt_regs *regs, get_sigframe(ksig, regs, sigframe_size); if (invalid_frame_pointer(sf, sigframe_size)) { - force_fatal_sig(SIGILL); + force_exit_sig(SIGILL); return -EINVAL; } @@ -336,7 +336,7 @@ static int setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs, sf = (struct rt_signal_frame __user *) get_sigframe(ksig, regs, sigframe_size); if (invalid_frame_pointer(sf, sigframe_size)) { - force_fatal_sig(SIGILL); + force_exit_sig(SIGILL); return -EINVAL; } diff --git a/arch/sparc/kernel/windows.c b/arch/sparc/kernel/windows.c index bbbd40cc6b28..8f20862ccc83 100644 --- a/arch/sparc/kernel/windows.c +++ b/arch/sparc/kernel/windows.c @@ -122,7 +122,7 @@ void try_to_clear_window_buffer(struct pt_regs *regs, int who) if ((sp & 7) || copy_to_user((char __user *) sp, &tp->reg_window[window], sizeof(struct reg_window32))) { - force_fatal_sig(SIGILL); + force_exit_sig(SIGILL); return; } } diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 0b6b277ee050..fd2ee9408e91 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -226,7 +226,7 @@ bool emulate_vsyscall(unsigned long error_code, if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { warn_bad_vsyscall(KERN_DEBUG, regs, "seccomp tried to change syscall nr or ip"); - force_fatal_sig(SIGSYS); + force_exit_sig(SIGSYS); return true; } regs->orig_ax = -1; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index cce1c89cb7df..c21bcd668284 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -160,7 +160,7 @@ Efault_end: user_access_end(); Efault: pr_alert("could not access userspace vm86 info\n"); - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); goto exit_vm86; } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 23505394ef70..33a50642cf41 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -352,6 +352,7 @@ extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern void force_fatal_sig(int); +extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 4508201847d2..0b6379adff6b 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -48,7 +48,7 @@ bool syscall_user_dispatch(struct pt_regs *regs) * the selector is loaded by userspace. */ if (unlikely(__get_user(state, sd->selector))) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return true; } @@ -56,7 +56,7 @@ bool syscall_user_dispatch(struct pt_regs *regs) return false; if (state != SYSCALL_DISPATCH_FILTER_BLOCK) { - force_fatal_sig(SIGSYS); + force_exit_sig(SIGSYS); return true; } } diff --git a/kernel/signal.c b/kernel/signal.c index 7815e1bbeddc..a629b11bf3e0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1671,6 +1671,19 @@ void force_fatal_sig(int sig) force_sig_info_to_task(&info, current, HANDLER_SIG_DFL); } +void force_exit_sig(int sig) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = 0; + info.si_uid = 0; + force_sig_info_to_task(&info, current, HANDLER_EXIT); +} + /* * When things go south during signal handling, we * will force a SIGSEGV. And if the signal that caused -- cgit v1.2.3 From 6326948f940dc3f77066d5cdc44ba6afe67830c0 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 29 Sep 2021 11:01:21 -0400 Subject: lsm: security_task_getsecid_subj() -> security_current_getsecid_subj() The security_task_getsecid_subj() LSM hook invites misuse by allowing callers to specify a task even though the hook is only safe when the current task is referenced. Fix this by removing the task_struct argument to the hook, requiring LSM implementations to use the current task. While we are changing the hook declaration we also rename the function to security_current_getsecid_subj() in an effort to reinforce that the hook captures the subjective credentials of the current task and not an arbitrary task on the system. Reviewed-by: Serge Hallyn Reviewed-by: Casey Schaufler Signed-off-by: Paul Moore --- include/linux/lsm_hook_defs.h | 3 +-- include/linux/lsm_hooks.h | 8 +++----- include/linux/security.h | 4 ++-- kernel/audit.c | 4 ++-- kernel/auditfilter.c | 3 +-- kernel/auditsc.c | 11 ++++++++++- net/netlabel/netlabel_unlabeled.c | 2 +- net/netlabel/netlabel_user.h | 2 +- security/apparmor/lsm.c | 13 ++++++++++--- security/integrity/ima/ima_appraise.c | 2 +- security/integrity/ima/ima_main.c | 14 +++++++------- security/security.c | 6 +++--- security/selinux/hooks.c | 19 +++---------------- security/smack/smack.h | 16 ---------------- security/smack/smack_lsm.c | 9 ++++----- 15 files changed, 49 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index df8de62f4710..ae2228f0711d 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -206,8 +206,7 @@ LSM_HOOK(int, 0, task_fix_setgid, struct cred *new, const struct cred * old, LSM_HOOK(int, 0, task_setpgid, struct task_struct *p, pid_t pgid) LSM_HOOK(int, 0, task_getpgid, struct task_struct *p) LSM_HOOK(int, 0, task_getsid, struct task_struct *p) -LSM_HOOK(void, LSM_RET_VOID, task_getsecid_subj, - struct task_struct *p, u32 *secid) +LSM_HOOK(void, LSM_RET_VOID, current_getsecid_subj, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, task_getsecid_obj, struct task_struct *p, u32 *secid) LSM_HOOK(int, 0, task_setnice, struct task_struct *p, int nice) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d45b6f6e27fd..52c1990644b9 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -719,11 +719,9 @@ * @p. * @p contains the task_struct for the process. * Return 0 if permission is granted. - * @task_getsecid_subj: - * Retrieve the subjective security identifier of the task_struct in @p - * and return it in @secid. Special care must be taken to ensure that @p - * is the either the "current" task, or the caller has exclusive access - * to @p. + * @current_getsecid_subj: + * Retrieve the subjective security identifier of the current task and + * return it in @secid. * In case of failure, @secid will be set to zero. * @task_getsecid_obj: * Retrieve the objective security identifier of the task_struct in @p diff --git a/include/linux/security.h b/include/linux/security.h index bbf44a466832..bb301963e333 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -418,7 +418,7 @@ int security_task_fix_setgid(struct cred *new, const struct cred *old, int security_task_setpgid(struct task_struct *p, pid_t pgid); int security_task_getpgid(struct task_struct *p); int security_task_getsid(struct task_struct *p); -void security_task_getsecid_subj(struct task_struct *p, u32 *secid); +void security_current_getsecid_subj(u32 *secid); void security_task_getsecid_obj(struct task_struct *p, u32 *secid); int security_task_setnice(struct task_struct *p, int nice); int security_task_setioprio(struct task_struct *p, int ioprio); @@ -1119,7 +1119,7 @@ static inline int security_task_getsid(struct task_struct *p) return 0; } -static inline void security_task_getsecid_subj(struct task_struct *p, u32 *secid) +static inline void security_current_getsecid_subj(u32 *secid) { *secid = 0; } diff --git a/kernel/audit.c b/kernel/audit.c index 121d37e700a6..d4084751cfe6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -2132,7 +2132,7 @@ int audit_log_task_context(struct audit_buffer *ab) int error; u32 sid; - security_task_getsecid_subj(current, &sid); + security_current_getsecid_subj(&sid); if (!sid) return 0; @@ -2353,7 +2353,7 @@ int audit_signal_info(int sig, struct task_struct *t) audit_sig_uid = auid; else audit_sig_uid = uid; - security_task_getsecid_subj(current, &audit_sig_sid); + security_current_getsecid_subj(&audit_sig_sid); } return audit_signal_info_syscall(t); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d75acb014ccd..4173e771650c 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1368,8 +1368,7 @@ int audit_filter(int msgtype, unsigned int listtype) case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: if (f->lsm_rule) { - security_task_getsecid_subj(current, - &sid); + security_current_getsecid_subj(&sid); result = security_audit_rule_match(sid, f->type, f->op, f->lsm_rule); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b517947bfa48..fce5d43a933f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -666,7 +666,16 @@ static int audit_filter_rules(struct task_struct *tsk, logged upon error */ if (f->lsm_rule) { if (need_sid) { - security_task_getsecid_subj(tsk, &sid); + /* @tsk should always be equal to + * @current with the exception of + * fork()/copy_process() in which case + * the new @tsk creds are still a dup + * of @current's creds so we can still + * use security_current_getsecid_subj() + * here even though it always refs + * @current's creds + */ + security_current_getsecid_subj(&sid); need_sid = 0; } result = security_audit_rule_match(sid, f->type, diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 566ba4397ee4..8490e46359ae 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1537,7 +1537,7 @@ int __init netlbl_unlabel_defconf(void) /* Only the kernel is allowed to call this function and the only time * it is called is at bootup before the audit subsystem is reporting * messages so don't worry to much about these values. */ - security_task_getsecid_subj(current, &audit_info.secid); + security_current_getsecid_subj(&audit_info.secid); audit_info.loginuid = GLOBAL_ROOT_UID; audit_info.sessionid = 0; diff --git a/net/netlabel/netlabel_user.h b/net/netlabel/netlabel_user.h index 6190cbf94bf0..d6c5b31eb4eb 100644 --- a/net/netlabel/netlabel_user.h +++ b/net/netlabel/netlabel_user.h @@ -32,7 +32,7 @@ */ static inline void netlbl_netlink_auditinfo(struct netlbl_audit *audit_info) { - security_task_getsecid_subj(current, &audit_info->secid); + security_current_getsecid_subj(&audit_info->secid); audit_info->loginuid = audit_get_loginuid(current); audit_info->sessionid = audit_get_sessionid(current); } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index 0d6585056f3d..4f0eecb67dde 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -728,7 +728,14 @@ static void apparmor_bprm_committed_creds(struct linux_binprm *bprm) return; } -static void apparmor_task_getsecid(struct task_struct *p, u32 *secid) +static void apparmor_current_getsecid_subj(u32 *secid) +{ + struct aa_label *label = aa_get_current_label(); + *secid = label->secid; + aa_put_label(label); +} + +static void apparmor_task_getsecid_obj(struct task_struct *p, u32 *secid) { struct aa_label *label = aa_get_task_label(p); *secid = label->secid; @@ -1252,8 +1259,8 @@ static struct security_hook_list apparmor_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(task_free, apparmor_task_free), LSM_HOOK_INIT(task_alloc, apparmor_task_alloc), - LSM_HOOK_INIT(task_getsecid_subj, apparmor_task_getsecid), - LSM_HOOK_INIT(task_getsecid_obj, apparmor_task_getsecid), + LSM_HOOK_INIT(current_getsecid_subj, apparmor_current_getsecid_subj), + LSM_HOOK_INIT(task_getsecid_obj, apparmor_task_getsecid_obj), LSM_HOOK_INIT(task_setrlimit, apparmor_task_setrlimit), LSM_HOOK_INIT(task_kill, apparmor_task_kill), diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index dbba51583e7c..17232bbfb9f9 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -76,7 +76,7 @@ int ima_must_appraise(struct user_namespace *mnt_userns, struct inode *inode, if (!ima_appraise) return 0; - security_task_getsecid_subj(current, &secid); + security_current_getsecid_subj(&secid); return ima_match_policy(mnt_userns, inode, current_cred(), secid, func, mask, IMA_APPRAISE | IMA_HASH, NULL, NULL, NULL, NULL); diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 465865412100..8c6e4514d494 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -408,7 +408,7 @@ int ima_file_mmap(struct file *file, unsigned long prot) u32 secid; if (file && (prot & PROT_EXEC)) { - security_task_getsecid_subj(current, &secid); + security_current_getsecid_subj(&secid); return process_measurement(file, current_cred(), secid, NULL, 0, MAY_EXEC, MMAP_CHECK); } @@ -446,7 +446,7 @@ int ima_file_mprotect(struct vm_area_struct *vma, unsigned long prot) !(prot & PROT_EXEC) || (vma->vm_flags & VM_EXEC)) return 0; - security_task_getsecid_subj(current, &secid); + security_current_getsecid_subj(&secid); inode = file_inode(vma->vm_file); action = ima_get_action(file_mnt_user_ns(vma->vm_file), inode, current_cred(), secid, MAY_EXEC, MMAP_CHECK, @@ -487,7 +487,7 @@ int ima_bprm_check(struct linux_binprm *bprm) int ret; u32 secid; - security_task_getsecid_subj(current, &secid); + security_current_getsecid_subj(&secid); ret = process_measurement(bprm->file, current_cred(), secid, NULL, 0, MAY_EXEC, BPRM_CHECK); if (ret) @@ -512,7 +512,7 @@ int ima_file_check(struct file *file, int mask) { u32 secid; - security_task_getsecid_subj(current, &secid); + security_current_getsecid_subj(&secid); return process_measurement(file, current_cred(), secid, NULL, 0, mask & (MAY_READ | MAY_WRITE | MAY_EXEC | MAY_APPEND), FILE_CHECK); @@ -709,7 +709,7 @@ int ima_read_file(struct file *file, enum kernel_read_file_id read_id, /* Read entire file for all partial reads. */ func = read_idmap[read_id] ?: FILE_CHECK; - security_task_getsecid_subj(current, &secid); + security_current_getsecid_subj(&secid); return process_measurement(file, current_cred(), secid, NULL, 0, MAY_READ, func); } @@ -752,7 +752,7 @@ int ima_post_read_file(struct file *file, void *buf, loff_t size, } func = read_idmap[read_id] ?: FILE_CHECK; - security_task_getsecid_subj(current, &secid); + security_current_getsecid_subj(&secid); return process_measurement(file, current_cred(), secid, buf, size, MAY_READ, func); } @@ -905,7 +905,7 @@ int process_buffer_measurement(struct user_namespace *mnt_userns, * buffer measurements. */ if (func) { - security_task_getsecid_subj(current, &secid); + security_current_getsecid_subj(&secid); action = ima_get_action(mnt_userns, inode, current_cred(), secid, 0, func, &pcr, &template, func_data, NULL); diff --git a/security/security.c b/security/security.c index c88167a414b4..edb922b8bf4a 100644 --- a/security/security.c +++ b/security/security.c @@ -1808,12 +1808,12 @@ int security_task_getsid(struct task_struct *p) return call_int_hook(task_getsid, 0, p); } -void security_task_getsecid_subj(struct task_struct *p, u32 *secid) +void security_current_getsecid_subj(u32 *secid) { *secid = 0; - call_void_hook(task_getsecid_subj, p, secid); + call_void_hook(current_getsecid_subj, secid); } -EXPORT_SYMBOL(security_task_getsecid_subj); +EXPORT_SYMBOL(security_current_getsecid_subj); void security_task_getsecid_obj(struct task_struct *p, u32 *secid) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 62d30c0a30c2..726175254f60 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -229,19 +229,6 @@ static inline u32 cred_sid(const struct cred *cred) return tsec->sid; } -/* - * get the subjective security ID of a task - */ -static inline u32 task_sid_subj(const struct task_struct *task) -{ - u32 sid; - - rcu_read_lock(); - sid = cred_sid(rcu_dereference(task->cred)); - rcu_read_unlock(); - return sid; -} - /* * get the objective security ID of a task */ @@ -4205,9 +4192,9 @@ static int selinux_task_getsid(struct task_struct *p) PROCESS__GETSESSION, NULL); } -static void selinux_task_getsecid_subj(struct task_struct *p, u32 *secid) +static void selinux_current_getsecid_subj(u32 *secid) { - *secid = task_sid_subj(p); + *secid = current_sid(); } static void selinux_task_getsecid_obj(struct task_struct *p, u32 *secid) @@ -7159,7 +7146,7 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid), LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid), LSM_HOOK_INIT(task_getsid, selinux_task_getsid), - LSM_HOOK_INIT(task_getsecid_subj, selinux_task_getsecid_subj), + LSM_HOOK_INIT(current_getsecid_subj, selinux_current_getsecid_subj), LSM_HOOK_INIT(task_getsecid_obj, selinux_task_getsecid_obj), LSM_HOOK_INIT(task_setnice, selinux_task_setnice), LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio), diff --git a/security/smack/smack.h b/security/smack/smack.h index 99c3422596ab..fc837dcebf96 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -389,22 +389,6 @@ static inline struct smack_known *smk_of_task(const struct task_smack *tsp) return tsp->smk_task; } -static inline struct smack_known *smk_of_task_struct_subj( - const struct task_struct *t) -{ - struct smack_known *skp; - const struct cred *cred; - - rcu_read_lock(); - - cred = rcu_dereference(t->cred); - skp = smk_of_task(smack_cred(cred)); - - rcu_read_unlock(); - - return skp; -} - static inline struct smack_known *smk_of_task_struct_obj( const struct task_struct *t) { diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index efd35b07c7f8..14b279cc75c9 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -2067,15 +2067,14 @@ static int smack_task_getsid(struct task_struct *p) } /** - * smack_task_getsecid_subj - get the subjective secid of the task - * @p: the task + * smack_current_getsecid_subj - get the subjective secid of the current task * @secid: where to put the result * * Sets the secid to contain a u32 version of the task's subjective smack label. */ -static void smack_task_getsecid_subj(struct task_struct *p, u32 *secid) +static void smack_current_getsecid_subj(u32 *secid) { - struct smack_known *skp = smk_of_task_struct_subj(p); + struct smack_known *skp = smk_of_current(); *secid = skp->smk_secid; } @@ -4807,7 +4806,7 @@ static struct security_hook_list smack_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(task_setpgid, smack_task_setpgid), LSM_HOOK_INIT(task_getpgid, smack_task_getpgid), LSM_HOOK_INIT(task_getsid, smack_task_getsid), - LSM_HOOK_INIT(task_getsecid_subj, smack_task_getsecid_subj), + LSM_HOOK_INIT(current_getsecid_subj, smack_current_getsecid_subj), LSM_HOOK_INIT(task_getsecid_obj, smack_task_getsecid_obj), LSM_HOOK_INIT(task_setnice, smack_task_setnice), LSM_HOOK_INIT(task_setioprio, smack_task_setioprio), -- cgit v1.2.3 From d257cc8cb8d5355ffc43a96bab94db7b5a324803 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 15 Nov 2021 20:29:12 -0500 Subject: locking/rwsem: Make handoff bit handling more consistent There are some inconsistency in the way that the handoff bit is being handled in readers and writers that lead to a race condition. Firstly, when a queue head writer set the handoff bit, it will clear it when the writer is being killed or interrupted on its way out without acquiring the lock. That is not the case for a queue head reader. The handoff bit will simply be inherited by the next waiter. Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both the waiter and handoff bits are cleared if the wait queue becomes empty. For rwsem_down_write_slowpath(), however, the handoff bit is not checked and cleared if the wait queue is empty. This can potentially make the handoff bit set with empty wait queue. Worse, the situation in rwsem_down_write_slowpath() relies on wstate, a variable set outside of the critical section containing the ->count manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF can be double subtracted, corrupting ->count. To make the handoff bit handling more consistent and robust, extract out handoff bit clearing code into the new rwsem_del_waiter() helper function. Also, completely eradicate wstate; always evaluate everything inside the same critical section. The common function will only use atomic_long_andnot() to clear bits when the wait queue is empty to avoid possible race condition. If the first waiter with handoff bit set is killed or interrupted to exit the slowpath without acquiring the lock, the next waiter will inherit the handoff bit. While at it, simplify the trylock for loop in rwsem_down_write_slowpath() to make it easier to read. Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation") Reported-by: Zhenhua Ma Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211116012912.723980-1-longman@redhat.com --- kernel/locking/rwsem.c | 171 ++++++++++++++++++++++++------------------------- 1 file changed, 85 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c51387a43265..e039cf1605af 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -105,9 +105,9 @@ * atomic_long_cmpxchg() will be used to obtain writer lock. * * There are three places where the lock handoff bit may be set or cleared. - * 1) rwsem_mark_wake() for readers. - * 2) rwsem_try_write_lock() for writers. - * 3) Error path of rwsem_down_write_slowpath(). + * 1) rwsem_mark_wake() for readers -- set, clear + * 2) rwsem_try_write_lock() for writers -- set, clear + * 3) rwsem_del_waiter() -- clear * * For all the above cases, wait_lock will be held. A writer must also * be the first one in the wait_list to be eligible for setting the handoff @@ -334,6 +334,9 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; + + /* Writer only, not initialized in reader */ + bool handoff_set; }; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -344,12 +347,6 @@ enum rwsem_wake_type { RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ }; -enum writer_wait_state { - WRITER_NOT_FIRST, /* Writer is not first in wait list */ - WRITER_FIRST, /* Writer is first in wait list */ - WRITER_HANDOFF /* Writer is first & handoff needed */ -}; - /* * The typical HZ value is either 250 or 1000. So set the minimum waiting * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait @@ -365,6 +362,31 @@ enum writer_wait_state { */ #define MAX_READERS_WAKEUP 0x100 +static inline void +rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_add_tail(&waiter->list, &sem->wait_list); + /* caller will set RWSEM_FLAG_WAITERS */ +} + +/* + * Remove a waiter from the wait_list and clear flags. + * + * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of + * this function. Modify with care. + */ +static inline void +rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_del(&waiter->list); + if (likely(!list_empty(&sem->wait_list))) + return; + + atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -376,6 +398,8 @@ enum writer_wait_state { * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false + * + * Implies rwsem_del_waiter() for all woken readers. */ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, @@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); + + oldcount = atomic_long_read(&sem->count); if (list_empty(&sem->wait_list)) { - /* hit end of list above */ + /* + * Combined with list_move_tail() above, this implies + * rwsem_del_waiter(). + */ adjustment -= RWSEM_FLAG_WAITERS; + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; + } else if (woken) { + /* + * When we've woken a reader, we no longer need to force + * writers to give up the lock and we can clear HANDOFF. + */ + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; } - /* - * When we've woken a reader, we no longer need to force writers - * to give up the lock and we can clear HANDOFF. - */ - if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) - adjustment -= RWSEM_FLAG_HANDOFF; - if (adjustment) atomic_long_add(adjustment, &sem->count); @@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. * - * If wstate is WRITER_HANDOFF, it will make sure that either the handoff - * bit is set or the lock is acquired with handoff bit cleared. + * Implies rwsem_del_waiter() on success. */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, - enum writer_wait_state wstate) + struct rwsem_waiter *waiter) { + bool first = rwsem_first_waiter(sem) == waiter; long count, new; lockdep_assert_held(&sem->wait_lock); @@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, do { bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); - if (has_handoff && wstate == WRITER_NOT_FIRST) - return false; + if (has_handoff) { + if (!first) + return false; + + /* First waiter inherits a previously set handoff bit */ + waiter->handoff_set = true; + } new = count; if (count & RWSEM_LOCK_MASK) { - if (has_handoff || (wstate != WRITER_HANDOFF)) + if (has_handoff || (!rt_task(waiter->task) && + !time_after(jiffies, waiter->timeout))) return false; new |= RWSEM_FLAG_HANDOFF; @@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, * We have either acquired the lock with handoff bit cleared or * set the handoff bit. */ - if (new & RWSEM_FLAG_HANDOFF) + if (new & RWSEM_FLAG_HANDOFF) { + waiter->handoff_set = true; + lockevent_inc(rwsem_wlock_handoff); return false; + } + /* + * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on + * success. + */ + list_del(&waiter->list); rwsem_set_owner(sem); return true; } @@ -956,7 +1001,7 @@ queue: } adjustment += RWSEM_FLAG_WAITERS; } - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); @@ -1002,11 +1047,7 @@ queue: return sem; out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) { - atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, - &sem->count); - } + rwsem_del_waiter(sem, &waiter); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); @@ -1020,9 +1061,7 @@ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; - enum writer_wait_state wstate; struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ @@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; - - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock */ - if (wstate == WRITER_NOT_FIRST) { + if (rwsem_first_waiter(sem) != &waiter) { count = atomic_long_read(&sem->count); /* @@ -1083,13 +1119,16 @@ wait: /* wait until we successfully acquire the lock */ set_current_state(state); for (;;) { - if (rwsem_try_write_lock(sem, wstate)) { + if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ break; } raw_spin_unlock_irq(&sem->wait_lock); + if (signal_pending_state(state, current)) + goto out_nolock; + /* * After setting the handoff bit and failing to acquire * the lock, attempt to spin on owner to accelerate lock @@ -1098,7 +1137,7 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if (wstate == WRITER_HANDOFF) { + if (waiter.handoff_set) { enum owner_state owner_state; preempt_disable(); @@ -1109,66 +1148,26 @@ wait: goto trylock_again; } - /* Block until there are no active lockers. */ - for (;;) { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - lockevent_inc(rwsem_sleep_writer); - set_current_state(state); - /* - * If HANDOFF bit is set, unconditionally do - * a trylock. - */ - if (wstate == WRITER_HANDOFF) - break; - - if ((wstate == WRITER_NOT_FIRST) && - (rwsem_first_waiter(sem) == &waiter)) - wstate = WRITER_FIRST; - - count = atomic_long_read(&sem->count); - if (!(count & RWSEM_LOCK_MASK)) - break; - - /* - * The setting of the handoff bit is deferred - * until rwsem_try_write_lock() is called. - */ - if ((wstate == WRITER_FIRST) && (rt_task(current) || - time_after(jiffies, waiter.timeout))) { - wstate = WRITER_HANDOFF; - lockevent_inc(rwsem_wlock_handoff); - break; - } - } + schedule(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); - list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); - - return ret; + return sem; out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - - if (unlikely(wstate == WRITER_HANDOFF)) - atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); - - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); - else + rwsem_del_waiter(sem, &waiter); + if (!list_empty(&sem->wait_list)) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); lockevent_inc(rwsem_wlock_fail); - return ERR_PTR(-EINTR); } -- cgit v1.2.3 From 14c24048841151548a3f4d9e218510c844c1b737 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 18 Nov 2021 17:44:55 +0800 Subject: locking/rwsem: Optimize down_read_trylock() under highly contended case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We found that a process with 10 thousnads threads has been encountered a regression problem from Linux-v4.14 to Linux-v5.4. It is a kind of workload which will concurrently allocate lots of memory in different threads sometimes. In this case, we will see the down_read_trylock() with a high hotspot. Therefore, we suppose that rwsem has a regression at least since Linux-v5.4. In order to easily debug this problem, we write a simply benchmark to create the similar situation lile the following. ```c++ #include #include #include #include #include #include #include #include #include volatile int mutex; void trigger(int cpu, char* ptr, std::size_t sz) { cpu_set_t set; CPU_ZERO(&set); CPU_SET(cpu, &set); assert(pthread_setaffinity_np(pthread_self(), sizeof(set), &set) == 0); while (mutex); for (std::size_t i = 0; i < sz; i += 4096) { *ptr = '\0'; ptr += 4096; } } int main(int argc, char* argv[]) { std::size_t sz = 100; if (argc > 1) sz = atoi(argv[1]); auto nproc = std::thread::hardware_concurrency(); std::vector thr; sz <<= 30; auto* ptr = mmap(nullptr, sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); assert(ptr != MAP_FAILED); char* cptr = static_cast(ptr); auto run = sz / nproc; run = (run >> 12) << 12; mutex = 1; for (auto i = 0U; i < nproc; ++i) { thr.emplace_back(std::thread([i, cptr, run]() { trigger(i, cptr, run); })); cptr += run; } rusage usage_start; getrusage(RUSAGE_SELF, &usage_start); auto start = std::chrono::system_clock::now(); mutex = 0; for (auto& t : thr) t.join(); rusage usage_end; getrusage(RUSAGE_SELF, &usage_end); auto end = std::chrono::system_clock::now(); timeval utime; timeval stime; timersub(&usage_end.ru_utime, &usage_start.ru_utime, &utime); timersub(&usage_end.ru_stime, &usage_start.ru_stime, &stime); printf("usr: %ld.%06ld\n", utime.tv_sec, utime.tv_usec); printf("sys: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); printf("real: %lu\n", std::chrono::duration_cast(end - start).count()); return 0; } ``` The functionality of above program is simply which creates `nproc` threads and each of them are trying to touch memory (trigger page fault) on different CPU. Then we will see the similar profile by `perf top`. 25.55% [kernel] [k] down_read_trylock 14.78% [kernel] [k] handle_mm_fault 13.45% [kernel] [k] up_read 8.61% [kernel] [k] clear_page_erms 3.89% [kernel] [k] __do_page_fault The highest hot instruction, which accounts for about 92%, in down_read_trylock() is cmpxchg like the following. 91.89 │ lock cmpxchg %rdx,(%rdi) Sice the problem is found by migrating from Linux-v4.14 to Linux-v5.4, so we easily found that the commit ddb20d1d3aed ("locking/rwsem: Optimize down_read_trylock()") caused the regression. The reason is that the commit assumes the rwsem is not contended at all. But it is not always true for mmap lock which could be contended with thousands threads. So most threads almost need to run at least 2 times of "cmpxchg" to acquire the lock. The overhead of atomic operation is higher than non-atomic instructions, which caused the regression. By using the above benchmark, the real executing time on a x86-64 system before and after the patch were: Before Patch After Patch # of Threads real real reduced by ------------ ------ ------ ---------- 1 65,373 65,206 ~0.0% 4 15,467 15,378 ~0.5% 40 6,214 5,528 ~11.0% For the uncontended case, the new down_read_trylock() is the same as before. For the contended cases, the new down_read_trylock() is faster than before. The more contended, the more fast. Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20211118094455.9068-1-songmuchun@bytedance.com --- kernel/locking/rwsem.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e039cf1605af..04a74d040a6d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1248,17 +1248,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); - /* - * Optimize for the case when the rwsem is not locked at all. - */ - tmp = RWSEM_UNLOCKED_VALUE; - do { + tmp = atomic_long_read(&sem->count); + while (!(tmp & RWSEM_READ_FAILED_MASK)) { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - tmp + RWSEM_READER_BIAS)) { + tmp + RWSEM_READER_BIAS)) { rwsem_set_reader_owned(sem); return 1; } - } while (!(tmp & RWSEM_READ_FAILED_MASK)); + } return 0; } -- cgit v1.2.3 From 73743c3b092277febbf69b250ce8ebbca0525aa2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 9 Nov 2021 13:22:32 +0100 Subject: perf: Ignore sigtrap for tracepoints destined for other tasks syzbot reported that the warning in perf_sigtrap() fires, saying that the event's task does not match current: | WARNING: CPU: 0 PID: 9090 at kernel/events/core.c:6446 perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513 | Modules linked in: | CPU: 0 PID: 9090 Comm: syz-executor.1 Not tainted 5.15.0-syzkaller #0 | Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 | RIP: 0010:perf_sigtrap kernel/events/core.c:6446 [inline] | RIP: 0010:perf_pending_event_disable kernel/events/core.c:6470 [inline] | RIP: 0010:perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513 | ... | Call Trace: | | irq_work_single+0x106/0x220 kernel/irq_work.c:211 | irq_work_run_list+0x6a/0x90 kernel/irq_work.c:242 | irq_work_run+0x4f/0xd0 kernel/irq_work.c:251 | __sysvec_irq_work+0x95/0x3d0 arch/x86/kernel/irq_work.c:22 | sysvec_irq_work+0x8e/0xc0 arch/x86/kernel/irq_work.c:17 | | | asm_sysvec_irq_work+0x12/0x20 arch/x86/include/asm/idtentry.h:664 | RIP: 0010:__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline] | RIP: 0010:_raw_spin_unlock_irqrestore+0x38/0x70 kernel/locking/spinlock.c:194 | ... | coredump_task_exit kernel/exit.c:371 [inline] | do_exit+0x1865/0x25c0 kernel/exit.c:771 | do_group_exit+0xe7/0x290 kernel/exit.c:929 | get_signal+0x3b0/0x1ce0 kernel/signal.c:2820 | arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:868 | handle_signal_work kernel/entry/common.c:148 [inline] | exit_to_user_mode_loop kernel/entry/common.c:172 [inline] | exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:207 | __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] | syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:300 | do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86 | entry_SYSCALL_64_after_hwframe+0x44/0xae On x86 this shouldn't happen, which has arch_irq_work_raise(). The test program sets up a perf event with sigtrap set to fire on the 'sched_wakeup' tracepoint, which fired in ttwu_do_wakeup(). This happened because the 'sched_wakeup' tracepoint also takes a task argument passed on to perf_tp_event(), which is used to deliver the event to that other task. Since we cannot deliver synchronous signals to other tasks, skip an event if perf_tp_event() is targeted at another task and perf_event_attr::sigtrap is set, which will avoid ever entering perf_sigtrap() for such events. Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: syzbot+663359e32ce6f1a305ad@syzkaller.appspotmail.com Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YYpoCOBmC/kJWfmI@elver.google.com --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 523106a506ee..30d94f68c5bd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9759,6 +9759,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, continue; if (event->attr.config != entry->type) continue; + /* Cannot deliver synchronous signal to other task. */ + if (event->attr.sigtrap) + continue; if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); } -- cgit v1.2.3 From 9731698ecb9c851f353ce2496292ff9fcea39dff Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 15 Nov 2021 19:46:04 +0300 Subject: cputime, cpuacct: Include guest time in user time in cpuacct.stat cpuacct.stat in no-root cgroups shows user time without guest time included int it. This doesn't match with user time shown in root cpuacct.stat and /proc//stat. This also affects cgroup2's cpu.stat in the same way. Make account_guest_time() to add user time to cgroup's cpustat to fix this. Fixes: ef12fefabf94 ("cpuacct: add per-cgroup utime/stime statistics") Signed-off-by: Andrey Ryabinin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Jordan Acked-by: Tejun Heo Cc: Link: https://lore.kernel.org/r/20211115164607.23784-1-arbn@yandex-team.com --- kernel/sched/cputime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 872e481d5098..042a6dbce8f3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -148,10 +148,10 @@ void account_guest_time(struct task_struct *p, u64 cputime) /* Add guest time to cpustat. */ if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += cputime; + task_group_account_field(p, CPUTIME_NICE, cputime); cpustat[CPUTIME_GUEST_NICE] += cputime; } else { - cpustat[CPUTIME_USER] += cputime; + task_group_account_field(p, CPUTIME_USER, cputime); cpustat[CPUTIME_GUEST] += cputime; } } -- cgit v1.2.3 From c7ccbf4b6174e32c130892570db06d0f496cfef0 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 15 Nov 2021 19:46:05 +0300 Subject: cpuacct: Convert BUG_ON() to WARN_ON_ONCE() Replace fatal BUG_ON() with more safe WARN_ON_ONCE() in cpuacct_cpuusage_read(). Signed-off-by: Andrey Ryabinin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Jordan Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20211115164607.23784-2-arbn@yandex-team.com --- kernel/sched/cpuacct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 893eece65bfd..f347cf9e4634 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -106,7 +106,8 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, * We allow index == CPUACCT_STAT_NSTATS here to read * the sum of usages. */ - BUG_ON(index > CPUACCT_STAT_NSTATS); + if (WARN_ON_ONCE(index > CPUACCT_STAT_NSTATS)) + return 0; #ifndef CONFIG_64BIT /* -- cgit v1.2.3 From dd02d4234c9a2214a81c57a16484304a1a51872a Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 15 Nov 2021 19:46:06 +0300 Subject: sched/cpuacct: Fix user/system in shown cpuacct.usage* cpuacct has 2 different ways of accounting and showing user and system times. The first one uses cpuacct_account_field() to account times and cpuacct.stat file to expose them. And this one seems to work ok. The second one is uses cpuacct_charge() function for accounting and set of cpuacct.usage* files to show times. Despite some attempts to fix it in the past it still doesn't work. Sometimes while running KVM guest the cpuacct_charge() accounts most of the guest time as system time. This doesn't match with user&system times shown in cpuacct.stat or proc//stat. Demonstration: # git clone https://github.com/aryabinin/kvmsample # make # mkdir /sys/fs/cgroup/cpuacct/test # echo $$ > /sys/fs/cgroup/cpuacct/test/tasks # ./kvmsample & # for i in {1..5}; do cat /sys/fs/cgroup/cpuacct/test/cpuacct.usage_sys; sleep 1; done 1976535645 2979839428 3979832704 4983603153 5983604157 Use cpustats accounted in cpuacct_account_field() as the source of user/sys times for cpuacct.usage* files. Make cpuacct_charge() to account only summary execution time. Fixes: d740037fac70 ("sched/cpuacct: Split usage accounting into user_usage and sys_usage") Signed-off-by: Andrey Ryabinin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Jordan Acked-by: Tejun Heo Cc: Link: https://lore.kernel.org/r/20211115164607.23784-3-arbn@yandex-team.com --- kernel/sched/cpuacct.c | 79 ++++++++++++++++++++------------------------------ 1 file changed, 32 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f347cf9e4634..9de7dd51beb0 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -21,15 +21,11 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -struct cpuacct_usage { - u64 usages[CPUACCT_STAT_NSTATS]; -}; - /* track CPU usage of a group of tasks and its child groups */ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every CPU */ - struct cpuacct_usage __percpu *cpuusage; + u64 __percpu *cpuusage; struct kernel_cpustat __percpu *cpustat; }; @@ -49,7 +45,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return css_ca(ca->css.parent); } -static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct = { .cpustat = &kernel_cpustat, .cpuusage = &root_cpuacct_cpuusage, @@ -68,7 +64,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) if (!ca) goto out; - ca->cpuusage = alloc_percpu(struct cpuacct_usage); + ca->cpuusage = alloc_percpu(u64); if (!ca->cpuusage) goto out_free_ca; @@ -99,7 +95,8 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, enum cpuacct_stat_index index) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; u64 data; /* @@ -116,14 +113,17 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif - if (index == CPUACCT_STAT_NSTATS) { - int i = 0; - - data = 0; - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - data += cpuusage->usages[i]; - } else { - data = cpuusage->usages[index]; + switch (index) { + case CPUACCT_STAT_USER: + data = cpustat[CPUTIME_USER] + cpustat[CPUTIME_NICE]; + break; + case CPUACCT_STAT_SYSTEM: + data = cpustat[CPUTIME_SYSTEM] + cpustat[CPUTIME_IRQ] + + cpustat[CPUTIME_SOFTIRQ]; + break; + case CPUACCT_STAT_NSTATS: + data = *cpuusage; + break; } #ifndef CONFIG_64BIT @@ -133,10 +133,14 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, return data; } -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - int i; + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; + + /* Don't allow to reset global kernel_cpustat */ + if (ca == &root_cpuacct) + return; #ifndef CONFIG_64BIT /* @@ -144,9 +148,10 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) */ raw_spin_rq_lock_irq(cpu_rq(cpu)); #endif - - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - cpuusage->usages[i] = val; + *cpuusage = 0; + cpustat[CPUTIME_USER] = cpustat[CPUTIME_NICE] = 0; + cpustat[CPUTIME_SYSTEM] = cpustat[CPUTIME_IRQ] = 0; + cpustat[CPUTIME_SOFTIRQ] = 0; #ifndef CONFIG_64BIT raw_spin_rq_unlock_irq(cpu_rq(cpu)); @@ -197,7 +202,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, return -EINVAL; for_each_possible_cpu(cpu) - cpuacct_cpuusage_write(ca, cpu, 0); + cpuacct_cpuusage_write(ca, cpu); return 0; } @@ -244,25 +249,10 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) seq_puts(m, "\n"); for_each_possible_cpu(cpu) { - struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - seq_printf(m, "%d", cpu); - - for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit - * platforms. - */ - raw_spin_rq_lock_irq(cpu_rq(cpu)); -#endif - - seq_printf(m, " %llu", cpuusage->usages[index]); - -#ifndef CONFIG_64BIT - raw_spin_rq_unlock_irq(cpu_rq(cpu)); -#endif - } + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %llu", + cpuacct_cpuusage_read(ca, cpu, index)); seq_puts(m, "\n"); } return 0; @@ -340,16 +330,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index = CPUACCT_STAT_SYSTEM; - struct pt_regs *regs = get_irq_regs() ? : task_pt_regs(tsk); - - if (regs && user_mode(regs)) - index = CPUACCT_STAT_USER; rcu_read_lock(); for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) - __this_cpu_add(ca->cpuusage->usages[index], cputime); + __this_cpu_add(*ca->cpuusage, cputime); rcu_read_unlock(); } -- cgit v1.2.3 From 8c92606ab81086db00cbb73347d124b4eb169b7e Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 15 Nov 2021 19:46:07 +0300 Subject: sched/cpuacct: Make user/system times in cpuacct.stat more precise cpuacct.stat shows user time based on raw random precision tick based counters. Use cputime_addjust() to scale these values against the total runtime accounted by the scheduler, like we already do for user/system times in /proc//stat. Signed-off-by: Andrey Ryabinin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Jordan Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20211115164607.23784-4-arbn@yandex-team.com --- kernel/sched/cpuacct.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9de7dd51beb0..3d06c5e4220d 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -261,25 +261,30 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V) static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); - s64 val[CPUACCT_STAT_NSTATS]; + struct task_cputime cputime; + u64 val[CPUACCT_STAT_NSTATS]; int cpu; int stat; - memset(val, 0, sizeof(val)); + memset(&cputime, 0, sizeof(cputime)); for_each_possible_cpu(cpu) { u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; - val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; - val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; + cputime.utime += cpustat[CPUTIME_USER]; + cputime.utime += cpustat[CPUTIME_NICE]; + cputime.stime += cpustat[CPUTIME_SYSTEM]; + cputime.stime += cpustat[CPUTIME_IRQ]; + cputime.stime += cpustat[CPUTIME_SOFTIRQ]; + + cputime.sum_exec_runtime += *per_cpu_ptr(ca->cpuusage, cpu); } + cputime_adjust(&cputime, &seq_css(sf)->cgroup->prev_cputime, + &val[CPUACCT_STAT_USER], &val[CPUACCT_STAT_SYSTEM]); + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { - seq_printf(sf, "%s %lld\n", - cpuacct_stat_desc[stat], - (long long)nsec_to_clock_t(val[stat])); + seq_printf(sf, "%s %llu\n", cpuacct_stat_desc[stat], + nsec_to_clock_t(val[stat])); } return 0; -- cgit v1.2.3 From 1880ed71ce863318c1ce93bf324876fb5f92854f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 23 Nov 2021 15:28:01 +0100 Subject: tracing/uprobe: Fix uprobe_perf_open probes iteration Add missing 'tu' variable initialization in the probes loop, otherwise the head 'tu' is used instead of added probes. Link: https://lkml.kernel.org/r/20211123142801.182530-1-jolsa@kernel.org Cc: stable@vger.kernel.org Fixes: 99c9a923e97a ("tracing/uprobe: Fix double perf_event linking on multiprobe uprobe") Acked-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0a5c0db3137e..f5f0039d31e5 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1313,6 +1313,7 @@ static int uprobe_perf_open(struct trace_event_call *call, return 0; list_for_each_entry(pos, trace_probe_probe_list(tp), list) { + tu = container_of(pos, struct trace_uprobe, tp); err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); -- cgit v1.2.3 From dce1ca0525bfdc8a69a9343bc714fbc19a2f04b3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 23 Nov 2021 11:40:47 +0000 Subject: sched/scs: Reset task stack state in bringup_cpu() To hot unplug a CPU, the idle task on that CPU calls a few layers of C code before finally leaving the kernel. When KASAN is in use, poisoned shadow is left around for each of the active stack frames, and when shadow call stacks are in use. When shadow call stacks (SCS) are in use the task's saved SCS SP is left pointing at an arbitrary point within the task's shadow call stack. When a CPU is offlined than onlined back into the kernel, this stale state can adversely affect execution. Stale KASAN shadow can alias new stackframes and result in bogus KASAN warnings. A stale SCS SP is effectively a memory leak, and prevents a portion of the shadow call stack being used. Across a number of hotplug cycles the idle task's entire shadow call stack can become unusable. We previously fixed the KASAN issue in commit: e1b77c92981a5222 ("sched/kasan: remove stale KASAN poison after hotplug") ... by removing any stale KASAN stack poison immediately prior to onlining a CPU. Subsequently in commit: f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled") ... the refactoring left the KASAN and SCS cleanup in one-time idle thread initialization code rather than something invoked prior to each CPU being onlined, breaking both as above. We fixed SCS (but not KASAN) in commit: 63acd42c0d4942f7 ("sched/scs: Reset the shadow stack when idle_task_exit") ... but as this runs in the context of the idle task being offlined it's potentially fragile. To fix these consistently and more robustly, reset the SCS SP and KASAN shadow of a CPU's idle task immediately before we online that CPU in bringup_cpu(). This ensures the idle task always has a consistent state when it is running, and removes the need to so so when exiting an idle task. Whenever any thread is created, dup_task_struct() will give the task a stack which is free of KASAN shadow, and initialize the task's SCS SP, so there's no need to specially initialize either for idle thread within init_idle(), as this was only necessary to handle hotplug cycles. I've tested this on arm64 with: * gcc 11.1.0, defconfig +KASAN_INLINE, KASAN_STACK * clang 12.0.0, defconfig +KASAN_INLINE, KASAN_STACK, SHADOW_CALL_STACK ... offlining and onlining CPUS with: | while true; do | for C in /sys/devices/system/cpu/cpu*/online; do | echo 0 > $C; | echo 1 > $C; | done | done Fixes: f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled") Reported-by: Qian Cai Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Qian Cai Link: https://lore.kernel.org/lkml/20211115113310.35693-1-mark.rutland@arm.com/ --- kernel/cpu.c | 7 +++++++ kernel/sched/core.c | 4 ---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 192e43a87407..407a2568f35e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -587,6 +588,12 @@ static int bringup_cpu(unsigned int cpu) struct task_struct *idle = idle_thread_get(cpu); int ret; + /* + * Reset stale stack state from the last time this CPU was online. + */ + scs_task_reset(idle); + kasan_unpoison_task_stack(idle); + /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c9b0fda64ac..76f9deeaa942 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8619,9 +8619,6 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); - scs_task_reset(idle); - kasan_unpoison_task_stack(idle); - #ifdef CONFIG_SMP /* * It's possible that init_idle() gets called multiple times on a task, @@ -8777,7 +8774,6 @@ void idle_task_exit(void) finish_arch_post_lock_switch(); } - scs_task_reset(current); /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } -- cgit v1.2.3 From cefcf24b4d351daf70ecd945324e200d3736821e Mon Sep 17 00:00:00 2001 From: Thomas Zeitlhofer Date: Tue, 23 Nov 2021 20:18:43 +0100 Subject: PM: hibernate: use correct mode for swsusp_close() Commit 39fbef4b0f77 ("PM: hibernate: Get block device exclusively in swsusp_check()") changed the opening mode of the block device to (FMODE_READ | FMODE_EXCL). In the corresponding calls to swsusp_close(), the mode is still just FMODE_READ which triggers the warning in blkdev_flush_mapping() on resume from hibernate. So, use the mode (FMODE_READ | FMODE_EXCL) also when closing the device. Fixes: 39fbef4b0f77 ("PM: hibernate: Get block device exclusively in swsusp_check()") Signed-off-by: Thomas Zeitlhofer Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 9ed9b744876c..e6af502c2fd7 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -693,7 +693,7 @@ static int load_image_and_restore(void) goto Unlock; error = swsusp_read(&flags); - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); if (!error) error = hibernation_restore(flags & SF_PLATFORM_MODE); @@ -983,7 +983,7 @@ static int software_resume(void) /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); goto Unlock; } @@ -1018,7 +1018,7 @@ static int software_resume(void) pm_pr_dbg("Hibernation image not present or could not be loaded.\n"); return error; Close_Finish: - swsusp_close(FMODE_READ); + swsusp_close(FMODE_READ | FMODE_EXCL); goto Finish; } -- cgit v1.2.3 From 88a5045f176b78c33a269a30a7b146e99c550bd9 Mon Sep 17 00:00:00 2001 From: Evan Green Date: Fri, 29 Oct 2021 12:24:22 -0700 Subject: PM: hibernate: Fix snapshot partial write lengths snapshot_write() is inappropriately limiting the amount of data that can be written in cases where a partial page has already been written. For example, one would expect to be able to write 1 byte, then 4095 bytes to the snapshot device, and have both of those complete fully (since now we're aligned to a page again). But what ends up happening is we write 1 byte, then 4094/4095 bytes complete successfully. The reason is that simple_write_to_buffer()'s second argument is the total size of the buffer, not the size of the buffer minus the offset. Since simple_write_to_buffer() accounts for the offset in its implementation, snapshot_write() can just pass the full page size directly down. Signed-off-by: Evan Green Signed-off-by: Rafael J. Wysocki --- kernel/power/user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index 740723bb3885..ad241b4ff64c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -177,7 +177,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res <= 0) goto unlock; } else { - res = PAGE_SIZE - pg_offp; + res = PAGE_SIZE; } if (!data_of(data->handle)) { -- cgit v1.2.3 From 3297481d688a5cc2973ea58bd78e66b8639748b1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 26 Oct 2021 12:03:48 +0200 Subject: futex: Remove futex_cmpxchg detection Now that all architectures have a working futex implementation in any configuration, remove the runtime detection code. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Reviewed-by: Russell King (Oracle) Acked-by: Vineet Gupta Acked-by: Max Filippov Acked-by: Christian Borntraeger Link: https://lore.kernel.org/r/20211026100432.1730393-2-arnd@kernel.org --- arch/arc/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/csky/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/um/Kconfig | 1 - arch/um/kernel/skas/uaccess.c | 1 - arch/xtensa/Kconfig | 1 - init/Kconfig | 8 -------- kernel/futex/core.c | 35 ----------------------------------- kernel/futex/futex.h | 6 ------ kernel/futex/syscalls.c | 22 ---------------------- 15 files changed, 82 deletions(-) (limited to 'kernel') diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index b4ae6058902a..f74d9860a442 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -32,7 +32,6 @@ config ARC select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARC_MMU_V4 select HAVE_DEBUG_STACKOVERFLOW select HAVE_DEBUG_KMEMLEAK - select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_IOREMAP_PROT select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index f0f9e8bec83a..2948487346dc 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -92,7 +92,6 @@ config ARM select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !(THUMB2_KERNEL && CC_IS_CLANG) - select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7) select HAVE_IRQ_TIME_ACCOUNTING diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c4207cf9bb17..5e2dfef78956 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -194,7 +194,6 @@ config ARM64 select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_FUNCTION_ARG_ACCESS_API - select HAVE_FUTEX_CMPXCHG if FUTEX select MMU_GATHER_RCU_TABLE_FREE select HAVE_RSEQ select HAVE_STACKPROTECTOR diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index aed2b3e734ee..132f43f12dd8 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -52,7 +52,6 @@ config CSKY select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_ERROR_INJECTION - select HAVE_FUTEX_CMPXCHG if FUTEX && SMP select HAVE_FTRACE_MCOUNT_RECORD select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZO diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 0b50da08a9c5..15a793c5b2dc 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -20,7 +20,6 @@ config M68K select HAVE_ASM_MODVERSIONS select HAVE_DEBUG_BUGVERBOSE select HAVE_EFFICIENT_UNALIGNED_ACCESS if !CPU_HAS_NO_UNALIGNED - select HAVE_FUTEX_CMPXCHG if MMU && FUTEX select HAVE_MOD_ARCH_SPECIFIC select HAVE_UID16 select MMU_GATHER_NO_RANGE if MMU diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 821252b65f89..09abf62ae0ad 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -83,7 +83,6 @@ config RISCV select HAVE_DMA_CONTIGUOUS if MMU select HAVE_EBPF_JIT if MMU select HAVE_FUNCTION_ERROR_INJECTION - select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS select HAVE_GENERIC_VDSO if MMU && 64BIT select HAVE_IRQ_TIME_ACCOUNTING diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 8857ec3b97eb..f614562d74f0 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -165,7 +165,6 @@ config S390 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER - select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS select HAVE_GENERIC_VDSO select HAVE_IOREMAP_PROT if PCI diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 70afb30e0b32..2474a04ceac4 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -34,7 +34,6 @@ config SUPERH select HAVE_FAST_GUP if MMU select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER - select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_FTRACE_MCOUNT_RECORD select HAVE_HW_BREAKPOINT select HAVE_IOREMAP_PROT if MMU && !X2TLB diff --git a/arch/um/Kconfig b/arch/um/Kconfig index c18b45f75d41..c906250d4970 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -14,7 +14,6 @@ config UML select HAVE_ARCH_SECCOMP_FILTER select HAVE_ASM_MODVERSIONS select HAVE_UID16 - select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_BUGVERBOSE select NO_DMA if !UML_DMA_EMULATION diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c index a509be911026..9e37a7c05990 100644 --- a/arch/um/kernel/skas/uaccess.c +++ b/arch/um/kernel/skas/uaccess.c @@ -348,7 +348,6 @@ EXPORT_SYMBOL(arch_futex_atomic_op_inuser); * 0 - On success * -EFAULT - User access resulted in a page fault * -EAGAIN - Atomic operation was unable to complete due to contention - * -ENOSYS - Function not implemented (only if !HAVE_FUTEX_CMPXCHG) */ int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 0e56bad058fa..8ac599aa6d99 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -31,7 +31,6 @@ config XTENSA select HAVE_DMA_CONTIGUOUS select HAVE_EXIT_THREAD select HAVE_FUNCTION_TRACER - select HAVE_FUTEX_CMPXCHG if !MMU && FUTEX select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_IRQ_TIME_ACCOUNTING select HAVE_PCI diff --git a/init/Kconfig b/init/Kconfig index 3f5aa5063f55..76d89db5657b 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1592,14 +1592,6 @@ config FUTEX_PI depends on FUTEX && RT_MUTEXES default y -config HAVE_FUTEX_CMPXCHG - bool - depends on FUTEX - help - Architectures should select this if futex_atomic_cmpxchg_inatomic() - is implemented and always working. This removes a couple of runtime - checks. - config EPOLL bool "Enable eventpoll support" if EXPERT default y diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 25d8a88b32e5..926c2bb752bc 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -41,11 +41,6 @@ #include "futex.h" #include "../locking/rtmutex_common.h" -#ifndef CONFIG_HAVE_FUTEX_CMPXCHG -int __read_mostly futex_cmpxchg_enabled; -#endif - - /* * The base of the bucket array and its size are always used together * (after initialization only in futex_hash()), so ensure that they @@ -776,9 +771,6 @@ static void exit_robust_list(struct task_struct *curr) unsigned long futex_offset; int rc; - if (!futex_cmpxchg_enabled) - return; - /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -874,9 +866,6 @@ static void compat_exit_robust_list(struct task_struct *curr) compat_long_t futex_offset; int rc; - if (!futex_cmpxchg_enabled) - return; - /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -950,8 +939,6 @@ static void exit_pi_state_list(struct task_struct *curr) struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; - if (!futex_cmpxchg_enabled) - return; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful @@ -1125,26 +1112,6 @@ void futex_exit_release(struct task_struct *tsk) futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } -static void __init futex_detect_cmpxchg(void) -{ -#ifndef CONFIG_HAVE_FUTEX_CMPXCHG - u32 curval; - - /* - * This will fail and we want it. Some arch implementations do - * runtime detection of the futex_atomic_cmpxchg_inatomic() - * functionality. We want to know that before we call in any - * of the complex code paths. Also we want to prevent - * registration of robust lists in that case. NULL is - * guaranteed to fault and we get -EFAULT on functional - * implementation, the non-functional ones will return - * -ENOSYS. - */ - if (futex_cmpxchg_value_locked(&curval, NULL, 0, 0) == -EFAULT) - futex_cmpxchg_enabled = 1; -#endif -} - static int __init futex_init(void) { unsigned int futex_shift; @@ -1163,8 +1130,6 @@ static int __init futex_init(void) futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; - futex_detect_cmpxchg(); - for (i = 0; i < futex_hashsize; i++) { atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 040ae4277cb0..c264cbeab71c 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -27,12 +27,6 @@ #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 -#ifdef CONFIG_HAVE_FUTEX_CMPXCHG -#define futex_cmpxchg_enabled 1 -#else -extern int __read_mostly futex_cmpxchg_enabled; -#endif - #ifdef CONFIG_FAIL_FUTEX extern bool should_fail_futex(bool fshared); #else diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 6f91a07a6a83..086a22d1adb7 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -29,8 +29,6 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) { - if (!futex_cmpxchg_enabled) - return -ENOSYS; /* * The kernel knows only one size for now: */ @@ -56,9 +54,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, unsigned long ret; struct task_struct *p; - if (!futex_cmpxchg_enabled) - return -ENOSYS; - rcu_read_lock(); ret = -ESRCH; @@ -103,17 +98,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, return -ENOSYS; } - switch (cmd) { - case FUTEX_LOCK_PI: - case FUTEX_LOCK_PI2: - case FUTEX_UNLOCK_PI: - case FUTEX_TRYLOCK_PI: - case FUTEX_WAIT_REQUEUE_PI: - case FUTEX_CMP_REQUEUE_PI: - if (!futex_cmpxchg_enabled) - return -ENOSYS; - } - switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; @@ -323,9 +307,6 @@ COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, compat_size_t, len) { - if (!futex_cmpxchg_enabled) - return -ENOSYS; - if (unlikely(len != sizeof(*head))) return -EINVAL; @@ -342,9 +323,6 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, unsigned long ret; struct task_struct *p; - if (!futex_cmpxchg_enabled) - return -ENOSYS; - rcu_read_lock(); ret = -ESRCH; -- cgit v1.2.3 From 6cb206508b621a9a0a2c35b60540e399225c8243 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 26 Nov 2021 13:35:26 -0500 Subject: tracing: Check pid filtering when creating events When pid filtering is activated in an instance, all of the events trace files for that instance has the PID_FILTER flag set. This determines whether or not pid filtering needs to be done on the event, otherwise the event is executed as normal. If pid filtering is enabled when an event is created (via a dynamic event or modules), its flag is not updated to reflect the current state, and the events are not filtered properly. Cc: stable@vger.kernel.org Fixes: 3fdaf80f4a836 ("tracing: Implement event pid filtering") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4021b9a79f93..f8965fd50d3b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2678,12 +2678,24 @@ static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { + struct trace_pid_list *no_pid_list; + struct trace_pid_list *pid_list; struct trace_event_file *file; + unsigned int first; file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return NULL; + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + + if (!trace_pid_list_first(pid_list, &first) || + !trace_pid_list_first(pid_list, &first)) + file->flags |= EVENT_FILE_FL_PID_FILTER; + file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); -- cgit v1.2.3 From a55f224ff5f238013de8762c4287117e47b86e22 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 26 Nov 2021 17:34:42 -0500 Subject: tracing: Fix pid filtering when triggers are attached If a event is filtered by pid and a trigger that requires processing of the event to happen is a attached to the event, the discard portion does not take the pid filtering into account, and the event will then be recorded when it should not have been. Cc: stable@vger.kernel.org Fixes: 3fdaf80f4a836 ("tracing: Implement event pid filtering") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6b60ab9475ed..38715aa6cfdf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1366,14 +1366,26 @@ __event_trigger_test_discard(struct trace_event_file *file, if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, buffer, entry, event); - if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || - (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && - !filter_match_preds(file->filter, entry))) { - __trace_event_discard_commit(buffer, event); - return true; - } + if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_FILTERED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) + goto discard; + + if (file->flags & EVENT_FILE_FL_FILTERED && + !filter_match_preds(file->filter, entry)) + goto discard; + + if ((file->flags & EVENT_FILE_FL_PID_FILTER) && + trace_event_ignore_this_pid(file)) + goto discard; return false; + discard: + __trace_event_discard_commit(buffer, event); + return true; } /** -- cgit v1.2.3 From 27ff768fa21ca3286fcc87c3f38ac67d1a2cbe2d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sat, 27 Nov 2021 16:45:26 -0500 Subject: tracing: Test the 'Do not trace this pid' case in create event When creating a new event (via a module, kprobe, eprobe, etc), the descriptors that are created must add flags for pid filtering if an instance has pid filtering enabled, as the flags are used at the time the event is executed to know if pid filtering should be done or not. The "Only trace this pid" case was added, but a cut and paste error made that case checked twice, instead of checking the "Trace all but this pid" case. Link: https://lore.kernel.org/all/202111280401.qC0z99JB-lkp@intel.com/ Fixes: 6cb206508b62 ("tracing: Check pid filtering when creating events") Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f8965fd50d3b..92be9cb1d7d4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2693,7 +2693,7 @@ trace_create_new_event(struct trace_event_call *call, lockdep_is_held(&event_mutex)); if (!trace_pid_list_first(pid_list, &first) || - !trace_pid_list_first(pid_list, &first)) + !trace_pid_list_first(no_pid_list, &first)) file->flags |= EVENT_FILE_FL_PID_FILTER; file->event_call = call; -- cgit v1.2.3 From 88c9a2ce520ba381bb70658c80ec704f4d60f728 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Nov 2021 12:58:05 +0100 Subject: fork: move copy_io to block/blk-ioc.c Move the copying of the I/O context to the block layer as that is where we can use the proper low-level interfaces. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211126115817.2087431-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-ioc.c | 27 +++++++++++++++++++++++++++ include/linux/iocontext.h | 23 +++++++++++++---------- kernel/fork.c | 26 -------------------------- 3 files changed, 40 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 70c99e85aee5..3b31cfad4b75 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -322,6 +322,33 @@ struct io_context *get_task_io_context(struct task_struct *task, return NULL; } +int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + struct io_context *ioc = current->io_context; + struct io_context *new_ioc; + + /* + * Share io context with parent, if CLONE_IO is set + */ + if (clone_flags & CLONE_IO) { + get_io_context_active(ioc); + + WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); + atomic_inc(&ioc->nr_tasks); + + tsk->io_context = ioc; + } else if (ioprio_valid(ioc->ioprio)) { + new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); + if (unlikely(!new_ioc)) + return -ENOMEM; + + new_ioc->ioprio = ioc->ioprio; + put_io_context(new_ioc); + } + + return 0; +} + /** * ioc_lookup_icq - lookup io_cq from ioc * @ioc: the associated io_context diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 0a9dc40b7be8..bcd47d104d8e 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -129,14 +129,6 @@ static inline void get_io_context_active(struct io_context *ioc) atomic_inc(&ioc->active_ref); } -static inline void ioc_task_link(struct io_context *ioc) -{ - get_io_context_active(ioc); - - WARN_ON_ONCE(atomic_read(&ioc->nr_tasks) <= 0); - atomic_inc(&ioc->nr_tasks); -} - struct task_struct; #ifdef CONFIG_BLOCK void put_io_context(struct io_context *ioc); @@ -144,10 +136,21 @@ void put_io_context_active(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); +int __copy_io(unsigned long clone_flags, struct task_struct *tsk); +static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + if (!current->io_context) + return 0; + return __copy_io(clone_flags, tsk); +} #else struct io_context; static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } -#endif +static inline int copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + return 0; +} +#endif /* CONFIG_BLOCK */ -#endif +#endif /* IOCONTEXT_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 3244cc56b697..3161d7980155 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1556,32 +1556,6 @@ out: return error; } -static int copy_io(unsigned long clone_flags, struct task_struct *tsk) -{ -#ifdef CONFIG_BLOCK - struct io_context *ioc = current->io_context; - struct io_context *new_ioc; - - if (!ioc) - return 0; - /* - * Share io context with parent, if CLONE_IO is set - */ - if (clone_flags & CLONE_IO) { - ioc_task_link(ioc); - tsk->io_context = ioc; - } else if (ioprio_valid(ioc->ioprio)) { - new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); - if (unlikely(!new_ioc)) - return -ENOMEM; - - new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc); - } -#endif - return 0; -} - static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; -- cgit v1.2.3 From f3fa33acca9f0058157214800f68b10d8e71ab7a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 26 Nov 2021 13:18:00 +0100 Subject: block: remove the ->rq_disk field in struct request Just use the disk attached to the request_queue instead. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20211126121802.2090656-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-flush.c | 3 +-- block/blk-mq.c | 14 ++++++-------- block/blk.h | 2 +- drivers/block/amiflop.c | 2 +- drivers/block/ataflop.c | 6 +++--- drivers/block/floppy.c | 6 +++--- drivers/block/null_blk/trace.h | 2 +- drivers/block/paride/pcd.c | 2 +- drivers/block/paride/pd.c | 4 ++-- drivers/block/paride/pf.c | 4 ++-- drivers/block/rnbd/rnbd-clt.c | 4 ++-- drivers/block/sunvdc.c | 2 +- drivers/md/dm-mpath.c | 1 - drivers/mmc/core/block.c | 2 +- drivers/nvme/host/fault_inject.c | 2 +- drivers/nvme/host/trace.h | 6 +++--- drivers/scsi/scsi_lib.c | 3 ++- drivers/scsi/scsi_logging.c | 4 +++- drivers/scsi/sd.c | 24 ++++++++++++------------ drivers/scsi/sd_zbc.c | 8 ++++---- drivers/scsi/sr.c | 4 ++-- drivers/scsi/virtio_scsi.c | 2 +- drivers/usb/storage/transport.c | 2 +- include/linux/blk-mq.h | 4 ---- include/scsi/scsi_cmnd.h | 2 +- include/scsi/scsi_device.h | 4 ++-- include/trace/events/block.h | 8 ++++---- kernel/trace/blktrace.c | 2 +- 28 files changed, 62 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/block/blk-flush.c b/block/blk-flush.c index 902e80e48e4a..fd5187a0898d 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -145,7 +145,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) static void blk_account_io_flush(struct request *rq) { - struct block_device *part = rq->rq_disk->part0; + struct block_device *part = rq->q->disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); @@ -339,7 +339,6 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); flush_rq->rq_flags |= RQF_FLUSH_SEQ; - flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one diff --git a/block/blk-mq.c b/block/blk-mq.c index 3e67662f7801..f1abfd2e24f7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -377,7 +377,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; - rq->rq_disk = NULL; rq->part = NULL; #ifdef CONFIG_BLK_RQ_ALLOC_TIME rq->alloc_time_ns = alloc_time_ns; @@ -659,7 +658,7 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug) void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, - rq->rq_disk ? rq->rq_disk->disk_name : "?", + rq->q->disk ? rq->q->disk->disk_name : "?", (unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", @@ -712,7 +711,7 @@ static void blk_print_req_error(struct request *req, blk_status_t status) "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " "phys_seg %u prio class %u\n", blk_status_to_str(status), - req->rq_disk ? req->rq_disk->disk_name : "?", + req->q->disk ? req->q->disk->disk_name : "?", blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), req->cmd_flags & ~REQ_OP_MASK, req->nr_phys_segments, @@ -853,8 +852,8 @@ static void __blk_account_io_start(struct request *rq) /* passthrough requests can hold bios that do not have ->bi_bdev set */ if (rq->bio && rq->bio->bi_bdev) rq->part = rq->bio->bi_bdev; - else - rq->part = rq->rq_disk->part0; + else if (rq->q->disk) + rq->part = rq->q->disk->part0; part_stat_lock(); update_io_ticks(rq->part, jiffies, false); @@ -1172,7 +1171,6 @@ void blk_execute_rq_nowait(struct gendisk *bd_disk, struct request *rq, WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); - rq->rq_disk = bd_disk; rq->end_io = done; blk_account_io_start(rq); @@ -2902,8 +2900,8 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * if (ret != BLK_STS_OK) return ret; - if (rq->rq_disk && - should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq))) + if (rq->q->disk && + should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) diff --git a/block/blk.h b/block/blk.h index 3be0fdf76c9a..a55d82c3d1c2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -324,7 +324,7 @@ int blk_dev_init(void); */ static inline bool blk_do_io_stat(struct request *rq) { - return (rq->rq_flags & RQF_IO_STAT) && rq->rq_disk; + return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk; } void update_io_ticks(struct block_device *part, unsigned long now, bool end); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 1eec5113d0b5..5a566f2fd533 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1505,7 +1505,7 @@ static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; - struct amiga_floppy_struct *floppy = rq->rq_disk->private_data; + struct amiga_floppy_struct *floppy = rq->q->disk->private_data; blk_status_t err; if (!spin_trylock_irq(&amiflop_lock)) diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index f3ff9babdb5c..5d819a466e2f 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1502,7 +1502,7 @@ static void setup_req_params( int drive ) static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { - struct atari_floppy_struct *floppy = bd->rq->rq_disk->private_data; + struct atari_floppy_struct *floppy = bd->rq->q->disk->private_data; int drive = floppy - unit; int type = floppy->type; @@ -1538,7 +1538,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, if (!UDT) { Probing = 1; UDT = atari_disk_type + StartDiskType[DriveType]; - set_capacity(bd->rq->rq_disk, UDT->blocks); + set_capacity(bd->rq->q->disk, UDT->blocks); UD.autoprobe = 1; } } @@ -1558,7 +1558,7 @@ static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx, } type = minor2disktype[type].index; UDT = &atari_disk_type[type]; - set_capacity(bd->rq->rq_disk, UDT->blocks); + set_capacity(bd->rq->q->disk, UDT->blocks); UD.autoprobe = 0; } diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 7f0a60c4079f..0c638de25023 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2259,7 +2259,7 @@ static int do_format(int drive, struct format_descr *tmp_format_req) static void floppy_end_request(struct request *req, blk_status_t error) { unsigned int nr_sectors = current_count_sectors; - unsigned int drive = (unsigned long)req->rq_disk->private_data; + unsigned int drive = (unsigned long)req->q->disk->private_data; /* current_count_sectors can be zero if transfer failed */ if (error) @@ -2550,7 +2550,7 @@ static int make_raw_rw_request(void) if (WARN(max_buffer_sectors == 0, "VFS: Block I/O scheduled on unopened device\n")) return 0; - set_fdc((long)current_req->rq_disk->private_data); + set_fdc((long)current_req->q->disk->private_data); raw_cmd = &default_raw_cmd; raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK; @@ -2792,7 +2792,7 @@ do_request: return; } } - drive = (long)current_req->rq_disk->private_data; + drive = (long)current_req->q->disk->private_data; set_fdc(drive); reschedule_timeout(current_drive, "redo fd request"); diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h index ce3b430e88c5..86d6c12c603c 100644 --- a/drivers/block/null_blk/trace.h +++ b/drivers/block/null_blk/trace.h @@ -44,7 +44,7 @@ TRACE_EVENT(nullb_zone_op, __entry->op = req_op(cmd->rq); __entry->zone_no = zone_no; __entry->zone_cond = zone_cond; - __assign_disk_name(__entry->disk, cmd->rq->rq_disk); + __assign_disk_name(__entry->disk, cmd->rq->q->disk); ), TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", __print_disk_name(__entry->disk), diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 255fd3d4b8a8..f462ad67931a 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -690,7 +690,7 @@ static void pcd_request(void) if (!pcd_req && !set_next_request()) return; - cd = pcd_req->rq_disk->private_data; + cd = pcd_req->q->disk->private_data; if (cd != pcd_current) pcd_bufblk = -1; pcd_current = cd; diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index fba865058a17..4f8cce510562 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -430,7 +430,7 @@ static void run_fsm(void) int stop = 0; if (!phase) { - pd_current = pd_req->rq_disk->private_data; + pd_current = pd_req->q->disk->private_data; pi_current = pd_current->pi; phase = do_pd_io_start; } @@ -492,7 +492,7 @@ static enum action do_pd_io_start(void) case REQ_OP_WRITE: pd_block = blk_rq_pos(pd_req); pd_count = blk_rq_cur_sectors(pd_req); - if (pd_block + pd_count > get_capacity(pd_req->rq_disk)) + if (pd_block + pd_count > get_capacity(pd_req->q->disk)) return Fail; pd_run = blk_rq_sectors(pd_req); pd_buf = bio_data(pd_req->bio); diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index b84a6448a4f7..292e9a4ce1b9 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -746,12 +746,12 @@ repeat: if (!pf_req && !set_next_request()) return; - pf_current = pf_req->rq_disk->private_data; + pf_current = pf_req->q->disk->private_data; pf_block = blk_rq_pos(pf_req); pf_run = blk_rq_sectors(pf_req); pf_count = blk_rq_cur_sectors(pf_req); - if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) { + if (pf_block + pf_count > get_capacity(pf_req->q->disk)) { pf_end_request(BLK_STS_IOERR); goto repeat; } diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 2df0657cdf00..67a8edbaa1fd 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -393,7 +393,7 @@ static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu) static void rnbd_softirq_done_fn(struct request *rq) { - struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_clt_dev *dev = rq->q->disk->private_data; struct rnbd_clt_session *sess = dev->sess; struct rnbd_iu *iu; @@ -1133,7 +1133,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; - struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_clt_dev *dev = rq->q->disk->private_data; struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq); int err; blk_status_t ret = BLK_STS_IOERR; diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 2157936de623..146d85d80e0e 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -462,7 +462,7 @@ static int __vdc_tx_trigger(struct vdc_port *port) static int __send_request(struct request *req) { - struct vdc_port *port = req->rq_disk->private_data; + struct vdc_port *port = req->q->disk->private_data; struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; struct scatterlist sg[MAX_RING_COOKIES]; struct vdc_req_entry *rqe; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 90dc9cc48881..f4719b65e5e3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -550,7 +550,6 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq, return DM_MAPIO_REQUEUE; } clone->bio = clone->biotail = NULL; - clone->rq_disk = bdev->bd_disk; clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; *__clone = clone; diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 635b79899b9f..dc094f73d335 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1837,7 +1837,7 @@ static void mmc_blk_mq_rw_recovery(struct mmc_queue *mq, struct request *req) /* Reset if the card is in a bad state */ if (!mmc_host_is_spi(mq->card->host) && err && mmc_blk_reset(md, card->host, type)) { - pr_err("%s: recovery failed!\n", req->rq_disk->disk_name); + pr_err("%s: recovery failed!\n", req->q->disk->disk_name); mqrq->retries = MMC_NO_RETRIES; return; } diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c index 1352159733b0..83d2e6860d38 100644 --- a/drivers/nvme/host/fault_inject.c +++ b/drivers/nvme/host/fault_inject.c @@ -56,7 +56,7 @@ void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inject) void nvme_should_fail(struct request *req) { - struct gendisk *disk = req->rq_disk; + struct gendisk *disk = req->q->disk; struct nvme_fault_inject *fault_inject = NULL; u16 status; diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index 35bac7a25422..b5f85259461a 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -68,7 +68,7 @@ TRACE_EVENT(nvme_setup_cmd, __entry->nsid = le32_to_cpu(cmd->common.nsid); __entry->metadata = !!blk_integrity_rq(req); __entry->fctype = cmd->fabrics.fctype; - __assign_disk_name(__entry->disk, req->rq_disk); + __assign_disk_name(__entry->disk, req->q->disk); memcpy(__entry->cdw10, &cmd->common.cdw10, sizeof(__entry->cdw10)); ), @@ -103,7 +103,7 @@ TRACE_EVENT(nvme_complete_rq, __entry->retries = nvme_req(req)->retries; __entry->flags = nvme_req(req)->flags; __entry->status = nvme_req(req)->status; - __assign_disk_name(__entry->disk, req->rq_disk); + __assign_disk_name(__entry->disk, req->q->disk); ), TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%#llx, retries=%u, flags=0x%x, status=%#x", __entry->ctrl_id, __print_disk_name(__entry->disk), @@ -153,7 +153,7 @@ TRACE_EVENT(nvme_sq, ), TP_fast_assign( __entry->ctrl_id = nvme_req(req)->ctrl->instance; - __assign_disk_name(__entry->disk, req->rq_disk); + __assign_disk_name(__entry->disk, req->q->disk); __entry->qid = nvme_req_qid(req); __entry->sq_head = le16_to_cpu(sq_head); __entry->sq_tail = sq_tail; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 5e8b5ecb3245..c23cf8e7b3c3 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -543,8 +543,9 @@ static bool scsi_end_request(struct request *req, blk_status_t error, if (blk_update_request(req, error, bytes)) return true; + // XXX: if (blk_queue_add_random(q)) - add_disk_randomness(req->rq_disk); + add_disk_randomness(req->q->disk); if (!blk_rq_is_passthrough(req)) { WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED)); diff --git a/drivers/scsi/scsi_logging.c b/drivers/scsi/scsi_logging.c index ed9572252a42..1f8f80b2dbfc 100644 --- a/drivers/scsi/scsi_logging.c +++ b/drivers/scsi/scsi_logging.c @@ -30,7 +30,9 @@ static inline const char *scmd_name(const struct scsi_cmnd *scmd) { struct request *rq = scsi_cmd_to_rq((struct scsi_cmnd *)scmd); - return rq->rq_disk ? rq->rq_disk->disk_name : NULL; + if (!rq->q->disk) + return NULL; + return rq->q->disk->disk_name; } static size_t sdev_format_header(char *logbuf, size_t logbuf_len, diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index bba1f5dafd38..8181857ddf53 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -872,7 +872,7 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); unsigned int data_len = 24; @@ -908,7 +908,7 @@ static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; @@ -940,7 +940,7 @@ static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; @@ -971,7 +971,7 @@ static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); @@ -1068,7 +1068,7 @@ static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); struct bio *bio = rq->bio; u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); @@ -1116,7 +1116,7 @@ static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); /* flush requests don't perform I/O, zero the S/G table */ memset(&cmd->sdb, 0, sizeof(cmd->sdb)); @@ -1215,7 +1215,7 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_device *sdp = cmd->device; - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t lba = sectors_to_logical(sdp, blk_rq_pos(rq)); sector_t threshold; unsigned int nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); @@ -1236,7 +1236,7 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) goto fail; } - if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->rq_disk)) { + if (blk_rq_pos(rq) + blk_rq_sectors(rq) > get_capacity(rq->q->disk)) { scmd_printk(KERN_ERR, cmd, "access beyond end of device\n"); goto fail; } @@ -1331,7 +1331,7 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd) switch (req_op(rq)) { case REQ_OP_DISCARD: - switch (scsi_disk(rq->rq_disk)->provisioning_mode) { + switch (scsi_disk(rq->q->disk)->provisioning_mode) { case SD_LBP_UNMAP: return sd_setup_unmap_cmnd(cmd); case SD_LBP_WS16: @@ -1917,7 +1917,7 @@ static const struct block_device_operations sd_fops = { **/ static void sd_eh_reset(struct scsi_cmnd *scmd) { - struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->rq_disk); + struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); /* New SCSI EH run, reset gate variable */ sdkp->ignore_medium_access_errors = false; @@ -1937,7 +1937,7 @@ static void sd_eh_reset(struct scsi_cmnd *scmd) **/ static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) { - struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->rq_disk); + struct scsi_disk *sdkp = scsi_disk(scsi_cmd_to_rq(scmd)->q->disk); struct scsi_device *sdev = scmd->device; if (!scsi_device_online(sdev) || @@ -2034,7 +2034,7 @@ static int sd_done(struct scsi_cmnd *SCpnt) unsigned int resid; struct scsi_sense_hdr sshdr; struct request *req = scsi_cmd_to_rq(SCpnt); - struct scsi_disk *sdkp = scsi_disk(req->rq_disk); + struct scsi_disk *sdkp = scsi_disk(req->q->disk); int sense_valid = 0; int sense_deferred = 0; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index ed06798983f8..65bfd1e170da 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -244,7 +244,7 @@ out: static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t sector = blk_rq_pos(rq); if (!sd_is_zoned(sdkp)) @@ -322,7 +322,7 @@ blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, unsigned int nr_blocks) { struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); unsigned int wp_offset, zno = blk_rq_zone_no(rq); unsigned long flags; blk_status_t ret; @@ -388,7 +388,7 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, { struct request *rq = scsi_cmd_to_rq(cmd); sector_t sector = blk_rq_pos(rq); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); sector_t block = sectors_to_logical(sdkp->device, sector); blk_status_t ret; @@ -443,7 +443,7 @@ static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd, { int result = cmd->result; struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); + struct scsi_disk *sdkp = scsi_disk(rq->q->disk); unsigned int zno = blk_rq_zone_no(rq); enum req_opf op = req_op(rq); unsigned long flags; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 411e2b01966e..7db595c08b20 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -335,7 +335,7 @@ static int sr_done(struct scsi_cmnd *SCpnt) int block_sectors = 0; long error_sector; struct request *rq = scsi_cmd_to_rq(SCpnt); - struct scsi_cd *cd = scsi_cd(rq->rq_disk); + struct scsi_cd *cd = scsi_cd(rq->q->disk); #ifdef DEBUG scmd_printk(KERN_INFO, SCpnt, "done: %x\n", result); @@ -402,7 +402,7 @@ static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt) ret = scsi_alloc_sgtables(SCpnt); if (ret != BLK_STS_OK) return ret; - cd = scsi_cd(rq->rq_disk); + cd = scsi_cd(rq->q->disk); SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt, "Doing sr request, block = %d\n", block)); diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 28e1d98ae102..65c642b24ecf 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -528,7 +528,7 @@ static void virtio_scsi_init_hdr_pi(struct virtio_device *vdev, if (!rq || !scsi_prot_sg_count(sc)) return; - bi = blk_get_integrity(rq->rq_disk); + bi = blk_get_integrity(rq->q->disk); if (sc->sc_data_direction == DMA_TO_DEVICE) cmd_pi->pi_bytesout = cpu_to_virtio32(vdev, diff --git a/drivers/usb/storage/transport.c b/drivers/usb/storage/transport.c index 4c5a0a49035f..1928b3918242 100644 --- a/drivers/usb/storage/transport.c +++ b/drivers/usb/storage/transport.c @@ -551,7 +551,7 @@ static void last_sector_hacks(struct us_data *us, struct scsi_cmnd *srb) /* Did this command access the last sector? */ sector = (srb->cmnd[2] << 24) | (srb->cmnd[3] << 16) | (srb->cmnd[4] << 8) | (srb->cmnd[5]); - disk = scsi_cmd_to_rq(srb)->rq_disk; + disk = scsi_cmd_to_rq(srb)->q->disk; if (!disk) goto done; sdkp = scsi_disk(disk); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d952c3442261..ede7bef8880a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -100,7 +100,6 @@ struct request { struct request *rq_next; }; - struct gendisk *rq_disk; struct block_device *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME /* Time that the first bio started allocating this request. */ @@ -890,9 +889,6 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); - - if (bio->bi_bdev) - rq->rq_disk = bio->bi_bdev->bd_disk; } void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 477a800a9543..6794d7322cbd 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -164,7 +164,7 @@ static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); - return *(struct scsi_driver **)rq->rq_disk->private_data; + return *(struct scsi_driver **)rq->q->disk->private_data; } void scsi_done(struct scsi_cmnd *cmd); diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index d1c6fc83b1e3..ab7557d84f75 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -275,9 +275,9 @@ scmd_printk(const char *, const struct scsi_cmnd *, const char *, ...); do { \ struct request *__rq = scsi_cmd_to_rq((scmd)); \ \ - if (__rq->rq_disk) \ + if (__rq->q->disk) \ sdev_dbg((scmd)->device, "[%s] " fmt, \ - __rq->rq_disk->disk_name, ##a); \ + __rq->q->disk->disk_name, ##a); \ else \ sdev_dbg((scmd)->device, fmt, ##a); \ } while (0) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index a95daa4d4caa..27170e40e8c9 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -85,7 +85,7 @@ TRACE_EVENT(block_rq_requeue, ), TP_fast_assign( - __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); @@ -128,7 +128,7 @@ TRACE_EVENT(block_rq_complete, ), TP_fast_assign( - __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = blk_status_to_errno(error); @@ -161,7 +161,7 @@ DECLARE_EVENT_CLASS(block_rq, ), TP_fast_assign( - __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->dev = rq->q->disk ? disk_devt(rq->q->disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); @@ -512,7 +512,7 @@ TRACE_EVENT(block_rq_remap, ), TP_fast_assign( - __entry->dev = disk_devt(rq->rq_disk); + __entry->dev = disk_devt(rq->q->disk); __entry->sector = blk_rq_pos(rq); __entry->nr_sector = blk_rq_sectors(rq); __entry->old_dev = dev; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 1183c88634aa..431e41bc4c23 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1045,7 +1045,7 @@ static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, } r.device_from = cpu_to_be32(dev); - r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.device_to = cpu_to_be32(disk_devt(rq->q->disk)); r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), -- cgit v1.2.3 From 8291471ea5f1b2e6782cbb9c6ed785f12435245f Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 27 Nov 2021 14:59:19 +0000 Subject: cgroup: get the wrong css for css_alloc() during cgroup_init_subsys() css_alloc() needs the parent css, while cgroup_css() gets current cgropu's css. So we are getting the wrong css during cgroup_init_subsys(). Fortunately, cgrp_dfl_root.cgrp's css is not set yet, so the value we pass to css_alloc() is NULL anyway. Let's pass NULL directly during init, since we know there is no parent yet. Signed-off-by: Wei Yang Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 919194de39c8..f522cee8e650 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5711,7 +5711,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) /* Create the root cgroup state for this subsystem */ ss->root = &cgrp_dfl_root; - css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); + css = ss->css_alloc(NULL); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); -- cgit v1.2.3 From ccb00292eb2dbb58a55850639356d07630cd3c46 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Nov 2021 17:32:12 +0100 Subject: bpf: Remove a redundant comment on bpf_prog_free The comment telling that the prog_free helper is freeing the program is not exactly useful, so just remove it. Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211119163215.971383-3-hch@lst.de --- kernel/bpf/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b52dc845ecea..189d85d64bf1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2301,7 +2301,6 @@ static void bpf_prog_free_deferred(struct work_struct *work) } } -/* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; -- cgit v1.2.3 From 06edc59c1fd7aababc8361655b20f4cc9870aef2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 19 Nov 2021 17:32:13 +0100 Subject: bpf, docs: Prune all references to "internal BPF" The eBPF name has completely taken over from eBPF in general usage for the actual eBPF representation, or BPF for any general in-kernel use. Prune all remaining references to "internal BPF". Signed-off-by: Christoph Hellwig Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211119163215.971383-4-hch@lst.de --- Documentation/networking/filter.rst | 22 +++++++++++----------- arch/arm/net/bpf_jit_32.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp_64.c | 2 +- kernel/bpf/core.c | 2 +- net/core/filter.c | 11 +++++------ 6 files changed, 20 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index ce2b8e8bb9ab..83ffcaa5b91a 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -617,7 +617,7 @@ format with similar underlying principles from BPF described in previous paragraphs is being used. However, the instruction set format is modelled closer to the underlying architecture to mimic native instruction sets, so that a better performance can be achieved (more details later). This new -ISA is called 'eBPF' or 'internal BPF' interchangeably. (Note: eBPF which +ISA is called 'eBPF'. (Note: eBPF which originates from [e]xtended BPF is not the same as BPF extensions! While eBPF is an ISA, BPF extensions date back to classic BPF's 'overloading' of BPF_LD | BPF_{B,H,W} | BPF_ABS instruction.) @@ -690,7 +690,7 @@ Some core changes of the new internal format: That behavior maps directly to x86_64 and arm64 subregister definition, but makes other JITs more difficult. - 32-bit architectures run 64-bit internal BPF programs via interpreter. + 32-bit architectures run 64-bit eBPF programs via interpreter. Their JITs may convert BPF programs that only use 32-bit subregisters into native instruction set and let the rest being interpreted. @@ -711,7 +711,7 @@ Some core changes of the new internal format: - Introduces bpf_call insn and register passing convention for zero overhead calls from/to other kernel functions: - Before an in-kernel function call, the internal BPF program needs to + Before an in-kernel function call, the eBPF program needs to place function arguments into R1 to R5 registers to satisfy calling convention, then the interpreter will take them from registers and pass to in-kernel function. If R1 - R5 registers are mapped to CPU registers @@ -780,7 +780,7 @@ Some core changes of the new internal format: ... since x86_64 ABI mandates rdi, rsi, rdx, rcx, r8, r9 for argument passing and rbx, r12 - r15 are callee saved. - Then the following internal BPF pseudo-program:: + Then the following eBPF pseudo-program:: bpf_mov R6, R1 /* save ctx */ bpf_mov R2, 2 @@ -846,7 +846,7 @@ Some core changes of the new internal format: bpf_exit After the call the registers R1-R5 contain junk values and cannot be read. - An in-kernel eBPF verifier is used to validate internal BPF programs. + An in-kernel eBPF verifier is used to validate eBPF programs. Also in the new design, eBPF is limited to 4096 insns, which means that any program will terminate quickly and will only call a fixed number of kernel @@ -861,23 +861,23 @@ A program, that is translated internally consists of the following elements:: op:16, jt:8, jf:8, k:32 ==> op:8, dst_reg:4, src_reg:4, off:16, imm:32 -So far 87 internal BPF instructions were implemented. 8-bit 'op' opcode field +So far 87 eBPF instructions were implemented. 8-bit 'op' opcode field has room for new instructions. Some of them may use 16/24/32 byte encoding. New instructions must be multiple of 8 bytes to preserve backward compatibility. -Internal BPF is a general purpose RISC instruction set. Not every register and +eBPF is a general purpose RISC instruction set. Not every register and every instruction are used during translation from original BPF to new format. For example, socket filters are not using ``exclusive add`` instruction, but tracing filters may do to maintain counters of events, for example. Register R9 is not used by socket filters either, but more complex filters may be running out of registers and would have to resort to spill/fill to stack. -Internal BPF can be used as a generic assembler for last step performance +eBPF can be used as a generic assembler for last step performance optimizations, socket filters and seccomp are using it as assembler. Tracing filters may use it as assembler to generate code from kernel. In kernel usage -may not be bounded by security considerations, since generated internal BPF code +may not be bounded by security considerations, since generated eBPF code may be optimizing internal code path and not being exposed to the user space. -Safety of internal BPF can come from a verifier (TBD). In such use cases as +Safety of eBPF can come from a verifier (TBD). In such use cases as described, it may be used as safe instruction set. Just like the original BPF, the new format runs within a controlled environment, @@ -1675,7 +1675,7 @@ Testing ------- Next to the BPF toolchain, the kernel also ships a test module that contains -various test cases for classic and internal BPF that can be executed against +various test cases for classic and eBPF that can be executed against the BPF interpreter and JIT compiler. It can be found in lib/test_bpf.c and enabled via Kconfig:: diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index e59b41e9ab0c..10ceebb7530b 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -163,7 +163,7 @@ static const s8 bpf2a32[][2] = { [BPF_REG_9] = {STACK_OFFSET(BPF_R9_HI), STACK_OFFSET(BPF_R9_LO)}, /* Read only Frame Pointer to access Stack */ [BPF_REG_FP] = {STACK_OFFSET(BPF_FP_HI), STACK_OFFSET(BPF_FP_LO)}, - /* Temporary Register for internal BPF JIT, can be used + /* Temporary Register for BPF JIT, can be used * for constant blindings and others. */ [TMP_REG_1] = {ARM_R7, ARM_R6}, diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 07c12c42b751..07aad85848fa 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -44,7 +44,7 @@ static const int bpf2a64[] = { [BPF_REG_9] = A64_R(22), /* read-only frame pointer to access stack */ [BPF_REG_FP] = A64_R(25), - /* temporary registers for internal BPF JIT */ + /* temporary registers for BPF JIT */ [TMP_REG_1] = A64_R(10), [TMP_REG_2] = A64_R(11), [TMP_REG_3] = A64_R(12), diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 0bfe1c72a0c9..b1e38784eb23 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -227,7 +227,7 @@ static const int bpf2sparc[] = { [BPF_REG_AX] = G7, - /* temporary register for internal BPF JIT */ + /* temporary register for BPF JIT */ [TMP_REG_1] = G1, [TMP_REG_2] = G2, [TMP_REG_3] = G3, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 189d85d64bf1..de3e5bc6781f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1892,7 +1892,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp) /** * bpf_prog_select_runtime - select exec runtime for BPF program - * @fp: bpf_prog populated with internal BPF program + * @fp: bpf_prog populated with BPF program * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. diff --git a/net/core/filter.c b/net/core/filter.c index 26e0276aa00d..fe27c91e3758 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1242,10 +1242,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) int err, new_len, old_len = fp->len; bool seen_ld_abs = false; - /* We are free to overwrite insns et al right here as it - * won't be used at this point in time anymore internally - * after the migration to the internal BPF instruction - * representation. + /* We are free to overwrite insns et al right here as it won't be used at + * this point in time anymore internally after the migration to the eBPF + * instruction representation. */ BUILD_BUG_ON(sizeof(struct sock_filter) != sizeof(struct bpf_insn)); @@ -1336,8 +1335,8 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, */ bpf_jit_compile(fp); - /* JIT compiler couldn't process this filter, so do the - * internal BPF translation for the optimized interpreter. + /* JIT compiler couldn't process this filter, so do the eBPF translation + * for the optimized interpreter. */ if (!fp->jited) fp = bpf_migrate_filter(fp); -- cgit v1.2.3 From e6f2dd0f80674e9d5960337b3e9c2a242441b326 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 29 Nov 2021 19:06:19 -0800 Subject: bpf: Add bpf_loop helper This patch adds the kernel-side and API changes for a new helper function, bpf_loop: long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags); where long (*callback_fn)(u32 index, void *ctx); bpf_loop invokes the "callback_fn" **nr_loops** times or until the callback_fn returns 1. The callback_fn can only return 0 or 1, and this is enforced by the verifier. The callback_fn index is zero-indexed. A few things to please note: ~ The "u64 flags" parameter is currently unused but is included in case a future use case for it arises. ~ In the kernel-side implementation of bpf_loop (kernel/bpf/bpf_iter.c), bpf_callback_t is used as the callback function cast. ~ A program can have nested bpf_loop calls but the program must still adhere to the verifier constraint of its stack depth (the stack depth cannot exceed MAX_BPF_STACK)) ~ Recursive callback_fns do not pass the verifier, due to the call stack for these being too deep. ~ The next patch will include the tests and benchmark Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211130030622.4131246-2-joannekoong@fb.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 25 ++++++++++++ kernel/bpf/bpf_iter.c | 35 +++++++++++++++++ kernel/bpf/helpers.c | 2 + kernel/bpf/verifier.c | 88 ++++++++++++++++++++++++++---------------- tools/include/uapi/linux/bpf.h | 25 ++++++++++++ 6 files changed, 142 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cc7a0c36e7df..cad0829710be 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2164,6 +2164,7 @@ extern const struct bpf_func_proto bpf_sk_setsockopt_proto; extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; extern const struct bpf_func_proto bpf_find_vma_proto; +extern const struct bpf_func_proto bpf_loop_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a69e4b04ffeb..211b43afd0fb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4957,6 +4957,30 @@ union bpf_attr { * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. * **-EBUSY** if failed to try lock mmap_lock. * **-EINVAL** for invalid **flags**. + * + * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * Return + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5140,6 +5164,7 @@ union bpf_attr { FN(skc_to_unix_sock), \ FN(kallsyms_lookup_name), \ FN(find_vma), \ + FN(loop), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b2ee45064e06..b7aef5b3416d 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -714,3 +714,38 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = { .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; + +/* maximum number of loops */ +#define MAX_LOOPS BIT(23) + +BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, + u64, flags) +{ + bpf_callback_t callback = (bpf_callback_t)callback_fn; + u64 ret; + u32 i; + + if (flags) + return -EINVAL; + if (nr_loops > MAX_LOOPS) + return -E2BIG; + + for (i = 0; i < nr_loops; i++) { + ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) + return i + 1; + } + + return i; +} + +const struct bpf_func_proto bpf_loop_proto = { + .func = bpf_loop, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ffd469c217f..52188004a9c3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1378,6 +1378,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_query_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; + case BPF_FUNC_loop: + return &bpf_loop_proto; default: break; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0763cca139a7..d7678d8a925c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6085,6 +6085,27 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_loop_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, + * u64 flags); + * callback_fn(u32 index, void *callback_ctx); + */ + callee->regs[BPF_REG_1].type = SCALAR_VALUE; + callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + + callee->in_callback_fn = true; + return 0; +} + static int set_timer_callback_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -6458,13 +6479,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (func_id == BPF_FUNC_tail_call) { - err = check_reference_leak(env); - if (err) { - verbose(env, "tail_call would lead to reference leak\n"); - return err; - } - } else if (is_release_function(func_id)) { + if (is_release_function(func_id)) { err = release_reference(env, meta.ref_obj_id); if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", @@ -6475,42 +6490,47 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); - /* check that flags argument in get_local_storage(map, flags) is 0, - * this is required because get_local_storage() can't return an error. - */ - if (func_id == BPF_FUNC_get_local_storage && - !register_is_null(®s[BPF_REG_2])) { - verbose(env, "get_local_storage() doesn't support non-zero flags\n"); - return -EINVAL; - } - - if (func_id == BPF_FUNC_for_each_map_elem) { + switch (func_id) { + case BPF_FUNC_tail_call: + err = check_reference_leak(env); + if (err) { + verbose(env, "tail_call would lead to reference leak\n"); + return err; + } + break; + case BPF_FUNC_get_local_storage: + /* check that flags argument in get_local_storage(map, flags) is 0, + * this is required because get_local_storage() can't return an error. + */ + if (!register_is_null(®s[BPF_REG_2])) { + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); + return -EINVAL; + } + break; + case BPF_FUNC_for_each_map_elem: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_map_elem_callback_state); - if (err < 0) - return -EINVAL; - } - - if (func_id == BPF_FUNC_timer_set_callback) { + break; + case BPF_FUNC_timer_set_callback: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_timer_callback_state); - if (err < 0) - return -EINVAL; - } - - if (func_id == BPF_FUNC_find_vma) { + break; + case BPF_FUNC_find_vma: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_find_vma_callback_state); - if (err < 0) - return -EINVAL; - } - - if (func_id == BPF_FUNC_snprintf) { + break; + case BPF_FUNC_snprintf: err = check_bpf_snprintf_call(env, regs); - if (err < 0) - return err; + break; + case BPF_FUNC_loop: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_loop_callback_state); + break; } + if (err) + return err; + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a69e4b04ffeb..211b43afd0fb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4957,6 +4957,30 @@ union bpf_attr { * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. * **-EBUSY** if failed to try lock mmap_lock. * **-EINVAL** for invalid **flags**. + * + * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * Return + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5140,6 +5164,7 @@ union bpf_attr { FN(skc_to_unix_sock), \ FN(kallsyms_lookup_name), \ FN(find_vma), \ + FN(loop), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 6ce895128b3bff738fe8d9dd74747a03e319e466 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 29 Nov 2021 13:06:44 +0000 Subject: entry: Snapshot thread flags Some thread flags can be set remotely, and so even when IRQs are disabled, the flags can change under our feet. Generally this is unlikely to cause a problem in practice, but it is somewhat unsound, and KCSAN will legitimately warn that there is a data race. To avoid such issues, a snapshot of the flags has to be taken prior to using them. Some places already use READ_ONCE() for that, others do not. Convert them all to the new flag accessor helpers. Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20211129130653.2037928-3-mark.rutland@arm.com --- include/linux/entry-kvm.h | 2 +- kernel/entry/common.c | 4 ++-- kernel/entry/kvm.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 0d7865a0731c..07c878d6e323 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -75,7 +75,7 @@ static inline void xfer_to_guest_mode_prepare(void) */ static inline bool __xfer_to_guest_mode_work_pending(void) { - unsigned long ti_work = READ_ONCE(current_thread_info()->flags); + unsigned long ti_work = read_thread_flags(); return !!(ti_work & XFER_TO_GUEST_MODE_WORK); } diff --git a/kernel/entry/common.c b/kernel/entry/common.c index d5a61d565ad5..bad713684c2e 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -187,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, /* Check if any of the above work has queued a deferred wakeup */ tick_nohz_user_enter_prepare(); - ti_work = READ_ONCE(current_thread_info()->flags); + ti_work = read_thread_flags(); } /* Return the latest work state for arch_exit_to_user_mode() */ @@ -196,7 +196,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, static void exit_to_user_mode_prepare(struct pt_regs *regs) { - unsigned long ti_work = READ_ONCE(current_thread_info()->flags); + unsigned long ti_work = read_thread_flags(); lockdep_assert_irqs_disabled(); diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 49972ee99aff..96d476e06c77 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -26,7 +26,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) if (ret) return ret; - ti_work = READ_ONCE(current_thread_info()->flags); + ti_work = read_thread_flags(); } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); return 0; } @@ -43,7 +43,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) * disabled in the inner loop before going into guest mode. No need * to disable interrupts here. */ - ti_work = READ_ONCE(current_thread_info()->flags); + ti_work = read_thread_flags(); if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) return 0; -- cgit v1.2.3 From 0569b245132c40015281610353935a50e282eb94 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 29 Nov 2021 13:06:45 +0000 Subject: sched: Snapshot thread flags Some thread flags can be set remotely, and so even when IRQs are disabled, the flags can change under our feet. Generally this is unlikely to cause a problem in practice, but it is somewhat unsound, and KCSAN will legitimately warn that there is a data race. To avoid such issues, a snapshot of the flags has to be taken prior to using them. Some places already use READ_ONCE() for that, others do not. Convert them all to the new flag accessor helpers. The READ_ONCE(ti->flags) .. cmpxchg(ti->flags) loop in set_nr_if_polling() is left as-is for clarity. Signed-off-by: Mark Rutland Signed-off-by: Thomas Gleixner Acked-by: Paul E. McKenney Cc: Juri Lelli Cc: Vincent Guittot Link: https://lore.kernel.org/r/20211129130653.2037928-4-mark.rutland@arm.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 76f9deeaa942..704262798fb7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8520,7 +8520,7 @@ void sched_show_task(struct task_struct *p) rcu_read_unlock(); pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", free, task_pid_nr(p), ppid, - (unsigned long)task_thread_info(p)->flags); + read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); -- cgit v1.2.3 From 4946f15e8c334840bf277a0bf924371eae120fcd Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Tue, 30 Nov 2021 22:40:43 +0100 Subject: genirq/generic_chip: Constify irq_generic_chip_ops The only usage of irq_generic_chip_ops is to pass its address to irq_domain_add_linear() which takes a pointer to const struct irq_domain_ops. Make it const to allow the compiler to put it in read-only memory. [ tglx: Fixed subject prefix ] Signed-off-by: Rikard Falkeborn Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20211130214043.1257585-1-rikard.falkeborn@gmail.com --- include/linux/irqdomain.h | 2 +- kernel/irq/generic-chip.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 553da4899f55..d476405802e9 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -131,7 +131,7 @@ struct irq_domain_ops { #endif }; -extern struct irq_domain_ops irq_generic_chip_ops; +extern const struct irq_domain_ops irq_generic_chip_ops; struct irq_domain_chip_generic; diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 6f29bf4c8515..f0862eb6b506 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -451,7 +451,7 @@ static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) } -struct irq_domain_ops irq_generic_chip_ops = { +const struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, .unmap = irq_unmap_generic_chip, .xlate = irq_domain_xlate_onetwocell, -- cgit v1.2.3 From d4efb170861827290f7f571020001a60d001faaf Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Nov 2021 05:27:31 +0530 Subject: bpf: Change bpf_kallsyms_lookup_name size type to ARG_CONST_SIZE_OR_ZERO Andrii mentioned in [0] that switching to ARG_CONST_SIZE_OR_ZERO lets user avoid having to prove that string size at runtime is not zero and helps with not having to supress clang optimizations. [0]: https://lore.kernel.org/bpf/CAEf4BzZa_vhXB3c8atNcTS6=krQvC25H7K7c3WWZhM=27ro=Wg@mail.gmail.com Suggested-by: Andrii Nakryiko Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211122235733.634914-2-memxor@gmail.com --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 50f96ea4452a..47089d1d67a4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4804,7 +4804,7 @@ const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, - .arg2_type = ARG_CONST_SIZE, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; -- cgit v1.2.3 From c86ff8c55b8ae68837b2fa59dc0c203907e9a15f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 18 Nov 2021 14:14:36 -0500 Subject: clocksource: Avoid accidental unstable marking of clocksources Since commit db3a34e17433 ("clocksource: Retry clock read if long delays detected") and commit 2e27e793e280 ("clocksource: Reduce clocksource-skew threshold"), it is found that tsc clocksource fallback to hpet can sometimes happen on both Intel and AMD systems especially when they are running stressful benchmarking workloads. Of the 23 systems tested with a v5.14 kernel, 10 of them have switched to hpet clock source during the test run. The result of falling back to hpet is a drastic reduction of performance when running benchmarks. For example, the fio performance tests can drop up to 70% whereas the iperf3 performance can drop up to 80%. 4 hpet fallbacks happened during bootup. They were: [ 8.749399] clocksource: timekeeping watchdog on CPU13: hpet read-back delay of 263750ns, attempt 4, marking unstable [ 12.044610] clocksource: timekeeping watchdog on CPU19: hpet read-back delay of 186166ns, attempt 4, marking unstable [ 17.336941] clocksource: timekeeping watchdog on CPU28: hpet read-back delay of 182291ns, attempt 4, marking unstable [ 17.518565] clocksource: timekeeping watchdog on CPU34: hpet read-back delay of 252196ns, attempt 4, marking unstable Other fallbacks happen when the systems were running stressful benchmarks. For example: [ 2685.867873] clocksource: timekeeping watchdog on CPU117: hpet read-back delay of 57269ns, attempt 4, marking unstable [46215.471228] clocksource: timekeeping watchdog on CPU8: hpet read-back delay of 61460ns, attempt 4, marking unstable Commit 2e27e793e280 ("clocksource: Reduce clocksource-skew threshold"), changed the skew margin from 100us to 50us. I think this is too small and can easily be exceeded when running some stressful workloads on a thermally stressed system. So it is switched back to 100us. Even a maximum skew margin of 100us may be too small in for some systems when booting up especially if those systems are under thermal stress. To eliminate the case that the large skew is due to the system being too busy slowing down the reading of both the watchdog and the clocksource, an extra consecutive read of watchdog clock is being done to check this. The consecutive watchdog read delay is compared against WATCHDOG_MAX_SKEW/2. If the delay exceeds the limit, we assume that the system is just too busy. A warning will be printed to the console and the clock skew check is skipped for this round. Fixes: db3a34e17433 ("clocksource: Retry clock read if long delays detected") Fixes: 2e27e793e280 ("clocksource: Reduce clocksource-skew threshold") Signed-off-by: Waiman Long Signed-off-by: Paul E. McKenney --- kernel/time/clocksource.c | 50 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b8a14d2fb5ba..bcad1a1e5dcf 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -107,7 +107,7 @@ static u64 suspend_start; * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as * a lower bound for cs->uncertainty_margin values when registering clocks. */ -#define WATCHDOG_MAX_SKEW (50 * NSEC_PER_USEC) +#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC) #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); @@ -205,17 +205,24 @@ EXPORT_SYMBOL_GPL(max_cswd_read_retries); static int verify_n_cpus = 8; module_param(verify_n_cpus, int, 0644); -static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) +enum wd_read_status { + WD_READ_SUCCESS, + WD_READ_UNSTABLE, + WD_READ_SKIP +}; + +static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) { unsigned int nretries; - u64 wd_end, wd_delta; - int64_t wd_delay; + u64 wd_end, wd_end2, wd_delta; + int64_t wd_delay, wd_seq_delay; for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) { local_irq_disable(); *wdnow = watchdog->read(watchdog); *csnow = cs->read(cs); wd_end = watchdog->read(watchdog); + wd_end2 = watchdog->read(watchdog); local_irq_enable(); wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask); @@ -226,13 +233,34 @@ static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", smp_processor_id(), watchdog->name, nretries); } - return true; + return WD_READ_SUCCESS; } + + /* + * Now compute delay in consecutive watchdog read to see if + * there is too much external interferences that cause + * significant delay in reading both clocksource and watchdog. + * + * If consecutive WD read-back delay > WATCHDOG_MAX_SKEW/2, + * report system busy, reinit the watchdog and skip the current + * watchdog test. + */ + wd_delta = clocksource_delta(wd_end2, wd_end, watchdog->mask); + wd_seq_delay = clocksource_cyc2ns(wd_delta, watchdog->mult, watchdog->shift); + if (wd_seq_delay > WATCHDOG_MAX_SKEW/2) + goto skip_test; } pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n", smp_processor_id(), watchdog->name, wd_delay, nretries); - return false; + return WD_READ_UNSTABLE; + +skip_test: + pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n", + smp_processor_id(), watchdog->name, wd_seq_delay); + pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n", + cs->name, wd_delay); + return WD_READ_SKIP; } static u64 csnow_mid; @@ -356,6 +384,7 @@ static void clocksource_watchdog(struct timer_list *unused) int next_cpu, reset_pending; int64_t wd_nsec, cs_nsec; struct clocksource *cs; + enum wd_read_status read_ret; u32 md; spin_lock(&watchdog_lock); @@ -373,9 +402,12 @@ static void clocksource_watchdog(struct timer_list *unused) continue; } - if (!cs_watchdog_read(cs, &csnow, &wdnow)) { - /* Clock readout unreliable, so give it up. */ - __clocksource_unstable(cs); + read_ret = cs_watchdog_read(cs, &csnow, &wdnow); + + if (read_ret != WD_READ_SUCCESS) { + if (read_ret == WD_READ_UNSTABLE) + /* Clock readout unreliable, so give it up. */ + __clocksource_unstable(cs); continue; } -- cgit v1.2.3 From 1a5620671a1b6fd9cc08761677d050f1702f910c Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 18 Nov 2021 14:14:37 -0500 Subject: clocksource: Reduce the default clocksource_watchdog() retries to 2 With the previous patch, there is an extra watchdog read in each retry. Now the total number of clocksource reads is increased to 4 per iteration. In order to avoid increasing the clock skew check overhead, the default maximum number of retries is reduced from 3 to 2 to maintain the same 12 clocksource reads in the worst case. Suggested-by: Paul E. McKenney Signed-off-by: Waiman Long Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 4 ++-- kernel/time/clocksource.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..3ea934b034f7 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -603,8 +603,8 @@ clocksource.max_cswd_read_retries= [KNL] Number of clocksource_watchdog() retries due to external delays before the clock will be marked - unstable. Defaults to three retries, that is, - four attempts to read the clock under test. + unstable. Defaults to two retries, that is, + three attempts to read the clock under test. clocksource.verify_n_cpus= [KNL] Limit the number of CPUs checked for clocksources diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index bcad1a1e5dcf..b7e52a642948 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -199,7 +199,7 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -ulong max_cswd_read_retries = 3; +ulong max_cswd_read_retries = 2; module_param(max_cswd_read_retries, ulong, 0644); EXPORT_SYMBOL_GPL(max_cswd_read_retries); static int verify_n_cpus = 8; -- cgit v1.2.3 From e2c73a6860bdf54f2c6bf8cddc34ddc91a1343e1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Sep 2021 14:18:51 -0700 Subject: rcu: Remove the RCU_FAST_NO_HZ Kconfig option All of the uses of CONFIG_RCU_FAST_NO_HZ=y that I have seen involve systems with RCU callbacks offloaded. In this situation, all that this Kconfig option does is slow down idle entry/exit with an additional allways-taken early exit. If this is the only use case, then this Kconfig option nothing but an attractive nuisance that needs to go away. This commit therefore removes the RCU_FAST_NO_HZ Kconfig option. Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.rst | 11 -- Documentation/admin-guide/kernel-parameters.txt | 4 - Documentation/timers/no_hz.rst | 10 +- kernel/rcu/Kconfig | 18 -- kernel/rcu/tree.c | 11 -- kernel/rcu/tree.h | 7 - kernel/rcu/tree_plugin.h | 185 +-------------------- kernel/rcu/tree_stall.h | 27 +-- .../selftests/rcutorture/configs/rcu/TREE01 | 1 - .../selftests/rcutorture/configs/rcu/TREE04 | 1 - .../selftests/rcutorture/doc/TREE_RCU-kconfig.txt | 1 - 11 files changed, 7 insertions(+), 269 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index 28f8ad16db25..78404625bad2 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -254,17 +254,6 @@ period (in this case 2603), the grace-period sequence number (7075), and an estimate of the total number of RCU callbacks queued across all CPUs (625 in this case). -In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed -for each CPU:: - - 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1 - -The "last_accelerate:" prints the low-order 16 bits (in hex) of the -jiffies counter when this CPU last invoked rcu_try_advance_all_cbs() -from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from -rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle -processing is enabled. - If the grace period ends just as the stall warning starts printing, there will be a spurious stall-warning message, which will include the following:: diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..36b83a47e54a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4489,10 +4489,6 @@ on rcutree.qhimark at boot time and to zero to disable more aggressive help enlistment. - rcutree.rcu_idle_gp_delay= [KNL] - Set wakeup interval for idle CPUs that have - RCU callbacks (RCU_FAST_NO_HZ=y). - rcutree.rcu_kick_kthreads= [KNL] Cause the grace-period kthread to get an extra wake_up() if it sleeps three times longer than diff --git a/Documentation/timers/no_hz.rst b/Documentation/timers/no_hz.rst index 20ad23a6c618..f8786be15183 100644 --- a/Documentation/timers/no_hz.rst +++ b/Documentation/timers/no_hz.rst @@ -184,16 +184,12 @@ There are situations in which idle CPUs cannot be permitted to enter either dyntick-idle mode or adaptive-tick mode, the most common being when that CPU has RCU callbacks pending. -The CONFIG_RCU_FAST_NO_HZ=y Kconfig option may be used to cause such CPUs -to enter dyntick-idle mode or adaptive-tick mode anyway. In this case, -a timer will awaken these CPUs every four jiffies in order to ensure -that the RCU callbacks are processed in a timely fashion. - -Another approach is to offload RCU callback processing to "rcuo" kthreads +Avoid this by offloading RCU callback processing to "rcuo" kthreads using the CONFIG_RCU_NOCB_CPU=y Kconfig option. The specific CPUs to offload may be selected using The "rcu_nocbs=" kernel boot parameter, which takes a comma-separated list of CPUs and CPU ranges, for example, -"1,3-5" selects CPUs 1, 3, 4, and 5. +"1,3-5" selects CPUs 1, 3, 4, and 5. Note that CPUs specified by +the "nohz_full" kernel boot parameter are also offloaded. The offloaded CPUs will never queue RCU callbacks, and therefore RCU never prevents offloaded CPUs from entering either dyntick-idle mode diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 3128b7cf8e1f..4005f2728f1a 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -169,24 +169,6 @@ config RCU_FANOUT_LEAF Take the default if unsure. -config RCU_FAST_NO_HZ - bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on NO_HZ_COMMON && SMP && RCU_EXPERT - default n - help - This option permits CPUs to enter dynticks-idle state even if - they have RCU callbacks queued, and prevents RCU from waking - these CPUs up more than roughly once every four jiffies (by - default, you can adjust this using the rcutree.rcu_idle_gp_delay - parameter), thus improving energy efficiency. On the other - hand, this option increases the duration of RCU grace periods, - for example, slowing down synchronize_rcu(). - - Say Y if energy efficiency is critically important, and you - don't care about increased grace-period durations. - - Say N if you are unsure. - config RCU_BOOST bool "Enable RCU priority boosting" depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ef8d36f580fc..5e98257c2910 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -624,7 +624,6 @@ static noinstr void rcu_eqs_enter(bool user) instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); // instrumentation for the noinstr rcu_dynticks_eqs_enter() @@ -768,9 +767,6 @@ noinstr void rcu_nmi_exit(void) trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - if (!in_nmi()) - rcu_prepare_for_idle(); - // instrumentation for the noinstr rcu_dynticks_eqs_enter() instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); instrumentation_end(); @@ -872,7 +868,6 @@ static void noinstr rcu_eqs_exit(bool user) // instrumentation for the noinstr rcu_dynticks_eqs_exit() instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); - rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); @@ -1014,12 +1009,6 @@ noinstr void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); // ... but is watching here. - if (!in_nmi()) { - instrumentation_begin(); - rcu_cleanup_after_idle(); - instrumentation_end(); - } - instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 305cf6aeb408..7ca1aa46083c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -189,11 +189,6 @@ struct rcu_data { bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ -#ifdef CONFIG_RCU_FAST_NO_HZ - unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ - unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ - int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ /* 4) rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; @@ -419,8 +414,6 @@ static bool rcu_is_callbacks_kthread(void); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); static void __init rcu_spawn_boost_kthreads(void); -static void rcu_cleanup_after_idle(void); -static void rcu_prepare_for_idle(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5199559fbbf0..19f7d578cedb 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -51,8 +51,6 @@ static void __init rcu_bootup_announce_oddness(void) RCU_FANOUT); if (rcu_fanout_exact) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); - if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) - pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) @@ -1253,16 +1251,14 @@ static void __init rcu_spawn_boost_kthreads(void) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -#if !defined(CONFIG_RCU_FAST_NO_HZ) - /* * Check to see if any future non-offloaded RCU-related work will need * to be done by the current CPU, even if none need be done immediately, * returning 1 if so. This function is part of the RCU implementation; * it is -not- an exported member of the RCU API. * - * Because we not have RCU_FAST_NO_HZ, just check whether or not this - * CPU has RCU callbacks queued. + * Just check whether or not this CPU has non-offloaded RCU callbacks + * queued. */ int rcu_needs_cpu(u64 basemono, u64 *nextevt) { @@ -1271,183 +1267,6 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); } -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up - * after it. - */ -static void rcu_cleanup_after_idle(void) -{ -} - -/* - * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, - * is nothing. - */ -static void rcu_prepare_for_idle(void) -{ -} - -#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - -/* - * This code is invoked when a CPU goes idle, at which point we want - * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. - * - * The following preprocessor symbol controls this: - * - * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted - * to sleep in dyntick-idle mode with RCU callbacks pending. This - * is sized to be roughly one RCU grace period. Those energy-efficiency - * benchmarkers who might otherwise be tempted to set this to a large - * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your - * system. And if you are -that- concerned about energy efficiency, - * just power the system down and be done with it! - * - * The value below works well in practice. If future workloads require - * adjustment, they can be converted into kernel config parameters, though - * making the state machine smarter might be a better option. - */ -#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ - -static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; -module_param(rcu_idle_gp_delay, int, 0644); - -/* - * Try to advance callbacks on the current CPU, but only if it has been - * awhile since the last time we did so. Afterwards, if there are any - * callbacks ready for immediate invocation, return true. - */ -static bool __maybe_unused rcu_try_advance_all_cbs(void) -{ - bool cbs_ready = false; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp; - - /* Exit early if we advanced recently. */ - if (jiffies == rdp->last_advance_all) - return false; - rdp->last_advance_all = jiffies; - - rnp = rdp->mynode; - - /* - * Don't bother checking unless a grace period has - * completed since we last checked and there are - * callbacks not yet ready to invoke. - */ - if ((rcu_seq_completed_gp(rdp->gp_seq, - rcu_seq_current(&rnp->gp_seq)) || - unlikely(READ_ONCE(rdp->gpwrap))) && - rcu_segcblist_pend_cbs(&rdp->cblist)) - note_gp_changes(rdp); - - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - cbs_ready = true; - return cbs_ready; -} - -/* - * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready - * to invoke. If the CPU has callbacks, try to advance them. Tell the - * caller about what to set the timeout. - * - * The caller must have disabled interrupts. - */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - unsigned long dj; - - lockdep_assert_irqs_disabled(); - - /* If no non-offloaded callbacks, RCU doesn't need the CPU. */ - if (rcu_segcblist_empty(&rdp->cblist) || - rcu_rdp_is_offloaded(rdp)) { - *nextevt = KTIME_MAX; - return 0; - } - - /* Attempt to advance callbacks. */ - if (rcu_try_advance_all_cbs()) { - /* Some ready to invoke, so initiate later invocation. */ - invoke_rcu_core(); - return 1; - } - rdp->last_accelerate = jiffies; - - /* Request timer and round. */ - dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; - - *nextevt = basemono + dj * TICK_NSEC; - return 0; -} - -/* - * Prepare a CPU for idle from an RCU perspective. The first major task is to - * sense whether nohz mode has been enabled or disabled via sysfs. The second - * major task is to accelerate (that is, assign grace-period numbers to) any - * recently arrived callbacks. - * - * The caller must have disabled interrupts. - */ -static void rcu_prepare_for_idle(void) -{ - bool needwake; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp; - int tne; - - lockdep_assert_irqs_disabled(); - if (rcu_rdp_is_offloaded(rdp)) - return; - - /* Handle nohz enablement switches conservatively. */ - tne = READ_ONCE(tick_nohz_active); - if (tne != rdp->tick_nohz_enabled_snap) { - if (!rcu_segcblist_empty(&rdp->cblist)) - invoke_rcu_core(); /* force nohz to see update. */ - rdp->tick_nohz_enabled_snap = tne; - return; - } - if (!tne) - return; - - /* - * If we have not yet accelerated this jiffy, accelerate all - * callbacks on this CPU. - */ - if (rdp->last_accelerate == jiffies) - return; - rdp->last_accelerate = jiffies; - if (rcu_segcblist_pend_cbs(&rdp->cblist)) { - rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_accelerate_cbs(rnp, rdp); - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - if (needwake) - rcu_gp_kthread_wake(); - } -} - -/* - * Clean up for exit from idle. Attempt to advance callbacks based on - * any grace periods that elapsed while the CPU was idle, and if any - * callbacks are now ready to invoke, initiate invocation. - */ -static void rcu_cleanup_after_idle(void) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - lockdep_assert_irqs_disabled(); - if (rcu_rdp_is_offloaded(rdp)) - return; - if (rcu_try_advance_all_cbs()) - invoke_rcu_core(); -} - -#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 5e2fa6fd97f1..21bebf7c9030 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -347,26 +347,6 @@ static void rcu_dump_cpu_stacks(void) } } -#ifdef CONFIG_RCU_FAST_NO_HZ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - - sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", - rdp->last_accelerate & 0xffff, jiffies & 0xffff, - !!rdp->tick_nohz_enabled_snap); -} - -#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - *cp = '\0'; -} - -#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ - static const char * const gp_state_names[] = { [RCU_GP_IDLE] = "RCU_GP_IDLE", [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS", @@ -408,13 +388,12 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp) * of RCU grace periods that this CPU is ignorant of, for example, "1" * if the CPU was aware of the previous grace period. * - * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + * Also print out idle info. */ static void print_cpu_stall_info(int cpu) { unsigned long delta; bool falsepositive; - char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; unsigned long ticks_value; @@ -432,11 +411,10 @@ static void print_cpu_stall_info(int cpu) ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; } - print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -449,7 +427,6 @@ static void print_cpu_stall_info(int cpu) rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz, falsepositive ? " (false positive?)" : ""); } diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 index b5b53973c01e..8ae41d5f81a3 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01 @@ -6,7 +6,6 @@ CONFIG_PREEMPT=y CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=y CONFIG_NO_HZ_FULL=n -CONFIG_RCU_FAST_NO_HZ=y CONFIG_RCU_TRACE=y CONFIG_HOTPLUG_CPU=y CONFIG_MAXSMP=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 index f6d6a40c0576..22ad0261728d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04 @@ -7,7 +7,6 @@ CONFIG_PREEMPT=n CONFIG_HZ_PERIODIC=n CONFIG_NO_HZ_IDLE=n CONFIG_NO_HZ_FULL=y -CONFIG_RCU_FAST_NO_HZ=y CONFIG_RCU_TRACE=y CONFIG_RCU_FANOUT=4 CONFIG_RCU_FANOUT_LEAF=3 diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt index 1b96d68473b8..42acb1a64ce1 100644 --- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt +++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt @@ -15,7 +15,6 @@ CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING. CONFIG_RCU_BOOST -- one of PREEMPT_RCU. CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others. CONFIG_RCU_FANOUT_LEAF -- Do one non-default. -CONFIG_RCU_FAST_NO_HZ -- Do one, but not with all nohz_full CPUs. CONFIG_RCU_NOCB_CPU -- Do three, one with no rcu_nocbs CPUs, one with rcu_nocbs=0, and one with all rcu_nocbs CPUs. CONFIG_RCU_TRACE -- Do half. -- cgit v1.2.3 From bc849e9192c75833a85f2e9376a265ab31f8eec7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 27 Sep 2021 14:30:20 -0700 Subject: rcu: Move rcu_needs_cpu() to tree.c Now that RCU_FAST_NO_HZ is no more, there is but one implementation of the rcu_needs_cpu() function. This commit therefore moves this function from kernel/rcu/tree_plugin.c to kernel/rcu/tree.c. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 ++++++++++++++++++ kernel/rcu/tree_plugin.h | 16 ---------------- 2 files changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5e98257c2910..4ac019e9b25f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1075,6 +1075,24 @@ void rcu_irq_enter_irqson(void) local_irq_restore(flags); } +/* + * Check to see if any future non-offloaded RCU-related work will need + * to be done by the current CPU, even if none need be done immediately, + * returning 1 if so. This function is part of the RCU implementation; + * it is -not- an exported member of the RCU API. This is used by + * the idle-entry code to figure out whether it is safe to disable the + * scheduler-clock interrupt. + * + * Just check whether or not this CPU has non-offloaded RCU callbacks + * queued. + */ +int rcu_needs_cpu(u64 basemono, u64 *nextevt) +{ + *nextevt = KTIME_MAX; + return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && + !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); +} + /* * If any sort of urgency was applied to the current CPU (for example, * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 19f7d578cedb..0575757a0f8f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1251,22 +1251,6 @@ static void __init rcu_spawn_boost_kthreads(void) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -/* - * Check to see if any future non-offloaded RCU-related work will need - * to be done by the current CPU, even if none need be done immediately, - * returning 1 if so. This function is part of the RCU implementation; - * it is -not- an exported member of the RCU API. - * - * Just check whether or not this CPU has non-offloaded RCU callbacks - * queued. - */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) -{ - *nextevt = KTIME_MAX; - return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && - !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); -} - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? -- cgit v1.2.3 From 2407a64f8045552203ee5cb9904ce75ce2fceef4 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 28 Sep 2021 08:21:28 +0800 Subject: rcu: in_irq() cleanup This commit replaces the obsolete and ambiguous macro in_irq() with its shiny new in_hardirq() equivalent. Signed-off-by: Changbin Du Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 2 +- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 9be015305f9f..858f4d429946 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -85,7 +85,7 @@ static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } static inline void rcu_irq_exit_check_preempt(void) { } #define rcu_is_idle_cpu(cpu) \ - (is_idle_task(current) && !in_nmi() && !in_irq() && !in_serving_softirq()) + (is_idle_task(current) && !in_nmi() && !in_hardirq() && !in_serving_softirq()) static inline void exit_rcu(void) { } static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ef8d36f580fc..f0f19dc7f19e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1467,7 +1467,7 @@ static void rcu_gp_kthread_wake(void) { struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); - if ((current == t && !in_irq() && !in_serving_softirq()) || + if ((current == t && !in_hardirq() && !in_serving_softirq()) || !READ_ONCE(rcu_state.gp_flags) || !t) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5199559fbbf0..599084c4c21f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -642,7 +642,7 @@ static void rcu_read_unlock_special(struct task_struct *t) (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node); // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) { + if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the // wakeup is free or there is either an expedited // GP in flight or a potential need to deboost. -- cgit v1.2.3 From 17ea3718824912e773b0fd78579694b2e75ee597 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Sun, 24 Oct 2021 08:36:34 +0800 Subject: rcu: Improve tree_plugin.h comments and add code cleanups This commit cleans up some comments and code in kernel/rcu/tree_plugin.h. Signed-off-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 599084c4c21f..f44580276fbd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -16,7 +16,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) { /* - * In order to read the offloaded state of an rdp is a safe + * In order to read the offloaded state of an rdp in a safe * and stable way and prevent from its value to be changed * under us, we must either hold the barrier mutex, the cpu * hotplug lock (read or write) or the nocb lock. Local @@ -56,7 +56,7 @@ static void __init rcu_bootup_announce_oddness(void) if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) - pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n"); + pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -88,13 +88,13 @@ static void __init rcu_bootup_announce_oddness(void) if (rcu_kick_kthreads) pr_info("\tKick kthreads if too-long grace period.\n"); if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) - pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); + pr_info("\tRCU callback double-/use-after-free debug is enabled.\n"); if (gp_preinit_delay) pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); if (gp_init_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) - pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay); if (!use_softirq) pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) @@ -1153,7 +1153,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) /* * Create an RCU-boost kthread for the specified node if one does not * already exist. We only create this kthread for preemptible RCU. - * Returns zero if all is well, a negated errno otherwise. */ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { @@ -1455,7 +1454,7 @@ static void rcu_cleanup_after_idle(void) * CPU unless the grace period has extended for too long. * * This code relies on the fact that all NO_HZ_FULL CPUs are also - * CONFIG_RCU_NOCB_CPU CPUs. + * RCU_NOCB_CPU CPUs. */ static bool rcu_nohz_full_cpu(void) { -- cgit v1.2.3 From c2cf0767e98eb4487444e5c7ebba491a866811ce Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 15 Nov 2021 13:15:46 +0800 Subject: rcu: Avoid running boost kthreads on isolated CPUs When the boost kthreads are created on systems with nohz_full CPUs, the cpus_allowed_ptr is set to housekeeping_cpumask(HK_FLAG_KTHREAD). However, when the rcu_boost_kthread_setaffinity() is called, the original affinity will be changed and these kthreads can subsequently run on nohz_full CPUs. This commit makes rcu_boost_kthread_setaffinity() restrict these boost kthreads to housekeeping CPUs. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index f44580276fbd..87fc4609b756 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1203,8 +1203,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); + cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU)); if (cpumask_weight(cm) == 0) - cpumask_setall(cm); + cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU)); set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } -- cgit v1.2.3 From 300c0c5e721834f484b03fa3062602dd8ff48413 Mon Sep 17 00:00:00 2001 From: Jun Miao Date: Tue, 16 Nov 2021 07:23:02 +0800 Subject: rcu: Avoid alloc_pages() when recording stack The default kasan_record_aux_stack() calls stack_depot_save() with GFP_NOWAIT, which in turn can then call alloc_pages(GFP_NOWAIT, ...). In general, however, it is not even possible to use either GFP_ATOMIC nor GFP_NOWAIT in certain non-preemptive contexts/RT kernel including raw_spin_locks (see gfp.h and ab00db216c9c7). Fix it by instructing stackdepot to not expand stack storage via alloc_pages() in case it runs out by using kasan_record_aux_stack_noalloc(). Jianwei Hu reported: BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:969 in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 15319, name: python3 INFO: lockdep is turned off. irq event stamp: 0 hardirqs last enabled at (0): [<0000000000000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0xaf3/0x2590 softirqs last enabled at (0): [] copy_process+0xaf3/0x2590 softirqs last disabled at (0): [<0000000000000000>] 0x0 CPU: 6 PID: 15319 Comm: python3 Tainted: G W O 5.15-rc7-preempt-rt #1 Hardware name: Supermicro SYS-E300-9A-8C/A2SDi-8C-HLN4F, BIOS 1.1b 12/17/2018 Call Trace: show_stack+0x52/0x58 dump_stack+0xa1/0xd6 ___might_sleep.cold+0x11c/0x12d rt_spin_lock+0x3f/0xc0 rmqueue+0x100/0x1460 rmqueue+0x100/0x1460 mark_usage+0x1a0/0x1a0 ftrace_graph_ret_addr+0x2a/0xb0 rmqueue_pcplist.constprop.0+0x6a0/0x6a0 __kasan_check_read+0x11/0x20 __zone_watermark_ok+0x114/0x270 get_page_from_freelist+0x148/0x630 is_module_text_address+0x32/0xa0 __alloc_pages_nodemask+0x2f6/0x790 __alloc_pages_slowpath.constprop.0+0x12d0/0x12d0 create_prof_cpu_mask+0x30/0x30 alloc_pages_current+0xb1/0x150 stack_depot_save+0x39f/0x490 kasan_save_stack+0x42/0x50 kasan_save_stack+0x23/0x50 kasan_record_aux_stack+0xa9/0xc0 __call_rcu+0xff/0x9c0 call_rcu+0xe/0x10 put_object+0x53/0x70 __delete_object+0x7b/0x90 kmemleak_free+0x46/0x70 slab_free_freelist_hook+0xb4/0x160 kfree+0xe5/0x420 kfree_const+0x17/0x30 kobject_cleanup+0xaa/0x230 kobject_put+0x76/0x90 netdev_queue_update_kobjects+0x17d/0x1f0 ... ... ksys_write+0xd9/0x180 __x64_sys_write+0x42/0x50 do_syscall_64+0x38/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Links: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/include/linux/kasan.h?id=7cb3007ce2da27ec02a1a3211941e7fe6875b642 Fixes: 84109ab58590 ("rcu: Record kvfree_call_rcu() call stack for KASAN") Fixes: 26e760c9a7c8 ("rcu: kasan: record and print call_rcu() call stack") Reported-by: Jianwei Hu Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Marco Elver Tested-by: Juri Lelli Signed-off-by: Jun Miao Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f0f19dc7f19e..c8beea8dd3a0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2982,7 +2982,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) head->func = func; head->next = NULL; local_irq_save(flags); - kasan_record_aux_stack(head); + kasan_record_aux_stack_noalloc(head); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ @@ -3547,7 +3547,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) return; } - kasan_record_aux_stack(ptr); + kasan_record_aux_stack_noalloc(ptr); success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); if (!success) { run_page_cache_worker(krcp); -- cgit v1.2.3 From 1f8da406a964dcc01121385a54fafd28e1c6a796 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 23 Sep 2021 10:07:14 -0700 Subject: srcu: Prevent redundant __srcu_read_unlock() wakeup Tiny SRCU readers can appear at task level, but also in interrupt and softirq handlers. Because Tiny SRCU is selected only in kernels built with CONFIG_SMP=n and CONFIG_PREEMPTION=n, it is not possible for a grace period to start while there is a non-task-level SRCU reader executing. This means that it does not make sense for __srcu_read_unlock() to awaken the Tiny SRCU grace period, because that can only happen when the grace period is waiting for one value of ->srcu_idx and __srcu_read_unlock() is ending the last reader for some other value of ->srcu_idx. After all, any such wakeup will be redundant. Worse yet, in some cases, such wakeups generate lockdep splats: ====================================================== WARNING: possible circular locking dependency detected 5.15.0-rc1+ #3758 Not tainted ------------------------------------------------------ rcu_torture_rea/53 is trying to acquire lock: ffffffff9514e6a8 (srcu_ctl.srcu_wq.lock){..-.}-{2:2}, at: xa/0x30 but task is already holding lock: ffff95c642479d80 (&p->pi_lock){-.-.}-{2:2}, at: _extend+0x370/0x400 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&p->pi_lock){-.-.}-{2:2}: _raw_spin_lock_irqsave+0x2f/0x50 try_to_wake_up+0x50/0x580 swake_up_locked.part.7+0xe/0x30 swake_up_one+0x22/0x30 rcutorture_one_extend+0x1b6/0x400 rcu_torture_one_read+0x290/0x5d0 rcu_torture_timer+0x1a/0x70 call_timer_fn+0xa6/0x230 run_timer_softirq+0x493/0x4c0 __do_softirq+0xc0/0x371 irq_exit+0x73/0x90 sysvec_apic_timer_interrupt+0x63/0x80 asm_sysvec_apic_timer_interrupt+0x12/0x20 default_idle+0xb/0x10 default_idle_call+0x5e/0x170 do_idle+0x18a/0x1f0 cpu_startup_entry+0xa/0x10 start_kernel+0x678/0x69f secondary_startup_64_no_verify+0xc2/0xcb -> #0 (srcu_ctl.srcu_wq.lock){..-.}-{2:2}: __lock_acquire+0x130c/0x2440 lock_acquire+0xc2/0x270 _raw_spin_lock_irqsave+0x2f/0x50 swake_up_one+0xa/0x30 rcutorture_one_extend+0x387/0x400 rcu_torture_one_read+0x290/0x5d0 rcu_torture_reader+0xac/0x200 kthread+0x12d/0x150 ret_from_fork+0x22/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&p->pi_lock); lock(srcu_ctl.srcu_wq.lock); lock(&p->pi_lock); lock(srcu_ctl.srcu_wq.lock); *** DEADLOCK *** 1 lock held by rcu_torture_rea/53: #0: ffff95c642479d80 (&p->pi_lock){-.-.}-{2:2}, at: _extend+0x370/0x400 stack backtrace: CPU: 0 PID: 53 Comm: rcu_torture_rea Not tainted 5.15.0-rc1+ Hardware name: Red Hat KVM/RHEL-AV, BIOS e_el8.5.0+746+bbd5d70c 04/01/2014 Call Trace: check_noncircular+0xfe/0x110 ? find_held_lock+0x2d/0x90 __lock_acquire+0x130c/0x2440 lock_acquire+0xc2/0x270 ? swake_up_one+0xa/0x30 ? find_held_lock+0x72/0x90 _raw_spin_lock_irqsave+0x2f/0x50 ? swake_up_one+0xa/0x30 swake_up_one+0xa/0x30 rcutorture_one_extend+0x387/0x400 rcu_torture_one_read+0x290/0x5d0 rcu_torture_reader+0xac/0x200 ? rcutorture_oom_notify+0xf0/0xf0 ? __kthread_parkme+0x61/0x90 ? rcu_torture_one_read+0x5d0/0x5d0 kthread+0x12d/0x150 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 This is a false positive because there is only one CPU, and both locks are raw (non-preemptible) spinlocks. However, it is worthwhile getting rid of the redundant wakeup, which has the side effect of breaking the theoretical deadlock cycle. This commit therefore eliminates the redundant wakeups. Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutiny.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index a0ba2ed49bc6..92c002d65482 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -99,7 +99,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); - if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) + if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task()) swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); -- cgit v1.2.3 From f5dbc594b5bac1fa694174032b8d3d0249945fd3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 18 Sep 2021 20:40:48 -0700 Subject: rcu-tasks: Don't remove tasks with pending IPIs from holdout list Currently, the check_all_holdout_tasks_trace() function removes all tasks marked with ->trc_reader_checked from the holdout list, including those with IPIs pending. This means that the IPI handler might arrive at a task that has already been removed from the list, which is at best an accident waiting to happen. This commit therefore avoids removing tasks with IPIs pending from the holdout list. This in turn means that the "if" condition in the for_each_online_cpu() loop in rcu_tasks_trace_postgp() should always evaluate to false, so a WARN_ON_ONCE() is added to check that. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 7da3c81c3f59..bd44cd4794d3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1121,7 +1121,8 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, trc_wait_for_one_reader(t, hop); // If check succeeded, remove this task from the list. - if (READ_ONCE(t->trc_reader_checked)) + if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 && + READ_ONCE(t->trc_reader_checked)) trc_del_holdout(t); else if (needreport) show_stalled_task_trace(t, firstreport); @@ -1156,7 +1157,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) // Yes, this assumes that CPUs process IPIs in order. If that ever // changes, there will need to be a recheck and/or timed wait. for_each_online_cpu(cpu) - if (smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))) + if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))) smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1); // Remove the safety count. -- cgit v1.2.3 From 902d82e6299631ec6b6c759ac4b45e44b45cc832 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Sep 2021 20:31:44 -0700 Subject: rcutorture: Sanitize RCUTORTURE_RDR_MASK RCUTORTURE_RDR_MASK is currently not the bit indicated by RCUTORTURE_RDR_SHIFT, but is instead all the bits less significant than that one. This is an accident waiting to happen, so this commit makes RCUTORTURE_RDR_MASK be that one bit and adjusts uses accordingly. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8b410d982990..1e03fe681f02 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -54,7 +54,7 @@ MODULE_AUTHOR("Paul E. McKenney and Josh Triplett extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK ((1 << RCUTORTURE_RDR_SHIFT) - 1) +#define RCUTORTURE_RDR_MASK (1 << RCUTORTURE_RDR_SHIFT) #define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ #define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ #define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ @@ -1466,6 +1466,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); + idxold = 0; if (lockit) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } @@ -1476,7 +1477,7 @@ static void rcutorture_one_extend(int *readstate, int newstate, /* Update the reader state. */ if (idxnew == -1) - idxnew = idxold & ~RCUTORTURE_RDR_MASK; + idxnew = idxold & RCUTORTURE_RDR_MASK; WARN_ON_ONCE(idxnew < 0); WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1); *readstate = idxnew | newstate; @@ -1626,7 +1627,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) rcu_torture_writer_state, cookie, cur_ops->get_gp_state()); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); - WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); + WARN_ON_ONCE(readstate); // This next splat is expected behavior if leakpointer, especially // for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels. WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1); -- cgit v1.2.3 From 1c3d53986f744c469f0e0b136b8922c83363bfde Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 22 Sep 2021 20:49:12 -0700 Subject: rcutorture: More thoroughly test nested readers Currently, nested readers occur only when a timer handler interrupts a reader. This is rare, and is thus insufficient testing of the transition between nesting levels. This commit therefore causes rcutorture nested readers to be the rule rather than the exception. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 73 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1e03fe681f02..d295da380fb4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -53,15 +53,18 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); /* Bits for ->extendables field, extendables param, and related definitions. */ -#define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ -#define RCUTORTURE_RDR_MASK (1 << RCUTORTURE_RDR_SHIFT) +#define RCUTORTURE_RDR_SHIFT_1 8 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_1 (1 << RCUTORTURE_RDR_SHIFT_1) +#define RCUTORTURE_RDR_SHIFT_2 9 /* Put SRCU index in upper bits. */ +#define RCUTORTURE_RDR_MASK_2 (1 << RCUTORTURE_RDR_SHIFT_2) #define RCUTORTURE_RDR_BH 0x01 /* Extend readers by disabling bh. */ #define RCUTORTURE_RDR_IRQ 0x02 /* ... disabling interrupts. */ #define RCUTORTURE_RDR_PREEMPT 0x04 /* ... disabling preemption. */ #define RCUTORTURE_RDR_RBH 0x08 /* ... rcu_read_lock_bh(). */ #define RCUTORTURE_RDR_SCHED 0x10 /* ... rcu_read_lock_sched(). */ -#define RCUTORTURE_RDR_RCU 0x20 /* ... entering another RCU reader. */ -#define RCUTORTURE_RDR_NBITS 6 /* Number of bits defined above. */ +#define RCUTORTURE_RDR_RCU_1 0x20 /* ... entering another RCU reader. */ +#define RCUTORTURE_RDR_RCU_2 0x40 /* ... entering another RCU reader. */ +#define RCUTORTURE_RDR_NBITS 7 /* Number of bits defined above. */ #define RCUTORTURE_MAX_EXTEND \ (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_IRQ | RCUTORTURE_RDR_PREEMPT | \ RCUTORTURE_RDR_RBH | RCUTORTURE_RDR_SCHED) @@ -1420,13 +1423,15 @@ static void rcutorture_one_extend(int *readstate, int newstate, struct rt_read_seg *rtrsp) { unsigned long flags; - int idxnew = -1; - int idxold = *readstate; + int idxnew1 = -1; + int idxnew2 = -1; + int idxold1 = *readstate; + int idxold2 = idxold1; int statesnew = ~*readstate & newstate; int statesold = *readstate & ~newstate; - WARN_ON_ONCE(idxold < 0); - WARN_ON_ONCE((idxold >> RCUTORTURE_RDR_SHIFT) > 1); + WARN_ON_ONCE(idxold2 < 0); + WARN_ON_ONCE((idxold2 >> RCUTORTURE_RDR_SHIFT_2) > 1); rtrsp->rt_readstate = newstate; /* First, put new protection in place to avoid critical-section gap. */ @@ -1440,8 +1445,10 @@ static void rcutorture_one_extend(int *readstate, int newstate, preempt_disable(); if (statesnew & RCUTORTURE_RDR_SCHED) rcu_read_lock_sched(); - if (statesnew & RCUTORTURE_RDR_RCU) - idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; + if (statesnew & RCUTORTURE_RDR_RCU_1) + idxnew1 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_1; + if (statesnew & RCUTORTURE_RDR_RCU_2) + idxnew2 = (cur_ops->readlock() & 0x1) << RCUTORTURE_RDR_SHIFT_2; /* * Next, remove old protection, in decreasing order of strength @@ -1460,13 +1467,19 @@ static void rcutorture_one_extend(int *readstate, int newstate, local_bh_enable(); if (statesold & RCUTORTURE_RDR_RBH) rcu_read_unlock_bh(); - if (statesold & RCUTORTURE_RDR_RCU) { + if (statesold & RCUTORTURE_RDR_RCU_2) { + cur_ops->readunlock((idxold2 >> RCUTORTURE_RDR_SHIFT_2) & 0x1); + WARN_ON_ONCE(idxnew2 != -1); + idxold2 = 0; + } + if (statesold & RCUTORTURE_RDR_RCU_1) { bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); - cur_ops->readunlock(idxold >> RCUTORTURE_RDR_SHIFT); - idxold = 0; + cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1); + WARN_ON_ONCE(idxnew1 != -1); + idxold1 = 0; if (lockit) raw_spin_unlock_irqrestore(¤t->pi_lock, flags); } @@ -1476,13 +1489,19 @@ static void rcutorture_one_extend(int *readstate, int newstate, cur_ops->read_delay(trsp, rtrsp); /* Update the reader state. */ - if (idxnew == -1) - idxnew = idxold & RCUTORTURE_RDR_MASK; - WARN_ON_ONCE(idxnew < 0); - WARN_ON_ONCE((idxnew >> RCUTORTURE_RDR_SHIFT) > 1); - *readstate = idxnew | newstate; - WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) < 0); - WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT) > 1); + if (idxnew1 == -1) + idxnew1 = idxold1 & RCUTORTURE_RDR_MASK_1; + WARN_ON_ONCE(idxnew1 < 0); + if (WARN_ON_ONCE((idxnew1 >> RCUTORTURE_RDR_SHIFT_1) > 1)) + pr_info("Unexpected idxnew1 value of %#x\n", idxnew1); + if (idxnew2 == -1) + idxnew2 = idxold2 & RCUTORTURE_RDR_MASK_2; + WARN_ON_ONCE(idxnew2 < 0); + WARN_ON_ONCE((idxnew2 >> RCUTORTURE_RDR_SHIFT_2) > 1); + *readstate = idxnew1 | idxnew2 | newstate; + WARN_ON_ONCE(*readstate < 0); + if (WARN_ON_ONCE((*readstate >> RCUTORTURE_RDR_SHIFT_2) > 1)) + pr_info("Unexpected idxnew2 value of %#x\n", idxnew2); } /* Return the biggest extendables mask given current RCU and boot parameters. */ @@ -1492,7 +1511,7 @@ static int rcutorture_extend_mask_max(void) WARN_ON_ONCE(extendables & ~RCUTORTURE_MAX_EXTEND); mask = extendables & RCUTORTURE_MAX_EXTEND & cur_ops->extendables; - mask = mask | RCUTORTURE_RDR_RCU; + mask = mask | RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2; return mask; } @@ -1507,13 +1526,21 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; - WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); + WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT_1); /* Mostly only one bit (need preemption!), sometimes lots of bits. */ if (!(randmask1 & 0x7)) mask = mask & randmask2; else mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); + // Can't have nested RCU reader without outer RCU reader. + if (!(mask & RCUTORTURE_RDR_RCU_1) && (mask & RCUTORTURE_RDR_RCU_2)) { + if (oldmask & RCUTORTURE_RDR_RCU_1) + mask &= ~RCUTORTURE_RDR_RCU_2; + else + mask |= RCUTORTURE_RDR_RCU_1; + } + /* * Can't enable bh w/irq disabled. */ @@ -1533,7 +1560,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) mask |= oldmask & bhs; } - return mask ?: RCUTORTURE_RDR_RCU; + return mask ?: RCUTORTURE_RDR_RCU_1; } /* -- cgit v1.2.3 From 340170fef01b18631128006de03522b671e6e2b5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Sep 2021 21:30:26 -0700 Subject: rcutorture: Suppress pi-lock-across read-unlock testing for Tiny SRCU Because Tiny srcu_read_unlock() directly calls swake_up_one(), lockdep complains when a pi lock is held across that srcu_read_unlock(). Although this is a lockdep false positive (there is no other CPU to complete the deadlock cycle), lockdep is what it is at the moment. This commit therefore prevents rcutorture from holding pi lock across a Tiny srcu_read_unlock(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d295da380fb4..503e14e62e8f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -349,6 +349,7 @@ struct rcu_torture_ops { int can_boost; int extendables; int slow_gps; + int no_pi_lock; const char *name; }; @@ -670,6 +671,7 @@ static struct rcu_torture_ops srcu_ops = { .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcu" }; @@ -703,6 +705,7 @@ static struct rcu_torture_ops srcud_ops = { .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .name = "srcud" }; @@ -723,6 +726,7 @@ static struct rcu_torture_ops busted_srcud_ops = { .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .irq_capable = 1, + .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), .extendables = RCUTORTURE_MAX_EXTEND, .name = "busted_srcud" }; @@ -1473,8 +1477,9 @@ static void rcutorture_one_extend(int *readstate, int newstate, idxold2 = 0; } if (statesold & RCUTORTURE_RDR_RCU_1) { - bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); + bool lockit; + lockit = !cur_ops->no_pi_lock && !statesnew && !(torture_random(trsp) & 0xffff); if (lockit) raw_spin_lock_irqsave(¤t->pi_lock, flags); cur_ops->readunlock((idxold1 >> RCUTORTURE_RDR_SHIFT_1) & 0x1); -- cgit v1.2.3 From c30c876312f66b0422a1d632b8c45b6416deeec0 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 25 Oct 2021 11:26:56 +0800 Subject: refscale: Simplify the errexit checkpoint There is only the one OOM error case in main_func(), so this commit eliminates the errexit local variable in favor of a branch to cleanup code. Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 1631ef8a138d..b59457cef88d 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -637,7 +637,6 @@ static u64 process_durations(int n) // point all the timestamps are printed. static int main_func(void *arg) { - bool errexit = false; int exp, r; char buf1[64]; char *buf; @@ -651,7 +650,7 @@ static int main_func(void *arg) buf = kzalloc(64 + nruns * 32, GFP_KERNEL); if (!result_avg || !buf) { VERBOSE_SCALEOUT_ERRSTRING("out of memory"); - errexit = true; + goto oom_exit; } if (holdoff) schedule_timeout_interruptible(holdoff * HZ); @@ -663,8 +662,6 @@ static int main_func(void *arg) // Start exp readers up per experiment for (exp = 0; exp < nruns && !torture_must_stop(); exp++) { - if (errexit) - break; if (torture_must_stop()) goto end; @@ -698,26 +695,22 @@ static int main_func(void *arg) // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - if (!errexit) { - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); - } + buf[0] = 0; + strcat(buf, "\n"); + strcat(buf, "Runs\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { u64 avg; u32 rem; - if (errexit) - break; avg = div_u64_rem(result_avg[exp], 1000, &rem); sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); strcat(buf, buf1); } - if (!errexit) - SCALEOUT("%s", buf); + SCALEOUT("%s", buf); +oom_exit: // This will shutdown everything including us. if (shutdown) { shutdown_start = 1; -- cgit v1.2.3 From 9880eb878c315b241002d43d7ff89df23713a51b Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 25 Oct 2021 11:26:57 +0800 Subject: refscale: Prevent buffer to pr_alert() being too long 0Day/LKP observed that the refscale results fail to complete when larger values of nrun (such as 300) are specified. The problem is that printk() can accept at most a 1024-byte buffer. This commit therefore prints the buffer whenever its length exceeds 800 bytes. CC: Philip Li Reported-by: kernel test robot Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index b59457cef88d..d8ed1ca3adc0 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -604,7 +604,7 @@ static u64 process_durations(int n) char *buf; u64 sum = 0; - buf = kmalloc(128 + nreaders * 32, GFP_KERNEL); + buf = kmalloc(800 + 64, GFP_KERNEL); if (!buf) return 0; buf[0] = 0; @@ -617,13 +617,15 @@ static u64 process_durations(int n) if (i % 5 == 0) strcat(buf, "\n"); + if (strlen(buf) >= 800) { + pr_alert("%s", buf); + buf[0] = 0; + } strcat(buf, buf1); sum += rt->last_duration_ns; } - strcat(buf, "\n"); - - SCALEOUT("%s\n", buf); + pr_alert("%s\n", buf); kfree(buf); return sum; @@ -647,7 +649,7 @@ static int main_func(void *arg) VERBOSE_SCALEOUT("main_func task started"); result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); - buf = kzalloc(64 + nruns * 32, GFP_KERNEL); + buf = kzalloc(800 + 64, GFP_KERNEL); if (!result_avg || !buf) { VERBOSE_SCALEOUT_ERRSTRING("out of memory"); goto oom_exit; @@ -695,10 +697,7 @@ static int main_func(void *arg) // Print the average of all experiments SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n"); - buf[0] = 0; - strcat(buf, "\n"); - strcat(buf, "Runs\tTime(ns)\n"); - + pr_alert("Runs\tTime(ns)\n"); for (exp = 0; exp < nruns; exp++) { u64 avg; u32 rem; @@ -706,9 +705,13 @@ static int main_func(void *arg) avg = div_u64_rem(result_avg[exp], 1000, &rem); sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem); strcat(buf, buf1); + if (strlen(buf) >= 800) { + pr_alert("%s", buf); + buf[0] = 0; + } } - SCALEOUT("%s", buf); + pr_alert("%s", buf); oom_exit: // This will shutdown everything including us. -- cgit v1.2.3 From 443378f0664a78756c3e3aeaab92750fe1e05735 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Nov 2021 17:00:30 -0800 Subject: workqueue: Upgrade queue_work_on() comment The current queue_work_on() docbook comment says that the caller must ensure that the specified CPU can't go away, but does not spell out the consequences, which turn out to be quite mild. Therefore expand this comment to explicitly say that the penalty for failing to nail down the specified CPU is that the workqueue handler might find itself executing on some other CPU. Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 613917bbc4e7..332361cf215f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1531,7 +1531,8 @@ out: * @work: work to queue * * We queue the work to a specific CPU, the caller must ensure it - * can't go away. + * can't go away. Callers that fail to ensure that the specified + * CPU cannot go away will execute on a randomly chosen CPU. * * Return: %false if @work was already on a queue, %true otherwise. */ -- cgit v1.2.3 From 436d404cc8ff573a417cb3b6a5c76655121aceac Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 1 Dec 2021 15:34:57 +0800 Subject: bpf: Clean-up bpf_verifier_vlog() for BPF_LOG_KERNEL log level An extra newline will output for bpf_log() with BPF_LOG_KERNEL level as shown below: [ 52.095704] BPF:The function test_3 has 12 arguments. Too many. [ 52.095704] [ 52.096896] Error in parsing func ptr test_3 in struct bpf_dummy_ops Now all bpf_log() are ended by newline, but not all btf_verifier_log() are ended by newline, so checking whether or not the log message has the trailing newline and adding a newline if not. Also there is no need to calculate the left userspace buffer size for kernel log output and to truncate the output by '\0' which has already been done by vscnprintf(), so only do these for userspace log output. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211201073458.2731595-2-houtao1@huawei.com --- kernel/bpf/verifier.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d7678d8a925c..6c9c0d9a04a0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -293,13 +293,15 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, "verifier log line truncated - local buffer too short\n"); - n = min(log->len_total - log->len_used - 1, n); - log->kbuf[n] = '\0'; - if (log->level == BPF_LOG_KERNEL) { - pr_err("BPF:%s\n", log->kbuf); + bool newline = n > 0 && log->kbuf[n - 1] == '\n'; + + pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); return; } + + n = min(log->len_total - log->len_used - 1, n); + log->kbuf[n] = '\0'; if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) log->len_used += n; else -- cgit v1.2.3 From 450fec13d9170127678f991698ac1a5b05c02e2f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 30 Nov 2021 12:31:23 -0500 Subject: tracing/histograms: String compares should not care about signed values When comparing two strings for the "onmatch" histogram trigger, fields that are strings use string comparisons, which do not care about being signed or not. Do not fail to match two string fields if one is unsigned char array and the other is a signed char array. Link: https://lore.kernel.org/all/20211129123043.5cfd687a@gandalf.local.home/ Cc: stable@vgerk.kernel.org Cc: Tom Zanussi Cc: Yafang Shao Fixes: b05e89ae7cf3b ("tracing: Accept different type for synthetic event fields") Reviewed-by: Masami Hiramatsu Reported-by: Sven Schnelle Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9555b8e1d1e3..319f9c8ca7e7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3757,7 +3757,7 @@ static int check_synth_field(struct synth_event *event, if (strcmp(field->type, hist_field->type) != 0) { if (field->size != hist_field->size || - field->is_signed != hist_field->is_signed) + (!field->is_string && field->is_signed != hist_field->is_signed)) return -EINVAL; } -- cgit v1.2.3 From f25667e5980a4333729cac3101e5de1bb851f71a Mon Sep 17 00:00:00 2001 From: Chen Jun Date: Wed, 24 Nov 2021 14:08:01 +0000 Subject: tracing: Fix a kmemleak false positive in tracing_map Doing the command: echo 'hist:key=common_pid.execname,common_timestamp' > /sys/kernel/debug/tracing/events/xxx/trigger Triggers many kmemleak reports: unreferenced object 0xffff0000c7ea4980 (size 128): comm "bash", pid 338, jiffies 4294912626 (age 9339.324s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f3469921>] kmem_cache_alloc_trace+0x4c0/0x6f0 [<0000000054ca40c3>] hist_trigger_elt_data_alloc+0x140/0x178 [<00000000633bd154>] tracing_map_init+0x1f8/0x268 [<000000007e814ab9>] event_hist_trigger_func+0xca0/0x1ad0 [<00000000bf8520ed>] trigger_process_regex+0xd4/0x128 [<00000000f549355a>] event_trigger_write+0x7c/0x120 [<00000000b80f898d>] vfs_write+0xc4/0x380 [<00000000823e1055>] ksys_write+0x74/0xf8 [<000000008a9374aa>] __arm64_sys_write+0x24/0x30 [<0000000087124017>] do_el0_svc+0x88/0x1c0 [<00000000efd0dcd1>] el0_svc+0x1c/0x28 [<00000000dbfba9b3>] el0_sync_handler+0x88/0xc0 [<00000000e7399680>] el0_sync+0x148/0x180 unreferenced object 0xffff0000c7ea4980 (size 128): comm "bash", pid 338, jiffies 4294912626 (age 9339.324s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f3469921>] kmem_cache_alloc_trace+0x4c0/0x6f0 [<0000000054ca40c3>] hist_trigger_elt_data_alloc+0x140/0x178 [<00000000633bd154>] tracing_map_init+0x1f8/0x268 [<000000007e814ab9>] event_hist_trigger_func+0xca0/0x1ad0 [<00000000bf8520ed>] trigger_process_regex+0xd4/0x128 [<00000000f549355a>] event_trigger_write+0x7c/0x120 [<00000000b80f898d>] vfs_write+0xc4/0x380 [<00000000823e1055>] ksys_write+0x74/0xf8 [<000000008a9374aa>] __arm64_sys_write+0x24/0x30 [<0000000087124017>] do_el0_svc+0x88/0x1c0 [<00000000efd0dcd1>] el0_svc+0x1c/0x28 [<00000000dbfba9b3>] el0_sync_handler+0x88/0xc0 [<00000000e7399680>] el0_sync+0x148/0x180 The reason is elts->pages[i] is alloced by get_zeroed_page. and kmemleak will not scan the area alloced by get_zeroed_page. The address stored in elts->pages will be regarded as leaked. That is, the elts->pages[i] will have pointers loaded onto it as well, and without telling kmemleak about it, those pointers will look like memory without a reference. To fix this, call kmemleak_alloc to tell kmemleak to scan elts->pages[i] Link: https://lkml.kernel.org/r/20211124140801.87121-1-chenjun102@huawei.com Signed-off-by: Chen Jun Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 39bb56d2dcbe..9628b5571846 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "tracing_map.h" #include "trace.h" @@ -307,6 +308,7 @@ static void tracing_map_array_free(struct tracing_map_array *a) for (i = 0; i < a->n_pages; i++) { if (!a->pages[i]) break; + kmemleak_free(a->pages[i]); free_page((unsigned long)a->pages[i]); } @@ -342,6 +344,7 @@ static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL); if (!a->pages[i]) goto free; + kmemleak_alloc(a->pages[i], PAGE_SIZE, 1, GFP_KERNEL); } out: return a; -- cgit v1.2.3 From 6bbfa44116689469267f1a6e3d233b52114139d2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 1 Dec 2021 23:45:50 +0900 Subject: kprobes: Limit max data_size of the kretprobe instances The 'kprobe::data_size' is unsigned, thus it can not be negative. But if user sets it enough big number (e.g. (size_t)-8), the result of 'data_size + sizeof(struct kretprobe_instance)' becomes smaller than sizeof(struct kretprobe_instance) or zero. In result, the kretprobe_instance are allocated without enough memory, and kretprobe accesses outside of allocated memory. To avoid this issue, introduce a max limitation of the kretprobe::data_size. 4KB per instance should be OK. Link: https://lkml.kernel.org/r/163836995040.432120.10322772773821182925.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: f47cd9b553aa ("kprobes: kretprobe user entry-handler") Reported-by: zhangyue Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 2 ++ kernel/kprobes.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e974caf39d3e..8c8f7a4d93af 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -153,6 +153,8 @@ struct kretprobe { struct kretprobe_holder *rph; }; +#define KRETPROBE_MAX_DATA_SIZE 4096 + struct kretprobe_instance { union { struct freelist_node freelist; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e9db0c810554..21eccc961bba 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2086,6 +2086,9 @@ int register_kretprobe(struct kretprobe *rp) } } + if (rp->data_size > KRETPROBE_MAX_DATA_SIZE) + return -E2BIG; + rp->kp.pre_handler = pre_handler_kretprobe; rp->kp.post_handler = NULL; -- cgit v1.2.3 From 0766bffcae0706baddea6aa3f85b43031ede0e0d Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 29 Nov 2021 09:58:02 -0700 Subject: gcov: Remove compiler version check The minimum supported version of LLVM has been raised to 11.0.0, meaning this check is always true, so it can be dropped. Signed-off-by: Nathan Chancellor Reviewed-by: Miguel Ojeda Reviewed-by: Mark Brown Reviewed-by: Nick Desaulniers Reviewed-by: Kees Cook Signed-off-by: Masahiro Yamada --- kernel/gcov/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 053447183ac5..04f4ebdc3cf5 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,6 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - depends on !CC_IS_CLANG || CLANG_VERSION >= 110000 depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR select CONSTRUCTORS default n -- cgit v1.2.3 From 53e87e3cdc155f20c3417b689df8d2ac88d79576 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Oct 2021 16:10:54 +0200 Subject: timers/nohz: Last resort update jiffies on nohz_full IRQ entry When at least one CPU runs in nohz_full mode, a dedicated timekeeper CPU is guaranteed to stay online and to never stop its tick. Meanwhile on some rare case, the dedicated timekeeper may be running with interrupts disabled for a while, such as in stop_machine. If jiffies stop being updated, a nohz_full CPU may end up endlessly programming the next tick in the past, taking the last jiffies update monotonic timestamp as a stale base, resulting in an tick storm. Here is a scenario where it matters: 0) CPU 0 is the timekeeper and CPU 1 a nohz_full CPU. 1) A stop machine callback is queued to execute somewhere. 2) CPU 0 reaches MULTI_STOP_DISABLE_IRQ while CPU 1 is still in MULTI_STOP_PREPARE. Hence CPU 0 can't do its timekeeping duty. CPU 1 can still take IRQs. 3) CPU 1 receives an IRQ which queues a timer callback one jiffy forward. 4) On IRQ exit, CPU 1 schedules the tick one jiffy forward, taking last_jiffies_update as a base. But last_jiffies_update hasn't been updated for 2 jiffies since the timekeeper has interrupts disabled. 5) clockevents_program_event(), which relies on ktime_get(), observes that the expiration is in the past and therefore programs the min delta event on the clock. 6) The tick fires immediately, goto 3) 7) Tick storm, the nohz_full CPU is drown and takes ages to reach MULTI_STOP_DISABLE_IRQ, which is the only way out of this situation. Solve this with unconditionally updating jiffies if the value is stale on nohz_full IRQ entry. IRQs and other disturbances are expected to be rare enough on nohz_full for the unconditional call to ktime_get() to actually matter. Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Tested-by: Paul E. McKenney Link: https://lore.kernel.org/r/20211026141055.57358-2-frederic@kernel.org --- kernel/softirq.c | 3 ++- kernel/time/tick-sched.c | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 322b65d45676..41f470929e99 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -595,7 +595,8 @@ void irq_enter_rcu(void) { __irq_enter_raw(); - if (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)) + if (tick_nohz_full_cpu(smp_processor_id()) || + (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET))) tick_irq_enter(); account_hardirq_enter(current); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6bffe5af8cb1..17a283ce2b20 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1375,6 +1375,13 @@ static inline void tick_nohz_irq_enter(void) now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); + /* + * If all CPUs are idle. We may need to update a stale jiffies value. + * Note nohz_full is a special case: a timekeeper is guaranteed to stay + * alive but it might be busy looping with interrupts disabled in some + * rare case (typically stop machine). So we must make sure we have a + * last resort. + */ if (ts->tick_stopped) tick_nohz_update_jiffies(now); } -- cgit v1.2.3 From e7f2be115f0746b969c0df14c0d182f65f005ca5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 26 Oct 2021 16:10:55 +0200 Subject: sched/cputime: Fix getrusage(RUSAGE_THREAD) with nohz_full getrusage(RUSAGE_THREAD) with nohz_full may return shorter utime/stime than the actual time. task_cputime_adjusted() snapshots utime and stime and then adjust their sum to match the scheduler maintained cputime.sum_exec_runtime. Unfortunately in nohz_full, sum_exec_runtime is only updated once per second in the worst case, causing a discrepancy against utime and stime that can be updated anytime by the reader using vtime. To fix this situation, perform an update of cputime.sum_exec_runtime when the cputime snapshot reports the task as actually running while the tick is disabled. The related overhead is then contained within the relevant situations. Reported-by: Hasegawa Hitomi Signed-off-by: Frederic Weisbecker Signed-off-by: Hasegawa Hitomi Signed-off-by: Thomas Gleixner Tested-by: Masayoshi Mizuma Acked-by: Phil Auld Link: https://lore.kernel.org/r/20211026141055.57358-3-frederic@kernel.org --- include/linux/sched/cputime.h | 5 +++-- kernel/sched/cputime.c | 12 +++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h index 6c9f19a33865..ce3c58286062 100644 --- a/include/linux/sched/cputime.h +++ b/include/linux/sched/cputime.h @@ -18,15 +18,16 @@ #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern void task_cputime(struct task_struct *t, +extern bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime); extern u64 task_gtime(struct task_struct *t); #else -static inline void task_cputime(struct task_struct *t, +static inline bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { *utime = t->utime; *stime = t->stime; + return false; } static inline u64 task_gtime(struct task_struct *t) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 872e481d5098..9392aea1804e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -615,7 +615,8 @@ void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) .sum_exec_runtime = p->se.sum_exec_runtime, }; - task_cputime(p, &cputime.utime, &cputime.stime); + if (task_cputime(p, &cputime.utime, &cputime.stime)) + cputime.sum_exec_runtime = task_sched_runtime(p); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } EXPORT_SYMBOL_GPL(task_cputime_adjusted); @@ -828,19 +829,21 @@ u64 task_gtime(struct task_struct *t) * add up the pending nohz execution time since the last * cputime snapshot. */ -void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) +bool task_cputime(struct task_struct *t, u64 *utime, u64 *stime) { struct vtime *vtime = &t->vtime; unsigned int seq; u64 delta; + int ret; if (!vtime_accounting_enabled()) { *utime = t->utime; *stime = t->stime; - return; + return false; } do { + ret = false; seq = read_seqcount_begin(&vtime->seqcount); *utime = t->utime; @@ -850,6 +853,7 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) if (vtime->state < VTIME_SYS) continue; + ret = true; delta = vtime_delta(vtime); /* @@ -861,6 +865,8 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) else *utime += vtime->utime + delta; } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return ret; } static int vtime_state_fetch(struct vtime *vtime, int cpu) -- cgit v1.2.3 From 8293eb995f349aed28006792cad4cb48091919dd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:25 -0800 Subject: bpf: Rename btf_member accessors. Rename btf_member_bit_offset() and btf_member_bitfield_size() to avoid conflicts with similarly named helpers in libbpf's btf.h. Rename the kernel helpers, since libbpf helpers are part of uapi. Suggested-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-3-alexei.starovoitov@gmail.com --- include/linux/btf.h | 8 ++++---- kernel/bpf/bpf_struct_ops.c | 6 +++--- kernel/bpf/btf.c | 18 +++++++++--------- net/ipv4/bpf_tcp_ca.c | 6 +++--- 4 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index 203eef993d76..956f70388f69 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -194,15 +194,15 @@ static inline bool btf_type_kflag(const struct btf_type *t) return BTF_INFO_KFLAG(t->info); } -static inline u32 btf_member_bit_offset(const struct btf_type *struct_type, - const struct btf_member *member) +static inline u32 __btf_member_bit_offset(const struct btf_type *struct_type, + const struct btf_member *member) { return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) : member->offset; } -static inline u32 btf_member_bitfield_size(const struct btf_type *struct_type, - const struct btf_member *member) +static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type, + const struct btf_member *member) { return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) : 0; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 8ecfe4752769..21069dbe9138 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -165,7 +165,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) break; } - if (btf_member_bitfield_size(t, member)) { + if (__btf_member_bitfield_size(t, member)) { pr_warn("bit field member %s in struct %s is not supported\n", mname, st_ops->name); break; @@ -296,7 +296,7 @@ static int check_zero_holes(const struct btf_type *t, void *data) const struct btf_type *mtype; for_each_member(i, t, member) { - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (moff > prev_mend && memchr_inv(data + prev_mend, 0, moff - prev_mend)) return -EINVAL; @@ -387,7 +387,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_prog *prog; u32 moff; - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); if (ptype == module_type) { if (*(void **)(udata + moff)) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b9d23be1e99..f4119a99da7b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -2969,7 +2969,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } - offset = btf_member_bit_offset(t, member); + offset = __btf_member_bit_offset(t, member); if (is_union && offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); @@ -3094,7 +3094,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t if (off != -ENOENT) /* only one such field is allowed */ return -E2BIG; - off = btf_member_bit_offset(t, member); + off = __btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ return -EINVAL; @@ -3184,8 +3184,8 @@ static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, btf_show_start_member(show, member); - member_offset = btf_member_bit_offset(t, member); - bitfield_size = btf_member_bitfield_size(t, member); + member_offset = __btf_member_bit_offset(t, member); + bitfield_size = __btf_member_bitfield_size(t, member); bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { @@ -5060,7 +5060,7 @@ again: if (array_elem->nelems != 0) goto error; - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (off < moff) goto error; @@ -5083,14 +5083,14 @@ error: for_each_member(i, t, member) { /* offset of the field in bytes */ - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (off + size <= moff) /* won't find anything, field is already too far */ break; - if (btf_member_bitfield_size(t, member)) { - u32 end_bit = btf_member_bit_offset(t, member) + - btf_member_bitfield_size(t, member); + if (__btf_member_bitfield_size(t, member)) { + u32 end_bit = __btf_member_bit_offset(t, member) + + __btf_member_bitfield_size(t, member); /* off <= moff instead of off == moff because clang * does not generate a BTF member for anonymous diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 2cf02b4d77fb..67466dbff152 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -169,7 +169,7 @@ static u32 prog_ops_moff(const struct bpf_prog *prog) t = bpf_tcp_congestion_ops.type; m = &btf_type_member(t)[midx]; - return btf_member_bit_offset(t, m) / 8; + return __btf_member_bit_offset(t, m) / 8; } static const struct bpf_func_proto * @@ -244,7 +244,7 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, utcp_ca = (const struct tcp_congestion_ops *)udata; tcp_ca = (struct tcp_congestion_ops *)kdata; - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; switch (moff) { case offsetof(struct tcp_congestion_ops, flags): if (utcp_ca->flags & ~TCP_CONG_MASK) @@ -274,7 +274,7 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t, static int bpf_tcp_ca_check_member(const struct btf_type *t, const struct btf_member *member) { - if (is_unsupported(btf_member_bit_offset(t, member) / 8)) + if (is_unsupported(__btf_member_bit_offset(t, member) / 8)) return -ENOTSUPP; return 0; } -- cgit v1.2.3 From 29db4bea1d10b73749d7992c1fc9ac13499e8871 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:26 -0800 Subject: bpf: Prepare relo_core.c for kernel duty. Make relo_core.c to be compiled for the kernel and for user space libbpf. Note the patch is reducing BPF_CORE_SPEC_MAX_LEN from 64 to 32. This is the maximum number of nested structs and arrays. For example: struct sample { int a; struct { int b[10]; }; }; struct sample *s = ...; int *y = &s->b[5]; This field access is encoded as "0:1:0:5" and spec len is 4. The follow up patch might bump it back to 64. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-4-alexei.starovoitov@gmail.com --- include/linux/btf.h | 81 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/Makefile | 4 +++ kernel/bpf/btf.c | 26 +++++++++++++++ tools/lib/bpf/relo_core.c | 76 +++++++++++++++++++++++++++++++++++++------- 4 files changed, 176 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index 956f70388f69..acef6ef28768 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -144,6 +144,53 @@ static inline bool btf_type_is_enum(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_ENUM; } +static inline bool str_is_empty(const char *s) +{ + return !s || !s[0]; +} + +static inline u16 btf_kind(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info); +} + +static inline bool btf_is_enum(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM; +} + +static inline bool btf_is_composite(const struct btf_type *t) +{ + u16 kind = btf_kind(t); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + +static inline bool btf_is_array(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ARRAY; +} + +static inline bool btf_is_int(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_INT; +} + +static inline bool btf_is_ptr(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_PTR; +} + +static inline u8 btf_int_offset(const struct btf_type *t) +{ + return BTF_INT_OFFSET(*(u32 *)(t + 1)); +} + +static inline u8 btf_int_encoding(const struct btf_type *t) +{ + return BTF_INT_ENCODING(*(u32 *)(t + 1)); +} + static inline bool btf_type_is_scalar(const struct btf_type *t) { return btf_type_is_int(t) || btf_type_is_enum(t); @@ -184,6 +231,11 @@ static inline u16 btf_type_vlen(const struct btf_type *t) return BTF_INFO_VLEN(t->info); } +static inline u16 btf_vlen(const struct btf_type *t) +{ + return btf_type_vlen(t); +} + static inline u16 btf_func_linkage(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); @@ -208,11 +260,40 @@ static inline u32 __btf_member_bitfield_size(const struct btf_type *struct_type, : 0; } +static inline struct btf_member *btf_members(const struct btf_type *t) +{ + return (struct btf_member *)(t + 1); +} + +static inline u32 btf_member_bit_offset(const struct btf_type *t, u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + + return __btf_member_bit_offset(t, m); +} + +static inline u32 btf_member_bitfield_size(const struct btf_type *t, u32 member_idx) +{ + const struct btf_member *m = btf_members(t) + member_idx; + + return __btf_member_bitfield_size(t, m); +} + static inline const struct btf_member *btf_type_member(const struct btf_type *t) { return (const struct btf_member *)(t + 1); } +static inline struct btf_array *btf_array(const struct btf_type *t) +{ + return (struct btf_array *)(t + 1); +} + +static inline struct btf_enum *btf_enum(const struct btf_type *t) +{ + return (struct btf_enum *)(t + 1); +} + static inline const struct btf_var_secinfo *btf_type_var_secinfo( const struct btf_type *t) { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index cf6ca339f3cd..c1a9be6a4b9f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -36,3 +36,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif obj-$(CONFIG_BPF_PRELOAD) += preload/ + +obj-$(CONFIG_BPF_SYSCALL) += relo_core.o +$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE + $(call if_changed_rule,cc_o_c) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f4119a99da7b..c79595aad55b 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6413,3 +6413,29 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); + +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return -EOPNOTSUPP; +} + +static bool bpf_core_is_flavor_sep(const char *s) +{ + /* check X___Y name pattern, where X and Y are not underscores */ + return s[0] != '_' && /* X */ + s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */ + s[4] != '_'; /* Y */ +} + +size_t bpf_core_essential_name_len(const char *name) +{ + size_t n = strlen(name); + int i; + + for (i = n - 5; i >= 0; i--) { + if (bpf_core_is_flavor_sep(name + i)) + return i + 1; + } + return n; +} diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index c0904f4cb514..56dbe6d16664 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -1,6 +1,60 @@ // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* Copyright (c) 2019 Facebook */ +#ifdef __KERNEL__ +#include +#include +#include +#include +#include "relo_core.h" + +static const char *btf_kind_str(const struct btf_type *t) +{ + return btf_type_str(t); +} + +static bool is_ldimm64_insn(struct bpf_insn *insn) +{ + return insn->code == (BPF_LD | BPF_IMM | BPF_DW); +} + +static const struct btf_type * +skip_mods_and_typedefs(const struct btf *btf, u32 id, u32 *res_id) +{ + return btf_type_skip_modifiers(btf, id, res_id); +} + +static const char *btf__name_by_offset(const struct btf *btf, u32 offset) +{ + return btf_name_by_offset(btf, offset); +} + +static s64 btf__resolve_size(const struct btf *btf, u32 type_id) +{ + const struct btf_type *t; + int size; + + t = btf_type_by_id(btf, type_id); + t = btf_resolve_size(btf, t, &size); + if (IS_ERR(t)) + return PTR_ERR(t); + return size; +} + +enum libbpf_print_level { + LIBBPF_WARN, + LIBBPF_INFO, + LIBBPF_DEBUG, +}; + +#undef pr_warn +#undef pr_info +#undef pr_debug +#define pr_warn(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define pr_info(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define pr_debug(fmt, log, ...) bpf_log((void *)log, fmt, "", ##__VA_ARGS__) +#define libbpf_print(level, fmt, ...) bpf_log((void *)prog_name, fmt, ##__VA_ARGS__) +#else #include #include #include @@ -12,8 +66,9 @@ #include "btf.h" #include "str_error.h" #include "libbpf_internal.h" +#endif -#define BPF_CORE_SPEC_MAX_LEN 64 +#define BPF_CORE_SPEC_MAX_LEN 32 /* represents BPF CO-RE field or array element accessor */ struct bpf_core_accessor { @@ -150,7 +205,7 @@ static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) * Enum value-based relocations (ENUMVAL_EXISTS/ENUMVAL_VALUE) use access * string to specify enumerator's value index that need to be relocated. */ -static int bpf_core_parse_spec(const struct btf *btf, +static int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, __u32 type_id, const char *spec_str, enum bpf_core_relo_kind relo_kind, @@ -272,8 +327,8 @@ static int bpf_core_parse_spec(const struct btf *btf, return sz; spec->bit_offset += access_idx * sz * 8; } else { - pr_warn("relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n", - type_id, spec_str, i, id, btf_kind_str(t)); + pr_warn("prog '%s': relo for [%u] %s (at idx %d) captures type [%d] of unexpected kind %s\n", + prog_name, type_id, spec_str, i, id, btf_kind_str(t)); return -EINVAL; } } @@ -346,8 +401,6 @@ recur: targ_id = btf_array(targ_type)->type; goto recur; default: - pr_warn("unexpected kind %d relocated, local [%d], target [%d]\n", - btf_kind(local_type), local_id, targ_id); return 0; } } @@ -1045,7 +1098,7 @@ poison: * [] () + => @, * where is a C-syntax view of recorded field access, e.g.: x.a[3].b */ -static void bpf_core_dump_spec(int level, const struct bpf_core_spec *spec) +static void bpf_core_dump_spec(const char *prog_name, int level, const struct bpf_core_spec *spec) { const struct btf_type *t; const struct btf_enum *e; @@ -1167,7 +1220,8 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, if (str_is_empty(spec_str)) return -EINVAL; - err = bpf_core_parse_spec(local_btf, local_id, spec_str, relo->kind, &local_spec); + err = bpf_core_parse_spec(prog_name, local_btf, local_id, spec_str, + relo->kind, &local_spec); if (err) { pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", prog_name, relo_idx, local_id, btf_kind_str(local_type), @@ -1178,7 +1232,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); - bpf_core_dump_spec(LIBBPF_DEBUG, &local_spec); + bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &local_spec); libbpf_print(LIBBPF_DEBUG, "\n"); /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ @@ -1204,14 +1258,14 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, if (err < 0) { pr_warn("prog '%s': relo #%d: error matching candidate #%d ", prog_name, relo_idx, i); - bpf_core_dump_spec(LIBBPF_WARN, &cand_spec); + bpf_core_dump_spec(prog_name, LIBBPF_WARN, &cand_spec); libbpf_print(LIBBPF_WARN, ": %d\n", err); return err; } pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name, relo_idx, err == 0 ? "non-matching" : "matching", i); - bpf_core_dump_spec(LIBBPF_DEBUG, &cand_spec); + bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &cand_spec); libbpf_print(LIBBPF_DEBUG, "\n"); if (err == 0) -- cgit v1.2.3 From fbd94c7afcf99c9f3b1ba1168657ecc428eb2c8d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:28 -0800 Subject: bpf: Pass a set of bpf_core_relo-s to prog_load command. struct bpf_core_relo is generated by llvm and processed by libbpf. It's a de-facto uapi. With CO-RE in the kernel the struct bpf_core_relo becomes uapi de-jure. Add an ability to pass a set of 'struct bpf_core_relo' to prog_load command and let the kernel perform CO-RE relocations. Note the struct bpf_line_info and struct bpf_func_info have the same layout when passed from LLVM to libbpf and from libbpf to the kernel except "insn_off" fields means "byte offset" when LLVM generates it. Then libbpf converts it to "insn index" to pass to the kernel. The struct bpf_core_relo's "insn_off" field is always "byte offset". Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-6-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 8 +++++ include/uapi/linux/bpf.h | 59 +++++++++++++++++++++++++++++++- kernel/bpf/btf.c | 6 ++++ kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 76 ++++++++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 59 +++++++++++++++++++++++++++++++- tools/lib/bpf/relo_core.h | 53 ----------------------------- 7 files changed, 207 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cad0829710be..8bbf08fbab66 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1732,6 +1732,14 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn); +struct bpf_core_ctx { + struct bpf_verifier_log *log; + const struct btf *btf; +}; + +int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, + int relo_idx, void *insn); + #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9e66b1880020..c26871263f1f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1342,8 +1342,10 @@ union bpf_attr { /* or valid module BTF object fd or 0 to attach to vmlinux */ __u32 attach_btf_obj_fd; }; - __u32 :32; /* pad */ + __u32 core_relo_cnt; /* number of bpf_core_relo */ __aligned_u64 fd_array; /* array of FDs */ + __aligned_u64 core_relos; + __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -6393,4 +6395,59 @@ enum bpf_core_relo_kind { BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ }; +/* + * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf + * and from libbpf to the kernel. + * + * CO-RE relocation captures the following data: + * - insn_off - instruction offset (in bytes) within a BPF program that needs + * its insn->imm field to be relocated with actual field info; + * - type_id - BTF type ID of the "root" (containing) entity of a relocatable + * type or field; + * - access_str_off - offset into corresponding .BTF string section. String + * interpretation depends on specific relocation kind: + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * - kind - one of enum bpf_core_relo_kind; + * + * Example: + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample *s = ...; + * int *x = &s->a; // encoded as "0:0" (a is field #0) + * int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + * + * type_id for all relocs in this example will capture BTF type id of + * `struct sample`. + * + * Such relocation is emitted when using __builtin_preserve_access_index() + * Clang built-in, passing expression that captures field address, e.g.: + * + * bpf_probe_read(&dst, sizeof(dst), + * __builtin_preserve_access_index(&src->a.b.c)); + * + * In this case Clang will emit field relocation recording necessary data to + * be able to find offset of embedded `a.b.c` field within `src` struct. + * + * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction + */ +struct bpf_core_relo { + __u32 insn_off; + __u32 type_id; + __u32 access_str_off; + enum bpf_core_relo_kind kind; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c79595aad55b..0d070461e2b8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6439,3 +6439,9 @@ size_t bpf_core_essential_name_len(const char *name) } return n; } + +int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, + int relo_idx, void *insn) +{ + return -EOPNOTSUPP; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 47089d1d67a4..b3ada4085f85 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2184,7 +2184,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD fd_array +#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6c9c0d9a04a0..6522ffdea487 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10273,6 +10273,78 @@ err_free: return err; } +#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo) +#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE + +static int check_core_relo(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + u32 i, nr_core_relo, ncopy, expected_size, rec_size; + struct bpf_core_relo core_relo = {}; + struct bpf_prog *prog = env->prog; + const struct btf *btf = prog->aux->btf; + struct bpf_core_ctx ctx = { + .log = &env->log, + .btf = btf, + }; + bpfptr_t u_core_relo; + int err; + + nr_core_relo = attr->core_relo_cnt; + if (!nr_core_relo) + return 0; + if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo)) + return -EINVAL; + + rec_size = attr->core_relo_rec_size; + if (rec_size < MIN_CORE_RELO_SIZE || + rec_size > MAX_CORE_RELO_SIZE || + rec_size % sizeof(u32)) + return -EINVAL; + + u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel); + expected_size = sizeof(struct bpf_core_relo); + ncopy = min_t(u32, expected_size, rec_size); + + /* Unlike func_info and line_info, copy and apply each CO-RE + * relocation record one at a time. + */ + for (i = 0; i < nr_core_relo; i++) { + /* future proofing when sizeof(bpf_core_relo) changes */ + err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size); + if (err) { + if (err == -E2BIG) { + verbose(env, "nonzero tailing record in core_relo"); + if (copy_to_bpfptr_offset(uattr, + offsetof(union bpf_attr, core_relo_rec_size), + &expected_size, sizeof(expected_size))) + err = -EFAULT; + } + break; + } + + if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) { + err = -EFAULT; + break; + } + + if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) { + verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n", + i, core_relo.insn_off, prog->len); + err = -EINVAL; + break; + } + + err = bpf_core_apply(&ctx, &core_relo, i, + &prog->insnsi[core_relo.insn_off / 8]); + if (err) + break; + bpfptr_add(&u_core_relo, rec_size); + } + return err; +} + static int check_btf_info(struct bpf_verifier_env *env, const union bpf_attr *attr, bpfptr_t uattr) @@ -10303,6 +10375,10 @@ static int check_btf_info(struct bpf_verifier_env *env, if (err) return err; + err = check_core_relo(env, attr, uattr); + if (err) + return err; + return 0; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9e66b1880020..c26871263f1f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1342,8 +1342,10 @@ union bpf_attr { /* or valid module BTF object fd or 0 to attach to vmlinux */ __u32 attach_btf_obj_fd; }; - __u32 :32; /* pad */ + __u32 core_relo_cnt; /* number of bpf_core_relo */ __aligned_u64 fd_array; /* array of FDs */ + __aligned_u64 core_relos; + __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -6393,4 +6395,59 @@ enum bpf_core_relo_kind { BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ }; +/* + * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf + * and from libbpf to the kernel. + * + * CO-RE relocation captures the following data: + * - insn_off - instruction offset (in bytes) within a BPF program that needs + * its insn->imm field to be relocated with actual field info; + * - type_id - BTF type ID of the "root" (containing) entity of a relocatable + * type or field; + * - access_str_off - offset into corresponding .BTF string section. String + * interpretation depends on specific relocation kind: + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * - kind - one of enum bpf_core_relo_kind; + * + * Example: + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample *s = ...; + * int *x = &s->a; // encoded as "0:0" (a is field #0) + * int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + * + * type_id for all relocs in this example will capture BTF type id of + * `struct sample`. + * + * Such relocation is emitted when using __builtin_preserve_access_index() + * Clang built-in, passing expression that captures field address, e.g.: + * + * bpf_probe_read(&dst, sizeof(dst), + * __builtin_preserve_access_index(&src->a.b.c)); + * + * In this case Clang will emit field relocation recording necessary data to + * be able to find offset of embedded `a.b.c` field within `src` struct. + * + * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction + */ +struct bpf_core_relo { + __u32 insn_off; + __u32 type_id; + __u32 access_str_off; + enum bpf_core_relo_kind kind; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 3d0b86e7f439..f410691cc4e5 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -6,59 +6,6 @@ #include -/* The minimum bpf_core_relo checked by the loader - * - * CO-RE relocation captures the following data: - * - insn_off - instruction offset (in bytes) within a BPF program that needs - * its insn->imm field to be relocated with actual field info; - * - type_id - BTF type ID of the "root" (containing) entity of a relocatable - * type or field; - * - access_str_off - offset into corresponding .BTF string section. String - * interpretation depends on specific relocation kind: - * - for field-based relocations, string encodes an accessed field using - * a sequence of field and array indices, separated by colon (:). It's - * conceptually very close to LLVM's getelementptr ([0]) instruction's - * arguments for identifying offset to a field. - * - for type-based relocations, strings is expected to be just "0"; - * - for enum value-based relocations, string contains an index of enum - * value within its enum type; - * - * Example to provide a better feel. - * - * struct sample { - * int a; - * struct { - * int b[10]; - * }; - * }; - * - * struct sample *s = ...; - * int x = &s->a; // encoded as "0:0" (a is field #0) - * int y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, - * // b is field #0 inside anon struct, accessing elem #5) - * int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) - * - * type_id for all relocs in this example will capture BTF type id of - * `struct sample`. - * - * Such relocation is emitted when using __builtin_preserve_access_index() - * Clang built-in, passing expression that captures field address, e.g.: - * - * bpf_probe_read(&dst, sizeof(dst), - * __builtin_preserve_access_index(&src->a.b.c)); - * - * In this case Clang will emit field relocation recording necessary data to - * be able to find offset of embedded `a.b.c` field within `src` struct. - * - * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction - */ -struct bpf_core_relo { - __u32 insn_off; - __u32 type_id; - __u32 access_str_off; - enum bpf_core_relo_kind kind; -}; - struct bpf_core_cand { const struct btf *btf; const struct btf_type *t; -- cgit v1.2.3 From c5a2d43e998a821701029f23e25b62f9188e93ff Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:29 -0800 Subject: bpf: Adjust BTF log size limit. Make BTF log size limit to be the same as the verifier log size limit. Otherwise tools that progressively increase log size and use the same log for BTF loading and program loading will be hitting hard to debug EINVAL. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-7-alexei.starovoitov@gmail.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0d070461e2b8..dbf1f389b1d3 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4472,7 +4472,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, log->len_total = log_size; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || !log->level || !log->ubuf) { err = -EINVAL; goto errout; -- cgit v1.2.3 From 1e89106da25390826608ad6ac0edfb7c9952eff3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 1 Dec 2021 10:10:31 -0800 Subject: bpf: Add bpf_core_add_cands() and wire it into bpf_core_apply_relo_insn(). Given BPF program's BTF root type name perform the following steps: . search in vmlinux candidate cache. . if (present in cache and candidate list >= 1) return candidate list. . do a linear search through kernel BTFs for possible candidates. . regardless of number of candidates found populate vmlinux cache. . if (candidate list >= 1) return candidate list. . search in module candidate cache. . if (present in cache) return candidate list (even if list is empty). . do a linear search through BTFs of all kernel modules collecting candidates from all of them. . regardless of number of candidates found populate module cache. . return candidate list. Then wire the result into bpf_core_apply_relo_insn(). When BPF program is trying to CO-RE relocate a type that doesn't exist in either vmlinux BTF or in modules BTFs these steps will perform 2 cache lookups when cache is hit. Note the cache doesn't prevent the abuse by the program that might have lots of relocations that cannot be resolved. Hence cond_resched(). CO-RE in the kernel requires CAP_BPF, since BTF loading requires it. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211201181040.23337-9-alexei.starovoitov@gmail.com --- kernel/bpf/btf.c | 346 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 345 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbf1f389b1d3..ed4258cb0832 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -25,6 +25,7 @@ #include #include #include +#include "../tools/lib/bpf/relo_core.h" /* BTF (BPF Type Format) is the meta data format which describes * the data types of BPF program/map. Hence, it basically focus @@ -6169,6 +6170,8 @@ btf_module_read(struct file *file, struct kobject *kobj, return len; } +static void purge_cand_cache(struct btf *btf); + static int btf_module_notify(struct notifier_block *nb, unsigned long op, void *module) { @@ -6203,6 +6206,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, goto out; } + purge_cand_cache(NULL); mutex_lock(&btf_module_mutex); btf_mod->module = module; btf_mod->btf = btf; @@ -6245,6 +6249,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, list_del(&btf_mod->list); if (btf_mod->sysfs_attr) sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); + purge_cand_cache(btf_mod->btf); btf_put(btf_mod->btf); kfree(btf_mod->sysfs_attr); kfree(btf_mod); @@ -6440,8 +6445,347 @@ size_t bpf_core_essential_name_len(const char *name) return n; } +struct bpf_cand_cache { + const char *name; + u32 name_len; + u16 kind; + u16 cnt; + struct { + const struct btf *btf; + u32 id; + } cands[]; +}; + +static void bpf_free_cands(struct bpf_cand_cache *cands) +{ + if (!cands->cnt) + /* empty candidate array was allocated on stack */ + return; + kfree(cands); +} + +static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands) +{ + kfree(cands->name); + kfree(cands); +} + +#define VMLINUX_CAND_CACHE_SIZE 31 +static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE]; + +#define MODULE_CAND_CACHE_SIZE 31 +static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE]; + +static DEFINE_MUTEX(cand_cache_mutex); + +static void __print_cand_cache(struct bpf_verifier_log *log, + struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc; + int i, j; + + for (i = 0; i < cache_size; i++) { + cc = cache[i]; + if (!cc) + continue; + bpf_log(log, "[%d]%s(", i, cc->name); + for (j = 0; j < cc->cnt; j++) { + bpf_log(log, "%d", cc->cands[j].id); + if (j < cc->cnt - 1) + bpf_log(log, " "); + } + bpf_log(log, "), "); + } +} + +static void print_cand_cache(struct bpf_verifier_log *log) +{ + mutex_lock(&cand_cache_mutex); + bpf_log(log, "vmlinux_cand_cache:"); + __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + bpf_log(log, "\nmodule_cand_cache:"); + __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE); + bpf_log(log, "\n"); + mutex_unlock(&cand_cache_mutex); +} + +static u32 hash_cands(struct bpf_cand_cache *cands) +{ + return jhash(cands->name, cands->name_len, 0); +} + +static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands, + struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size]; + + if (cc && cc->name_len == cands->name_len && + !strncmp(cc->name, cands->name, cands->name_len)) + return cc; + return NULL; +} + +static size_t sizeof_cands(int cnt) +{ + return offsetof(struct bpf_cand_cache, cands[cnt]); +} + +static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, + struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands; + + if (*cc) { + bpf_free_cands_from_cache(*cc); + *cc = NULL; + } + new_cands = kmalloc(sizeof_cands(cands->cnt), GFP_KERNEL); + if (!new_cands) { + bpf_free_cands(cands); + return ERR_PTR(-ENOMEM); + } + memcpy(new_cands, cands, sizeof_cands(cands->cnt)); + /* strdup the name, since it will stay in cache. + * the cands->name points to strings in prog's BTF and the prog can be unloaded. + */ + new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL); + bpf_free_cands(cands); + if (!new_cands->name) { + kfree(new_cands); + return ERR_PTR(-ENOMEM); + } + *cc = new_cands; + return new_cands; +} + +static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc; + int i, j; + + for (i = 0; i < cache_size; i++) { + cc = cache[i]; + if (!cc) + continue; + if (!btf) { + /* when new module is loaded purge all of module_cand_cache, + * since new module might have candidates with the name + * that matches cached cands. + */ + bpf_free_cands_from_cache(cc); + cache[i] = NULL; + continue; + } + /* when module is unloaded purge cache entries + * that match module's btf + */ + for (j = 0; j < cc->cnt; j++) + if (cc->cands[j].btf == btf) { + bpf_free_cands_from_cache(cc); + cache[i] = NULL; + break; + } + } + +} + +static void purge_cand_cache(struct btf *btf) +{ + mutex_lock(&cand_cache_mutex); + __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE); + mutex_unlock(&cand_cache_mutex); +} + +static struct bpf_cand_cache * +bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, + int targ_start_id) +{ + struct bpf_cand_cache *new_cands; + const struct btf_type *t; + const char *targ_name; + size_t targ_essent_len; + int n, i; + + n = btf_nr_types(targ_btf); + for (i = targ_start_id; i < n; i++) { + t = btf_type_by_id(targ_btf, i); + if (btf_kind(t) != cands->kind) + continue; + + targ_name = btf_name_by_offset(targ_btf, t->name_off); + if (!targ_name) + continue; + + /* the resched point is before strncmp to make sure that search + * for non-existing name will have a chance to schedule(). + */ + cond_resched(); + + if (strncmp(cands->name, targ_name, cands->name_len) != 0) + continue; + + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != cands->name_len) + continue; + + /* most of the time there is only one candidate for a given kind+name pair */ + new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL); + if (!new_cands) { + bpf_free_cands(cands); + return ERR_PTR(-ENOMEM); + } + + memcpy(new_cands, cands, sizeof_cands(cands->cnt)); + bpf_free_cands(cands); + cands = new_cands; + cands->cands[cands->cnt].btf = targ_btf; + cands->cands[cands->cnt].id = i; + cands->cnt++; + } + return cands; +} + +static struct bpf_cand_cache * +bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) +{ + struct bpf_cand_cache *cands, *cc, local_cand = {}; + const struct btf *local_btf = ctx->btf; + const struct btf_type *local_type; + const struct btf *main_btf; + size_t local_essent_len; + struct btf *mod_btf; + const char *name; + int id; + + main_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(main_btf)) + return (void *)main_btf; + + local_type = btf_type_by_id(local_btf, local_type_id); + if (!local_type) + return ERR_PTR(-EINVAL); + + name = btf_name_by_offset(local_btf, local_type->name_off); + if (str_is_empty(name)) + return ERR_PTR(-EINVAL); + local_essent_len = bpf_core_essential_name_len(name); + + cands = &local_cand; + cands->name = name; + cands->kind = btf_kind(local_type); + cands->name_len = local_essent_len; + + cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + /* cands is a pointer to stack here */ + if (cc) { + if (cc->cnt) + return cc; + goto check_modules; + } + + /* Attempt to find target candidates in vmlinux BTF first */ + cands = bpf_core_add_cands(cands, main_btf, 1); + if (IS_ERR(cands)) + return cands; + + /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */ + + /* populate cache even when cands->cnt == 0 */ + cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + if (IS_ERR(cc)) + return cc; + + /* if vmlinux BTF has any candidate, don't go for module BTFs */ + if (cc->cnt) + return cc; + +check_modules: + /* cands is a pointer to stack here and cands->cnt == 0 */ + cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); + if (cc) + /* if cache has it return it even if cc->cnt == 0 */ + return cc; + + /* If candidate is not found in vmlinux's BTF then search in module's BTFs */ + spin_lock_bh(&btf_idr_lock); + idr_for_each_entry(&btf_idr, mod_btf, id) { + if (!btf_is_module(mod_btf)) + continue; + /* linear search could be slow hence unlock/lock + * the IDR to avoiding holding it for too long + */ + btf_get(mod_btf); + spin_unlock_bh(&btf_idr_lock); + cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); + if (IS_ERR(cands)) { + btf_put(mod_btf); + return cands; + } + spin_lock_bh(&btf_idr_lock); + btf_put(mod_btf); + } + spin_unlock_bh(&btf_idr_lock); + /* cands is a pointer to kmalloced memory here if cands->cnt > 0 + * or pointer to stack if cands->cnd == 0. + * Copy it into the cache even when cands->cnt == 0 and + * return the result. + */ + return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); +} + int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, int relo_idx, void *insn) { - return -EOPNOTSUPP; + bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL; + struct bpf_core_cand_list cands = {}; + int err; + + if (need_cands) { + struct bpf_cand_cache *cc; + int i; + + mutex_lock(&cand_cache_mutex); + cc = bpf_core_find_cands(ctx, relo->type_id); + if (IS_ERR(cc)) { + bpf_log(ctx->log, "target candidate search failed for %d\n", + relo->type_id); + err = PTR_ERR(cc); + goto out; + } + if (cc->cnt) { + cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL); + if (!cands.cands) { + err = -ENOMEM; + goto out; + } + } + for (i = 0; i < cc->cnt; i++) { + bpf_log(ctx->log, + "CO-RE relocating %s %s: found target candidate [%d]\n", + btf_kind_str[cc->kind], cc->name, cc->cands[i].id); + cands.cands[i].btf = cc->cands[i].btf; + cands.cands[i].id = cc->cands[i].id; + } + cands.len = cc->cnt; + /* cand_cache_mutex needs to span the cache lookup and + * copy of btf pointer into bpf_core_cand_list, + * since module can be unloaded while bpf_core_apply_relo_insn + * is working with module's btf. + */ + } + + err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8, + relo, relo_idx, ctx->btf, &cands); +out: + if (need_cands) { + kfree(cands.cands); + mutex_unlock(&cand_cache_mutex); + if (ctx->log->level & BPF_LOG_LEVEL2) + print_cand_cache(ctx->log); + } + return err; } -- cgit v1.2.3 From d9847eb8be3d895b2b5f514fdf3885d47a0b92a2 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 22 Nov 2021 20:17:40 +0530 Subject: bpf: Make CONFIG_DEBUG_INFO_BTF depend upon CONFIG_BPF_SYSCALL Vinicius Costa Gomes reported [0] that build fails when CONFIG_DEBUG_INFO_BTF is enabled and CONFIG_BPF_SYSCALL is disabled. This leads to btf.c not being compiled, and then no symbol being present in vmlinux for the declarations in btf.h. Since BTF is not useful without enabling BPF subsystem, disallow this combination. However, theoretically disabling both now could still fail, as the symbol for kfunc_btf_id_list variables is not available. This isn't a problem as the compiler usually optimizes the whole register/unregister call, but at lower optimization levels it can fail the build in linking stage. Fix that by adding dummy variables so that modules taking address of them still work, but the whole thing is a noop. [0]: https://lore.kernel.org/bpf/20211110205418.332403-1-vinicius.gomes@intel.com Fixes: 14f267d95fe4 ("bpf: btf: Introduce helpers for dynamic BTF set registration") Reported-by: Vinicius Costa Gomes Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211122144742.477787-2-memxor@gmail.com --- include/linux/btf.h | 14 ++++++++++---- kernel/bpf/btf.c | 9 ++------- lib/Kconfig.debug | 1 + 3 files changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index 203eef993d76..0e1b6281fd8f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -245,7 +245,10 @@ struct kfunc_btf_id_set { struct module *owner; }; -struct kfunc_btf_id_list; +struct kfunc_btf_id_list { + struct list_head list; + struct mutex mutex; +}; #ifdef CONFIG_DEBUG_INFO_BTF_MODULES void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, @@ -254,6 +257,9 @@ void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, struct kfunc_btf_id_set *s); bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, struct module *owner); + +extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; +extern struct kfunc_btf_id_list prog_test_kfunc_list; #else static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, struct kfunc_btf_id_set *s) @@ -268,13 +274,13 @@ static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, { return false; } + +static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused; +static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused; #endif #define DEFINE_KFUNC_BTF_ID_SET(set, name) \ struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set), \ THIS_MODULE } -extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; -extern struct kfunc_btf_id_list prog_test_kfunc_list; - #endif diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbc3ad07e21b..ea3df9867cec 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6346,11 +6346,6 @@ BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) /* BTF ID set registration API for modules */ -struct kfunc_btf_id_list { - struct list_head list; - struct mutex mutex; -}; - #ifdef CONFIG_DEBUG_INFO_BTF_MODULES void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, @@ -6389,8 +6384,6 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, return false; } -#endif - #define DEFINE_KFUNC_BTF_ID_LIST(name) \ struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \ __MUTEX_INITIALIZER(name.mutex) }; \ @@ -6398,3 +6391,5 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); + +#endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9ef7ce18b4f5..596bb5e4790c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -316,6 +316,7 @@ config DEBUG_INFO_BTF bool "Generate BTF typeinfo" depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST + depends on BPF_SYSCALL help Generate deduplicated BTF type information from DWARF debug info. Turning this on expects presence of pahole tool, which will convert -- cgit v1.2.3 From b12f031043247b80999bf5e03b8cded3b0b40f8d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 22 Nov 2021 20:17:41 +0530 Subject: bpf: Fix bpf_check_mod_kfunc_call for built-in modules When module registering its set is built-in, THIS_MODULE will be NULL, hence we cannot return early in case owner is NULL. Fixes: 14f267d95fe4 ("bpf: btf: Introduce helpers for dynamic BTF set registration") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211122144742.477787-3-memxor@gmail.com --- kernel/bpf/btf.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ea3df9867cec..9bdb03767db5 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6371,8 +6371,6 @@ bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, { struct kfunc_btf_id_set *s; - if (!owner) - return false; mutex_lock(&klist->mutex); list_for_each_entry(s, &klist->list, list) { if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) { -- cgit v1.2.3 From 07edfece8bcb0580a1828d939e6f8d91a8603eb2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 1 Dec 2021 16:19:44 +0100 Subject: workqueue: Fix unbind_workers() VS wq_worker_running() race At CPU-hotplug time, unbind_worker() may preempt a worker while it is waking up. In that case the following scenario can happen: unbind_workers() wq_worker_running() -------------- ------------------- if (!(worker->flags & WORKER_NOT_RUNNING)) //PREEMPTED by unbind_workers worker->flags |= WORKER_UNBOUND; [...] atomic_set(&pool->nr_running, 0); //resume to worker atomic_inc(&worker->pool->nr_running); After unbind_worker() resets pool->nr_running, the value is expected to remain 0 until the pool ever gets rebound in case cpu_up() is called on the target CPU in the future. But here the race leaves pool->nr_running with a value of 1, triggering the following warning when the worker goes idle: WARNING: CPU: 3 PID: 34 at kernel/workqueue.c:1823 worker_enter_idle+0x95/0xc0 Modules linked in: CPU: 3 PID: 34 Comm: kworker/3:0 Not tainted 5.16.0-rc1+ #34 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 Workqueue: 0x0 (rcu_par_gp) RIP: 0010:worker_enter_idle+0x95/0xc0 Code: 04 85 f8 ff ff ff 39 c1 7f 09 48 8b 43 50 48 85 c0 74 1b 83 e2 04 75 99 8b 43 34 39 43 30 75 91 8b 83 00 03 00 00 85 c0 74 87 <0f> 0b 5b c3 48 8b 35 70 f1 37 01 48 8d 7b 48 48 81 c6 e0 93 0 RSP: 0000:ffff9b7680277ed0 EFLAGS: 00010086 RAX: 00000000ffffffff RBX: ffff93465eae9c00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9346418a0000 RDI: ffff934641057140 RBP: ffff934641057170 R08: 0000000000000001 R09: ffff9346418a0080 R10: ffff9b768027fdf0 R11: 0000000000002400 R12: ffff93465eae9c20 R13: ffff93465eae9c20 R14: ffff93465eae9c70 R15: ffff934641057140 FS: 0000000000000000(0000) GS:ffff93465eac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000001cc0c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: worker_thread+0x89/0x3d0 ? process_one_work+0x400/0x400 kthread+0x162/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Also due to this incorrect "nr_running == 1", further queued work may end up not being served, because no worker is awaken at work insert time. This raises rcutorture writer stalls for example. Fix this with disabling preemption in the right place in wq_worker_running(). It's worth noting that if the worker migrates and runs concurrently with unbind_workers(), it is guaranteed to see the WORKER_UNBOUND flag update due to set_cpus_allowed_ptr() acquiring/releasing rq->lock. Fixes: 6d25be5782e4 ("sched/core, workqueues: Distangle worker accounting from rq lock") Reviewed-by: Lai Jiangshan Tested-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 332361cf215f..5094573e8b45 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -868,8 +868,17 @@ void wq_worker_running(struct task_struct *task) if (!worker->sleeping) return; + + /* + * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check + * and the nr_running increment below, we may ruin the nr_running reset + * and leave with an unexpected pool->nr_running == 1 on the newly unbound + * pool. Protect against such race. + */ + preempt_disable(); if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(&worker->pool->nr_running); + preempt_enable(); worker->sleeping = 0; } -- cgit v1.2.3 From 45c753f5f24d2d4717acb38ce35e604ff9abcb50 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 1 Dec 2021 16:19:45 +0100 Subject: workqueue: Fix unbind_workers() VS wq_worker_sleeping() race At CPU-hotplug time, unbind_workers() may preempt a worker while it is going to sleep. In that case the following scenario can happen: unbind_workers() wq_worker_sleeping() -------------- ------------------- if (worker->flags & WORKER_NOT_RUNNING) return; //PREEMPTED by unbind_workers worker->flags |= WORKER_UNBOUND; [...] atomic_set(&pool->nr_running, 0); //resume to worker atomic_dec_and_test(&pool->nr_running); After unbind_worker() resets pool->nr_running, the value is expected to remain 0 until the pool ever gets rebound in case cpu_up() is called on the target CPU in the future. But here the race leaves pool->nr_running with a value of -1, triggering the following warning when the worker goes idle: WARNING: CPU: 3 PID: 34 at kernel/workqueue.c:1823 worker_enter_idle+0x95/0xc0 Modules linked in: CPU: 3 PID: 34 Comm: kworker/3:0 Not tainted 5.16.0-rc1+ #34 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 Workqueue: 0x0 (rcu_par_gp) RIP: 0010:worker_enter_idle+0x95/0xc0 Code: 04 85 f8 ff ff ff 39 c1 7f 09 48 8b 43 50 48 85 c0 74 1b 83 e2 04 75 99 8b 43 34 39 43 30 75 91 8b 83 00 03 00 00 85 c0 74 87 <0f> 0b 5b c3 48 8b 35 70 f1 37 01 48 8d 7b 48 48 81 c6 e0 93 0 RSP: 0000:ffff9b7680277ed0 EFLAGS: 00010086 RAX: 00000000ffffffff RBX: ffff93465eae9c00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9346418a0000 RDI: ffff934641057140 RBP: ffff934641057170 R08: 0000000000000001 R09: ffff9346418a0080 R10: ffff9b768027fdf0 R11: 0000000000002400 R12: ffff93465eae9c20 R13: ffff93465eae9c20 R14: ffff93465eae9c70 R15: ffff934641057140 FS: 0000000000000000(0000) GS:ffff93465eac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000001cc0c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: worker_thread+0x89/0x3d0 ? process_one_work+0x400/0x400 kthread+0x162/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Also due to this incorrect "nr_running == -1", all sorts of hazards can happen, starting with queued works being ignored because no workers are awaken at insert_work() time. Fix this with checking again the worker flags while pool->lock is locked. Fixes: b945efcdd07d ("sched: Remove pointless preemption disable in sched_submit_work()") Reviewed-by: Lai Jiangshan Tested-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5094573e8b45..5557d19ea81c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -912,6 +912,16 @@ void wq_worker_sleeping(struct task_struct *task) worker->sleeping = 1; raw_spin_lock_irq(&pool->lock); + /* + * Recheck in case unbind_workers() preempted us. We don't + * want to decrement nr_running after the worker is unbound + * and nr_running has been reset. + */ + if (worker->flags & WORKER_NOT_RUNNING) { + raw_spin_unlock_irq(&pool->lock); + return; + } + /* * The counterpart of the following dec_and_test, implied mb, * worklist not empty test sequence is in insert_work(). -- cgit v1.2.3 From 2fa7d94afc1afbb4d702760c058dc2d7ed30f226 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 30 Nov 2021 20:16:07 +0200 Subject: bpf: Fix the off-by-two error in range markings The first commit cited below attempts to fix the off-by-one error that appeared in some comparisons with an open range. Due to this error, arithmetically equivalent pieces of code could get different verdicts from the verifier, for example (pseudocode): // 1. Passes the verifier: if (data + 8 > data_end) return early read *(u64 *)data, i.e. [data; data+7] // 2. Rejected by the verifier (should still pass): if (data + 7 >= data_end) return early read *(u64 *)data, i.e. [data; data+7] The attempted fix, however, shifts the range by one in a wrong direction, so the bug not only remains, but also such piece of code starts failing in the verifier: // 3. Rejected by the verifier, but the check is stricter than in #1. if (data + 8 >= data_end) return early read *(u64 *)data, i.e. [data; data+7] The change performed by that fix converted an off-by-one bug into off-by-two. The second commit cited below added the BPF selftests written to ensure than code chunks like #3 are rejected, however, they should be accepted. This commit fixes the off-by-two error by adjusting new_range in the right direction and fixes the tests by changing the range into the one that should actually fail. Fixes: fb2a311a31d3 ("bpf: fix off by one for range markings with L{T, E} patterns") Fixes: b37242c773b2 ("bpf: add test cases to bpf selftests to cover all access tests") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211130181607.593149-1-maximmi@nvidia.com --- kernel/bpf/verifier.c | 2 +- .../bpf/verifier/xdp_direct_packet_access.c | 32 +++++++++++----------- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 50efda51515b..f3001937bbb9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8422,7 +8422,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, new_range = dst_reg->off; if (range_right_open) - new_range--; + new_range++; /* Examples for register markings: * diff --git a/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c b/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c index bfb97383e6b5..de172a5b8754 100644 --- a/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c +++ b/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c @@ -112,10 +112,10 @@ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data_end)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1), BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -167,10 +167,10 @@ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data_end)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1), BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -274,9 +274,9 @@ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data_end)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -437,9 +437,9 @@ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data_end)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -544,10 +544,10 @@ offsetof(struct xdp_md, data_meta)), BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1), BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -599,10 +599,10 @@ offsetof(struct xdp_md, data_meta)), BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1), BPF_JMP_IMM(BPF_JA, 0, 0, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -706,9 +706,9 @@ offsetof(struct xdp_md, data_meta)), BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, @@ -869,9 +869,9 @@ offsetof(struct xdp_md, data_meta)), BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)), BPF_MOV64_REG(BPF_REG_1, BPF_REG_2), - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6), BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1), - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8), + BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, -- cgit v1.2.3 From 78c1f8d0634cc35da613d844eda7c849fc50f643 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 3 Dec 2021 10:28:36 -0800 Subject: libbpf: Reduce bpf_core_apply_relo_insn() stack usage. Reduce bpf_core_apply_relo_insn() stack usage and bump BPF_CORE_SPEC_MAX_LEN limit back to 64. Fixes: 29db4bea1d10 ("bpf: Prepare relo_core.c for kernel duty.") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211203182836.16646-1-alexei.starovoitov@gmail.com --- kernel/bpf/btf.c | 11 ++++++++- tools/lib/bpf/libbpf.c | 4 ++- tools/lib/bpf/relo_core.c | 62 +++++++++++++++-------------------------------- tools/lib/bpf/relo_core.h | 30 ++++++++++++++++++++++- 4 files changed, 61 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed4258cb0832..2a902a946f70 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6742,8 +6742,16 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, { bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL; struct bpf_core_cand_list cands = {}; + struct bpf_core_spec *specs; int err; + /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5" + * into arrays of btf_ids of struct fields and array indices. + */ + specs = kcalloc(3, sizeof(*specs), GFP_KERNEL); + if (!specs) + return -ENOMEM; + if (need_cands) { struct bpf_cand_cache *cc; int i; @@ -6779,8 +6787,9 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, } err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8, - relo, relo_idx, ctx->btf, &cands); + relo, relo_idx, ctx->btf, &cands, specs); out: + kfree(specs); if (need_cands) { kfree(cands.cands); mutex_unlock(&cand_cache_mutex); diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index de260c94e418..6db0b5e8540e 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -5515,6 +5515,7 @@ static int bpf_core_apply_relo(struct bpf_program *prog, const struct btf *local_btf, struct hashmap *cand_cache) { + struct bpf_core_spec specs_scratch[3] = {}; const void *type_key = u32_as_hash_key(relo->type_id); struct bpf_core_cand_list *cands = NULL; const char *prog_name = prog->name; @@ -5569,7 +5570,8 @@ static int bpf_core_apply_relo(struct bpf_program *prog, } } - return bpf_core_apply_relo_insn(prog_name, insn, insn_idx, relo, relo_idx, local_btf, cands); + return bpf_core_apply_relo_insn(prog_name, insn, insn_idx, relo, + relo_idx, local_btf, cands, specs_scratch); } static int diff --git a/tools/lib/bpf/relo_core.c b/tools/lib/bpf/relo_core.c index d194fb9306ed..32464f0ab4b1 100644 --- a/tools/lib/bpf/relo_core.c +++ b/tools/lib/bpf/relo_core.c @@ -68,33 +68,6 @@ enum libbpf_print_level { #include "libbpf_internal.h" #endif -#define BPF_CORE_SPEC_MAX_LEN 32 - -/* represents BPF CO-RE field or array element accessor */ -struct bpf_core_accessor { - __u32 type_id; /* struct/union type or array element type */ - __u32 idx; /* field index or array index */ - const char *name; /* field name or NULL for array accessor */ -}; - -struct bpf_core_spec { - const struct btf *btf; - /* high-level spec: named fields and array indices only */ - struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN]; - /* original unresolved (no skip_mods_or_typedefs) root type ID */ - __u32 root_type_id; - /* CO-RE relocation kind */ - enum bpf_core_relo_kind relo_kind; - /* high-level spec length */ - int len; - /* raw, low-level spec: 1-to-1 with accessor spec string */ - int raw_spec[BPF_CORE_SPEC_MAX_LEN]; - /* raw spec length */ - int raw_len; - /* field bit offset represented by spec */ - __u32 bit_offset; -}; - static bool is_flex_arr(const struct btf *btf, const struct bpf_core_accessor *acc, const struct btf_array *arr) @@ -1200,9 +1173,12 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, const struct bpf_core_relo *relo, int relo_idx, const struct btf *local_btf, - struct bpf_core_cand_list *cands) + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch) { - struct bpf_core_spec local_spec, cand_spec, targ_spec = {}; + struct bpf_core_spec *local_spec = &specs_scratch[0]; + struct bpf_core_spec *cand_spec = &specs_scratch[1]; + struct bpf_core_spec *targ_spec = &specs_scratch[2]; struct bpf_core_relo_res cand_res, targ_res; const struct btf_type *local_type; const char *local_name; @@ -1221,7 +1197,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, return -EINVAL; err = bpf_core_parse_spec(prog_name, local_btf, local_id, spec_str, - relo->kind, &local_spec); + relo->kind, local_spec); if (err) { pr_warn("prog '%s': relo #%d: parsing [%d] %s %s + %s failed: %d\n", prog_name, relo_idx, local_id, btf_kind_str(local_type), @@ -1232,15 +1208,15 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, pr_debug("prog '%s': relo #%d: kind <%s> (%d), spec is ", prog_name, relo_idx, core_relo_kind_str(relo->kind), relo->kind); - bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &local_spec); + bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, local_spec); libbpf_print(LIBBPF_DEBUG, "\n"); /* TYPE_ID_LOCAL relo is special and doesn't need candidate search */ if (relo->kind == BPF_CORE_TYPE_ID_LOCAL) { targ_res.validate = true; targ_res.poison = false; - targ_res.orig_val = local_spec.root_type_id; - targ_res.new_val = local_spec.root_type_id; + targ_res.orig_val = local_spec->root_type_id; + targ_res.new_val = local_spec->root_type_id; goto patch_insn; } @@ -1253,38 +1229,38 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, for (i = 0, j = 0; i < cands->len; i++) { - err = bpf_core_spec_match(&local_spec, cands->cands[i].btf, - cands->cands[i].id, &cand_spec); + err = bpf_core_spec_match(local_spec, cands->cands[i].btf, + cands->cands[i].id, cand_spec); if (err < 0) { pr_warn("prog '%s': relo #%d: error matching candidate #%d ", prog_name, relo_idx, i); - bpf_core_dump_spec(prog_name, LIBBPF_WARN, &cand_spec); + bpf_core_dump_spec(prog_name, LIBBPF_WARN, cand_spec); libbpf_print(LIBBPF_WARN, ": %d\n", err); return err; } pr_debug("prog '%s': relo #%d: %s candidate #%d ", prog_name, relo_idx, err == 0 ? "non-matching" : "matching", i); - bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, &cand_spec); + bpf_core_dump_spec(prog_name, LIBBPF_DEBUG, cand_spec); libbpf_print(LIBBPF_DEBUG, "\n"); if (err == 0) continue; - err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, &cand_spec, &cand_res); + err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, cand_spec, &cand_res); if (err) return err; if (j == 0) { targ_res = cand_res; - targ_spec = cand_spec; - } else if (cand_spec.bit_offset != targ_spec.bit_offset) { + *targ_spec = *cand_spec; + } else if (cand_spec->bit_offset != targ_spec->bit_offset) { /* if there are many field relo candidates, they * should all resolve to the same bit offset */ pr_warn("prog '%s': relo #%d: field offset ambiguity: %u != %u\n", - prog_name, relo_idx, cand_spec.bit_offset, - targ_spec.bit_offset); + prog_name, relo_idx, cand_spec->bit_offset, + targ_spec->bit_offset); return -EINVAL; } else if (cand_res.poison != targ_res.poison || cand_res.new_val != targ_res.new_val) { /* all candidates should result in the same relocation @@ -1328,7 +1304,7 @@ int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, prog_name, relo_idx); /* calculate single target relo result explicitly */ - err = bpf_core_calc_relo(prog_name, relo, relo_idx, &local_spec, NULL, &targ_res); + err = bpf_core_calc_relo(prog_name, relo, relo_idx, local_spec, NULL, &targ_res); if (err) return err; } diff --git a/tools/lib/bpf/relo_core.h b/tools/lib/bpf/relo_core.h index 4f864b8e33b7..17799819ad7c 100644 --- a/tools/lib/bpf/relo_core.h +++ b/tools/lib/bpf/relo_core.h @@ -17,11 +17,39 @@ struct bpf_core_cand_list { int len; }; +#define BPF_CORE_SPEC_MAX_LEN 64 + +/* represents BPF CO-RE field or array element accessor */ +struct bpf_core_accessor { + __u32 type_id; /* struct/union type or array element type */ + __u32 idx; /* field index or array index */ + const char *name; /* field name or NULL for array accessor */ +}; + +struct bpf_core_spec { + const struct btf *btf; + /* high-level spec: named fields and array indices only */ + struct bpf_core_accessor spec[BPF_CORE_SPEC_MAX_LEN]; + /* original unresolved (no skip_mods_or_typedefs) root type ID */ + __u32 root_type_id; + /* CO-RE relocation kind */ + enum bpf_core_relo_kind relo_kind; + /* high-level spec length */ + int len; + /* raw, low-level spec: 1-to-1 with accessor spec string */ + int raw_spec[BPF_CORE_SPEC_MAX_LEN]; + /* raw spec length */ + int raw_len; + /* field bit offset represented by spec */ + __u32 bit_offset; +}; + int bpf_core_apply_relo_insn(const char *prog_name, struct bpf_insn *insn, int insn_idx, const struct bpf_core_relo *relo, int relo_idx, const struct btf *local_btf, - struct bpf_core_cand_list *cands); + struct bpf_core_cand_list *cands, + struct bpf_core_spec *specs_scratch); int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id); -- cgit v1.2.3 From 9ed20bafc85806ca6c97c9128cec46c3ef80ae86 Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Fri, 3 Dec 2021 17:32:03 -0600 Subject: preempt/dynamic: Fix setup_preempt_mode() return value __setup() callbacks expect 1 for success and 0 for failure. Correct the usage here to reflect that. Fixes: 826bfeb37bb4 ("preempt/dynamic: Support dynamic preempt with preempt= boot option") Reported-by: Mark Rutland Signed-off-by: Andrew Halaney Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211203233203.133581-1-ahalaney@redhat.com --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 76f9deeaa942..814c52d90c0f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6617,11 +6617,11 @@ static int __init setup_preempt_mode(char *str) int mode = sched_dynamic_mode(str); if (mode < 0) { pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); - return 1; + return 0; } sched_dynamic_update(mode); - return 0; + return 1; } __setup("preempt=", setup_preempt_mode); -- cgit v1.2.3 From 315c4f884800c45cb6bd8c90422fad554a8b9588 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 2 Dec 2021 11:20:33 +0000 Subject: sched/uclamp: Fix rq->uclamp_max not set on first enqueue Commit d81ae8aac85c ("sched/uclamp: Fix initialization of struct uclamp_rq") introduced a bug where uclamp_max of the rq is not reset to match the woken up task's uclamp_max when the rq is idle. The code was relying on rq->uclamp_max initialized to zero, so on first enqueue static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, enum uclamp_id clamp_id) { ... if (uc_se->value > READ_ONCE(uc_rq->value)) WRITE_ONCE(uc_rq->value, uc_se->value); } was actually resetting it. But since commit d81ae8aac85c changed the default to 1024, this no longer works. And since rq->uclamp_flags is also initialized to 0, neither above code path nor uclamp_idle_reset() update the rq->uclamp_max on first wake up from idle. This is only visible from first wake up(s) until the first dequeue to idle after enabling the static key. And it only matters if the uclamp_max of this task is < 1024 since only then its uclamp_max will be effectively ignored. Fix it by properly initializing rq->uclamp_flags = UCLAMP_FLAG_IDLE to ensure uclamp_idle_reset() is called which then will update the rq uclamp_max value as expected. Fixes: d81ae8aac85c ("sched/uclamp: Fix initialization of struct uclamp_rq") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20211202112033.1705279-1-qais.yousef@arm.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 814c52d90c0f..77563109c0ea 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1918,7 +1918,7 @@ static void __init init_uclamp_rq(struct rq *rq) }; } - rq->uclamp_flags = 0; + rq->uclamp_flags = UCLAMP_FLAG_IDLE; } static void __init init_uclamp(void) -- cgit v1.2.3 From 8b4e74ccb582797f6f0b0a50372ebd9fd2372a27 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Wed, 1 Dec 2021 14:34:50 +0000 Subject: sched/fair: Fix detection of per-CPU kthreads waking a task select_idle_sibling() has a special case for tasks woken up by a per-CPU kthread, where the selected CPU is the previous one. However, the current condition for this exit path is incomplete. A task can wake up from an interrupt context (e.g. hrtimer), while a per-CPU kthread is running. A such scenario would spuriously trigger the special case described above. Also, a recent change made the idle task like a regular per-CPU kthread, hence making that situation more likely to happen (is_per_cpu_kthread(swapper) being true now). Checking for task context makes sure select_idle_sibling() will not interpret a wake up from any other context as a wake up by a per-CPU kthread. Fixes: 52262ee567ad ("sched/fair: Allow a per-CPU kthread waking a task to stack on the same CPU, to fix XFS performance regression") Signed-off-by: Vincent Donnefort Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lore.kernel.org/r/20211201143450.479472-1-vincent.donnefort@arm.com --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 884f29d07963..5cd27986b43e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6398,6 +6398,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * pattern is IO completions. */ if (is_per_cpu_kthread(current) && + in_task() && prev == smp_processor_id() && this_rq()->nr_running <= 1) { return prev; -- cgit v1.2.3 From 014ba44e8184e1acf93e0cbb7089ee847802f8f0 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Mon, 29 Nov 2021 17:31:15 +0000 Subject: sched/fair: Fix per-CPU kthread and wakee stacking for asym CPU capacity select_idle_sibling() has a special case for tasks woken up by a per-CPU kthread where the selected CPU is the previous one. For asymmetric CPU capacity systems, the assumption was that the wakee couldn't have a bigger utilization during task placement than it used to have during the last activation. That was not considering uclamp.min which can completely change between two task activations and as a consequence mandates the fitness criterion asym_fits_capacity(), even for the exit path described above. Fixes: b4c9c9f15649 ("sched/fair: Prefer prev cpu in asymmetric wakeup path") Signed-off-by: Vincent Donnefort Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/20211129173115.4006346-1-vincent.donnefort@arm.com --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5cd27986b43e..06722188df49 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6400,7 +6400,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (is_per_cpu_kthread(current) && in_task() && prev == smp_processor_id() && - this_rq()->nr_running <= 1) { + this_rq()->nr_running <= 1 && + asym_fits_capacity(task_util, prev)) { return prev; } -- cgit v1.2.3 From 9d0df37797453f168afdb2e6fd0353c73718ae9a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 29 Nov 2021 18:46:44 +0100 Subject: sched: Trigger warning if ->migration_disabled counter underflows. If migrate_enable() is used more often than its counter part then it remains undetected and rq::nr_pinned will underflow, too. Add a warning if migrate_enable() is attempted if without a matching a migrate_disable(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211129174654.668506-2-bigeasy@linutronix.de --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c9b0fda64ac..300218ad98a2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2173,6 +2173,9 @@ void migrate_enable(void) return; } + if (WARN_ON_ONCE(!p->migration_disabled)) + return; + /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). -- cgit v1.2.3 From e08f343be00c3fe8f9f6ac58085c81bcdd231fab Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 29 Nov 2021 18:46:45 +0100 Subject: locking: Remove rt_rwlock_is_contended(). rt_rwlock_is_contended() has no users. It makes no sense to use it as rwlock_is_contended() because it is a sleeping lock on RT and preemption is possible. It reports always != 0 if used by a writer and even if there is a waiter then the lock might not be handed over if the current owner has the highest priority. Remove rt_rwlock_is_contended(). Reported-by: kernel test robot Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211129174654.668506-3-bigeasy@linutronix.de --- kernel/locking/spinlock_rt.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index b2e553f9255b..9e396a09fe0f 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -257,12 +257,6 @@ void __sched rt_write_unlock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_unlock); -int __sched rt_rwlock_is_contended(rwlock_t *rwlock) -{ - return rw_base_is_contended(&rwlock->rwbase); -} -EXPORT_SYMBOL(rt_rwlock_is_contended); - #ifdef CONFIG_DEBUG_LOCK_ALLOC void __rt_rwlock_init(rwlock_t *rwlock, const char *name, struct lock_class_key *key) -- cgit v1.2.3 From 02ea9fc96fe976e7f7e067f38b12202f126e3f2f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 29 Nov 2021 18:46:46 +0100 Subject: locking/rtmutex: Squash self-deadlock check for ww_rt_mutex. Similar to the issues in commits: 6467822b8cc9 ("locking/rtmutex: Prevent spurious EDEADLK return caused by ww_mutexes") a055fcc132d4 ("locking/rtmutex: Return success on deadlock for ww_mutex waiters") ww_rt_mutex_lock() should not return EDEADLK without first going through the __ww_mutex logic to set the required state. In fact, the chain-walk can deal with the spurious cycles (per the above commits) this check warns about and is trying to avoid. Therefore ignore this test for ww_rt_mutex and simply let things fall in place. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211129174654.668506-4-bigeasy@linutronix.de --- kernel/locking/rtmutex.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 0c6a48dfcecb..f89620852774 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1103,8 +1103,11 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, * the other will detect the deadlock and return -EDEADLOCK, * which is wrong, as the other waiter is not in a deadlock * situation. + * + * Except for ww_mutex, in that case the chain walk must already deal + * with spurious cycles, see the comments at [3] and [6]. */ - if (owner == task) + if (owner == task && !(build_ww_mutex() && ww_ctx)) return -EDEADLK; raw_spin_lock(&task->pi_lock); -- cgit v1.2.3 From a3642021923b26d86bb27d88c826494827612c06 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 29 Nov 2021 18:46:47 +0100 Subject: locking/rtmutex: Add rt_mutex_lock_nest_lock() and rt_mutex_lock_killable(). The locking selftest for ww-mutex expects to operate directly on the base-mutex which becomes a rtmutex on PREEMPT_RT. Add a rtmutex based implementation of mutex_lock_nest_lock() and mutex_lock_killable() named rt_mutex_lock_nest_lock() abd rt_mutex_lock_killable(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211129174654.668506-5-bigeasy@linutronix.de --- include/linux/rtmutex.h | 9 +++++++++ kernel/locking/rtmutex_api.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 35 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 9deedfeec2b1..7d049883a08a 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -99,13 +99,22 @@ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); +extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock); #define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0) +#define rt_mutex_lock_nest_lock(lock, nest_lock) \ + do { \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ + _rt_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ + } while (0) + #else extern void rt_mutex_lock(struct rt_mutex *lock); #define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock) +#define rt_mutex_lock_nest_lock(lock, nest_lock) rt_mutex_lock(lock) #endif extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); +extern int rt_mutex_lock_killable(struct rt_mutex *lock); extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 5c9299aaabae..900220941caa 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -21,12 +21,13 @@ int max_lock_depth = 1024; */ static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, unsigned int state, + struct lockdep_map *nest_lock, unsigned int subclass) { int ret; might_sleep(); - mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_); ret = __rt_mutex_lock(&lock->rtmutex, state); if (ret) mutex_release(&lock->dep_map, _RET_IP_); @@ -48,10 +49,16 @@ EXPORT_SYMBOL(rt_mutex_base_init); */ void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass); } EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); +void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock) +{ + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0); +} +EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock); + #else /* !CONFIG_DEBUG_LOCK_ALLOC */ /** @@ -61,7 +68,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); */ void __sched rt_mutex_lock(struct rt_mutex *lock) { - __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); + __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock); #endif @@ -77,10 +84,25 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { - return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); + return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0); } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); +/** + * rt_mutex_lock_killable - lock a rt_mutex killable + * + * @lock: the rt_mutex to be locked + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + */ +int __sched rt_mutex_lock_killable(struct rt_mutex *lock) +{ + return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); + /** * rt_mutex_trylock - try to lock a rt_mutex * -- cgit v1.2.3 From 0c1d7a2c2d32fac7ff4a644724b2d52a64184645 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Nov 2021 18:46:48 +0100 Subject: lockdep: Remove softirq accounting on PREEMPT_RT. There is not really a softirq context on PREEMPT_RT. Softirqs on PREEMPT_RT are always invoked within the context of a threaded interrupt handler or within ksoftirqd. The "in-softirq" context is preemptible and is protected by a per-CPU lock to ensure mutual exclusion. There is no difference on PREEMPT_RT between spin_lock_irq() and spin_lock() because the former does not disable interrupts. Therefore if a lock is used in_softirq() and locked once with spin_lock_irq() then lockdep will report this with "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage". Teach lockdep that we don't really do softirqs on -RT. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211129174654.668506-6-bigeasy@linutronix.de --- include/linux/irqflags.h | 23 +++++++++++++++-------- kernel/locking/lockdep.c | 2 ++ 2 files changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 600c10da321a..4b140938b03e 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -71,14 +71,6 @@ do { \ do { \ __this_cpu_dec(hardirq_context); \ } while (0) -# define lockdep_softirq_enter() \ -do { \ - current->softirq_context++; \ -} while (0) -# define lockdep_softirq_exit() \ -do { \ - current->softirq_context--; \ -} while (0) # define lockdep_hrtimer_enter(__hrtimer) \ ({ \ @@ -140,6 +132,21 @@ do { \ # define lockdep_irq_work_exit(__work) do { } while (0) #endif +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) +# define lockdep_softirq_enter() \ +do { \ + current->softirq_context++; \ +} while (0) +# define lockdep_softirq_exit() \ +do { \ + current->softirq_context--; \ +} while (0) + +#else +# define lockdep_softirq_enter() do { } while (0) +# define lockdep_softirq_exit() do { } while (0) +#endif + #if defined(CONFIG_IRQSOFF_TRACER) || \ defined(CONFIG_PREEMPT_TRACER) extern void stop_critical_timings(void); diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 2270ec68f10a..4a882f83aeb9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5485,6 +5485,7 @@ static noinstr void check_flags(unsigned long flags) } } +#ifndef CONFIG_PREEMPT_RT /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only @@ -5499,6 +5500,7 @@ static noinstr void check_flags(unsigned long flags) DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } } +#endif if (!debug_locks) print_irqtrace_events(current); -- cgit v1.2.3 From c0bed69daf4b67809b58cc7cd81a8fa4f45bc161 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 3 Dec 2021 15:59:34 +0800 Subject: locking: Make owner_on_cpu() into Move the owner_on_cpu() from kernel/locking/rwsem.c into include/linux/sched.h with under CONFIG_SMP, then use it in the mutex/rwsem/rtmutex to simplify the code. Signed-off-by: Kefeng Wang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20211203075935.136808-2-wangkefeng.wang@huawei.com --- include/linux/sched.h | 9 +++++++++ kernel/locking/mutex.c | 11 ++--------- kernel/locking/rtmutex.c | 5 ++--- kernel/locking/rwsem.c | 9 --------- 4 files changed, 13 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 78c351e35fec..ff609d9c2f21 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2171,6 +2171,15 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #endif #ifdef CONFIG_SMP +static inline bool owner_on_cpu(struct task_struct *owner) +{ + /* + * As lock holder preemption issue, we both skip spinning if + * task is not on cpu or its cpu is preempted + */ + return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); +} + /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu, unsigned long max); #endif /* CONFIG_SMP */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index db1913611192..5e3585950ec8 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -367,8 +367,7 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, /* * Use vcpu_is_preempted to detect lock holder preemption issue. */ - if (!owner->on_cpu || need_resched() || - vcpu_is_preempted(task_cpu(owner))) { + if (!owner_on_cpu(owner) || need_resched()) { ret = false; break; } @@ -403,14 +402,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) * structure won't go away during the spinning period. */ owner = __mutex_owner(lock); - - /* - * As lock holder preemption issue, we both skip spinning if task is not - * on cpu or its cpu is preempted - */ - if (owner) - retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); + retval = owner_on_cpu(owner); /* * If lock->owner is not set, the mutex has been released. Return true diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index f89620852774..0c1f2e3f019a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1382,9 +1382,8 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, * for CONFIG_PREEMPT_RCU=y) * - the VCPU on which owner runs is preempted */ - if (!owner->on_cpu || need_resched() || - rt_mutex_waiter_is_top_waiter(lock, waiter) || - vcpu_is_preempted(task_cpu(owner))) { + if (!owner_on_cpu(owner) || need_resched() || + rt_mutex_waiter_is_top_waiter(lock, waiter)) { res = false; break; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c51387a43265..b92d0a830568 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -613,15 +613,6 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) return false; } -static inline bool owner_on_cpu(struct task_struct *owner) -{ - /* - * As lock holder preemption issue, we both skip spinning if - * task is not on cpu or its cpu is preempted - */ - return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); -} - static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; -- cgit v1.2.3 From 866de407444398bc8140ea70de1dba5f91cc34ac Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 3 Dec 2021 13:30:01 +0800 Subject: bpf: Disallow BPF_LOG_KERNEL log level for bpf(BPF_BTF_LOAD) BPF_LOG_KERNEL is only used internally, so disallow bpf_btf_load() to set log level as BPF_LOG_KERNEL. The same checking has already been done in bpf_check(), so factor out a helper to check the validity of log attributes and use it in both places. Fixes: 8580ac9404f6 ("bpf: Process in-kernel BTF") Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211203053001.740945-1-houtao1@huawei.com --- include/linux/bpf_verifier.h | 7 +++++++ kernel/bpf/btf.c | 3 +-- kernel/bpf/verifier.c | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c8a78e830fca..182b16a91084 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -396,6 +396,13 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) log->level == BPF_LOG_KERNEL); } +static inline bool +bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) +{ + return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 && + log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); +} + #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_info { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 2a902a946f70..36a5cc0f53c6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4473,8 +4473,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, log->len_total = log_size; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || - !log->level || !log->ubuf) { + if (!bpf_verifier_log_attr_valid(log)) { err = -EINVAL; goto errout; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6522ffdea487..1126b75fe650 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14050,11 +14050,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) log->ubuf = (char __user *) (unsigned long) attr->log_buf; log->len_total = attr->log_size; - ret = -EINVAL; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || - !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) + if (!bpf_verifier_log_attr_valid(log)) { + ret = -EINVAL; goto err_unlock; + } } if (IS_ERR(btf_vmlinux)) { -- cgit v1.2.3 From ed758b30d541e9bf713cd58612a4414e57dc6d73 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 22 Nov 2021 14:26:45 +0100 Subject: printk/console: Split out code that enables default console Put the code enabling a console by default into a separate function called try_enable_default_console(). Rename try_enable_new_console() to try_enable_preferred_console() to make the purpose of the different variants more clear. It is a code refactoring without any functional change. Signed-off-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Link: https://lore.kernel.org/r/20211122132649.12737-2-pmladek@suse.com --- kernel/printk/printk.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 57b132b658e1..1acbe39dd47c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2861,7 +2861,8 @@ early_param("keep_bootcon", keep_bootcon_setup); * Care need to be taken with consoles that are statically * enabled such as netconsole */ -static int try_enable_new_console(struct console *newcon, bool user_specified) +static int try_enable_preferred_console(struct console *newcon, + bool user_specified) { struct console_cmdline *c; int i, err; @@ -2909,6 +2910,23 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) return -ENOENT; } +/* Try to enable the console unconditionally */ +static void try_enable_default_console(struct console *newcon) +{ + if (newcon->index < 0) + newcon->index = 0; + + if (newcon->setup && newcon->setup(newcon, NULL) != 0) + return; + + newcon->flags |= CON_ENABLED; + + if (newcon->device) { + newcon->flags |= CON_CONSDEV; + has_preferred_console = true; + } +} + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -2964,25 +2982,15 @@ void register_console(struct console *newcon) * didn't select a console we take the first one * that registers here. */ - if (!has_preferred_console) { - if (newcon->index < 0) - newcon->index = 0; - if (newcon->setup == NULL || - newcon->setup(newcon, NULL) == 0) { - newcon->flags |= CON_ENABLED; - if (newcon->device) { - newcon->flags |= CON_CONSDEV; - has_preferred_console = true; - } - } - } + if (!has_preferred_console) + try_enable_default_console(newcon); /* See if this console matches one we selected on the command line */ - err = try_enable_new_console(newcon, true); + err = try_enable_preferred_console(newcon, true); /* If not, try to match against the platform default(s) */ if (err == -ENOENT) - err = try_enable_new_console(newcon, false); + err = try_enable_preferred_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) -- cgit v1.2.3 From a6953370d2fcf8c3878f1588771d20d3d972fcf3 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 22 Nov 2021 14:26:46 +0100 Subject: printk/console: Rename has_preferred_console to need_default_console The logic around the variable @has_preferred_console made my head spin many times. Part of the problem is the ambiguous name. There is the variable @preferred_console. It points to the last non-braille console in @console_cmdline array. This array contains consoles preferred via the command line, device tree, or SPCR. Then there is the variable @has_preferred_console. It is set to "true" when @preferred_console is enabled or when a console with tty binding gets enabled by default. It might get reset back by the magic condition: if (!has_preferred_console || bcon || !console_drivers) has_preferred_console = preferred_console >= 0; It is a puzzle. Dumb explanation is that it gets re-evaluated when: + it was not set before (see above when it gets set) + there is still an early console enabled (bcon) + there is no console enabled (!console_drivers) This is still a puzzle. It gets more clear when we see where the value is checked. The only meaning of the variable is to decide whether we should try to enable the new console by default. Rename the variable according to the single situation where the value is checked. The rename requires an inverted logic. Otherwise, it is a simple search & replace. It does not change the functionality. Signed-off-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Link: https://lore.kernel.org/r/20211122132649.12737-3-pmladek@suse.com --- kernel/printk/printk.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1acbe39dd47c..4c5f496877b0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -280,7 +280,7 @@ static struct console *exclusive_console; static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; -static bool has_preferred_console; +static bool need_default_console = true; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -2894,7 +2894,7 @@ static int try_enable_preferred_console(struct console *newcon, newcon->flags |= CON_ENABLED; if (i == preferred_console) { newcon->flags |= CON_CONSDEV; - has_preferred_console = true; + need_default_console = false; } return 0; } @@ -2923,7 +2923,7 @@ static void try_enable_default_console(struct console *newcon) if (newcon->device) { newcon->flags |= CON_CONSDEV; - has_preferred_console = true; + need_default_console = false; } } @@ -2974,15 +2974,15 @@ void register_console(struct console *newcon) if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; - if (!has_preferred_console || bcon || !console_drivers) - has_preferred_console = preferred_console >= 0; + if (need_default_console || bcon || !console_drivers) + need_default_console = preferred_console < 0; /* * See if we want to use this console driver. If we * didn't select a console we take the first one * that registers here. */ - if (!has_preferred_console) + if (need_default_console) try_enable_default_console(newcon); /* See if this console matches one we selected on the command line */ -- cgit v1.2.3 From f873efe841f813303e8a4af0d4cc48ff1f43bbe2 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 22 Nov 2021 14:26:47 +0100 Subject: printk/console: Remove unnecessary need_default_console manipulation There is no need to clear @need_default_console when a console preferred by the command line, device tree, or SPCR, gets enabled. The code is called only when some non-braille console matched a console in @console_cmdline array. It means that a non-braille console was added in __add_preferred_console() and the variable preferred_console is set to a number >= 0. As a result, @need_default_console is always set to "false" in the magic condition: if (need_default_console || bcon || !console_drivers) need_default_console = preferred_console < 0; This is one small step in removing the above magic condition that is hard to follow. The patch removes one superfluous assignment and should not change the functionality. Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211122132649.12737-4-pmladek@suse.com --- kernel/printk/printk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4c5f496877b0..3f845daa3a4a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2892,10 +2892,8 @@ static int try_enable_preferred_console(struct console *newcon, return err; } newcon->flags |= CON_ENABLED; - if (i == preferred_console) { + if (i == preferred_console) newcon->flags |= CON_CONSDEV; - need_default_console = false; - } return 0; } -- cgit v1.2.3 From 4f546939259f8e1130b60553433892774a42ea68 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 22 Nov 2021 14:26:48 +0100 Subject: printk/console: Remove need_default_console variable The variable @need_default_console is used to decide whether a newly registered console should get enabled by default. The logic is complicated. It can be modified in a register_console() call. But it is always re-evaluated in the next call by the following condition: if (need_default_console || bcon || !console_drivers) need_default_console = preferred_console < 0; In short, the value is updated when either of the condition is valid: + the value is still, or again, "true" + boot/early console is still the first in @console_driver list + @console_driver list is empty The value is updated according to @preferred_console. In particular, it is set to "false" when a @preferred_console was set by __add_preferred_console(). This happens when a non-braille console was added via the command line, device tree, or SPCR. It far from clear what this all means together. Let's look at @need_default_console from another angle: 1. The value is "true" by default. It means that it is always set according to @preferred_console during the first register_console() call. By other words, the first register_console() call will register the console by default only when none non-braille console was defined via the command line, device tree, or SPCR. 2. The value will always stay "false" when @preferred_console is set. By other words, try_enable_default_console() will never get called when a non-braille console is explicitly required. 4. The value might be set to "false" in try_enable_default_console() when a console with tty binding (driver) gets enabled. In this case CON_CONSDEV is set as well. It causes that the console will be inserted as first into the list @console_driver. It might be either real or boot/early console. 5. The value will be set _back_ to "true" in the next register_console() call when: + The console added by the previous register_console() had been a boot/early one. + The last console has been unregistered in the meantime and a boot/early console became first in @console_drivers list again. Or the list became empty. By other words, the value will stay "false" only when the last registered console was real, had tty binding, and was not removed in the mean time. The main logic looks clear: + Consoles are enabled by default only when no one is preferred via the command line, device tree, or SPCR. + By default, any console is enabled until a real console with tty binding gets registered. The behavior when the real console with tty binding is later removed is a bit unclear: + By default, any new console is registered again only when there is no console or the first console in the list is a boot one. The question is why the code is suddenly happy when a real console without tty binding is the first in the list. It looks like an overlook and bug. Conclusion: The state of @preferred_console and the first console in @console_driver list should be enough to decide whether we need to enable the given console by default. The rules are simple. New consoles are _not_ enabled by default when either of the following conditions is true: + @preferred_console is set. It means that a non-braille console is explicitly configured via the command line, device tree, or SPCR. + A real console with tty binding is registered. Such a console will have CON_CONSDEV flag set and will always be the first in @console_drivers list. Note: The new code does not use @bcon variable. The meaning of the variable is far from clear. The direct check of the first console in the list makes it more clear that only real console fulfills requirements of the default console. Behavior change: As already discussed above. There was one situation where the original code worked a strange way. Let's have: + console A: real console without tty binding + console B: real console with tty binding and do: register_console(A); /* 1st step */ register_console(B); /* 2nd step */ unregister_console(B); /* 3rd step */ register_console(B); /* 4th step */ The original code will not register the console B in the 4th step. @need_default_console is set to "false" in 2nd step. The real console with tty binding (driver) is then removed in the 3rd step. But @need_default_console will stay "false" in the 4th step because there is no boot/early console and @registered_consoles list is not empty. The new code will register the console B in the 4th step because it checks whether the first console has tty binding (->driver) This behavior change should acceptable: 1. The scenario requires manual intervention (console removal). The system should boot with the same consoles as before. 2. Console B is registered again probably because the user wants to use it. The most likely scenario is that the related module is reloaded. 3. It makes the behavior more consistent and predictable. Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211122132649.12737-5-pmladek@suse.com --- kernel/printk/printk.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 3f845daa3a4a..6591da285a83 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -280,7 +280,6 @@ static struct console *exclusive_console; static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; -static bool need_default_console = true; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -2919,10 +2918,8 @@ static void try_enable_default_console(struct console *newcon) newcon->flags |= CON_ENABLED; - if (newcon->device) { + if (newcon->device) newcon->flags |= CON_CONSDEV; - need_default_console = false; - } } /* @@ -2972,16 +2969,24 @@ void register_console(struct console *newcon) if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; - if (need_default_console || bcon || !console_drivers) - need_default_console = preferred_console < 0; - /* - * See if we want to use this console driver. If we - * didn't select a console we take the first one - * that registers here. + * See if we want to enable this console driver by default. + * + * Nope when a console is preferred by the command line, device + * tree, or SPCR. + * + * The first real console with tty binding (driver) wins. More + * consoles might get enabled before the right one is found. + * + * Note that a console with tty binding will have CON_CONSDEV + * flag set and will be first in the list. */ - if (need_default_console) - try_enable_default_console(newcon); + if (preferred_console < 0) { + if (!console_drivers || !console_drivers->device || + console_drivers->flags & CON_BOOT) { + try_enable_default_console(newcon); + } + } /* See if this console matches one we selected on the command line */ err = try_enable_preferred_console(newcon, true); -- cgit v1.2.3 From 5e8ba485b2522808ab2d65208839e1c915e113dd Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 22 Nov 2021 14:26:49 +0100 Subject: printk/console: Clean up boot console handling in register_console() The variable @bcon has two meanings. It is used several times for iterating the list of registered consoles. In the meantime, it holds the information whether a boot console is first in @console_drivers list. The information about the 1st console driver used to be important for the decision whether to install the new console by default or not. It allowed to re-evaluate the variable @need_default_console when a real console with tty binding has been unregistered in the meantime. The decision about the default console is not longer affected by @bcon variable. The current code checks whether the first driver is real and has tty binding directly. The information about the first console is still used for two more decisions: 1. It prevents duplicate output on non-boot consoles with CON_CONSDEV flag set. 2. Early/boot consoles are unregistered when a real console with CON_CONSDEV is registered and @keep_bootcon is not set. The behavior in the real life is far from obvious. @bcon is set according to the first console @console_drivers list. But the first position in the list is special: 1. Consoles with CON_CONSDEV flag are put at the beginning of the list. It is either the preferred console or any console with tty binding registered by default. 2. Another console might become the first in the list when the first console in the list is unregistered. It might happen either explicitly or automatically when boot consoles are unregistered. There is one more important rule: + Boot consoles can't be registered when any real console is already registered. It is a puzzle. The main complication is the dependency on the first position is the list and the complicated rules around it. Let's try to make it easier: 1. Add variable @bootcon_enabled and set it by iterating all registered consoles. The variable has obvious meaning and more predictable behavior. Any speed optimization and other tricks are not worth it. 2. Use a generic name for the variable that is used to iterate the list on registered console drivers. Behavior change: No, maybe surprisingly, there is _no_ behavior change! Let's provide the proof by contradiction. Both operations, duplicate output prevention and boot consoles removal, are done only when the newly added console has CON_CONSDEV flag set. The behavior would change when the new @bootcon_enabled has different value than the original @bcon. By other words, the behavior would change when the following conditions are true: + a console with CON_CONSDEV flag is added + a real (non-boot) console is the first in the list + a boot console is later in the list Now, a real console might be first in the list only when: + It was the first registered console. In this case, there can't be any boot console because any later ones were rejected. + It was put at the first position because it had CON_CONSDEV flag set. It was either the preferred console or it was a console with tty binding registered by default. We are interested only in a real consoles here. And real console with tty binding fulfills conditions of the default console. Now, there is always only one console that is either preferred or fulfills conditions of the default console. It can't be already in the list and being registered at the same time. As a result, the above three conditions could newer be "true" at the same time. Therefore the behavior can't change. Final dilemma: OK, the new code has the same behavior. But is the change in the right direction? What if the handling of @console_drivers is updated in the future? OK, let's look at it from another angle: 1. The ordering of @console_drivers list is important only in console_device() function. The first console driver with tty binding gets associated with /dev/console. 2. CON_CONSDEV flag is shown in /proc/consoles. And it should be set for the driver that is returned by console_device(). 3. A boot console is removed and the duplicated output is prevented when the real console with CON_CONSDEV flag is registered. Now, in the ideal world: + The driver associated with /dev/console should be either a console preferred via the command line, device tree, or SPCR. Or it should be the first real console with tty binding registered by default. + The code should match the related boot and real console drivers. It should unregister only the obsolete boot driver. And the duplicated output should be prevented only on the related real driver. It is clear that it is not guaranteed by the current code. Instead, the current code looks like a maze of heuristics that try to achieve the above. It is result of adding several features over last few decades. For example, a possibility to register more consoles, unregister consoles, boot consoles, consoles without tty binding, device tree, SPCR, braille consoles. Anyway, there is no reason why the decision, about removing boot consoles and preventing duplicated output, should depend on the first console in the list. The current code does the decisions primary by CON_CONSDEV flag that is used for the preferred console. It looks like a good compromise. And the change seems to be in the right direction. Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211122132649.12737-6-pmladek@suse.com --- kernel/printk/printk.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6591da285a83..155229f0cf0f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2943,31 +2943,30 @@ static void try_enable_default_console(struct console *newcon) */ void register_console(struct console *newcon) { - struct console *bcon = NULL; + struct console *con; + bool bootcon_enabled = false; + bool realcon_enabled = false; int err; - for_each_console(bcon) { - if (WARN(bcon == newcon, "console '%s%d' already registered\n", - bcon->name, bcon->index)) + for_each_console(con) { + if (WARN(con == newcon, "console '%s%d' already registered\n", + con->name, con->index)) return; } - /* - * before we register a new CON_BOOT console, make sure we don't - * already have a valid console - */ - if (newcon->flags & CON_BOOT) { - for_each_console(bcon) { - if (!(bcon->flags & CON_BOOT)) { - pr_info("Too late to register bootconsole %s%d\n", - newcon->name, newcon->index); - return; - } - } + for_each_console(con) { + if (con->flags & CON_BOOT) + bootcon_enabled = true; + else + realcon_enabled = true; } - if (console_drivers && console_drivers->flags & CON_BOOT) - bcon = console_drivers; + /* Do not register boot consoles when there already is a real one. */ + if (newcon->flags & CON_BOOT && realcon_enabled) { + pr_info("Too late to register bootconsole %s%d\n", + newcon->name, newcon->index); + return; + } /* * See if we want to enable this console driver by default. @@ -3005,8 +3004,10 @@ void register_console(struct console *newcon) * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) + if (bootcon_enabled && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; + } /* * Put this console in the list - keep the @@ -3062,15 +3063,15 @@ void register_console(struct console *newcon) pr_info("%sconsole [%s%d] enabled\n", (newcon->flags & CON_BOOT) ? "boot" : "" , newcon->name, newcon->index); - if (bcon && + if (bootcon_enabled && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { /* We need to iterate through all boot consoles, to make * sure we print everything out, before we unregister them. */ - for_each_console(bcon) - if (bcon->flags & CON_BOOT) - unregister_console(bcon); + for_each_console(con) + if (con->flags & CON_BOOT) + unregister_console(con); } } EXPORT_SYMBOL(register_console); -- cgit v1.2.3 From db52f57211b4e45f0ebb274e2c877b211dc18591 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Mon, 6 Dec 2021 13:03:15 +0530 Subject: bpf: Remove config check to enable bpf support for branch records Branch data available to BPF programs can be very useful to get stack traces out of userspace application. Commit fff7b64355ea ("bpf: Add bpf_read_branch_records() helper") added BPF support to capture branch records in x86. Enable this feature also for other architectures as well by removing checks specific to x86. If an architecture doesn't support branch records, bpf_read_branch_records() still has appropriate checks and it will return an -EINVAL in that scenario. Based on UAPI helper doc in include/uapi/linux/bpf.h, unsupported architectures should return -ENOENT in such case. Hence, update the appropriate check to return -ENOENT instead. Selftest 'perf_branches' result on power9 machine which has the branch stacks support: - Before this patch: [command]# ./test_progs -t perf_branches #88/1 perf_branches/perf_branches_hw:FAIL #88/2 perf_branches/perf_branches_no_hw:OK #88 perf_branches:FAIL Summary: 0/1 PASSED, 0 SKIPPED, 1 FAILED - After this patch: [command]# ./test_progs -t perf_branches #88/1 perf_branches/perf_branches_hw:OK #88/2 perf_branches/perf_branches_no_hw:OK #88 perf_branches:OK Summary: 1/2 PASSED, 0 SKIPPED, 0 FAILED Selftest 'perf_branches' result on power9 machine which doesn't have branch stack report: - After this patch: [command]# ./test_progs -t perf_branches #88/1 perf_branches/perf_branches_hw:SKIP #88/2 perf_branches/perf_branches_no_hw:OK #88 perf_branches:OK Summary: 1/1 PASSED, 1 SKIPPED, 0 FAILED Fixes: fff7b64355eac ("bpf: Add bpf_read_branch_records() helper") Suggested-by: Peter Zijlstra Signed-off-by: Kajol Jain Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20211206073315.77432-1-kjain@linux.ibm.com --- kernel/trace/bpf_trace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 25ea521fb8f1..77f13de6f9f9 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1404,9 +1404,6 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, void *, buf, u32, size, u64, flags) { -#ifndef CONFIG_X86 - return -ENOENT; -#else static const u32 br_entry_size = sizeof(struct perf_branch_entry); struct perf_branch_stack *br_stack = ctx->data->br_stack; u32 to_copy; @@ -1415,7 +1412,7 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, return -EINVAL; if (unlikely(!br_stack)) - return -EINVAL; + return -ENOENT; if (flags & BPF_F_GET_BRANCH_RECORDS_SIZE) return br_stack->nr * br_entry_size; @@ -1427,7 +1424,6 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, memcpy(buf, br_stack->entries, to_copy); return to_copy; -#endif } static const struct bpf_func_proto bpf_read_branch_records_proto = { -- cgit v1.2.3 From f2b20c66274dafd57f1a9221aae84640319685a4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 8 Nov 2021 20:15:13 +0000 Subject: tracing: Fix spelling mistake "aritmethic" -> "arithmetic" There is a spelling mistake in the tracing mini-HOWTO text. Fix it. Link: https://lkml.kernel.org/r/20211108201513.42876-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88de94da596b..4821fe6a40a5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5635,7 +5635,7 @@ static const char readme_msg[] = "\t - a numeric literal: e.g. ms_per_sec=1000,\n" "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n" "\n" - "\t hist trigger aritmethic expressions support addition(+), subtraction(-),\n" + "\t hist trigger arithmetic expressions support addition(+), subtraction(-),\n" "\t multiplication(*) and division(/) operators. An operand can be either a\n" "\t variable reference, field or numeric literal.\n" "\n" -- cgit v1.2.3 From 05770dd0ad110854c7157d95700d7c89979cdb3e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 22 Nov 2021 18:30:12 +0900 Subject: tracing: Support __rel_loc relative dynamic data location attribute Add '__rel_loc' new dynamic data location attribute which encodes the data location from the next to the field itself. The '__data_loc' is used for encoding the dynamic data location on the trace event record. But '__data_loc' is not useful if the writer doesn't know the event header (e.g. user event), because it records the dynamic data offset from the entry of the record, not the field itself. This new '__rel_loc' attribute encodes the data location relatively from the next of the field. For example, when there is a record like below (the number in the parentheses is the size of fields) |header(N)|common(M)|fields(K)|__data_loc(4)|fields(L)|data(G)| In this case, '__data_loc' field will be __data_loc = (G << 16) | (N+M+K+4+L) If '__rel_loc' is used, this will be |header(N)|common(M)|fields(K)|__rel_loc(4)|fields(L)|data(G)| where __rel_loc = (G << 16) | (L) This case shows L bytes after the '__rel_loc' attribute field, if there is no fields after the __rel_loc field, L must be 0. This is relatively easy (and no need to consider the kernel header change) when the event data fields are composed by user who doesn't know header and common fields. Link: https://lkml.kernel.org/r/163757341258.510314.4214431827833229956.stgit@devnote2 Cc: Beau Belgrave Cc: Namhyung Kim Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_events.h | 1 + kernel/trace/trace.h | 1 + kernel/trace/trace_events_filter.c | 32 ++++++++++++++++++++++++++++++-- kernel/trace/trace_events_hist.c | 21 +++++++++++++++++++-- kernel/trace/trace_events_inject.c | 11 +++++++++-- 5 files changed, 60 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2d167ac3452c..3900404aa063 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -782,6 +782,7 @@ enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, FILTER_DYN_STRING, + FILTER_RDYN_STRING, FILTER_PTR_STRING, FILTER_TRACE_FN, FILTER_COMM, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 38715aa6cfdf..5db2bec8ca7e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1465,6 +1465,7 @@ struct filter_pred { static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING || field->filter_type == FILTER_STATIC_STRING || field->filter_type == FILTER_PTR_STRING || field->filter_type == FILTER_COMM; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index c9124038b140..996920ed1812 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -706,6 +706,29 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event) return match; } +/* + * Filter predicate for relative dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry as same as dynamic string. + * The difference is that the relative one records the location offset + * from the field itself, not the event entry. + */ +static int filter_pred_strrelloc(struct filter_pred *pred, void *event) +{ + u32 *item = (u32 *)(event + pred->offset); + u32 str_item = *item; + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; + char *addr = (char *)(&item[1]) + str_loc; + int cmp, match; + + cmp = pred->regex.match(addr, &pred->regex, str_len); + + match = cmp ^ pred->not; + + return match; +} + /* Filter predicate for CPUs. */ static int filter_pred_cpu(struct filter_pred *pred, void *event) { @@ -756,7 +779,7 @@ static int filter_pred_none(struct filter_pred *pred, void *event) * * Note: * - @str might not be NULL-terminated if it's of type DYN_STRING - * or STATIC_STRING, unless @len is zero. + * RDYN_STRING, or STATIC_STRING, unless @len is zero. */ static int regex_match_full(char *str, struct regex *r, int len) @@ -1083,6 +1106,9 @@ int filter_assign_type(const char *type) if (strstr(type, "__data_loc") && strstr(type, "char")) return FILTER_DYN_STRING; + if (strstr(type, "__rel_loc") && strstr(type, "char")) + return FILTER_RDYN_STRING; + if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; @@ -1318,8 +1344,10 @@ static int parse_pred(const char *str, void *data, pred->fn = filter_pred_string; pred->regex.field_len = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) + } else if (field->filter_type == FILTER_DYN_STRING) { pred->fn = filter_pred_strloc; + } else if (field->filter_type == FILTER_RDYN_STRING) + pred->fn = filter_pred_strrelloc; else pred->fn = filter_pred_pchar; /* go past the last quote */ diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 319f9c8ca7e7..9b8da439149c 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -217,6 +217,20 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, return (u64)(unsigned long)addr; } +static u64 hist_field_reldynstring(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + u32 *item = event + hist_field->field->offset; + u32 str_item = *item; + int str_loc = str_item & 0xffff; + char *addr = (char *)&item[1] + str_loc; + + return (u64)(unsigned long)addr; +} + static u64 hist_field_pstring(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -1956,8 +1970,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (field->filter_type == FILTER_STATIC_STRING) { hist_field->fn = hist_field_string; hist_field->size = field->size; - } else if (field->filter_type == FILTER_DYN_STRING) + } else if (field->filter_type == FILTER_DYN_STRING) { hist_field->fn = hist_field_dynstring; + } else if (field->filter_type == FILTER_RDYN_STRING) + hist_field->fn = hist_field_reldynstring; else hist_field->fn = hist_field_pstring; } else { @@ -4961,7 +4977,8 @@ static inline void add_to_key(char *compound_key, void *key, struct ftrace_event_field *field; field = key_field->field; - if (field->filter_type == FILTER_DYN_STRING) + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) size = *(u32 *)(rec + field->offset) >> 16; else if (field->filter_type == FILTER_STATIC_STRING) size = field->size; diff --git a/kernel/trace/trace_events_inject.c b/kernel/trace/trace_events_inject.c index c188045c5f97..d6b4935a78c0 100644 --- a/kernel/trace/trace_events_inject.c +++ b/kernel/trace/trace_events_inject.c @@ -168,10 +168,14 @@ static void *trace_alloc_entry(struct trace_event_call *call, int *size) continue; if (field->filter_type == FILTER_STATIC_STRING) continue; - if (field->filter_type == FILTER_DYN_STRING) { + if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) { u32 *str_item; int str_loc = entry_size & 0xffff; + if (field->filter_type == FILTER_RDYN_STRING) + str_loc -= field->offset + field->size; + str_item = (u32 *)(entry + field->offset); *str_item = str_loc; /* string length is 0. */ } else { @@ -214,7 +218,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) if (field->filter_type == FILTER_STATIC_STRING) { strlcpy(entry + field->offset, addr, field->size); - } else if (field->filter_type == FILTER_DYN_STRING) { + } else if (field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_RDYN_STRING) { int str_len = strlen(addr) + 1; int str_loc = entry_size & 0xffff; u32 *str_item; @@ -229,6 +234,8 @@ static int parse_entry(char *str, struct trace_event_call *call, void **pentry) strlcpy(entry + (entry_size - str_len), addr, str_len); str_item = (u32 *)(entry + field->offset); + if (field->filter_type == FILTER_RDYN_STRING) + str_loc -= field->offset + field->size; *str_item = (str_len << 16) | str_loc; } else { char **paddr; -- cgit v1.2.3 From 55de2c0b5610cba5a5a93c0788031133c457e689 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 22 Nov 2021 18:30:21 +0900 Subject: tracing: Add '__rel_loc' using trace event macros Add '__rel_loc' using trace event macros. These macros are usually not used in the kernel, except for testing purpose. This also add "rel_" variant of macros for dynamic_array string, and bitmask. Link: https://lkml.kernel.org/r/163757342119.510314.816029622439099016.stgit@devnote2 Cc: Beau Belgrave Cc: Namhyung Kim Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/trace/bpf_probe.h | 16 ++++++ include/trace/perf.h | 16 ++++++ include/trace/trace_events.h | 120 ++++++++++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 3 ++ 4 files changed, 153 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index a8e97f84b652..7660a7846586 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -21,6 +21,22 @@ #undef __get_bitmask #define __get_bitmask(field) (char *)__get_dynamic_array(field) +#undef __get_rel_dynamic_array +#define __get_rel_dynamic_array(field) \ + ((void *)(&__entry->__rel_loc_##field) + \ + sizeof(__entry->__rel_loc_##field) + \ + (__entry->__rel_loc_##field & 0xffff)) + +#undef __get_rel_dynamic_array_len +#define __get_rel_dynamic_array_len(field) \ + ((__entry->__rel_loc_##field >> 16) & 0xffff) + +#undef __get_rel_str +#define __get_rel_str(field) ((char *)__get_rel_dynamic_array(field)) + +#undef __get_rel_bitmask +#define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) + #undef __perf_count #define __perf_count(c) (c) diff --git a/include/trace/perf.h b/include/trace/perf.h index dbc6c74defc3..ea4405de175a 100644 --- a/include/trace/perf.h +++ b/include/trace/perf.h @@ -21,6 +21,22 @@ #undef __get_bitmask #define __get_bitmask(field) (char *)__get_dynamic_array(field) +#undef __get_rel_dynamic_array +#define __get_rel_dynamic_array(field) \ + ((void *)(&__entry->__rel_loc_##field) + \ + sizeof(__entry->__rel_loc_##field) + \ + (__entry->__rel_loc_##field & 0xffff)) + +#undef __get_rel_dynamic_array_len +#define __get_rel_dynamic_array_len(field) \ + ((__entry->__rel_loc_##field >> 16) & 0xffff) + +#undef __get_rel_str +#define __get_rel_str(field) ((char *)__get_rel_dynamic_array(field)) + +#undef __get_rel_bitmask +#define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) + #undef __perf_count #define __perf_count(c) (__count = (c)) diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 08810a463880..8c6f7c433518 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -108,6 +108,18 @@ TRACE_MAKE_SYSTEM_STR(); #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(char, item, -1) +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item, len) u32 __rel_loc_##item; + +#undef __rel_string +#define __rel_string(item, src) __rel_dynamic_array(char, item, -1) + +#undef __rel_string_len +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, -1) + +#undef __rel_bitmask +#define __rel_bitmask(item, nr_bits) __rel_dynamic_array(char, item, -1) + #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args @@ -200,11 +212,23 @@ TRACE_MAKE_SYSTEM_STR(); #undef __string #define __string(item, src) __dynamic_array(char, item, -1) +#undef __bitmask +#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) + #undef __string_len #define __string_len(item, src, len) __dynamic_array(char, item, -1) -#undef __bitmask -#define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item, len) u32 item; + +#undef __rel_string +#define __rel_string(item, src) __rel_dynamic_array(char, item, -1) + +#undef __rel_string_len +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, -1) + +#undef __rel_bitmask +#define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, -1) #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ @@ -293,6 +317,19 @@ TRACE_MAKE_SYSTEM_STR(); #undef __get_str #define __get_str(field) ((char *)__get_dynamic_array(field)) +#undef __get_rel_dynamic_array +#define __get_rel_dynamic_array(field) \ + ((void *)(&__entry->__rel_loc_##field) + \ + sizeof(__entry->__rel_loc_##field) + \ + (__entry->__rel_loc_##field & 0xffff)) + +#undef __get_rel_dynamic_array_len +#define __get_rel_dynamic_array_len(field) \ + ((__entry->__rel_loc_##field >> 16) & 0xffff) + +#undef __get_rel_str +#define __get_rel_str(field) ((char *)__get_rel_dynamic_array(field)) + #undef __get_bitmask #define __get_bitmask(field) \ ({ \ @@ -302,6 +339,15 @@ TRACE_MAKE_SYSTEM_STR(); trace_print_bitmask_seq(p, __bitmask, __bitmask_size); \ }) +#undef __get_rel_bitmask +#define __get_rel_bitmask(field) \ + ({ \ + void *__bitmask = __get_rel_dynamic_array(field); \ + unsigned int __bitmask_size; \ + __bitmask_size = __get_rel_dynamic_array_len(field); \ + trace_print_bitmask_seq(p, __bitmask, __bitmask_size); \ + }) + #undef __print_flags #define __print_flags(flag, delim, flag_array...) \ ({ \ @@ -471,6 +517,21 @@ static struct trace_event_functions trace_event_type_funcs_##call = { \ #undef __bitmask #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, -1) +#undef __rel_dynamic_array +#define __rel_dynamic_array(_type, _item, _len) { \ + .type = "__rel_loc " #_type "[]", .name = #_item, \ + .size = 4, .align = 4, \ + .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }, + +#undef __rel_string +#define __rel_string(item, src) __rel_dynamic_array(char, item, -1) + +#undef __rel_string_len +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, -1) + +#undef __rel_bitmask +#define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, -1) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \ static struct trace_event_fields trace_event_fields_##call[] = { \ @@ -519,6 +580,22 @@ static struct trace_event_fields trace_event_fields_##call[] = { \ #undef __string_len #define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1) +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item, len) \ + __item_length = (len) * sizeof(type); \ + __data_offsets->item = __data_size + \ + offsetof(typeof(*entry), __data) - \ + offsetof(typeof(*entry), __rel_loc_##item) - \ + sizeof(u32); \ + __data_offsets->item |= __item_length << 16; \ + __data_size += __item_length; + +#undef __rel_string +#define __rel_string(item, src) __rel_dynamic_array(char, item, \ + strlen((src) ? (const char *)(src) : "(null)") + 1) + +#undef __rel_string_len +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, (len) + 1) /* * __bitmask_size_in_bytes_raw is the number of bytes needed to hold * num_possible_cpus(). @@ -542,6 +619,10 @@ static struct trace_event_fields trace_event_fields_##call[] = { \ #define __bitmask(item, nr_bits) __dynamic_array(unsigned long, item, \ __bitmask_size_in_longs(nr_bits)) +#undef __rel_bitmask +#define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, \ + __bitmask_size_in_longs(nr_bits)) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ static inline notrace int trace_event_get_offsets_##call( \ @@ -706,6 +787,37 @@ static inline notrace int trace_event_get_offsets_##call( \ #define __assign_bitmask(dst, src, nr_bits) \ memcpy(__get_bitmask(dst), (src), __bitmask_size_in_bytes(nr_bits)) +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item, len) \ + __entry->__rel_loc_##item = __data_offsets.item; + +#undef __rel_string +#define __rel_string(item, src) __rel_dynamic_array(char, item, -1) + +#undef __rel_string_len +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, -1) + +#undef __assign_rel_str +#define __assign_rel_str(dst, src) \ + strcpy(__get_rel_str(dst), (src) ? (const char *)(src) : "(null)"); + +#undef __assign_rel_str_len +#define __assign_rel_str_len(dst, src, len) \ + do { \ + memcpy(__get_rel_str(dst), (src), (len)); \ + __get_rel_str(dst)[len] = '\0'; \ + } while (0) + +#undef __rel_bitmask +#define __rel_bitmask(item, nr_bits) __rel_dynamic_array(unsigned long, item, -1) + +#undef __get_rel_bitmask +#define __get_rel_bitmask(field) (char *)__get_rel_dynamic_array(field) + +#undef __assign_rel_bitmask +#define __assign_rel_bitmask(dst, src, nr_bits) \ + memcpy(__get_rel_bitmask(dst), (src), __bitmask_size_in_bytes(nr_bits)) + #undef TP_fast_assign #define TP_fast_assign(args...) args @@ -770,6 +882,10 @@ static inline void ftrace_test_probe_##call(void) \ #undef __get_dynamic_array_len #undef __get_str #undef __get_bitmask +#undef __get_rel_dynamic_array +#undef __get_rel_dynamic_array_len +#undef __get_rel_str +#undef __get_rel_bitmask #undef __print_array #undef __print_hex_dump diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5db2bec8ca7e..7162157b970b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -83,6 +83,9 @@ enum trace_type { #undef __dynamic_array #define __dynamic_array(type, item) type item[]; +#undef __rel_dynamic_array +#define __rel_dynamic_array(type, item) type item[]; + #undef F_STRUCT #define F_STRUCT(args...) args -- cgit v1.2.3 From e07a1d576239cf836070e740d4bd7c5e8a64868f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 29 Nov 2021 21:39:46 -0500 Subject: tracing: Use __this_cpu_read() in trace_event_buffer_lock_reserver() The value read by this_cpu_read() is used later and its use is expected to stay on the same CPU as being read. But this_cpu_read() does not warn if it is called without preemption disabled, where as __this_cpu_read() will check if preemption is disabled on CONFIG_DEBUG_PREEMPT Currently all callers have preemption disabled, but there may be new callers in the future that may not. Link: https://lkml.kernel.org/r/20211130024318.698165354@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4821fe6a40a5..2e87b7bf2ba7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2746,7 +2746,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, if (!tr->no_filter_buffering_ref && (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && - (entry = this_cpu_read(trace_buffered_event))) { + (entry = __this_cpu_read(trace_buffered_event))) { /* * Filtering is on, so try to use the per cpu buffer first. * This buffer will simulate a ring_buffer_event, -- cgit v1.2.3 From 6c536d76cfe63b79e9e468ef0876315420a19074 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 29 Nov 2021 21:39:47 -0500 Subject: tracing: Disable preemption when using the filter buffer In case trace_event_buffer_lock_reserve() is called with preemption enabled, the algorithm that defines the usage of the per cpu filter buffer may fail if the task schedules to another CPU after determining which buffer it will use. Disable preemption when using the filter buffer. And because that same buffer must be used throughout the call, keep preemption disabled until the filter buffer is released. This will also keep the semantics between the use case of when the filter buffer is used, and when the ring buffer itself is used, as that case also disables preemption until the ring buffer is released. Link: https://lkml.kernel.org/r/20211130024318.880190623@goodmis.org [ Fixed warning of assignment in if statement Reported-by: kernel test robot ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 59 +++++++++++++++++++++++++++++----------------------- kernel/trace/trace.h | 4 +++- 2 files changed, 36 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2e87b7bf2ba7..e3b8c906b7b4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -980,6 +980,8 @@ __buffer_unlock_commit(struct trace_buffer *buffer, struct ring_buffer_event *ev ring_buffer_write(buffer, event->array[0], &event->array[1]); /* Release the temp buffer */ this_cpu_dec(trace_buffered_event_cnt); + /* ring_buffer_unlock_commit() enables preemption */ + preempt_enable_notrace(); } else ring_buffer_unlock_commit(buffer, event); } @@ -2745,8 +2747,8 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, *current_rb = tr->array_buffer.buffer; if (!tr->no_filter_buffering_ref && - (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && - (entry = __this_cpu_read(trace_buffered_event))) { + (trace_file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED))) { + preempt_disable_notrace(); /* * Filtering is on, so try to use the per cpu buffer first. * This buffer will simulate a ring_buffer_event, @@ -2764,33 +2766,38 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, * is still quicker than no copy on match, but having * to discard out of the ring buffer on a failed match. */ - int max_len = PAGE_SIZE - struct_size(entry, array, 1); + if ((entry = __this_cpu_read(trace_buffered_event))) { + int max_len = PAGE_SIZE - struct_size(entry, array, 1); - val = this_cpu_inc_return(trace_buffered_event_cnt); + val = this_cpu_inc_return(trace_buffered_event_cnt); - /* - * Preemption is disabled, but interrupts and NMIs - * can still come in now. If that happens after - * the above increment, then it will have to go - * back to the old method of allocating the event - * on the ring buffer, and if the filter fails, it - * will have to call ring_buffer_discard_commit() - * to remove it. - * - * Need to also check the unlikely case that the - * length is bigger than the temp buffer size. - * If that happens, then the reserve is pretty much - * guaranteed to fail, as the ring buffer currently - * only allows events less than a page. But that may - * change in the future, so let the ring buffer reserve - * handle the failure in that case. - */ - if (val == 1 && likely(len <= max_len)) { - trace_event_setup(entry, type, trace_ctx); - entry->array[0] = len; - return entry; + /* + * Preemption is disabled, but interrupts and NMIs + * can still come in now. If that happens after + * the above increment, then it will have to go + * back to the old method of allocating the event + * on the ring buffer, and if the filter fails, it + * will have to call ring_buffer_discard_commit() + * to remove it. + * + * Need to also check the unlikely case that the + * length is bigger than the temp buffer size. + * If that happens, then the reserve is pretty much + * guaranteed to fail, as the ring buffer currently + * only allows events less than a page. But that may + * change in the future, so let the ring buffer reserve + * handle the failure in that case. + */ + if (val == 1 && likely(len <= max_len)) { + trace_event_setup(entry, type, trace_ctx); + entry->array[0] = len; + /* Return with preemption disabled */ + return entry; + } + this_cpu_dec(trace_buffered_event_cnt); } - this_cpu_dec(trace_buffered_event_cnt); + /* __trace_buffer_lock_reserve() disables preemption */ + preempt_enable_notrace(); } entry = __trace_buffer_lock_reserve(*current_rb, type, len, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7162157b970b..8bd1a815ce90 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1337,10 +1337,12 @@ __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { if (this_cpu_read(trace_buffered_event) == event) { - /* Simply release the temp buffer */ + /* Simply release the temp buffer and enable preemption */ this_cpu_dec(trace_buffered_event_cnt); + preempt_enable_notrace(); return; } + /* ring_buffer_discard_commit() enables preemption */ ring_buffer_discard_commit(buffer, event); } -- cgit v1.2.3 From 3e8b1a29a0e8d300466cf2a23d2f6d41971c5a0c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 29 Nov 2021 21:39:48 -0500 Subject: tracing: Have eprobes use filtering logic of trace events The eprobes open code the reserving of the event on the ring buffer for ftrace instead of using the ftrace event wrappers, which means that it doesn't get affected by the filters, breaking the filtering logic on user space. Link: https://lkml.kernel.org/r/20211130024319.068451680@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_eprobe.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 928867f527e7..88487752d307 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -489,18 +489,12 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec) if (trace_trigger_soft_disabled(edata->file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = edata->file; - dsize = get_eprobe_size(&edata->ep->tp, rec); - fbuffer.regs = NULL; - - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, edata->file, - call->event.type, - sizeof(*entry) + edata->ep->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + + entry = trace_event_buffer_reserve(&fbuffer, edata->file, + sizeof(*entry) + edata->ep->tp.size + dsize); + + if (!entry) return; entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event); -- cgit v1.2.3 From 5e6cd84e2f8bd3619b5d8f3dd4b44c0086a6ce1d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 29 Nov 2021 21:39:49 -0500 Subject: tracing/kprobes: Do not open code event reserve logic As kprobe events use trace_event_buffer_commit() to commit the event to the ftrace ring buffer, for consistency, it should use trace_event_buffer_reserve() to allocate it, as the two functions are related. Link: https://lkml.kernel.org/r/20211130024319.257430762@goodmis.org Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 33272a7b6912..d10c01948e68 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1383,17 +1383,11 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs, if (trace_trigger_soft_disabled(trace_file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = trace_file; - dsize = __get_data_size(&tk->tp, regs); - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, - call->event.type, - sizeof(*entry) + tk->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tk->tp.size + dsize); + if (!entry) return; fbuffer.regs = regs; @@ -1430,16 +1424,11 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, if (trace_trigger_soft_disabled(trace_file)) return; - fbuffer.trace_ctx = tracing_gen_ctx(); - fbuffer.trace_file = trace_file; - dsize = __get_data_size(&tk->tp, regs); - fbuffer.event = - trace_event_buffer_lock_reserve(&fbuffer.buffer, trace_file, - call->event.type, - sizeof(*entry) + tk->tp.size + dsize, - fbuffer.trace_ctx); - if (!fbuffer.event) + + entry = trace_event_buffer_reserve(&fbuffer, trace_file, + sizeof(*entry) + tk->tp.size + dsize); + if (!entry) return; fbuffer.regs = regs; -- cgit v1.2.3 From b7d5eb267f8c234d6eda40e21c0105a1f6231d14 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 6 Dec 2021 16:24:40 -0500 Subject: tracing/uprobes: Use trace_event_buffer_reserve() helper To be consistent with kprobes and eprobes, use trace_event_buffer_reserver() and trace_event_buffer_commit(). This will ensure that any updates to trace events will also be implemented on uprobe events. Link: https://lkml.kernel.org/r/20211206162440.69fbf96c@gandalf.local.home Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f5f0039d31e5..a4d5c624fe79 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -949,8 +949,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, struct trace_event_file *trace_file) { struct uprobe_trace_entry_head *entry; - struct trace_buffer *buffer; - struct ring_buffer_event *event; + struct trace_event_buffer fbuffer; void *data; int size, esize; struct trace_event_call *call = trace_probe_event_call(&tu->tp); @@ -965,12 +964,10 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; - event = trace_event_buffer_lock_reserve(&buffer, trace_file, - call->event.type, size, 0); - if (!event) + entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); + if (!entry) return; - entry = ring_buffer_event_data(event); if (is_ret_probe(tu)) { entry->vaddr[0] = func; entry->vaddr[1] = instruction_pointer(regs); @@ -982,7 +979,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu, memcpy(data, ucb->buf, tu->tp.size + dsize); - event_trigger_unlock_commit(trace_file, buffer, event, entry, 0); + trace_event_buffer_commit(&fbuffer); } /* uprobe handler */ -- cgit v1.2.3 From 1d83c3a20b0c5708b51c16a021ab76305dbb9943 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sun, 16 May 2021 02:24:10 +0000 Subject: tracing: Fix synth_event_add_val() kernel-doc comment It's named field here. Link: https://lkml.kernel.org/r/20210516022410.64271-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 22db3ce95e74..98e002648994 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1978,7 +1978,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_next_val); /** * synth_event_add_val - Add a named field's value to an open synth trace * @field_name: The name of the synthetic event field value to set - * @val: The value to set the next field to + * @val: The value to set the named field to * @trace_state: A pointer to object tracking the piecewise trace state * * Set the value of the named field in an event that's been opened by -- cgit v1.2.3 From a6ed2aee54644cfa2d04ca86308767f5c3a087e8 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 24 Nov 2021 12:03:08 +0100 Subject: tracing: Switch to kvfree_rcu() API Instead of invoking a synchronize_rcu() to free a pointer after a grace period we can directly make use of new API that does the same but in more efficient way. Link: https://lkml.kernel.org/r/20211124110308.2053-10-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 3 +-- kernel/trace/trace_probe.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 7520d43aed55..4719a848bf17 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -138,8 +138,7 @@ static void osnoise_unregister_instance(struct trace_array *tr) if (!found) return; - synchronize_rcu(); - kfree(inst); + kvfree_rcu(inst); } /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 3ed2a3f37297..8a3822818bf8 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -1138,8 +1138,7 @@ int trace_probe_remove_file(struct trace_probe *tp, return -ENOENT; list_del_rcu(&link->list); - synchronize_rcu(); - kfree(link); + kvfree_rcu(link); if (list_empty(&tp->event->files)) trace_probe_clear_flag(tp, TP_FLAG_TRACE); -- cgit v1.2.3 From 29f2e5bd9439445fe14ba8570b1c9a7ad682df84 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 6 Dec 2021 17:48:39 -0800 Subject: bpf: Silence purge_cand_cache build warning. When CONFIG_DEBUG_INFO_BTF_MODULES is not set the following warning can be seen: kernel/bpf/btf.c:6588:13: warning: 'purge_cand_cache' defined but not used [-Wunused-function] Fix it. Fixes: 1e89106da253 ("bpf: Add bpf_core_add_cands() and wire it into bpf_core_apply_relo_insn().") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211207014839.6976-1-alexei.starovoitov@gmail.com --- kernel/bpf/btf.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 36a5cc0f53c6..01b47d4df3ab 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6560,6 +6560,7 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, return new_cands; } +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache, int cache_size) { @@ -6598,6 +6599,7 @@ static void purge_cand_cache(struct btf *btf) __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE); mutex_unlock(&cand_cache_mutex); } +#endif static struct bpf_cand_cache * bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, -- cgit v1.2.3 From 4d0564785bb03841e4b5c5b31aa4ecd1eb0d01bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Oct 2021 13:18:34 +0200 Subject: dma-direct: factor out dma_set_{de,en}crypted helpers Factor out helpers the make dealing with memory encryption a little less cumbersome. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 56 ++++++++++++++++++++++++----------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 4c6c5e0635e3..d4d54af31a34 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -75,6 +75,20 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } +static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size) +{ + if (!force_dma_unencrypted(dev)) + return 0; + return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size)); +} + +static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) +{ + if (!force_dma_unencrypted(dev)) + return 0; + return set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size)); +} + static void __dma_direct_free_pages(struct device *dev, struct page *page, size_t size) { @@ -154,7 +168,6 @@ void *dma_direct_alloc(struct device *dev, size_t size, { struct page *page; void *ret; - int err; size = PAGE_ALIGN(size); if (attrs & DMA_ATTR_NO_WARN) @@ -216,12 +229,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, __builtin_return_address(0)); if (!ret) goto out_free_pages; - if (force_dma_unencrypted(dev)) { - err = set_memory_decrypted((unsigned long)ret, - 1 << get_order(size)); - if (err) - goto out_free_pages; - } + if (dma_set_decrypted(dev, ret, size)) + goto out_free_pages; memset(ret, 0, size); goto done; } @@ -238,13 +247,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted(dev)) { - err = set_memory_decrypted((unsigned long)ret, - 1 << get_order(size)); - if (err) - goto out_free_pages; - } - + if (dma_set_decrypted(dev, ret, size)) + goto out_free_pages; memset(ret, 0, size); if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && @@ -259,13 +263,9 @@ done: return ret; out_encrypt_pages: - if (force_dma_unencrypted(dev)) { - err = set_memory_encrypted((unsigned long)page_address(page), - 1 << get_order(size)); - /* If memory cannot be re-encrypted, it must be leaked */ - if (err) - return NULL; - } + /* If memory cannot be re-encrypted, it must be leaked */ + if (dma_set_encrypted(dev, page_address(page), size)) + return NULL; out_free_pages: __dma_direct_free_pages(dev, page, size); return NULL; @@ -304,8 +304,7 @@ void dma_direct_free(struct device *dev, size_t size, dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; - if (force_dma_unencrypted(dev)) - set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); + dma_set_encrypted(dev, cpu_addr, 1 << page_order); if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) vunmap(cpu_addr); @@ -341,11 +340,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted(dev)) { - if (set_memory_decrypted((unsigned long)ret, - 1 << get_order(size))) - goto out_free_pages; - } + if (dma_set_decrypted(dev, ret, size)) + goto out_free_pages; memset(ret, 0, size); *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; @@ -366,9 +362,7 @@ void dma_direct_free_pages(struct device *dev, size_t size, dma_free_from_pool(dev, vaddr, size)) return; - if (force_dma_unencrypted(dev)) - set_memory_encrypted((unsigned long)vaddr, 1 << page_order); - + dma_set_encrypted(dev, vaddr, 1 << page_order); __dma_direct_free_pages(dev, page, size); } -- cgit v1.2.3 From 5570449b6876f215d49ac4db9ccce6ff7aa1e20a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Oct 2021 09:20:39 +0200 Subject: dma-direct: don't call dma_set_decrypted for remapped allocations Remapped allocations handle the encrypted bit through the pgprot passed to vmap, so there is no call dma_set_decrypted. Note that this case is currently entirely theoretical as no valid kernel configuration supports remapped allocations and memory encryption currently. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index d4d54af31a34..996ba4edb2fa 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -229,8 +229,6 @@ void *dma_direct_alloc(struct device *dev, size_t size, __builtin_return_address(0)); if (!ret) goto out_free_pages; - if (dma_set_decrypted(dev, ret, size)) - goto out_free_pages; memset(ret, 0, size); goto done; } @@ -304,12 +302,13 @@ void dma_direct_free(struct device *dev, size_t size, dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; - dma_set_encrypted(dev, cpu_addr, 1 << page_order); - - if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) + if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) { vunmap(cpu_addr); - else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) - arch_dma_clear_uncached(cpu_addr, size); + } else { + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) + arch_dma_clear_uncached(cpu_addr, size); + dma_set_encrypted(dev, cpu_addr, 1 << page_order); + } __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); } -- cgit v1.2.3 From a90cf30437489343b8386ae87b4827b6d6c3ed50 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Nov 2021 15:41:01 +0100 Subject: dma-direct: always leak memory that can't be re-encrypted We must never let unencrypted memory go back into the general page pool. So if we fail to set it back to encrypted when freeing DMA memory, leak the memory instead and warn the user. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 996ba4edb2fa..d7a489be4847 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -84,9 +84,14 @@ static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size) static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) { + int ret; + if (!force_dma_unencrypted(dev)) return 0; - return set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size)); + ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size)); + if (ret) + pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n"); + return ret; } static void __dma_direct_free_pages(struct device *dev, struct page *page, @@ -261,7 +266,6 @@ done: return ret; out_encrypt_pages: - /* If memory cannot be re-encrypted, it must be leaked */ if (dma_set_encrypted(dev, page_address(page), size)) return NULL; out_free_pages: @@ -307,7 +311,8 @@ void dma_direct_free(struct device *dev, size_t size, } else { if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) arch_dma_clear_uncached(cpu_addr, size); - dma_set_encrypted(dev, cpu_addr, 1 << page_order); + if (dma_set_encrypted(dev, cpu_addr, 1 << page_order)) + return; } __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); @@ -361,7 +366,8 @@ void dma_direct_free_pages(struct device *dev, size_t size, dma_free_from_pool(dev, vaddr, size)) return; - dma_set_encrypted(dev, vaddr, 1 << page_order); + if (dma_set_encrypted(dev, vaddr, 1 << page_order)) + return; __dma_direct_free_pages(dev, page, size); } -- cgit v1.2.3 From f3c962226dbec7a611ddd4eb7af7f4e19f4790ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Nov 2021 15:20:40 +0100 Subject: dma-direct: clean up the remapping checks in dma_direct_alloc Add two local variables to track if we want to remap the returned address using vmap or call dma_set_uncached and use that to simplify the code flow. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index d7a489be4847..3d1718dc077e 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -171,6 +171,7 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { + bool remap = false, set_uncached = false; struct page *page; void *ret; @@ -222,9 +223,25 @@ void *dma_direct_alloc(struct device *dev, size_t size, if (!page) return NULL; - if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev)) || - (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { + if (!dev_is_dma_coherent(dev) && IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) { + remap = true; + } else if (PageHighMem(page)) { + /* + * Depending on the cma= arguments and per-arch setup, + * dma_alloc_contiguous could return highmem pages. + * Without remapping there is no way to return them here, so + * log an error and fail. + */ + if (!IS_ENABLED(CONFIG_DMA_REMAP)) { + dev_info(dev, "Rejecting highmem page from CMA.\n"); + goto out_free_pages; + } + remap = true; + } else if (!dev_is_dma_coherent(dev) && + IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED)) + set_uncached = true; + + if (remap) { /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); @@ -234,34 +251,21 @@ void *dma_direct_alloc(struct device *dev, size_t size, __builtin_return_address(0)); if (!ret) goto out_free_pages; - memset(ret, 0, size); - goto done; - } - - if (PageHighMem(page)) { - /* - * Depending on the cma= arguments and per-arch setup - * dma_alloc_contiguous could return highmem pages. - * Without remapping there is no way to return them here, - * so log an error and fail. - */ - dev_info(dev, "Rejecting highmem page from CMA.\n"); - goto out_free_pages; + } else { + ret = page_address(page); + if (dma_set_decrypted(dev, ret, size)) + goto out_free_pages; } - ret = page_address(page); - if (dma_set_decrypted(dev, ret, size)) - goto out_free_pages; memset(ret, 0, size); - if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !dev_is_dma_coherent(dev)) { + if (set_uncached) { arch_dma_prep_coherent(page, size); ret = arch_dma_set_uncached(ret, size); if (IS_ERR(ret)) goto out_encrypt_pages; } -done: + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return ret; -- cgit v1.2.3 From d541ae55d538265861ef729a64d2d816d34ef1e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Oct 2021 13:08:07 +0200 Subject: dma-direct: factor out a helper for DMA_ATTR_NO_KERNEL_MAPPING allocations Split the code for DMA_ATTR_NO_KERNEL_MAPPING allocations into a separate helper to make dma_direct_alloc a little more readable. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy Acked-by: David Rientjes --- kernel/dma/direct.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 3d1718dc077e..01104660ec43 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -168,6 +168,24 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, return ret; } +static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + struct page *page; + + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + if (!page) + return NULL; + + /* remove any dirty cache lines on the kernel alias */ + if (!PageHighMem(page)) + arch_dma_prep_coherent(page, size); + + /* return the page pointer as the opaque cookie */ + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); + return page; +} + void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { @@ -180,17 +198,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, gfp |= __GFP_NOWARN; if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) { - page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); - if (!page) - return NULL; - /* remove any dirty cache lines on the kernel alias */ - if (!PageHighMem(page)) - arch_dma_prep_coherent(page, size); - *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); - /* return the page pointer as the opaque cookie */ - return page; - } + !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) + return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp); if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && -- cgit v1.2.3 From a86d10942db2e0099a369b367fe62898f95987a8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Oct 2021 09:47:31 +0200 Subject: dma-direct: refactor the !coherent checks in dma_direct_alloc Add a big central !dev_is_dma_coherent(dev) block to deal with as much as of the uncached allocation schemes and document the schemes a bit better. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 66 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 01104660ec43..f9658fe18498 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -201,29 +201,49 @@ void *dma_direct_alloc(struct device *dev, size_t size, !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp); - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && - !dev_is_dma_coherent(dev) && - !is_swiotlb_for_alloc(dev)) - return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); + if (!dev_is_dma_coherent(dev)) { + /* + * Fallback to the arch handler if it exists. This should + * eventually go away. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && + !is_swiotlb_for_alloc(dev)) + return arch_dma_alloc(dev, size, dma_handle, gfp, + attrs); - if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && - !dev_is_dma_coherent(dev)) - return dma_alloc_from_global_coherent(dev, size, dma_handle); + /* + * If there is a global pool, always allocate from it for + * non-coherent devices. + */ + if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL)) + return dma_alloc_from_global_coherent(dev, size, + dma_handle); + + /* + * Otherwise remap if the architecture is asking for it. But + * given that remapping memory is a blocking operation we'll + * instead have to dip into the atomic pools. + */ + remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP); + if (remap) { + if (!gfpflags_allow_blocking(gfp) && + !is_swiotlb_for_alloc(dev)) + return dma_direct_alloc_from_pool(dev, size, + dma_handle, gfp); + } else { + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED)) + set_uncached = true; + } + } /* - * Remapping or decrypting memory may block. If either is required and - * we can't block, allocate the memory from the atomic pools. - * If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must - * set up another device coherent pool by shared-dma-pool and use - * dma_alloc_from_dev_coherent instead. + * Decrypting memory may block, so allocate the memory from the atomic + * pools if we can't block. */ if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && - !gfpflags_allow_blocking(gfp) && - (force_dma_unencrypted(dev) || - (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev))) && + force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); @@ -231,10 +251,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); if (!page) return NULL; - - if (!dev_is_dma_coherent(dev) && IS_ENABLED(CONFIG_DMA_DIRECT_REMAP)) { - remap = true; - } else if (PageHighMem(page)) { + if (PageHighMem(page)) { /* * Depending on the cma= arguments and per-arch setup, * dma_alloc_contiguous could return highmem pages. @@ -246,9 +263,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, goto out_free_pages; } remap = true; - } else if (!dev_is_dma_coherent(dev) && - IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED)) - set_uncached = true; + set_uncached = false; + } if (remap) { /* remove any dirty cache lines on the kernel alias */ -- cgit v1.2.3 From 955f58f7406ad912825fc344c7825fd904b124a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Nov 2021 15:47:56 +0100 Subject: dma-direct: fail allocations that can't be made coherent If the architecture can't remap or set an address uncached there is no way to fullfill a request for a coherent allocation. Return NULL in that case. Note that this case currently does not happen, so this is a theoretical fixup and/or a preparation for eventually supporting platforms that can't support coherent allocations with the generic code. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f9658fe18498..a13017656eca 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -233,8 +233,9 @@ void *dma_direct_alloc(struct device *dev, size_t size, return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); } else { - if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED)) - set_uncached = true; + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED)) + return NULL; + set_uncached = true; } } -- cgit v1.2.3 From 78bc72787ab9e638173aeb1f589578105d3a43c9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Oct 2021 10:00:55 +0200 Subject: dma-direct: warn if there is no pool for force unencrypted allocations Instead of blindly running into a blocking operation for a non-blocking gfp, return NULL and spew an error. Note that Kconfig prevents this for all currently relevant platforms, and this is just a debug check. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index a13017656eca..84226a764471 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -159,6 +159,9 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, u64 phys_mask; void *ret; + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))) + return NULL; + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_mask); page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok); @@ -243,8 +246,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, * Decrypting memory may block, so allocate the memory from the atomic * pools if we can't block. */ - if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && - force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && + if (force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); @@ -354,8 +356,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && - force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && + if (force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); -- cgit v1.2.3 From f5d3939a5916c0a8a0b47dcbc33963dbffe74f90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Oct 2021 09:34:59 +0200 Subject: dma-direct: drop two CONFIG_DMA_RESTRICTED_POOL conditionals swiotlb_alloc and swiotlb_free are properly stubbed out if CONFIG_DMA_RESTRICTED_POOL is not set, so skip the extra checks. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 84226a764471..cf75bfb2f499 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -97,8 +97,7 @@ static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) static void __dma_direct_free_pages(struct device *dev, struct page *page, size_t size) { - if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && - swiotlb_free(dev, page, size)) + if (swiotlb_free(dev, page, size)) return; dma_free_contiguous(dev, page, size); } @@ -114,8 +113,7 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); - if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && - is_swiotlb_for_alloc(dev)) { + if (is_swiotlb_for_alloc(dev)) { page = swiotlb_alloc(dev, size); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { __dma_direct_free_pages(dev, page, size); -- cgit v1.2.3 From aea7e2a86a94b2583e1e812c596140034398a169 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 21 Oct 2021 09:39:12 +0200 Subject: dma-direct: factor the swiotlb code out of __dma_direct_alloc_pages Add a new helper to deal with the swiotlb case. This keeps the code nicely boundled and removes the not required call to dma_direct_optimal_gfp_mask for the swiotlb case. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index cf75bfb2f499..924937c54e8a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -102,6 +102,18 @@ static void __dma_direct_free_pages(struct device *dev, struct page *page, dma_free_contiguous(dev, page, size); } +static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size) +{ + struct page *page = swiotlb_alloc(dev, size); + + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + swiotlb_free(dev, page, size); + return NULL; + } + + return page; +} + static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp) { @@ -111,17 +123,11 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, WARN_ON_ONCE(!PAGE_ALIGNED(size)); + if (is_swiotlb_for_alloc(dev)) + return dma_direct_alloc_swiotlb(dev, size); + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); - if (is_swiotlb_for_alloc(dev)) { - page = swiotlb_alloc(dev, size); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - __dma_direct_free_pages(dev, page, size); - return NULL; - } - return page; - } - page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); -- cgit v1.2.3 From 2917406c352757642c3c1a13a4c99c96e6d22fde Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 16 Oct 2021 19:11:09 +0800 Subject: sched/fair: Document the slow path and fast path in select_task_rq_fair All People I know including myself took a long time to figure out that typical wakeup will always go to fast path and never go to slow path except WF_FORK and WF_EXEC. Vincent reminded me once in a linaro meeting and made me understand slow path won't happen for WF_TTWU. But my other friends repeatedly wasted a lot of time on testing this path like me before I reminded them. So obviously the code needs some document. Signed-off-by: Barry Song Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211016111109.5559-1-21cnbao@gmail.com --- kernel/sched/fair.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 06722188df49..f34f2f344fe9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6917,6 +6917,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) break; } + /* + * Usually only true for WF_EXEC and WF_FORK, as sched_domains + * usually do not have SD_BALANCE_WAKE set. That means wakeup + * will usually go to the fast path. + */ if (tmp->flags & sd_flag) sd = tmp; else if (!want_affine) -- cgit v1.2.3 From 9b58e976b3b391c0cf02e038d53dd0478ed3013c Mon Sep 17 00:00:00 2001 From: Li Hua Date: Fri, 3 Dec 2021 03:36:18 +0000 Subject: sched/rt: Try to restart rt period timer when rt runtime exceeded When rt_runtime is modified from -1 to a valid control value, it may cause the task to be throttled all the time. Operations like the following will trigger the bug. E.g: 1. echo -1 > /proc/sys/kernel/sched_rt_runtime_us 2. Run a FIFO task named A that executes while(1) 3. echo 950000 > /proc/sys/kernel/sched_rt_runtime_us When rt_runtime is -1, The rt period timer will not be activated when task A enqueued. And then the task will be throttled after setting rt_runtime to 950,000. The task will always be throttled because the rt period timer is not activated. Fixes: d0b27fa77854 ("sched: rt-group: synchonised bandwidth period") Reported-by: Hulk Robot Signed-off-by: Li Hua Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211203033618.11895-1-hucool.lihua@huawei.com --- kernel/sched/rt.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index b48baaba2fc2..7b4f4fbbb404 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -52,11 +52,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.function = sched_rt_period_timer; } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b) { - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; @@ -75,6 +72,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + do_start_rt_bandwidth(rt_b); +} + void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; @@ -1031,13 +1036,17 @@ static void update_curr_rt(struct rq *rq) for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + int exceeded; if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { raw_spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) + exceeded = sched_rt_runtime_exceeded(rt_rq); + if (exceeded) resched_curr(rq); raw_spin_unlock(&rt_rq->rt_runtime_lock); + if (exceeded) + do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } } @@ -2911,8 +2920,12 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { + unsigned long flags; + + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); def_rt_bandwidth.rt_runtime = global_rt_runtime(); def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } int sched_rt_handler(struct ctl_table *table, int write, void *buffer, -- cgit v1.2.3 From a4382659487f84c00b5fbb61df25a9ad59396789 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Sep 2021 14:10:45 +0200 Subject: rcu: Ignore rdp.cpu_no_qs.b.exp on preemptible RCU's rcu_qs() Preemptible RCU does not use the rcu_data structure's ->cpu_no_qs.b.exp, instead using a separate ->exp_deferred_qs field to record the need for an expedited quiescent state. In fact ->cpu_no_qs.b.exp should never be set in preemptible RCU because preemptible RCU's expedited grace periods use other mechanisms to record quiescent states. This commit therefore removes the implicit rcu_qs() reference to ->cpu_no_qs.b.exp in favor of a direct reference to ->cpu_no_qs.b.norm. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5199559fbbf0..113c19b086ac 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -277,12 +277,16 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * current task, there might be any number of other tasks blocked while * in an RCU read-side critical section. * + * Unlike non-preemptible-RCU, quiescent state reports for expedited + * grace periods are handled separately via deferred quiescent states + * and context switch events. + * * Callers to this function must disable preemption. */ static void rcu_qs(void) { RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n"); - if (__this_cpu_read(rcu_data.cpu_no_qs.s)) { + if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); -- cgit v1.2.3 From 6e16b0f7bae3817ea67f4bef4f84298e880fbf66 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Sep 2021 14:10:47 +0200 Subject: rcu: Move rcu_data.cpu_no_qs.b.exp reset to rcu_export_exp_rdp() On non-preemptible RCU, move clearing of the rcu_data structure's ->cpu_no_qs.b.exp filed to the actual expedited quiescent state report function, matching hw preemptible RCU handles the ->exp_deferred_qs field. This prepares for removing ->exp_deferred_qs in favor of ->cpu_no_qs.b.exp for both preemptible and non-preemptible RCU. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 1 + kernel/rcu/tree_plugin.h | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f3947c49eee7..6c6eb3220385 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -256,6 +256,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, static void rcu_report_exp_rdp(struct rcu_data *rdp) { WRITE_ONCE(rdp->exp_deferred_qs, false); + WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 113c19b086ac..6d58b75d2782 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -849,10 +849,8 @@ static void rcu_qs(void) trace_rcu_grace_period(TPS("rcu_sched"), __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) - return; - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); + if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); } /* -- cgit v1.2.3 From 6120b72e25e195b6fa15b0a674479a38166c392a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Sep 2021 14:10:48 +0200 Subject: rcu: Remove rcu_data.exp_deferred_qs and convert to rcu_data.cpu no_qs.b.exp Having two fields for the same purpose with subtle differences on different RCU flavours is confusing, especially when both fields always exist on both RCU flavours. Fortunately, it is now safe for preemptible RCU to rely on the rcu_data structure's ->cpu_no_qs.b.exp field, just like non-preemptible RCU. This commit therefore removes the ad-hoc ->exp_deferred_qs field. Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 - kernel/rcu/tree_exp.h | 5 ++--- kernel/rcu/tree_plugin.h | 12 ++++++------ 3 files changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 305cf6aeb408..ea46ed40f6bc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -157,7 +157,6 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiescent state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ - bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6c6eb3220385..fc2ee326a6f7 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -255,7 +255,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { - WRITE_ONCE(rdp->exp_deferred_qs, false); WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } @@ -656,7 +655,7 @@ static void rcu_exp_handler(void *unused) rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); } else { - rdp->exp_deferred_qs = true; + WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); set_tsk_need_resched(t); set_preempt_need_resched(); } @@ -678,7 +677,7 @@ static void rcu_exp_handler(void *unused) if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { - rdp->exp_deferred_qs = true; + WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 6d58b75d2782..e1a9fb96e0b9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -260,10 +260,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) + if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); else - WARN_ON_ONCE(rdp->exp_deferred_qs); + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); } /* @@ -354,7 +354,7 @@ void rcu_note_context_switch(bool preempt) * means that we continue to block the current grace period. */ rcu_qs(); - if (rdp->exp_deferred_qs) + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); @@ -481,7 +481,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ special = t->rcu_read_unlock_special; rdp = this_cpu_ptr(&rcu_data); - if (!special.s && !rdp->exp_deferred_qs) { + if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; } @@ -501,7 +501,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->exp_deferred_qs) + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); /* Clean up if blocked during RCU read-side critical section. */ @@ -584,7 +584,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (__this_cpu_read(rcu_data.exp_deferred_qs) || + return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) || READ_ONCE(t->rcu_read_unlock_special.s)) && rcu_preempt_depth() == 0; } -- cgit v1.2.3 From 5401cc5264fff75aea10cf44b380b265d18d17ba Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 28 Sep 2021 11:06:35 -0700 Subject: rcu: Mark sync_sched_exp_online_cleanup() ->cpu_no_qs.b.exp load The sync_sched_exp_online_cleanup() is called from rcutree_online_cpu(), which can be invoked with interrupts enabled. This means that the ->cpu_no_qs.b.exp field is subject to data races from the rcu_exp_handler() IPI handler, so this commit marks the load from that field. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index fc2ee326a6f7..12f1a18d7b6d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -759,7 +759,7 @@ static void sync_sched_exp_online_cleanup(int cpu) my_cpu = get_cpu(); /* Quiescent state either not needed or already requested, leave. */ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - rdp->cpu_no_qs.b.exp) { + READ_ONCE(rdp->cpu_no_qs.b.exp)) { put_cpu(); return; } -- cgit v1.2.3 From 147f04b14adde831eb4a0a1e378667429732f9e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Sep 2021 09:21:34 -0700 Subject: rcu: Prevent expedited GP from enabling tick on offline CPU If an RCU expedited grace period starts just when a CPU is in the process of going offline, so that the outgoing CPU has completed its pass through stop-machine but has not yet completed its final dive into the idle loop, RCU will attempt to enable that CPU's scheduling-clock tick via a call to tick_dep_set_cpu(). For this to happen, that CPU has to have been online when the expedited grace period completed its CPU-selection phase. This is pointless: The outgoing CPU has interrupts disabled, so it cannot take a scheduling-clock tick anyway. In addition, the tick_dep_set_cpu() function's eventual call to irq_work_queue_on() will splat as follows: smpboot: CPU 1 is now offline WARNING: CPU: 6 PID: 124 at kernel/irq_work.c:95 +irq_work_queue_on+0x57/0x60 Modules linked in: CPU: 6 PID: 124 Comm: kworker/6:2 Not tainted 5.15.0-rc1+ #3 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS +rel-1.14.0-0-g155821a-rebuilt.opensuse.org 04/01/2014 Workqueue: rcu_gp wait_rcu_exp_gp RIP: 0010:irq_work_queue_on+0x57/0x60 Code: 8b 05 1d c7 ea 62 a9 00 00 f0 00 75 21 4c 89 ce 44 89 c7 e8 +9b 37 fa ff ba 01 00 00 00 89 d0 c3 4c 89 cf e8 3b ff ff ff eb ee <0f> 0b eb b7 +0f 0b eb db 90 48 c7 c0 98 2a 02 00 65 48 03 05 91 6f RSP: 0000:ffffb12cc038fe48 EFLAGS: 00010282 RAX: 0000000000000001 RBX: 0000000000005208 RCX: 0000000000000020 RDX: 0000000000000001 RSI: 0000000000000001 RDI: ffff9ad01f45a680 RBP: 000000000004c990 R08: 0000000000000001 R09: ffff9ad01f45a680 R10: ffffb12cc0317db0 R11: 0000000000000001 R12: 00000000fffecee8 R13: 0000000000000001 R14: 0000000000026980 R15: ffffffff9e53ae00 FS: 0000000000000000(0000) GS:ffff9ad01f580000(0000) +knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000000de0c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tick_nohz_dep_set_cpu+0x59/0x70 rcu_exp_wait_wake+0x54e/0x870 ? sync_rcu_exp_select_cpus+0x1fc/0x390 process_one_work+0x1ef/0x3c0 ? process_one_work+0x3c0/0x3c0 worker_thread+0x28/0x3c0 ? process_one_work+0x3c0/0x3c0 kthread+0x115/0x140 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 ---[ end trace c5bf75eb6aa80bc6 ]--- This commit therefore avoids invoking tick_dep_set_cpu() on offlined CPUs to limit both futility and false-positive splats. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 12f1a18d7b6d..a96d17206d87 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -506,7 +506,10 @@ static void synchronize_rcu_expedited_wait(void) if (rdp->rcu_forced_tick_exp) continue; rdp->rcu_forced_tick_exp = true; - tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + preempt_disable(); + if (cpu_online(cpu)) + tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + preempt_enable(); } } j = READ_ONCE(jiffies_till_first_fqs); -- cgit v1.2.3 From 790da248978a0722d92d1471630c881704f7eb0d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 29 Sep 2021 11:09:34 -0700 Subject: rcu: Make idle entry report expedited quiescent states In non-preemptible kernels, an unfortunately timed expedited grace period can result in the rcu_exp_handler() IPI handler setting the rcu_data structure's cpu_no_qs.b.exp field just as the target CPU enters idle. There are situations in which this field will not be checked until after that CPU exits idle. The resulting grace-period latency does not qualify as "expedited". This commit therefore checks this field upon non-preemptible idle entry in the rcu_preempt_deferred_qs() function. It also qualifies the rcu_core() preempt_count() check with IS_ENABLED(CONFIG_PREEMPT_COUNT) to prevent false-positive quiescent states from count-free kernels. Reported-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 13 ++++++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ef8d36f580fc..a8d1fe35f482 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2714,7 +2714,7 @@ static __latent_entropy void rcu_core(void) WARN_ON_ONCE(!rdp->beenonline); /* Report any deferred quiescent states if preemption enabled. */ - if (!(preempt_count() & PREEMPT_MASK)) { + if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { rcu_preempt_deferred_qs(current); } else if (rcu_preempt_need_deferred_qs(current)) { set_tsk_need_resched(current); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index e1a9fb96e0b9..8fb1612021a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -927,7 +927,18 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return false; } -static void rcu_preempt_deferred_qs(struct task_struct *t) { } + +// Except that we do need to respond to a request by an expedited grace +// period for a quiescent state from this CPU. Note that requests from +// tasks are handled when removing the task from the blocked-tasks list +// below. +static void rcu_preempt_deferred_qs(struct task_struct *t) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + if (rdp->cpu_no_qs.b.exp) + rcu_report_exp_rdp(rdp); +} /* * Because there is no preemptible RCU, there can be no readers blocked, -- cgit v1.2.3 From 81f6d49cce2d2fe507e3fddcc4a6db021d9c2e7b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 30 Nov 2021 17:21:08 +0100 Subject: rcu/exp: Mark current CPU as exp-QS in IPI loop second pass Expedited RCU grace periods invoke sync_rcu_exp_select_node_cpus(), which takes two passes over the leaf rcu_node structure's CPUs. The first pass gathers up the current CPU and CPUs that are in dynticks idle mode. The workqueue will report a quiescent state on their behalf later. The second pass sends IPIs to the rest of the CPUs, but excludes the current CPU, incorrectly assuming it has been included in the first pass's list of CPUs. Unfortunately the current CPU may have changed between the first and second pass, due to the fact that the various rcu_node structures' ->lock fields have been dropped, thus momentarily enabling preemption. This means that if the second pass's CPU was not on the first pass's list, it will be ignored completely. There will be no IPI sent to it, and there will be no reporting of quiescent states on its behalf. Unfortunately, the expedited grace period will nevertheless be waiting for that CPU to report a quiescent state, but with that CPU having no reason to believe that such a report is needed. The result will be an expedited grace period stall. Fix this by no longer excluding the current CPU from consideration during the second pass. Fixes: b9ad4d6ed18e ("rcu: Avoid self-IPI in sync_rcu_exp_select_node_cpus()") Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker Cc: Uladzislau Rezki Cc: Neeraj Upadhyay Cc: Boqun Feng Cc: Josh Triplett Cc: Joel Fernandes Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index a96d17206d87..237a79989aba 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -387,6 +387,7 @@ retry_ipi: continue; } if (get_cpu() == cpu) { + mask_ofl_test |= mask; put_cpu(); continue; } -- cgit v1.2.3 From 614ddad17f22a22e035e2ea37a04815f50362017 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 17 Sep 2021 15:04:48 -0700 Subject: rcu: Tighten rcu_advance_cbs_nowake() checks Currently, rcu_advance_cbs_nowake() checks that a grace period is in progress, however, that grace period could end just after the check. This commit rechecks that a grace period is still in progress while holding the rcu_node structure's lock. The grace period cannot end while the current CPU's rcu_node structure's ->lock is held, thus avoiding false positives from the WARN_ON_ONCE(). As Daniel Vacek noted, it is not necessary for the rcu_node structure to have a CPU that has not yet passed through its quiescent state. Tested-by: Guillaume Morin Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ef8d36f580fc..8706b30c2ac8 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1590,10 +1590,11 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, struct rcu_data *rdp) { rcu_lockdep_assert_cblist_protected(rdp); - if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || - !raw_spin_trylock_rcu_node(rnp)) + if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp)) return; - WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); + // The grace period cannot end while we hold the rcu_node lock. + if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) + WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); raw_spin_unlock_rcu_node(rnp); } -- cgit v1.2.3 From 118e0d4a1bc85d4ecea0427e440a72d21ffbfa6a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 11 Oct 2021 16:51:30 +0200 Subject: rcu/nocb: Make local rcu_nocb_lock_irqsave() safe against concurrent deoffloading rcu_nocb_lock_irqsave() can be preempted between the call to rcu_segcblist_is_offloaded() and the actual locking. This matters now that rcu_core() is preemptible on PREEMPT_RT and the (de-)offloading process can interrupt the softirq or the rcuc kthread. As a result we may locklessly call into code that requires nocb locking. In practice this is a problem while we accelerate callbacks on rcu_core(). Simply disabling interrupts before (instead of after) checking the NOCB offload state fixes the issue. Reported-and-tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Valentin Schneider Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 305cf6aeb408..4f6c67b3ccd5 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -447,12 +447,16 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); -#define rcu_nocb_lock_irqsave(rdp, flags) \ -do { \ - if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ - local_irq_save(flags); \ - else \ - raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \ + +/* + * Disable IRQs before checking offloaded state so that local + * locking is safe against concurrent de-offloading. + */ +#define rcu_nocb_lock_irqsave(rdp, flags) \ +do { \ + local_irq_save(flags); \ + if (rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ + raw_spin_lock(&(rdp)->nocb_lock); \ } while (0) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ #define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags) -- cgit v1.2.3 From 213d56bf33bdda835bac04046f09256a75c5ca8e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Oct 2021 02:08:07 +0200 Subject: rcu/nocb: Prepare state machine for a new step Currently SEGCBLIST_SOFTIRQ_ONLY is a bit of an exception among the segcblist flags because it is an exclusive state that doesn't mix up with the other flags. Remove it in favour of: _ A flag specifying that rcu_core() needs to perform callbacks execution and acceleration and _ A flag specifying we want the nocb lock to be held in any needed circumstances This clarifies the code and is more flexible: It allows to have a state where rcu_core() runs with locking while offloading hasn't started yet. This is a necessary step to prepare for triggering rcu_core() at the very beginning of the de-offloading process so that rcu_core() won't dismiss work while being preempted by the de-offloading process, at least not without a pending subsequent rcu_core() that will quickly catch up. Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 37 +++++++++++++++++++++++-------------- kernel/rcu/rcu_segcblist.c | 6 +++--- kernel/rcu/rcu_segcblist.h | 12 +++++++----- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_nocb.h | 24 ++++++++++++++++-------- 5 files changed, 50 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 3db96c4f45fd..812961b1d064 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -69,7 +69,7 @@ struct rcu_cblist { * * * ---------------------------------------------------------------------------- - * | SEGCBLIST_SOFTIRQ_ONLY | + * | SEGCBLIST_RCU_CORE | * | | * | Callbacks processed by rcu_core() from softirqs or local | * | rcuc kthread, without holding nocb_lock. | @@ -77,7 +77,7 @@ struct rcu_cblist { * | * v * ---------------------------------------------------------------------------- - * | SEGCBLIST_OFFLOADED | + * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED | * | | * | Callbacks processed by rcu_core() from softirqs or local | * | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads, | @@ -89,7 +89,9 @@ struct rcu_cblist { * | | * v v * --------------------------------------- ----------------------------------| - * | SEGCBLIST_OFFLOADED | | | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_RCU_CORE | | | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | | SEGCBLIST_OFFLOADED | | * | SEGCBLIST_KTHREAD_CB | | SEGCBLIST_KTHREAD_GP | * | | | | * | | | | @@ -104,9 +106,10 @@ struct rcu_cblist { * | * v * |--------------------------------------------------------------------------| - * | SEGCBLIST_OFFLOADED | | - * | SEGCBLIST_KTHREAD_CB | | - * | SEGCBLIST_KTHREAD_GP | + * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_GP | | + * | SEGCBLIST_KTHREAD_CB | * | | * | Kthreads handle callbacks holding nocb_lock, local rcu_core() stops | * | handling callbacks. Enable bypass queueing. | @@ -120,7 +123,8 @@ struct rcu_cblist { * * * |--------------------------------------------------------------------------| - * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | * | | @@ -130,6 +134,8 @@ struct rcu_cblist { * | * v * |--------------------------------------------------------------------------| + * | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | * | | @@ -143,7 +149,9 @@ struct rcu_cblist { * | | * v v * ---------------------------------------------------------------------------| - * | | + * | | | + * | SEGCBLIST_RCU_CORE | | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | SEGCBLIST_LOCKING | | * | SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | * | | | * | GP kthread woke up and | CB kthread woke up and | @@ -159,7 +167,7 @@ struct rcu_cblist { * | * v * ---------------------------------------------------------------------------- - * | 0 | + * | SEGCBLIST_RCU_CORE | SEGCBLIST_LOCKING | * | | * | Callbacks processed by rcu_core() from softirqs or local | * | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed. | @@ -168,17 +176,18 @@ struct rcu_cblist { * | * v * ---------------------------------------------------------------------------- - * | SEGCBLIST_SOFTIRQ_ONLY | + * | SEGCBLIST_RCU_CORE | * | | * | Callbacks processed by rcu_core() from softirqs or local | * | rcuc kthread, without holding nocb_lock. | * ---------------------------------------------------------------------------- */ #define SEGCBLIST_ENABLED BIT(0) -#define SEGCBLIST_SOFTIRQ_ONLY BIT(1) -#define SEGCBLIST_KTHREAD_CB BIT(2) -#define SEGCBLIST_KTHREAD_GP BIT(3) -#define SEGCBLIST_OFFLOADED BIT(4) +#define SEGCBLIST_RCU_CORE BIT(1) +#define SEGCBLIST_LOCKING BIT(2) +#define SEGCBLIST_KTHREAD_CB BIT(3) +#define SEGCBLIST_KTHREAD_GP BIT(4) +#define SEGCBLIST_OFFLOADED BIT(5) struct rcu_segcblist { struct rcu_head *head; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index aaa111237b60..c07aab6e39ef 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -261,14 +261,14 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) } /* - * Mark the specified rcu_segcblist structure as offloaded. + * Mark the specified rcu_segcblist structure as offloaded (or not) */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) { if (offload) { - rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); - rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); + rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED); } else { + rcu_segcblist_set_flags(rsclp, SEGCBLIST_RCU_CORE); rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); } } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 9a19328ff251..e373fbe44da5 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -80,11 +80,14 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED); } -/* Is the specified rcu_segcblist offloaded, or is SEGCBLIST_SOFTIRQ_ONLY set? */ +/* + * Is the specified rcu_segcblist NOCB offloaded (or in the middle of the + * [de]offloading process)? + */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - !rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY)) + rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING)) return true; return false; @@ -92,9 +95,8 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp) { - int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED; - - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && (rsclp->flags & flags) == flags) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE)) return true; return false; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8706b30c2ac8..e905d7e4ddb9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -79,7 +79,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_RCU_NOCB_CPU - .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, + .cblist.flags = SEGCBLIST_RCU_CORE, #endif }; static struct rcu_state rcu_state = { diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 368ef7b9af4f..b3e07d0bfbbf 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1000,12 +1000,12 @@ static long rcu_nocb_rdp_deoffload(void *arg) */ rcu_nocb_lock_irqsave(rdp, flags); /* - * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb + * Theoretically we could clear SEGCBLIST_LOCKING after the nocb * lock is released but how about being paranoid for once? */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING); /* - * With SEGCBLIST_SOFTIRQ_ONLY, we can't use + * Without SEGCBLIST_LOCKING, we can't use * rcu_nocb_unlock_irqrestore() anymore. */ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); @@ -1058,14 +1058,14 @@ static long rcu_nocb_rdp_offload(void *arg) pr_info("Offloading %d\n", rdp->cpu); /* - * Can't use rcu_nocb_lock_irqsave() while we are in - * SEGCBLIST_SOFTIRQ_ONLY mode. + * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING + * is set. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); /* * We didn't take the nocb lock while working on the - * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. + * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode). * Every modifications that have been done previously on * rdp->cblist must be visible remotely by the nocb kthreads * upon wake up after reading the cblist flags. @@ -1084,6 +1084,14 @@ static long rcu_nocb_rdp_offload(void *arg) rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + /* + * All kthreads are ready to work, we can finally relieve rcu_core() and + * enable nocb bypass. + */ + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); + rcu_nocb_unlock_irqrestore(rdp, flags); + return ret; } @@ -1154,8 +1162,8 @@ void __init rcu_init_nohz(void) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); } rcu_organize_nocb_kthreads(); } -- cgit v1.2.3 From fbb94cbd70d41c7511460896dfc7f9ea5da704b3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Oct 2021 02:08:08 +0200 Subject: rcu/nocb: Invoke rcu_core() at the start of deoffloading On PREEMPT_RT, if rcu_core() is preempted by the de-offloading process, some work, such as callbacks acceleration and invocation, may be left unattended due to the volatile checks on the offloaded state. In the worst case this work is postponed until the next rcu_pending() check that can take a jiffy to reach, which can be a problem in case of callbacks flooding. Solve that with invoking rcu_core() early in the de-offloading process. This way any work dismissed by an ongoing rcu_core() call fooled by a preempting deoffloading process will be caught up by a nearby future recall to rcu_core(), this time fully aware of the de-offloading state. Tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Valentin Schneider Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 14 ++++++++++++++ kernel/rcu/rcu_segcblist.c | 6 ++---- kernel/rcu/tree.c | 17 +++++++++++++++++ kernel/rcu/tree_nocb.h | 9 +++++++++ 4 files changed, 42 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 812961b1d064..659d13a7ddaa 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -136,6 +136,20 @@ struct rcu_cblist { * |--------------------------------------------------------------------------| * | SEGCBLIST_RCU_CORE | | * | SEGCBLIST_LOCKING | | + * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_CB | | + * | SEGCBLIST_KTHREAD_GP | + * | | + * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | + * | handles callbacks concurrently. Bypass enqueue is enabled. | + * | Invoke RCU core so we make sure not to preempt it in the middle with | + * | leaving some urgent work unattended within a jiffy. | + * ---------------------------------------------------------------------------- + * | + * v + * |--------------------------------------------------------------------------| + * | SEGCBLIST_RCU_CORE | | + * | SEGCBLIST_LOCKING | | * | SEGCBLIST_KTHREAD_CB | | * | SEGCBLIST_KTHREAD_GP | * | | diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index c07aab6e39ef..81145c3ece25 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -265,12 +265,10 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) { - if (offload) { + if (offload) rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED); - } else { - rcu_segcblist_set_flags(rsclp, SEGCBLIST_RCU_CORE); + else rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); - } } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e905d7e4ddb9..a329adfece86 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2707,6 +2707,23 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + /* + * On RT rcu_core() can be preempted when IRQs aren't disabled. + * Therefore this function can race with concurrent NOCB (de-)offloading + * on this CPU and the below condition must be considered volatile. + * However if we race with: + * + * _ Offloading: In the worst case we accelerate or process callbacks + * concurrently with NOCB kthreads. We are guaranteed to + * call rcu_nocb_lock() if that happens. + * + * _ Deoffloading: In the worst case we miss callbacks acceleration or + * processing. This is fine because the early stage + * of deoffloading invokes rcu_core() after setting + * SEGCBLIST_RCU_CORE. So we guarantee that we'll process + * what could have been dismissed without the need to wait + * for the next rcu_pending() check in the next jiffy. + */ const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index b3e07d0bfbbf..2461fe8d0c23 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -990,6 +990,15 @@ static long rcu_nocb_rdp_deoffload(void *arg) * will refuse to put anything into the bypass. */ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); + /* + * Start with invoking rcu_core() early. This way if the current thread + * happens to preempt an ongoing call to rcu_core() in the middle, + * leaving some work dismissed because rcu_core() still thinks the rdp is + * completely offloaded, we are guaranteed a nearby future instance of + * rcu_core() to catch up. + */ + rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); + invoke_rcu_core(); ret = rdp_offload_toggle(rdp, false, flags); swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | -- cgit v1.2.3 From 24ee940d89277602147ce1b8b4fd87b01b9a6660 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Oct 2021 02:08:09 +0200 Subject: rcu/nocb: Make rcu_core() callbacks acceleration preempt-safe While reporting a quiescent state for a given CPU, rcu_core() takes advantage of the freshly loaded grace period sequence number and the locked rnp to accelerate the callbacks whose sequence number have been assigned a stale value. This action is only necessary when the rdp isn't offloaded, otherwise the NOCB kthreads already take care of the callbacks progression. However the check for the offloaded state is volatile because it is performed outside the IRQs disabled section. It's possible for the offloading process to preempt rcu_core() at that point on PREEMPT_RT. This is dangerous because rcu_core() may end up accelerating callbacks concurrently with NOCB kthreads without appropriate locking. Fix this with moving the offloaded check inside the rnp locking section. Reported-and-tested-by: Valentin Schneider Reviewed-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a329adfece86..5985698f3341 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2278,7 +2278,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp) unsigned long flags; unsigned long mask; bool needwake = false; - const bool offloaded = rcu_rdp_is_offloaded(rdp); struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2305,8 +2304,10 @@ rcu_report_qs_rdp(struct rcu_data *rdp) /* * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. + * + * NOCB kthreads have their own way to deal with that. */ - if (!offloaded) + if (!rcu_rdp_is_offloaded(rdp)) needwake = rcu_accelerate_cbs(rnp, rdp); rcu_disable_urgency_upon_qs(rdp); -- cgit v1.2.3 From b3bb02fe5a2b538ae53eda1fe591dd6c81a91ad4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Oct 2021 02:08:10 +0200 Subject: rcu/nocb: Make rcu_core() callbacks acceleration (de-)offloading safe When callbacks are offloaded, the NOCB kthreads handle the callbacks progression on behalf of rcu_core(). However during the (de-)offloading process, the kthread may not be entirely up to the task. As a result some callbacks grace period sequence number may remain stale for a while because rcu_core() won't take care of them either. Fix this with forcing callbacks acceleration from rcu_core() as long as the offloading process isn't complete. Reported-and-tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Valentin Schneider Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5985698f3341..cb9abb80377f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2278,6 +2278,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp) unsigned long flags; unsigned long mask; bool needwake = false; + bool needacc = false; struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2305,16 +2306,29 @@ rcu_report_qs_rdp(struct rcu_data *rdp) * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. * - * NOCB kthreads have their own way to deal with that. + * NOCB kthreads have their own way to deal with that... */ - if (!rcu_rdp_is_offloaded(rdp)) + if (!rcu_rdp_is_offloaded(rdp)) { needwake = rcu_accelerate_cbs(rnp, rdp); + } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { + /* + * ...but NOCB kthreads may miss or delay callbacks acceleration + * if in the middle of a (de-)offloading process. + */ + needacc = true; + } rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) rcu_gp_kthread_wake(); + + if (needacc) { + rcu_nocb_lock_irqsave(rdp, flags); + rcu_accelerate_cbs_unlocked(rnp, rdp); + rcu_nocb_unlock_irqrestore(rdp, flags); + } } } -- cgit v1.2.3 From 344e219d7d2b28117daaae5fe8da2e054b53d5a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Oct 2021 02:08:11 +0200 Subject: rcu/nocb: Check a stable offloaded state to manipulate qlen_last_fqs_check It's not entirely obvious why rdp->qlen_last_fqs_check is updated before processing the queue only on offloaded rdp. There can be different effect to that, either in favour of triggering the force quiescent state path or not. For example: 1) If the number of callbacks has decreased since the last rdp->qlen_last_fqs_check update (because we recently called rcu_do_batch() and we executed below qhimark callbacks) and the number of processed callbacks on a subsequent do_batch() arranges for exceeding qhimark on non-offloaded but not on offloaded setup, then we may spare a later run to the force quiescent state slow path on __call_rcu_nocb_wake(), as compared to the non-offloaded counterpart scenario. Here is such an offloaded scenario instance: qhimark = 1000 rdp->last_qlen_last_fqs_check = 3000 rcu_segcblist_n_cbs(rdp) = 2000 rcu_do_batch() { if (offloaded) rdp->last_qlen_fqs_check = rcu_segcblist_n_cbs(rdp) // 2000 // run 1000 callback rcu_segcblist_n_cbs(rdp) = 1000 // Not updating rdp->qlen_last_fqs_check if (count < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = count; } call_rcu() * 1001 { __call_rcu_nocb_wake() { // not taking the fqs slowpath: // rcu_segcblist_n_cbs(rdp) == 2001 // rdp->qlen_last_fqs_check == 2000 // qhimark == 1000 if (len > rdp->qlen_last_fqs_check + qhimark) ... } In the case of a non-offloaded scenario, rdp->qlen_last_fqs_check would be 1000 and the fqs slowpath would have executed. 2) If the number of callbacks has increased since the last rdp->qlen_last_fqs_check update (because we recently queued below qhimark callbacks) and the number of callbacks executed in rcu_do_batch() doesn't exceed qhimark for either offloaded or non-offloaded setup, then it's possible that the offloaded scenario later run the force quiescent state slow path on __call_rcu_nocb_wake() while the non-offloaded doesn't. qhimark = 1000 rdp->last_qlen_last_fqs_check = 3000 rcu_segcblist_n_cbs(rdp) = 2000 rcu_do_batch() { if (offloaded) rdp->last_qlen_last_fqs_check = rcu_segcblist_n_cbs(rdp) // 2000 // run 100 callbacks // concurrent queued 100 rcu_segcblist_n_cbs(rdp) = 2000 // Not updating rdp->qlen_last_fqs_check if (count < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = count; } call_rcu() * 1001 { __call_rcu_nocb_wake() { // Taking the fqs slowpath: // rcu_segcblist_n_cbs(rdp) == 3001 // rdp->qlen_last_fqs_check == 2000 // qhimark == 1000 if (len > rdp->qlen_last_fqs_check + qhimark) ... } In the case of a non-offloaded scenario, rdp->qlen_last_fqs_check would be 3000 and the fqs slowpath would have executed. The reason for updating rdp->qlen_last_fqs_check when invoking callbacks for offloaded CPUs is that there is usually no point in waking up either the rcuog or rcuoc kthreads while in this state. After all, both threads are prohibited from indefinite sleeps. The exception is when some huge number of callbacks are enqueued while rcu_do_batch() is in the midst of invoking, in which case interrupting the rcuog kthread's timed sleep might get more callbacks set up for the next grace period. Reported-and-tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Original-patch-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker Cc: Valentin Schneider Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb9abb80377f..4cbfc4e4fa9e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2498,7 +2498,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); - if (offloaded) + if (rcu_rdp_is_offloaded(rdp)) rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued")); -- cgit v1.2.3 From 7b65dfa32dca1be0400d43a3d5bb80ed6e04958e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Oct 2021 02:08:12 +0200 Subject: rcu/nocb: Use appropriate rcu_nocb_lock_irqsave() Instead of hardcoding IRQ save and nocb lock, use the consolidated API (and fix a comment as per Valentin Schneider's suggestion). Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Valentin Schneider Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4cbfc4e4fa9e..20587d035d03 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2478,12 +2478,11 @@ static void rcu_do_batch(struct rcu_data *rdp) } /* - * Extract the list of ready callbacks, disabling to prevent + * Extract the list of ready callbacks, disabling IRQs to prevent * races with call_rcu() from interrupt handlers. Leave the * callback counts, as rcu_barrier() needs to be conservative. */ - local_irq_save(flags); - rcu_nocb_lock(rdp); + rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); div = READ_ONCE(rcu_divisor); @@ -2546,8 +2545,7 @@ static void rcu_do_batch(struct rcu_data *rdp) } } - local_irq_save(flags); - rcu_nocb_lock(rdp); + rcu_nocb_lock_irqsave(rdp, flags); rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); -- cgit v1.2.3 From 78ad37a2c50dfdb9a60e42bb9ee1da86d1fe770c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Oct 2021 02:08:13 +0200 Subject: rcu/nocb: Limit number of softirq callbacks only on softirq The current condition to limit the number of callbacks executed in a row checks the offloaded state of the rdp. Not only is it volatile but it is also misleading: the rcu_core() may well be executing callbacks concurrently with NOCB kthreads, and the offloaded state would then be verified on both cases. As a result the limit would spuriously not apply anymore on softirq while in the middle of (de-)offloading process. Fix and clarify the condition with those constraints in mind: _ If callbacks are processed either by rcuc or NOCB kthread, the call to cond_resched_tasks_rcu_qs() is enough to take care of the overload. _ If instead callbacks are processed by softirqs: * If need_resched(), exit the callbacks processing * Otherwise if CPU is idle we can continue * Otherwise exit because a softirq shouldn't interrupt a task for too long nor deprive other pending softirq vectors of the CPU. Tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Valentin Schneider Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 20587d035d03..4f4d9ea6a9cb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2525,9 +2525,8 @@ static void rcu_do_batch(struct rcu_data *rdp) /* * Stop only if limit reached and CPU has something to do. */ - if (count >= bl && !offloaded && - (need_resched() || - (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) + if (count >= bl && in_serving_softirq() && + (need_resched() || !is_idle_task(current))) break; if (unlikely(tlimit)) { /* only call local_clock() every 32 callbacks */ -- cgit v1.2.3 From 3e61e95e2d095e308616cba4ffb640f95a480e01 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Oct 2021 02:08:14 +0200 Subject: rcu: Fix callbacks processing time limit retaining cond_resched() The callbacks processing time limit makes sure we are not exceeding a given amount of time executing the queue. However its "continue" clause bypasses the cond_resched() call on rcuc and NOCB kthreads, delaying it until we reach the limit, which can be very long... Make sure the scheduler has a higher priority than the time limit. Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Valentin Schneider Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4f4d9ea6a9cb..74e210ccfd38 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2525,9 +2525,21 @@ static void rcu_do_batch(struct rcu_data *rdp) /* * Stop only if limit reached and CPU has something to do. */ - if (count >= bl && in_serving_softirq() && - (need_resched() || !is_idle_task(current))) - break; + if (in_serving_softirq()) { + if (count >= bl && (need_resched() || !is_idle_task(current))) + break; + } else { + local_bh_enable(); + lockdep_assert_irqs_enabled(); + cond_resched_tasks_rcu_qs(); + lockdep_assert_irqs_enabled(); + local_bh_disable(); + } + + /* + * Make sure we don't spend too much time here and deprive other + * softirq vectors of CPU cycles. + */ if (unlikely(tlimit)) { /* only call local_clock() every 32 callbacks */ if (likely((count & 31) || local_clock() < tlimit)) @@ -2535,13 +2547,6 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Exceeded the time limit, so leave. */ break; } - if (!in_serving_softirq()) { - local_bh_enable(); - lockdep_assert_irqs_enabled(); - cond_resched_tasks_rcu_qs(); - lockdep_assert_irqs_enabled(); - local_bh_disable(); - } } rcu_nocb_lock_irqsave(rdp, flags); -- cgit v1.2.3 From a554ba288845fd3f6f12311fd76a51694233458a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Oct 2021 02:08:15 +0200 Subject: rcu: Apply callbacks processing time limit only on softirq Time limit only makes sense when callbacks are serviced in softirq mode because: _ In case we need to get back to the scheduler, cond_resched_tasks_rcu_qs() is called after each callback. _ In case some other softirq vector needs the CPU, the call to local_bh_enable() before cond_resched_tasks_rcu_qs() takes care about them via a call to do_softirq(). Therefore, make sure the time limit only applies to softirq mode. Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Valentin Schneider Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 74e210ccfd38..1b31fad9f1a0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2488,7 +2488,7 @@ static void rcu_do_batch(struct rcu_data *rdp) div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); - if (unlikely(bl > 100)) { + if (in_serving_softirq() && unlikely(bl > 100)) { long rrn = READ_ONCE(rcu_resched_ns); rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; @@ -2528,6 +2528,17 @@ static void rcu_do_batch(struct rcu_data *rdp) if (in_serving_softirq()) { if (count >= bl && (need_resched() || !is_idle_task(current))) break; + /* + * Make sure we don't spend too much time here and deprive other + * softirq vectors of CPU cycles. + */ + if (unlikely(tlimit)) { + /* only call local_clock() every 32 callbacks */ + if (likely((count & 31) || local_clock() < tlimit)) + continue; + /* Exceeded the time limit, so leave. */ + break; + } } else { local_bh_enable(); lockdep_assert_irqs_enabled(); @@ -2535,18 +2546,6 @@ static void rcu_do_batch(struct rcu_data *rdp) lockdep_assert_irqs_enabled(); local_bh_disable(); } - - /* - * Make sure we don't spend too much time here and deprive other - * softirq vectors of CPU cycles. - */ - if (unlikely(tlimit)) { - /* only call local_clock() every 32 callbacks */ - if (likely((count & 31) || local_clock() < tlimit)) - continue; - /* Exceeded the time limit, so leave. */ - break; - } } rcu_nocb_lock_irqsave(rdp, flags); -- cgit v1.2.3 From 0598a4d4429c0a952ac0e99e5280354cf4ccc01c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Oct 2021 02:08:16 +0200 Subject: rcu/nocb: Don't invoke local rcu core on callback overload from nocb kthread rcu_core() tries to ensure that its self-invocation in case of callbacks overload only happen in softirq/rcuc mode. Indeed it doesn't make sense to trigger local RCU core from nocb_cb kthread since it can execute on a CPU different from the target rdp. Also in case of overload, the nocb_cb kthread simply iterates a new loop of callbacks processing. However the "offloaded" check that aims at preventing misplaced rcu_core() invocations is wrong. First of all that state is volatile and second: softirq/rcuc can execute while the target rdp is offloaded. As a result rcu_core() can be invoked on the wrong CPU while in the process of (de-)offloading. Fix that with moving the rcu_core() self-invocation to rcu_core() itself, irrespective of the rdp offloaded state. Tested-by: Valentin Schneider Tested-by: Sebastian Andrzej Siewior Signed-off-by: Frederic Weisbecker Cc: Valentin Schneider Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Josh Triplett Cc: Joel Fernandes Cc: Boqun Feng Cc: Neeraj Upadhyay Cc: Uladzislau Rezki Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1b31fad9f1a0..03c038f324a0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2460,7 +2460,6 @@ static void rcu_do_batch(struct rcu_data *rdp) int div; bool __maybe_unused empty; unsigned long flags; - const bool offloaded = rcu_rdp_is_offloaded(rdp); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count = 0; @@ -2582,9 +2581,6 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_unlock_irqrestore(rdp, flags); - /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_core(); tick_dep_clear_task(current, TICK_DEP_BIT_RCU); } @@ -2771,8 +2767,12 @@ static __latent_entropy void rcu_core(void) /* If there are callbacks ready, invoke them. */ if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && - likely(READ_ONCE(rcu_scheduler_fully_active))) + likely(READ_ONCE(rcu_scheduler_fully_active))) { rcu_do_batch(rdp); + /* Re-invoke RCU core processing if there are callbacks remaining. */ + if (rcu_segcblist_ready_cbs(&rdp->cblist)) + invoke_rcu_core(); + } /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); -- cgit v1.2.3 From cafafd67765b21334086b3fb8963ad9c5866c03d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Nov 2021 21:52:00 -0700 Subject: rcu-tasks: Create per-CPU callback lists Currently, RCU Tasks Trace (as well as the other two flavors of RCU Tasks) use a single global callback list. This works well and is simple, but expected changes in workload will cause this list to become a bottleneck. This commit therefore creates per-CPU callback lists for the various flavors of RCU Tasks, but continues queueing on a single list, namely that of CPU 0. Later commits will dynamically vary the number of lists in use to accommodate dynamic changes in workload. Reported-by: Martin Lau Cc: Neeraj Upadhyay Tested-by: kernel test robot Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 106 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index bd44cd4794d3..30048db7aa49 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -20,11 +20,21 @@ typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); typedef void (*postgp_func_t)(struct rcu_tasks *rtp); /** - * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. + * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism. * @cbs_head: Head of callback list. * @cbs_tail: Tail pointer for callback list. + * @cbs_pcpu_lock: Lock protecting per-CPU callback list. + */ +struct rcu_tasks_percpu { + struct rcu_head *cbs_head; + struct rcu_head **cbs_tail; + raw_spinlock_t cbs_pcpu_lock; +}; + +/** + * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. * @cbs_wq: Wait queue allowing new callback to get kthread's attention. - * @cbs_lock: Lock protecting callback list. + * @cbs_gbl_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). @@ -41,14 +51,13 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @holdouts_func: This flavor's holdout-list scan function (optional). * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. + * @rtpcpu: This flavor's rcu_tasks_percpu structure. * @name: This flavor's textual name. * @kname: This flavor's kthread name. */ struct rcu_tasks { - struct rcu_head *cbs_head; - struct rcu_head **cbs_tail; struct wait_queue_head cbs_wq; - raw_spinlock_t cbs_lock; + raw_spinlock_t cbs_gbl_lock; int gp_state; int gp_sleep; int init_fract; @@ -65,20 +74,24 @@ struct rcu_tasks { holdouts_func_t holdouts_func; postgp_func_t postgp_func; call_rcu_func_t call_func; + struct rcu_tasks_percpu __percpu *rtpcpu; char *name; char *kname; }; -#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ -static struct rcu_tasks rt_name = \ -{ \ - .cbs_tail = &rt_name.cbs_head, \ - .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ - .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \ - .gp_func = gp, \ - .call_func = call, \ - .name = n, \ - .kname = #rt_name, \ +#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ +static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ + .cbs_pcpu_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ +}; \ +static struct rcu_tasks rt_name = \ +{ \ + .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ + .gp_func = gp, \ + .call_func = call, \ + .rtpcpu = &rt_name ## __percpu, \ + .name = n, \ + .kname = #rt_name, \ } /* Track exiting tasks in order to allow them to be waited for. */ @@ -148,20 +161,51 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) } #endif /* #ifndef CONFIG_TINY_RCU */ +// Initialize per-CPU callback lists for the specified flavor of +// Tasks RCU. +static void cblist_init_generic(struct rcu_tasks *rtp) +{ + int cpu; + unsigned long flags; + + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + WARN_ON_ONCE(!rtpcp); + if (cpu) + raw_spin_lock_init(&rtpcp->cbs_pcpu_lock); + raw_spin_lock(&rtpcp->cbs_pcpu_lock); // irqs already disabled. + if (!WARN_ON_ONCE(rtpcp->cbs_tail)) + rtpcp->cbs_tail = &rtpcp->cbs_head; + raw_spin_unlock(&rtpcp->cbs_pcpu_lock); // irqs remain disabled. + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + +} + // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) { unsigned long flags; bool needwake; + struct rcu_tasks_percpu *rtpcp; rhp->next = NULL; rhp->func = func; - raw_spin_lock_irqsave(&rtp->cbs_lock, flags); - needwake = !rtp->cbs_head; - WRITE_ONCE(*rtp->cbs_tail, rhp); - rtp->cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + local_irq_save(flags); + rtpcp = per_cpu_ptr(rtp->rtpcpu, 0 /* smp_processor_id() */); + raw_spin_lock(&rtpcp->cbs_pcpu_lock); + if (!rtpcp->cbs_tail) { + raw_spin_unlock(&rtpcp->cbs_pcpu_lock); // irqs remain disabled. + cblist_init_generic(rtp); + raw_spin_lock(&rtpcp->cbs_pcpu_lock); // irqs already disabled. + } + needwake = !rtpcp->cbs_head; + WRITE_ONCE(*rtpcp->cbs_tail, rhp); + rtpcp->cbs_tail = &rhp->next; + raw_spin_unlock_irqrestore(&rtpcp->cbs_pcpu_lock, flags); /* We can't create the thread unless interrupts are enabled. */ if (needwake && READ_ONCE(rtp->kthread_ptr)) wake_up(&rtp->cbs_wq); @@ -197,21 +241,23 @@ static int __noreturn rcu_tasks_kthread(void *arg) * This loop is terminated by the system going down. ;-) */ for (;;) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each... + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rtp->cbs_lock, flags); + raw_spin_lock_irqsave(&rtpcp->cbs_pcpu_lock, flags); smp_mb__after_spinlock(); // Order updates vs. GP. - list = rtp->cbs_head; - rtp->cbs_head = NULL; - rtp->cbs_tail = &rtp->cbs_head; - raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + list = rtpcp->cbs_head; + rtpcp->cbs_head = NULL; + rtpcp->cbs_tail = &rtpcp->cbs_head; + raw_spin_unlock_irqrestore(&rtpcp->cbs_pcpu_lock, flags); /* If there were none, wait a bit and start over. */ if (!list) { wait_event_interruptible(rtp->cbs_wq, - READ_ONCE(rtp->cbs_head)); - if (!rtp->cbs_head) { + READ_ONCE(rtpcp->cbs_head)); + if (!rtpcp->cbs_head) { WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); schedule_timeout_idle(HZ/10); @@ -279,6 +325,7 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each... pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), @@ -286,7 +333,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rtp->n_gps), data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], - ".C"[!!data_race(rtp->cbs_head)], + ".C"[!!data_race(rtpcp->cbs_head)], s); } #endif // #ifndef CONFIG_TINY_RCU @@ -593,6 +640,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static int __init rcu_spawn_tasks_kthread(void) { + cblist_init_generic(&rcu_tasks); rcu_tasks.gp_sleep = HZ / 10; rcu_tasks.init_fract = HZ / 10; rcu_tasks.pregp_func = rcu_tasks_pregp_step; @@ -731,6 +779,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); static int __init rcu_spawn_tasks_rude_kthread(void) { + cblist_init_generic(&rcu_tasks_rude); rcu_tasks_rude.gp_sleep = HZ / 10; rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; @@ -1264,6 +1313,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); static int __init rcu_spawn_tasks_trace_kthread(void) { + cblist_init_generic(&rcu_tasks_trace); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { rcu_tasks_trace.gp_sleep = HZ / 10; rcu_tasks_trace.init_fract = HZ / 10; -- cgit v1.2.3 From 7a30871b6a27de1a1f418c7fd2c5dde9a46bfd16 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Nov 2021 10:51:13 -0800 Subject: rcu-tasks: Introduce ->percpu_enqueue_shift for dynamic queue selection This commit introduces a ->percpu_enqueue_shift field to the rcu_tasks structure, and uses it to shift down the CPU number in order to select a rcu_tasks_percpu structure. This field is currently set to a sufficiently large shift count to always select the CPU-0 instance of the rcu_tasks_percpu structure, and later commits will adjust this. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 30048db7aa49..2a5fe3e04b36 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -52,6 +52,7 @@ struct rcu_tasks_percpu { * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. * @rtpcpu: This flavor's rcu_tasks_percpu structure. + * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @name: This flavor's textual name. * @kname: This flavor's kthread name. */ @@ -75,6 +76,7 @@ struct rcu_tasks { postgp_func_t postgp_func; call_rcu_func_t call_func; struct rcu_tasks_percpu __percpu *rtpcpu; + int percpu_enqueue_shift; char *name; char *kname; }; @@ -91,6 +93,7 @@ static struct rcu_tasks rt_name = \ .call_func = call, \ .rtpcpu = &rt_name ## __percpu, \ .name = n, \ + .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS), \ .kname = #rt_name, \ } @@ -169,6 +172,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp) unsigned long flags; raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + rtp->percpu_enqueue_shift = ilog2(nr_cpu_ids); for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); @@ -195,7 +199,8 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rhp->next = NULL; rhp->func = func; local_irq_save(flags); - rtpcp = per_cpu_ptr(rtp->rtpcpu, 0 /* smp_processor_id() */); + rtpcp = per_cpu_ptr(rtp->rtpcpu, + smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); raw_spin_lock(&rtpcp->cbs_pcpu_lock); if (!rtpcp->cbs_tail) { raw_spin_unlock(&rtpcp->cbs_pcpu_lock); // irqs remain disabled. -- cgit v1.2.3 From b14fb4fbbcd8ff62f1a7aa2c6e2603424c117943 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Nov 2021 14:14:43 -0800 Subject: rcu-tasks: Convert grace-period counter to grace-period sequence number This commit moves the rcu_tasks structure's ->n_gps grace-period-counter field to a ->task_gp_seq grce-period sequence number in order to enable use of the rcu_segcblist structure for the callback lists. This in turn permits CPUs to lag behind the RCU Tasks grace-period sequence number without suffering long-term slowdowns in callback invocation. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 2a5fe3e04b36..ed84d59b6dbf 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -42,7 +42,7 @@ struct rcu_tasks_percpu { * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. - * @n_gps: Number of grace periods completed since boot. + * @tasks_gp_seq: Number of grace periods completed since boot. * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @n_ipis_fails: Number of IPI-send failures. * @pregp_func: This flavor's pre-grace-period function (optional). @@ -64,7 +64,7 @@ struct rcu_tasks { int init_fract; unsigned long gp_jiffies; unsigned long gp_start; - unsigned long n_gps; + unsigned long tasks_gp_seq; unsigned long n_ipis; unsigned long n_ipis_fails; struct task_struct *kthread_ptr; @@ -273,8 +273,9 @@ static int __noreturn rcu_tasks_kthread(void *arg) // Wait for one grace period. set_tasks_gp_state(rtp, RTGS_WAIT_GP); rtp->gp_start = jiffies; + rcu_seq_start(&rtp->tasks_gp_seq); rtp->gp_func(rtp); - rtp->n_gps++; + rcu_seq_end(&rtp->tasks_gp_seq); /* Invoke the callbacks. */ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); @@ -335,7 +336,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), jiffies - data_race(rtp->gp_jiffies), - data_race(rtp->n_gps), + data_race(rcu_seq_current(&rtp->tasks_gp_seq)), data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], ".C"[!!data_race(rtpcp->cbs_head)], -- cgit v1.2.3 From 9b073de1c7a354af7cb7100952599dde461aee45 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Nov 2021 16:18:57 -0800 Subject: rcu_tasks: Convert bespoke callback list to rcu_segcblist structure This commit moves from a bespoke head and tail pointer in the rcu_tasks_percpu structure to an rcu_segcblist structure, thus allowing associating the grace-period sequence number with groups of callbacks. This in turn will allow callbacks to be invoked independently on different CPUs. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 2 +- kernel/rcu/tasks.h | 52 +++++++++++++++++++++++++++++----------------------- 2 files changed, 30 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 3128b7cf8e1f..42bcd34312c2 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -112,7 +112,7 @@ config RCU_STALL_COMMON making these warnings mandatory for the tree variants. config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || TREE_SRCU ) + def_bool ( TREE_RCU || TREE_SRCU || TASKS_RCU_GENERIC ) config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index ed84d59b6dbf..2e58d7fa2da4 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -6,6 +6,7 @@ */ #ifdef CONFIG_TASKS_RCU_GENERIC +#include "rcu_segcblist.h" //////////////////////////////////////////////////////////////////////// // @@ -21,13 +22,11 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); /** * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism. - * @cbs_head: Head of callback list. - * @cbs_tail: Tail pointer for callback list. + * @cblist: Callback list. * @cbs_pcpu_lock: Lock protecting per-CPU callback list. */ struct rcu_tasks_percpu { - struct rcu_head *cbs_head; - struct rcu_head **cbs_tail; + struct rcu_segcblist cblist; raw_spinlock_t cbs_pcpu_lock; }; @@ -180,8 +179,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) if (cpu) raw_spin_lock_init(&rtpcp->cbs_pcpu_lock); raw_spin_lock(&rtpcp->cbs_pcpu_lock); // irqs already disabled. - if (!WARN_ON_ONCE(rtpcp->cbs_tail)) - rtpcp->cbs_tail = &rtpcp->cbs_head; + if (rcu_segcblist_empty(&rtpcp->cblist)) + rcu_segcblist_init(&rtpcp->cblist); raw_spin_unlock(&rtpcp->cbs_pcpu_lock); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); @@ -202,14 +201,13 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rtpcp = per_cpu_ptr(rtp->rtpcpu, smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); raw_spin_lock(&rtpcp->cbs_pcpu_lock); - if (!rtpcp->cbs_tail) { + if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) { raw_spin_unlock(&rtpcp->cbs_pcpu_lock); // irqs remain disabled. cblist_init_generic(rtp); raw_spin_lock(&rtpcp->cbs_pcpu_lock); // irqs already disabled. } - needwake = !rtpcp->cbs_head; - WRITE_ONCE(*rtpcp->cbs_tail, rhp); - rtpcp->cbs_tail = &rhp->next; + needwake = rcu_segcblist_empty(&rtpcp->cblist); + rcu_segcblist_enqueue(&rtpcp->cblist, rhp); raw_spin_unlock_irqrestore(&rtpcp->cbs_pcpu_lock, flags); /* We can't create the thread unless interrupts are enabled. */ if (needwake && READ_ONCE(rtp->kthread_ptr)) @@ -231,8 +229,9 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) static int __noreturn rcu_tasks_kthread(void *arg) { unsigned long flags; - struct rcu_head *list; - struct rcu_head *next; + int len; + struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); + struct rcu_head *rhp; struct rcu_tasks *rtp = arg; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ @@ -253,16 +252,15 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* Pick up any new callbacks. */ raw_spin_lock_irqsave(&rtpcp->cbs_pcpu_lock, flags); smp_mb__after_spinlock(); // Order updates vs. GP. - list = rtpcp->cbs_head; - rtpcp->cbs_head = NULL; - rtpcp->cbs_tail = &rtpcp->cbs_head; + rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); + (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); raw_spin_unlock_irqrestore(&rtpcp->cbs_pcpu_lock, flags); /* If there were none, wait a bit and start over. */ - if (!list) { + if (!rcu_segcblist_pend_cbs(&rtpcp->cblist)) { wait_event_interruptible(rtp->cbs_wq, - READ_ONCE(rtpcp->cbs_head)); - if (!rtpcp->cbs_head) { + rcu_segcblist_pend_cbs(&rtpcp->cblist)); + if (!rcu_segcblist_pend_cbs(&rtpcp->cblist)) { WARN_ON(signal_pending(current)); set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); schedule_timeout_idle(HZ/10); @@ -279,14 +277,22 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* Invoke the callbacks. */ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); - while (list) { - next = list->next; + raw_spin_lock_irqsave(&rtpcp->cbs_pcpu_lock, flags); + smp_mb__after_spinlock(); // Order updates vs. GP. + rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); + rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl); + raw_spin_unlock_irqrestore(&rtpcp->cbs_pcpu_lock, flags); + len = rcl.len; + for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { local_bh_disable(); - list->func(list); + rhp->func(rhp); local_bh_enable(); - list = next; cond_resched(); } + raw_spin_lock_irqsave(&rtpcp->cbs_pcpu_lock, flags); + rcu_segcblist_add_len(&rtpcp->cblist, -len); + (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); + raw_spin_unlock_irqrestore(&rtpcp->cbs_pcpu_lock, flags); /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_idle(rtp->gp_sleep); } @@ -339,7 +345,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) data_race(rcu_seq_current(&rtp->tasks_gp_seq)), data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], - ".C"[!!data_race(rtpcp->cbs_head)], + ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))], s); } #endif // #ifndef CONFIG_TINY_RCU -- cgit v1.2.3 From 4feeb9d5f82229bf39621be755bda81521539476 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 25 Oct 2021 11:26:58 +0800 Subject: refscale: Always log the error message An OOM is a serious error that should be logged even in non-verbose runs. This commit therefore adds an unconditional SCALEOUT_ERRSTRING() macro and uses it instead of VERBOSE_SCALEOUT_ERRSTRING() when reporting an OOM. [ paulmck: Drop do-while from SCALEOUT_ERRSTRING() due to only single statement. ] Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index d8ed1ca3adc0..c47685634640 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -58,8 +58,7 @@ do { \ } \ } while (0) -#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ - do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0) +#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Joel Fernandes (Google) "); @@ -651,7 +650,7 @@ static int main_func(void *arg) result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL); buf = kzalloc(800 + 64, GFP_KERNEL); if (!result_avg || !buf) { - VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + SCALEOUT_ERRSTRING("out of memory"); goto oom_exit; } if (holdoff) @@ -837,7 +836,7 @@ ref_scale_init(void) reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { - VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } -- cgit v1.2.3 From f71f22b67d37c91a5c4320f6e821f64eb189627d Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 29 Oct 2021 17:40:24 +0800 Subject: refscale: Add missing '\n' to flush message Add '\n' to macros to flush message for each call. Acked-by: Davidlohr Bueso Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index c47685634640..5489ff7f478e 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -44,7 +44,10 @@ pr_alert("%s" SCALE_FLAG s, scale_type, ## x) #define VERBOSE_SCALEOUT(s, x...) \ - do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0) + do { \ + if (verbose) \ + pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \ + } while (0) static atomic_t verbose_batch_ctr; @@ -54,11 +57,11 @@ do { \ (verbose_batched <= 0 || \ !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \ schedule_timeout_uninterruptible(1); \ - pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \ + pr_alert("%s" SCALE_FLAG s "\n", scale_type, ## x); \ } \ } while (0) -#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x) +#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Joel Fernandes (Google) "); @@ -841,7 +844,7 @@ ref_scale_init(void) goto unwind; } - VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders); + VERBOSE_SCALEOUT("Starting %d reader threads", nreaders); for (i = 0; i < nreaders; i++) { firsterr = torture_create_kthread(ref_scale_reader, (void *)i, -- cgit v1.2.3 From 71f6ea2a0be06f9d1833852ee4f501a76563acd3 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 29 Oct 2021 17:40:25 +0800 Subject: scftorture: Add missing '\n' to flush message Add '\n' to macros to flush message for each call. Acked-by: Davidlohr Bueso Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 5d42f44e3e1a..9cff573b7eb4 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -42,10 +42,10 @@ pr_alert(SCFTORT_FLAG s, ## x) #define VERBOSE_SCFTORTOUT(s, x...) \ - do { if (verbose) pr_alert(SCFTORT_FLAG s, ## x); } while (0) + do { if (verbose) pr_alert(SCFTORT_FLAG s "\n", ## x); } while (0) #define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \ - do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s, ## x); } while (0) + do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x); } while (0) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); @@ -630,7 +630,7 @@ static int __init scf_torture_init(void) goto unwind; } - VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads\n", nthreads); + VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads", nthreads); atomic_set(&n_started, nthreads); for (i = 0; i < nthreads; i++) { -- cgit v1.2.3 From 04cf8518860167081ee49b5ff2e1616244ab760e Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 29 Oct 2021 17:40:26 +0800 Subject: scftorture: Remove unused SCFTORTOUT There are no longer any users of SCFTORTOUT(), so this commit removes it. Acked-by: Davidlohr Bueso Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 9cff573b7eb4..a0df767897a1 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -38,9 +38,6 @@ #define SCFTORT_STRING "scftorture" #define SCFTORT_FLAG SCFTORT_STRING ": " -#define SCFTORTOUT(s, x...) \ - pr_alert(SCFTORT_FLAG s, ## x) - #define VERBOSE_SCFTORTOUT(s, x...) \ do { if (verbose) pr_alert(SCFTORT_FLAG s "\n", ## x); } while (0) -- cgit v1.2.3 From 86e7ed1bd57d020e35d430542bf5d689c3200568 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Fri, 29 Oct 2021 17:40:28 +0800 Subject: rcuscale: Always log error message Unconditionally log messages corresponding to errors. Acked-by: Davidlohr Bueso Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuscale.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 228f143bf935..5e4f1f83d38e 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -50,8 +50,8 @@ MODULE_AUTHOR("Paul E. McKenney "); pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s) #define VERBOSE_SCALEOUT_STRING(s) \ do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0) -#define VERBOSE_SCALEOUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0) +#define SCALEOUT_ERRSTRING(s) \ + pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s) /* * The intended use cases for the nreaders and nwriters module parameters @@ -514,11 +514,11 @@ rcu_scale_cleanup(void) * during the mid-boot phase, so have to wait till the end. */ if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) - VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); if (rcu_gp_is_normal() && gp_exp) - VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); if (gp_exp && gp_async) - VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); + SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!"); if (torture_cleanup_begin()) return; @@ -845,7 +845,7 @@ rcu_scale_init(void) reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -865,7 +865,7 @@ rcu_scale_init(void) kcalloc(nrealwriters, sizeof(*writer_n_durations), GFP_KERNEL); if (!writer_tasks || !writer_durations || !writer_n_durations) { - VERBOSE_SCALEOUT_ERRSTRING("out of memory"); + SCALEOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } -- cgit v1.2.3 From 809da9bf805094e1cf032fd8000e8b77d5eec642 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 3 Nov 2021 16:30:27 +0800 Subject: scftorture: Always log error message Unconditionally log messages corresponding to errors. Acked-by: Davidlohr Bueso Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index a0df767897a1..dcb0410950e4 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -41,8 +41,7 @@ #define VERBOSE_SCFTORTOUT(s, x...) \ do { if (verbose) pr_alert(SCFTORT_FLAG s "\n", ## x); } while (0) -#define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \ - do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x); } while (0) +#define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); @@ -584,14 +583,14 @@ static int __init scf_torture_init(void) if (weight_resched1 == 0 && weight_single1 == 0 && weight_single_rpc1 == 0 && weight_single_wait1 == 0 && weight_many1 == 0 && weight_many_wait1 == 0 && weight_all1 == 0 && weight_all_wait1 == 0) { - VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); + SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); firsterr = -EINVAL; goto unwind; } if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) scf_sel_add(weight_resched1, SCF_PRIM_RESCHED, false); else if (weight_resched1) - VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored"); + SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored"); scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true); scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); @@ -622,7 +621,7 @@ static int __init scf_torture_init(void) nthreads = num_online_cpus(); scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL); if (!scf_stats_p) { - VERBOSE_SCFTORTOUT_ERRSTRING("out of memory"); + SCFTORTOUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } -- cgit v1.2.3 From 81faa4f6fba429334ff72bb5ba7696818509b5b5 Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 3 Nov 2021 16:30:28 +0800 Subject: locktorture,rcutorture,torture: Always log error message Unconditionally log messages corresponding to errors. Acked-by: Davidlohr Bueso Signed-off-by: Li Zhijian Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 9 ++------- kernel/locking/locktorture.c | 4 ++-- kernel/rcu/rcutorture.c | 8 ++++---- kernel/torture.c | 4 ++-- 4 files changed, 10 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index 24f58e50a94b..63fa4196e51c 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -38,13 +38,8 @@ do { \ pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); \ } \ } while (0) -#define VERBOSE_TOROUT_ERRSTRING(s) \ -do { \ - if (verbose) { \ - verbose_torout_sleep(); \ - pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); \ - } \ -} while (0) +#define TOROUT_ERRSTRING(s) \ + pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s) void verbose_torout_sleep(void); #define torture_init_error(firsterr) \ diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 397ac13d2ef7..9c2fb613a55d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -1047,7 +1047,7 @@ static int __init lock_torture_init(void) sizeof(writer_tasks[0]), GFP_KERNEL); if (writer_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); + TOROUT_ERRSTRING("writer_tasks: Out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -1058,7 +1058,7 @@ static int __init lock_torture_init(void) sizeof(reader_tasks[0]), GFP_KERNEL); if (reader_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + TOROUT_ERRSTRING("reader_tasks: Out of memory"); kfree(writer_tasks); writer_tasks = NULL; firsterr = -ENOMEM; diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 503e14e62e8f..36a273589a35 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2774,7 +2774,7 @@ static int rcu_torture_read_exit(void *unused) &trs, "%s", "rcu_torture_read_exit_child"); if (IS_ERR(tsp)) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); errexit = true; tsp = NULL; break; @@ -3101,7 +3101,7 @@ rcu_torture_init(void) sizeof(fakewriter_tasks[0]), GFP_KERNEL); if (fakewriter_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -3117,7 +3117,7 @@ rcu_torture_init(void) rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk), GFP_KERNEL); if (!reader_tasks || !rcu_torture_reader_mbchk) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } @@ -3136,7 +3136,7 @@ rcu_torture_init(void) if (nrealnocbers > 0) { nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL); if (nocb_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); + TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } diff --git a/kernel/torture.c b/kernel/torture.c index bb8f411c974b..ef27a6c82451 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -570,7 +570,7 @@ int torture_shuffle_init(long shuffint) shuffle_idle_cpu = -1; if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { - VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask"); + TOROUT_ERRSTRING("Failed to alloc mask"); return -ENOMEM; } @@ -934,7 +934,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m, *tp = kthread_run(fn, arg, "%s", s); if (IS_ERR(*tp)) { ret = PTR_ERR(*tp); - VERBOSE_TOROUT_ERRSTRING(f); + TOROUT_ERRSTRING(f); *tp = NULL; } torture_shuffle_task_register(*tp); -- cgit v1.2.3 From 5ff7c9f9d7e3e0f6db5b81945fa11b69d62f433a Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Wed, 10 Nov 2021 11:37:45 -0300 Subject: rcutorture: Avoid soft lockup during cpu stall If we use the module stall_cpu option, we may get a soft lockup warning in case we also don't pass the stall_cpu_block option. Introduce the stall_no_softlockup option to avoid a soft lockup on cpu stall even if we don't use the stall_cpu_block option. Signed-off-by: Wander Lairson Costa Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 36a273589a35..bc854f935548 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "rcu.h" @@ -112,6 +113,8 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable."); torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); +torture_param(bool, stall_no_softlockup, false, + "Avoid softlockup warning during cpu stall."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); torture_param(int, stall_gp_kthread, 0, @@ -2085,6 +2088,8 @@ static int rcu_torture_stall(void *args) #else schedule_timeout_uninterruptible(HZ); #endif + } else if (stall_no_softlockup) { + touch_softlockup_watchdog(); } if (stall_cpu_irqsoff) local_irq_enable(); -- cgit v1.2.3 From 82e310033d7c21a7a88427f14e0dad78d731a5cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Nov 2021 20:55:18 -0800 Subject: rcutorture: Enable multiple concurrent callback-flood kthreads This commit converts the rcutorture.fwd_progress module parameter from bool to int, so that it specifies the number of callback-flood kthreads. Values less than zero specify one kthread per CPU, however, the number of kthreads executing concurrently is limited to the number of online CPUs. This commit also reverse the order of the need-resched and callback-flood operations to cause the callback flooding to happen more nearly at the same time. Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 6 +- kernel/rcu/rcutorture.c | 114 ++++++++++++++++++------ 2 files changed, 91 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..ba60da85d1c8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4603,8 +4603,12 @@ in seconds. rcutorture.fwd_progress= [KNL] - Enable RCU grace-period forward-progress testing + Specifies the number of kthreads to be used + for RCU grace-period forward-progress testing for the types of RCU supporting this notion. + Defaults to 1 kthread, values less than zero or + greater than the number of CPUs cause the number + of CPUs to be used. rcutorture.fwd_progress_div= [KNL] Specify the fraction of a CPU-stall-warning diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bc854f935548..9390f867ba6c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -79,7 +79,7 @@ torture_param(int, fqs_duration, 0, "Duration of fqs bursts (us), 0 to disable"); torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)"); -torture_param(bool, fwd_progress, 1, "Test grace-period forward progress"); +torture_param(int, fwd_progress, 1, "Test grace-period forward progress"); torture_param(int, fwd_progress_div, 4, "Fraction of CPU stall to wait"); torture_param(int, fwd_progress_holdoff, 60, "Time between forward-progress tests (s)"); @@ -146,7 +146,7 @@ static struct task_struct *stats_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; static struct task_struct *stall_task; -static struct task_struct *fwd_prog_task; +static struct task_struct **fwd_prog_tasks; static struct task_struct **barrier_cbs_tasks; static struct task_struct *barrier_task; static struct task_struct *read_exit_task; @@ -2161,10 +2161,12 @@ struct rcu_fwd { unsigned long rcu_fwd_startat; struct rcu_launder_hist n_launders_hist[N_LAUNDERS_HIST]; unsigned long rcu_launder_gp_seq_start; + int rcu_fwd_id; }; static DEFINE_MUTEX(rcu_fwd_mutex); static struct rcu_fwd *rcu_fwds; +static unsigned long rcu_fwd_seq; static bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) @@ -2177,8 +2179,9 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--) if (rfp->n_launders_hist[i].n_launders > 0) break; - pr_alert("%s: Callback-invocation histogram (duration %lu jiffies):", - __func__, jiffies - rfp->rcu_fwd_startat); + mutex_lock(&rcu_fwd_mutex); // Serialize histograms. + pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):", + __func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat); gps_old = rfp->rcu_launder_gp_seq_start; for (j = 0; j <= i; j++) { gps = rfp->n_launders_hist[j].launder_gp_seq; @@ -2189,6 +2192,7 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) gps_old = gps; } pr_cont("\n"); + mutex_unlock(&rcu_fwd_mutex); } /* Callback function for continuous-flood RCU callbacks. */ @@ -2314,7 +2318,8 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, cver = READ_ONCE(rcu_torture_current_version) - cver; gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps); WARN_ON(!cver && gps < 2); - pr_alert("%s: Duration %ld cver %ld gps %ld\n", __func__, dur, cver, gps); + pr_alert("%s: %d Duration %ld cver %ld gps %ld\n", __func__, + rfp->rcu_fwd_id, dur, cver, gps); } if (selfpropcb) { WRITE_ONCE(fcs.stop, 1); @@ -2432,6 +2437,8 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) static int rcutorture_oom_notify(struct notifier_block *self, unsigned long notused, void *nfreed) { + int i; + long ncbs; struct rcu_fwd *rfp; mutex_lock(&rcu_fwd_mutex); @@ -2442,18 +2449,26 @@ static int rcutorture_oom_notify(struct notifier_block *self, } WARN(1, "%s invoked upon OOM during forward-progress testing.\n", __func__); - rcu_torture_fwd_cb_hist(rfp); - rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp->rcu_fwd_startat)) / 2); + for (i = 0; i < fwd_progress; i++) { + rcu_torture_fwd_cb_hist(&rfp[i]); + rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rfp[i].rcu_fwd_startat)) / 2); + } WRITE_ONCE(rcu_fwd_emergency_stop, true); smp_mb(); /* Emergency stop before free and wait to avoid hangs. */ - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); rcu_barrier(); - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); rcu_barrier(); - pr_info("%s: Freed %lu RCU callbacks.\n", - __func__, rcu_torture_fwd_prog_cbfree(rfp)); + ncbs = 0; + for (i = 0; i < fwd_progress; i++) + ncbs += rcu_torture_fwd_prog_cbfree(&rfp[i]); + pr_info("%s: Freed %lu RCU callbacks.\n", __func__, ncbs); smp_mb(); /* Frees before return to avoid redoing OOM. */ (*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */ pr_info("%s returning after OOM processing.\n", __func__); @@ -2469,6 +2484,7 @@ static struct notifier_block rcutorture_oom_nb = { static int rcu_torture_fwd_prog(void *args) { int oldnice = task_nice(current); + unsigned long oldseq = READ_ONCE(rcu_fwd_seq); struct rcu_fwd *rfp = args; int tested = 0; int tested_tries = 0; @@ -2478,21 +2494,31 @@ static int rcu_torture_fwd_prog(void *args) if (!IS_ENABLED(CONFIG_SMP) || !IS_ENABLED(CONFIG_RCU_BOOST)) set_user_nice(current, MAX_NICE); do { - schedule_timeout_interruptible(fwd_progress_holdoff * HZ); - WRITE_ONCE(rcu_fwd_emergency_stop, false); + if (!rfp->rcu_fwd_id) { + schedule_timeout_interruptible(fwd_progress_holdoff * HZ); + WRITE_ONCE(rcu_fwd_emergency_stop, false); + WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); + } else { + while (READ_ONCE(rcu_fwd_seq) == oldseq) + schedule_timeout_interruptible(1); + oldseq = READ_ONCE(rcu_fwd_seq); + } + pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); + if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id) + rcu_torture_fwd_prog_cr(rfp); if (!IS_ENABLED(CONFIG_TINY_RCU) || - rcu_inkernel_boot_has_ended()) + (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id)) rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); - if (rcu_inkernel_boot_has_ended()) - rcu_torture_fwd_prog_cr(rfp); /* Avoid slow periods, better to test when busy. */ if (stutter_wait("rcu_torture_fwd_prog")) sched_set_normal(current, oldnice); } while (!torture_must_stop()); /* Short runs might not contain a valid forward-progress attempt. */ - WARN_ON(!tested && tested_tries >= 5); - pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + if (!rfp->rcu_fwd_id) { + WARN_ON(!tested && tested_tries >= 5); + pr_alert("%s: tested %d tested_tries %d\n", __func__, tested, tested_tries); + } torture_kthread_stopping("rcu_torture_fwd_prog"); return 0; } @@ -2500,17 +2526,27 @@ static int rcu_torture_fwd_prog(void *args) /* If forward-progress checking is requested and feasible, spawn the thread. */ static int __init rcu_torture_fwd_prog_init(void) { + int i; + int ret = 0; struct rcu_fwd *rfp; if (!fwd_progress) return 0; /* Not requested, so don't do it. */ + if (fwd_progress >= nr_cpu_ids) { + VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Limiting fwd_progress to # CPUs.\n"); + fwd_progress = nr_cpu_ids; + } else if (fwd_progress < 0) { + fwd_progress = nr_cpu_ids; + } if ((!cur_ops->sync && !cur_ops->call) || !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); + fwd_progress = 0; return 0; } if (stall_cpu > 0) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); + fwd_progress = 0; if (IS_MODULE(CONFIG_RCU_TORTURE_TEST)) return -EINVAL; /* In module, can fail back to user. */ WARN_ON(1); /* Make sure rcutorture notices conflict. */ @@ -2520,29 +2556,51 @@ static int __init rcu_torture_fwd_prog_init(void) fwd_progress_holdoff = 1; if (fwd_progress_div <= 0) fwd_progress_div = 4; - rfp = kzalloc(sizeof(*rfp), GFP_KERNEL); - if (!rfp) + rfp = kcalloc(fwd_progress, sizeof(*rfp), GFP_KERNEL); + fwd_prog_tasks = kcalloc(fwd_progress, sizeof(*fwd_prog_tasks), GFP_KERNEL); + if (!rfp || !fwd_prog_tasks) { + kfree(rfp); + kfree(fwd_prog_tasks); + fwd_prog_tasks = NULL; + fwd_progress = 0; return -ENOMEM; - spin_lock_init(&rfp->rcu_fwd_lock); - rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head; + } + for (i = 0; i < fwd_progress; i++) { + spin_lock_init(&rfp[i].rcu_fwd_lock); + rfp[i].rcu_fwd_cb_tail = &rfp[i].rcu_fwd_cb_head; + rfp[i].rcu_fwd_id = i; + } mutex_lock(&rcu_fwd_mutex); rcu_fwds = rfp; mutex_unlock(&rcu_fwd_mutex); register_oom_notifier(&rcutorture_oom_nb); - return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task); + for (i = 0; i < fwd_progress; i++) { + ret = torture_create_kthread(rcu_torture_fwd_prog, &rcu_fwds[i], fwd_prog_tasks[i]); + if (ret) { + fwd_progress = i; + return ret; + } + } + return 0; } static void rcu_torture_fwd_prog_cleanup(void) { + int i; struct rcu_fwd *rfp; - torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); - rfp = rcu_fwds; + if (!rcu_fwds || !fwd_prog_tasks) + return; + for (i = 0; i < fwd_progress; i++) + torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_tasks[i]); + unregister_oom_notifier(&rcutorture_oom_nb); mutex_lock(&rcu_fwd_mutex); + rfp = rcu_fwds; rcu_fwds = NULL; mutex_unlock(&rcu_fwd_mutex); - unregister_oom_notifier(&rcutorture_oom_nb); kfree(rfp); + kfree(fwd_prog_tasks); + fwd_prog_tasks = NULL; } /* Callback function for RCU barrier testing. */ -- cgit v1.2.3 From 613b00fbe64461f1c73e035e07c22cf3de65740b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Nov 2021 11:53:52 -0800 Subject: rcutorture: Add ability to limit callback-flood intensity The RCU tasks flavors of RCU now need concurrent callback flooding to test their ability to switch between single-queue mode and per-CPU queue mode, but their lack of heavy-duty forward-progress features rules out the use of rcutorture's current callback-flooding code. This commit therefore provides the ability to limit the intensity of the callback floods using a new ->cbflood_max field in the rcu_operations structure. When this field is zero, there is no limit, otherwise, each callback-flood kthread allocates at most ->cbflood_max callbacks. Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9390f867ba6c..1e8360d15920 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -348,6 +348,7 @@ struct rcu_torture_ops { void (*gp_kthread_dbg)(void); bool (*check_boost_failed)(unsigned long gp_state, int *cpup); int (*stall_dur)(void); + long cbflood_max; int irq_capable; int can_boost; int extendables; @@ -841,6 +842,7 @@ static struct rcu_torture_ops tasks_rude_ops = { .call = call_rcu_tasks_rude, .cb_barrier = rcu_barrier_tasks_rude, .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, + .cbflood_max = 50000, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -881,6 +883,7 @@ static struct rcu_torture_ops tasks_tracing_ops = { .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, + .cbflood_max = 50000, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -2387,7 +2390,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) rfp->rcu_fwd_cb_head = rfcpn; n_launders++; n_launders_sa++; - } else { + } else if (!cur_ops->cbflood_max || cur_ops->cbflood_max > n_max_cbs) { rfcp = kmalloc(sizeof(*rfcp), GFP_KERNEL); if (WARN_ON_ONCE(!rfcp)) { schedule_timeout_interruptible(1); @@ -2397,8 +2400,11 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) n_launders_sa = 0; rfcp->rfc_gps = 0; rfcp->rfc_rfp = rfp; + } else { + rfcp = NULL; } - cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); + if (rfcp) + cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); rcu_torture_fwd_prog_cond_resched(n_launders + n_max_cbs); if (tick_nohz_full_enabled()) { local_irq_save(flags); @@ -2506,8 +2512,10 @@ static int rcu_torture_fwd_prog(void *args) pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id); if (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id) rcu_torture_fwd_prog_cr(rfp); - if (!IS_ENABLED(CONFIG_TINY_RCU) || - (rcu_inkernel_boot_has_ended() && torture_num_online_cpus() > rfp->rcu_fwd_id)) + if ((cur_ops->stall_dur && cur_ops->stall_dur() > 0) && + (!IS_ENABLED(CONFIG_TINY_RCU) || + (rcu_inkernel_boot_has_ended() && + torture_num_online_cpus() > rfp->rcu_fwd_id))) rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries); /* Avoid slow periods, better to test when busy. */ @@ -2539,7 +2547,8 @@ static int __init rcu_torture_fwd_prog_init(void) fwd_progress = nr_cpu_ids; } if ((!cur_ops->sync && !cur_ops->call) || - !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) { + (!cur_ops->cbflood_max && (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0)) || + cur_ops == &rcu_busted_ops) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); fwd_progress = 0; return 0; -- cgit v1.2.3 From 53b541fbdb9c498db112216a0cc9fe4804f54742 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Nov 2021 13:51:11 -0800 Subject: rcutorture: Combine n_max_cbs from all kthreads in a callback flood With the addition of multiple callback-flood kthreads, the maximum number of callbacks from any one of those kthreads is reported in the rcutorture run summary. This commit changes this to report the sum of each kthread's maximum number of callbacks in a given callback-flooding episode. Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 9 +++++++++ tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1e8360d15920..33ea446101b3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2170,6 +2170,7 @@ struct rcu_fwd { static DEFINE_MUTEX(rcu_fwd_mutex); static struct rcu_fwd *rcu_fwds; static unsigned long rcu_fwd_seq; +static atomic_long_t rcu_fwd_max_cbs; static bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) @@ -2428,6 +2429,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, n_max_gps, n_max_cbs, cver, gps); + atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs); rcu_torture_fwd_cb_hist(rfp); } schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ @@ -2489,6 +2491,8 @@ static struct notifier_block rcutorture_oom_nb = { /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { + bool firsttime = true; + long max_cbs; int oldnice = task_nice(current); unsigned long oldseq = READ_ONCE(rcu_fwd_seq); struct rcu_fwd *rfp = args; @@ -2503,6 +2507,11 @@ static int rcu_torture_fwd_prog(void *args) if (!rfp->rcu_fwd_id) { schedule_timeout_interruptible(fwd_progress_holdoff * HZ); WRITE_ONCE(rcu_fwd_emergency_stop, false); + if (!firsttime) { + max_cbs = atomic_long_xchg(&rcu_fwd_max_cbs, 0); + pr_alert("%s n_max_cbs: %ld\n", __func__, max_cbs); + } + firsttime = false; WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1); } else { while (READ_ONCE(rcu_fwd_seq) == oldseq) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh index fbdf162b6acd..1c4c2c727dad 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh @@ -25,7 +25,7 @@ stopstate="`grep 'End-test grace-period state: g' $i/console.log 2> /dev/null | tail -1 | sed -e 's/^\[[ 0-9.]*] //' | awk '{ print \"[\" $1 \" \" $5 \" \" $6 \" \" $7 \"]\"; }' | tr -d '\012\015'`" -fwdprog="`grep 'rcu_torture_fwd_prog_cr Duration' $i/console.log 2> /dev/null | sed -e 's/^\[[^]]*] //' | sort -k15nr | head -1 | awk '{ print $14 " " $15 }'`" +fwdprog="`grep 'rcu_torture_fwd_prog n_max_cbs: ' $i/console.log 2> /dev/null | sed -e 's/^\[[^]]*] //' | sort -k3nr | head -1 | awk '{ print $2 " " $3 }'`" if test -z "$ngps" then echo "$configfile ------- " $stopstate -- cgit v1.2.3 From 2972e3050e3517a85ca1813b227d4c302e804343 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Tue, 7 Dec 2021 14:25:58 +0000 Subject: tracing: Make trace_marker{,_raw} stream-like The tracing marker files are write-only streams with no meaningful concept of file position. Using stream_open() to mark them as stream-link indicates this and has the added advantage that a single file descriptor can now be used from multiple threads without contention thanks to clearing FMODE_ATOMIC_POS. Note that this has the potential to break existing userspace by since both lseek(2) and pwrite(2) will now return ESPIPE when previously lseek would have updated the stored offset and pwrite would have appended to the trace. A survey of libtracefs and several other projects found to use trace_marker(_raw) [1][2][3] suggests that everyone limits themselves to calling write(2) and close(2) on these file descriptors so there is a good chance this will go unnoticed and the benefits of reduced overhead and lock contention seem worth the risk. [1] https://github.com/google/perfetto [2] https://github.com/intel/media-driver/ [3] https://w1.fi/cgit/hostap/ Link: https://lkml.kernel.org/r/20211207142558.347029-1-john@metanate.com Signed-off-by: John Keeping Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e3b8c906b7b4..588de6df473f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4841,6 +4841,12 @@ int tracing_open_generic_tr(struct inode *inode, struct file *filp) return 0; } +static int tracing_mark_open(struct inode *inode, struct file *filp) +{ + stream_open(inode, filp); + return tracing_open_generic_tr(inode, filp); +} + static int tracing_release(struct inode *inode, struct file *file) { struct trace_array *tr = inode->i_private; @@ -7117,9 +7123,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (tt) event_triggers_post_call(tr->trace_marker_file, tt); - if (written > 0) - *fpos += written; - return written; } @@ -7178,9 +7181,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, __buffer_unlock_commit(buffer, event); - if (written > 0) - *fpos += written; - return written; } @@ -7580,16 +7580,14 @@ static const struct file_operations tracing_free_buffer_fops = { }; static const struct file_operations tracing_mark_fops = { - .open = tracing_open_generic_tr, + .open = tracing_mark_open, .write = tracing_mark_write, - .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; static const struct file_operations tracing_mark_raw_fops = { - .open = tracing_open_generic_tr, + .open = tracing_mark_open, .write = tracing_mark_raw_write, - .llseek = generic_file_llseek, .release = tracing_release_generic_tr, }; -- cgit v1.2.3 From 74d9555580c48a04b2c3b742dfb0c80777aa0b26 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 8 Nov 2021 16:09:41 +0000 Subject: PM: hibernate: Allow ACPI hardware signature to be honoured Theoretically, when the hardware signature in FACS changes, the OS is supposed to gracefully decline to attempt to resume from S4: "If the signature has changed, OSPM will not restore the system context and can boot from scratch" In practice, Windows doesn't do this and many laptop vendors do allow the signature to change especially when docking/undocking, so it would be a bad idea to simply comply with the specification by default in the general case. However, there are use cases where we do want the compliant behaviour and we know it's safe. Specifically, when resuming virtual machines where we know the hypervisor has changed sufficiently that resume will fail. We really want to be able to *tell* the guest kernel not to try, so it boots cleanly and doesn't just crash. This patch provides a way to opt in to the spec-compliant behaviour on the command line. A follow-up patch may do this automatically for certain "known good" machines based on a DMI match, or perhaps just for all hypervisor guests since there's no good reason a hypervisor would change the hardware_signature that it exposes to guests *unless* it wants them to obey the ACPI specification. Signed-off-by: David Woodhouse Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/kernel-parameters.txt | 15 +++++++++++--- arch/x86/kernel/acpi/sleep.c | 4 +++- drivers/acpi/sleep.c | 26 ++++++++++++++++++++----- include/linux/acpi.h | 2 +- include/linux/suspend.h | 1 + kernel/power/power.h | 1 + kernel/power/swap.c | 16 +++++++++++++-- 7 files changed, 53 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..10e0a3a27516 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -225,14 +225,23 @@ For broken nForce2 BIOS resulting in XT-PIC timer. acpi_sleep= [HW,ACPI] Sleep options - Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, - old_ordering, nonvs, sci_force_enable, nobl } + Format: { s3_bios, s3_mode, s3_beep, s4_hwsig, + s4_nohwsig, old_ordering, nonvs, + sci_force_enable, nobl } See Documentation/power/video.rst for information on s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep as soon as the kernel's real-mode entry point is called. + s4_hwsig causes the kernel to check the ACPI hardware + signature during resume from hibernation, and gracefully + refuse to resume if it has changed. This complies with + the ACPI specification but not with reality, since + Windows does not do this and many laptops do change it + on docking. So the default behaviour is to allow resume + and simply warn when the signature changes, unless the + s4_hwsig option is enabled. s4_nohwsig prevents ACPI hardware signature from being - used during resume from hibernation. + used (or even warned about) during resume. old_ordering causes the ACPI 1.0 ordering of the _PTS control method, with respect to putting devices into low power states, to be enforced (the ACPI 2.0 ordering diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 3f85fcae450c..1e97f944b47d 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -139,8 +139,10 @@ static int __init acpi_sleep_setup(char *str) if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; #ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_hwsig", 8) == 0) + acpi_check_s4_hw_signature(1); if (strncmp(str, "s4_nohwsig", 10) == 0) - acpi_no_s4_hw_signature(); + acpi_check_s4_hw_signature(0); #endif if (strncmp(str, "nonvs", 5) == 0) acpi_nvs_nosave(); diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index eaa47753b758..4dfbfc69db34 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -877,11 +877,11 @@ static inline void acpi_sleep_syscore_init(void) {} #ifdef CONFIG_HIBERNATION static unsigned long s4_hardware_signature; static struct acpi_table_facs *facs; -static bool nosigcheck; +static int sigcheck = -1; /* Default behaviour is just to warn */ -void __init acpi_no_s4_hw_signature(void) +void __init acpi_check_s4_hw_signature(int check) { - nosigcheck = true; + sigcheck = check; } static int acpi_hibernation_begin(pm_message_t stage) @@ -1009,12 +1009,28 @@ static void acpi_sleep_hibernate_setup(void) hibernation_set_ops(old_suspend_ordering ? &acpi_hibernation_ops_old : &acpi_hibernation_ops); sleep_states[ACPI_STATE_S4] = 1; - if (nosigcheck) + if (!sigcheck) return; acpi_get_table(ACPI_SIG_FACS, 1, (struct acpi_table_header **)&facs); - if (facs) + if (facs) { + /* + * s4_hardware_signature is the local variable which is just + * used to warn about mismatch after we're attempting to + * resume (in violation of the ACPI specification.) + */ s4_hardware_signature = facs->hardware_signature; + + if (sigcheck > 0) { + /* + * If we're actually obeying the ACPI specification + * then the signature is written out as part of the + * swsusp header, in order to allow the boot kernel + * to gracefully decline to resume. + */ + swsusp_hardware_signature = facs->hardware_signature; + } + } } #else /* !CONFIG_HIBERNATION */ static inline void acpi_sleep_hibernate_setup(void) {} diff --git a/include/linux/acpi.h b/include/linux/acpi.h index b28f8790192a..6c0798db6bde 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -506,7 +506,7 @@ acpi_status acpi_release_memory(acpi_handle handle, struct resource *res, int acpi_resources_are_enforced(void); #ifdef CONFIG_HIBERNATION -void __init acpi_no_s4_hw_signature(void); +void __init acpi_check_s4_hw_signature(int check); #endif #ifdef CONFIG_PM_SLEEP diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 8af13ba60c7e..5785d909c321 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -446,6 +446,7 @@ extern unsigned long get_safe_page(gfp_t gfp_mask); extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); +extern u32 swsusp_hardware_signature; extern void hibernation_set_ops(const struct platform_hibernation_ops *ops); extern int hibernate(void); extern bool system_entering_hibernation(void); diff --git a/kernel/power/power.h b/kernel/power/power.h index 326f8d032eb5..b4f433943209 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -170,6 +170,7 @@ extern int swsusp_swap_in_use(void); #define SF_PLATFORM_MODE 1 #define SF_NOCOMPRESS_MODE 2 #define SF_CRC32_MODE 4 +#define SF_HW_SIG 8 /* kernel/power/hibernate.c */ extern int swsusp_check(void); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index ff326c2cb77b..ad10359030a4 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -36,6 +36,8 @@ #define HIBERNATE_SIG "S1SUSPEND" +u32 swsusp_hardware_signature; + /* * When reading an {un,}compressed image, we may restore pages in place, * in which case some architectures need these pages cleaning before they @@ -104,7 +106,8 @@ struct swap_map_handle { struct swsusp_header { char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - - sizeof(u32)]; + sizeof(u32) - sizeof(u32)]; + u32 hw_sig; u32 crc32; sector_t image; unsigned int flags; /* Flags to pass to the "boot" kernel */ @@ -312,7 +315,6 @@ static int hib_wait_io(struct hib_bio_batch *hb) /* * Saving part */ - static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) { int error; @@ -324,6 +326,10 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); swsusp_header->image = handle->first_sector; + if (swsusp_hardware_signature) { + swsusp_header->hw_sig = swsusp_hardware_signature; + flags |= SF_HW_SIG; + } swsusp_header->flags = flags; if (flags & SF_CRC32_MODE) swsusp_header->crc32 = handle->crc32; @@ -1537,6 +1543,12 @@ int swsusp_check(void) } else { error = -EINVAL; } + if (!error && swsusp_header->flags & SF_HW_SIG && + swsusp_header->hw_sig != swsusp_hardware_signature) { + pr_info("Suspend image hardware signature mismatch (%08x now %08x); aborting resume.\n", + swsusp_header->hw_sig, swsusp_hardware_signature); + error = -EINVAL; + } put: if (error) -- cgit v1.2.3 From 28e4576d556bca543b0996e9edd4b767397e24c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Nov 2021 15:50:28 +0100 Subject: dma-direct: add a dma_direct_use_pool helper Add a helper to check if a potentially blocking operation should dip into the atomic pools. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 924937c54e8a..50f48e9e4598 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -156,6 +156,15 @@ again: return page; } +/* + * Check if a potentially blocking operations needs to dip into the atomic + * pools for the given device/gfp. + */ +static bool dma_direct_use_pool(struct device *dev, gfp_t gfp) +{ + return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev); +} + static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { @@ -235,8 +244,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, */ remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP); if (remap) { - if (!gfpflags_allow_blocking(gfp) && - !is_swiotlb_for_alloc(dev)) + if (dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); } else { @@ -250,8 +258,7 @@ void *dma_direct_alloc(struct device *dev, size_t size, * Decrypting memory may block, so allocate the memory from the atomic * pools if we can't block. */ - if (force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && - !is_swiotlb_for_alloc(dev)) + if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ @@ -360,8 +367,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - if (force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && - !is_swiotlb_for_alloc(dev)) + if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); page = __dma_direct_alloc_pages(dev, size, gfp); -- cgit v1.2.3 From 7d5b7cad79da76f3dad4a9f6040e524217814e5a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 Dec 2021 19:20:30 +0100 Subject: ftrace: Use direct_ops hash in unregister_ftrace_direct Now when we have *direct_multi interface the direct_functions hash is no longer owned just by direct_ops. It's also used by any other ftrace_ops passed to *direct_multi interface. Thus to find out that we are unregistering the last function from direct_ops, we need to check directly direct_ops's hash. Link: https://lkml.kernel.org/r/20211206182032.87248-2-jolsa@kernel.org Cc: Ingo Molnar Cc: Heiko Carstens Fixes: f64dd4627ec6 ("ftrace: Add multi direct register/unregister interface") Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 30bc880c3849..7f0594e28226 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5217,6 +5217,7 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) { struct ftrace_direct_func *direct; struct ftrace_func_entry *entry; + struct ftrace_hash *hash; int ret = -ENODEV; mutex_lock(&direct_mutex); @@ -5225,7 +5226,8 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr) if (!entry) goto out_unlock; - if (direct_functions->count == 1) + hash = direct_ops.func_hash->filter_hash; + if (hash->count == 1) unregister_ftrace_function(&direct_ops); ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0); -- cgit v1.2.3 From fea3ffa48c6d42a11dca766c89284d22eaf5603f Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 6 Dec 2021 19:20:31 +0100 Subject: ftrace: Add cleanup to unregister_ftrace_direct_multi Adding ops cleanup to unregister_ftrace_direct_multi, so it can be reused in another register call. Link: https://lkml.kernel.org/r/20211206182032.87248-3-jolsa@kernel.org Cc: Ingo Molnar Cc: Heiko Carstens Fixes: f64dd4627ec6 ("ftrace: Add multi direct register/unregister interface") Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7f0594e28226..be5f6b32a012 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5542,6 +5542,10 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) err = unregister_ftrace_function(ops); remove_direct_functions_hash(hash, addr); mutex_unlock(&direct_mutex); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; return err; } EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); -- cgit v1.2.3 From ef8df9798d469b7c45c66664550e93469749f1e8 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 7 Dec 2021 09:57:55 +0000 Subject: sched/fair: Cleanup task_util and capacity type task_util and capacity are comparable unsigned long values. There is no need for an intermidiate implicit signed cast. Signed-off-by: Vincent Donnefort Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211207095755.859972-1-vincent.donnefort@arm.com --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f34f2f344fe9..ac5e55441cab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4070,7 +4070,8 @@ done: trace_sched_util_est_se_tp(&p->se); } -static inline int task_fits_capacity(struct task_struct *p, long capacity) +static inline int task_fits_capacity(struct task_struct *p, + unsigned long capacity) { return fits_capacity(uclamp_task_util(p), capacity); } @@ -6345,7 +6346,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } -static inline bool asym_fits_capacity(int task_util, int cpu) +static inline bool asym_fits_capacity(unsigned long task_util, int cpu) { if (static_branch_unlikely(&sched_asym_cpucapacity)) return fits_capacity(task_util, capacity_of(cpu)); -- cgit v1.2.3 From 73b6eae583f44e278e19489a411f9c1e22d530fc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 7 Dec 2021 22:47:18 +0000 Subject: bpf: Remove redundant assignment to pointer t The pointer t is being initialized with a value that is never read. The pointer is re-assigned a value a littler later on, hence the initialization is redundant and can be removed. Signed-off-by: Colin Ian King Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211207224718.59593-1-colin.i.king@gmail.com --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 01b47d4df3ab..27b7de538697 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -837,7 +837,7 @@ static const char *btf_show_name(struct btf_show *show) const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)]; const char *name = NULL, *prefix = "", *parens = ""; const struct btf_member *m = show->state.member; - const struct btf_type *t = show->state.type; + const struct btf_type *t; const struct btf_array *array; u32 id = show->state.type_id; const char *member = NULL; -- cgit v1.2.3 From 1197528aaea79ed4909aba695d18fdecc5387a36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:28 +0100 Subject: genirq/msi: Guard sysfs code No point in building unused code when CONFIG_SYSFS=n. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20211206210223.985907940@linutronix.de --- include/linux/msi.h | 10 ++++++++++ kernel/irq/msi.c | 2 ++ 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index e616f94c7c58..d43b9469c88b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -239,9 +239,19 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); +#ifdef CONFIG_SYSFS const struct attribute_group **msi_populate_sysfs(struct device *dev); void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups); +#else +static inline const struct attribute_group **msi_populate_sysfs(struct device *dev) +{ + return NULL; +} +static inline void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups) +{ +} +#endif /* * The arch hooks to setup up msi irqs. Default functions are implemented diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7f350ae59c5f..a8a0daeb22f5 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -72,6 +72,7 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) } EXPORT_SYMBOL_GPL(get_cached_msi_msg); +#ifdef CONFIG_SYSFS static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -204,6 +205,7 @@ void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_ir kfree(msi_irq_groups); } } +#endif #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN static inline void irq_chip_write_msi_msg(struct irq_data *data, -- cgit v1.2.3 From 1dd2c6a0817fd08f80dee75d7d3bd99a0c4b828d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:29 +0100 Subject: genirq/msi: Remove unused domain callbacks No users and there is no need to grow them. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211126223824.322987915@linutronix.de Link: https://lore.kernel.org/r/20211206210224.041777889@linutronix.de --- include/linux/msi.h | 11 ++++------- kernel/irq/msi.c | 5 ----- 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index d43b9469c88b..4b962f73f84a 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -304,7 +304,6 @@ struct msi_domain_info; * @msi_free: Domain specific function to free a MSI interrupts * @msi_check: Callback for verification of the domain/info/dev data * @msi_prepare: Prepare the allocation of the interrupts in the domain - * @msi_finish: Optional callback to finalize the allocation * @set_desc: Set the msi descriptor for an interrupt * @handle_error: Optional error handler if the allocation fails * @domain_alloc_irqs: Optional function to override the default allocation @@ -312,12 +311,11 @@ struct msi_domain_info; * @domain_free_irqs: Optional function to override the default free * function. * - * @get_hwirq, @msi_init and @msi_free are callbacks used by - * msi_create_irq_domain() and related interfaces + * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying + * irqdomain. * - * @msi_check, @msi_prepare, @msi_finish, @set_desc and @handle_error - * are callbacks used by msi_domain_alloc_irqs() and related - * interfaces which are based on msi_desc. + * @msi_check, @msi_prepare, @handle_error and @set_desc are callbacks used by + * msi_domain_alloc/free_irqs(). * * @domain_alloc_irqs, @domain_free_irqs can be used to override the * default allocation/free functions (__msi_domain_alloc/free_irqs). This @@ -351,7 +349,6 @@ struct msi_domain_ops { int (*msi_prepare)(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg); - void (*msi_finish)(msi_alloc_info_t *arg, int retval); void (*set_desc)(msi_alloc_info_t *arg, struct msi_desc *desc); int (*handle_error)(struct irq_domain *domain, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index a8a0daeb22f5..cd4fa264c7c6 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -562,8 +562,6 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ret = -ENOSPC; if (ops->handle_error) ret = ops->handle_error(domain, desc, ret); - if (ops->msi_finish) - ops->msi_finish(&arg, ret); return ret; } @@ -573,9 +571,6 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, } } - if (ops->msi_finish) - ops->msi_finish(&arg, 0); - can_reserve = msi_check_reservation_mode(domain, info, dev); /* -- cgit v1.2.3 From 3ba1f050c91d5ce3672dbf3a55dc2451c0b342e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:31 +0100 Subject: genirq/msi: Fixup includes Remove the kobject.h include from msi.h as it's not required and add a sysfs.h include to the core code instead. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20211206210224.103502021@linutronix.de --- include/linux/msi.h | 2 +- kernel/irq/msi.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 4b962f73f84a..5c627750f269 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -2,7 +2,7 @@ #ifndef LINUX_MSI_H #define LINUX_MSI_H -#include +#include #include #include diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index cd4fa264c7c6..6718bab1bde3 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include "internals.h" -- cgit v1.2.3 From e58f2259b91c02974c20db7b28d39d810a21249b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:39 +0100 Subject: genirq/msi, treewide: Use a named struct for PCI/MSI attributes The unnamed struct sucks and is in the way of further cleanups. Stick the PCI related MSI data into a real data structure and cleanup all users. No functional change. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Reviewed-by: Greg Kroah-Hartman Acked-by: Kalle Valo Link: https://lore.kernel.org/r/20211206210224.374863119@linutronix.de --- arch/powerpc/platforms/cell/axon_msi.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 4 +- arch/powerpc/platforms/pseries/msi.c | 6 +- arch/sparc/kernel/pci_msi.c | 4 +- arch/x86/kernel/apic/msi.c | 2 +- arch/x86/pci/xen.c | 6 +- drivers/net/wireless/ath/ath11k/pci.c | 2 +- drivers/pci/msi.c | 116 +++++++++++++++--------------- drivers/pci/xen-pcifront.c | 2 +- include/linux/msi.h | 84 +++++++++++----------- kernel/irq/msi.c | 4 +- 11 files changed, 115 insertions(+), 117 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index 82335e364c44..79e21128518c 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -212,7 +212,7 @@ static int setup_msi_msg_address(struct pci_dev *dev, struct msi_msg *msg) entry = first_pci_msi_entry(dev); for (; dn; dn = of_get_next_parent(dn)) { - if (entry->msi_attrib.is_64) { + if (entry->pci.msi_attrib.is_64) { prop = of_get_property(dn, "msi-address-64", &len); if (prop) break; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 004cd6a96c8a..8913c86009d9 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2154,10 +2154,10 @@ static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) int rc; rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq, - entry->msi_attrib.is_64, msg); + entry->pci.msi_attrib.is_64, msg); if (rc) dev_err(&pdev->dev, "Failed to setup %s-bit MSI #%ld : %d\n", - entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc); + entry->pci.msi_attrib.is_64 ? "64" : "32", d->hwirq, rc); } /* diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 8627362f613e..8e287204eeae 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -332,7 +332,7 @@ static int check_msix_entries(struct pci_dev *pdev) expected = 0; for_each_pci_msi_entry(entry, pdev) { - if (entry->msi_attrib.entry_nr != expected) { + if (entry->pci.msi_attrib.entry_nr != expected) { pr_debug("rtas_msi: bad MSI-X entries.\n"); return -EINVAL; } @@ -449,7 +449,7 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev { struct pci_dev *pdev = to_pci_dev(dev); struct msi_desc *desc = first_pci_msi_entry(pdev); - int type = desc->msi_attrib.is_msix ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; + int type = desc->pci.msi_attrib.is_msix ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; return rtas_prepare_msi_irqs(pdev, nvec, type, arg); } @@ -580,7 +580,7 @@ static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq int hwirq; int i, ret; - hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_attrib.entry_nr); + hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->pci.msi_attrib.entry_nr); if (hwirq < 0) { dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq); return hwirq; diff --git a/arch/sparc/kernel/pci_msi.c b/arch/sparc/kernel/pci_msi.c index fb5899cbfa51..9ed11985768e 100644 --- a/arch/sparc/kernel/pci_msi.c +++ b/arch/sparc/kernel/pci_msi.c @@ -146,13 +146,13 @@ static int sparc64_setup_msi_irq(unsigned int *irq_p, msiqid = pick_msiq(pbm); err = ops->msi_setup(pbm, msiqid, msi, - (entry->msi_attrib.is_64 ? 1 : 0)); + (entry->pci.msi_attrib.is_64 ? 1 : 0)); if (err) goto out_msi_free; pbm->msi_irq_table[msi - pbm->msi_first] = *irq_p; - if (entry->msi_attrib.is_64) { + if (entry->pci.msi_attrib.is_64) { msg.address_hi = pbm->msi64_start >> 32; msg.address_lo = pbm->msi64_start & 0xffffffff; } else { diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index dbacb9ec8843..1656477e4169 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -163,7 +163,7 @@ int pci_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, struct msi_desc *desc = first_pci_msi_entry(pdev); init_irq_alloc_info(arg, NULL); - if (desc->msi_attrib.is_msix) { + if (desc->pci.msi_attrib.is_msix) { arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSIX; } else { arg->type = X86_IRQ_ALLOC_TYPE_PCI_MSI; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 12da00558631..2dace884d6f7 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -306,7 +306,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return -EINVAL; map_irq.table_base = pci_resource_start(dev, bir); - map_irq.entry_nr = msidesc->msi_attrib.entry_nr; + map_irq.entry_nr = msidesc->pci.msi_attrib.entry_nr; } ret = -EINVAL; @@ -398,7 +398,7 @@ static void xen_pv_teardown_msi_irqs(struct pci_dev *dev) { struct msi_desc *msidesc = first_pci_msi_entry(dev); - if (msidesc->msi_attrib.is_msix) + if (msidesc->pci.msi_attrib.is_msix) xen_pci_frontend_disable_msix(dev); else xen_pci_frontend_disable_msi(dev); @@ -414,7 +414,7 @@ static int xen_msi_domain_alloc_irqs(struct irq_domain *domain, if (WARN_ON_ONCE(!dev_is_pci(dev))) return -EINVAL; - if (first_msi_entry(dev)->msi_attrib.is_msix) + if (first_msi_entry(dev)->pci.msi_attrib.is_msix) type = PCI_CAP_ID_MSIX; else type = PCI_CAP_ID_MSI; diff --git a/drivers/net/wireless/ath/ath11k/pci.c b/drivers/net/wireless/ath/ath11k/pci.c index 3d353e7c9d5c..d9d00e499174 100644 --- a/drivers/net/wireless/ath/ath11k/pci.c +++ b/drivers/net/wireless/ath/ath11k/pci.c @@ -911,7 +911,7 @@ static int ath11k_pci_alloc_msi(struct ath11k_pci *ab_pci) } ab_pci->msi_ep_base_data = msi_desc->msg.data; - if (msi_desc->msi_attrib.is_64) + if (msi_desc->pci.msi_attrib.is_64) set_bit(ATH11K_PCI_FLAG_IS_MSI_64, &ab_pci->flags); ath11k_dbg(ab, ATH11K_DBG_PCI, "msi base data is %d\n", ab_pci->msi_ep_base_data); diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index e1aecd9b82d9..b6cd8b337210 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -138,9 +138,9 @@ void __weak arch_restore_msi_irqs(struct pci_dev *dev) static inline __attribute_const__ u32 msi_multi_mask(struct msi_desc *desc) { /* Don't shift by >= width of type */ - if (desc->msi_attrib.multi_cap >= 5) + if (desc->pci.msi_attrib.multi_cap >= 5) return 0xffffffff; - return (1 << (1 << desc->msi_attrib.multi_cap)) - 1; + return (1 << (1 << desc->pci.msi_attrib.multi_cap)) - 1; } static noinline void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set) @@ -148,14 +148,14 @@ static noinline void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 s raw_spinlock_t *lock = &desc->dev->msi_lock; unsigned long flags; - if (!desc->msi_attrib.can_mask) + if (!desc->pci.msi_attrib.can_mask) return; raw_spin_lock_irqsave(lock, flags); - desc->msi_mask &= ~clear; - desc->msi_mask |= set; - pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos, - desc->msi_mask); + desc->pci.msi_mask &= ~clear; + desc->pci.msi_mask |= set; + pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos, + desc->pci.msi_mask); raw_spin_unlock_irqrestore(lock, flags); } @@ -171,7 +171,7 @@ static inline void pci_msi_unmask(struct msi_desc *desc, u32 mask) static inline void __iomem *pci_msix_desc_addr(struct msi_desc *desc) { - return desc->mask_base + desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; + return desc->pci.mask_base + desc->pci.msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; } /* @@ -184,27 +184,27 @@ static void pci_msix_write_vector_ctrl(struct msi_desc *desc, u32 ctrl) { void __iomem *desc_addr = pci_msix_desc_addr(desc); - if (desc->msi_attrib.can_mask) + if (desc->pci.msi_attrib.can_mask) writel(ctrl, desc_addr + PCI_MSIX_ENTRY_VECTOR_CTRL); } static inline void pci_msix_mask(struct msi_desc *desc) { - desc->msix_ctrl |= PCI_MSIX_ENTRY_CTRL_MASKBIT; - pci_msix_write_vector_ctrl(desc, desc->msix_ctrl); + desc->pci.msix_ctrl |= PCI_MSIX_ENTRY_CTRL_MASKBIT; + pci_msix_write_vector_ctrl(desc, desc->pci.msix_ctrl); /* Flush write to device */ - readl(desc->mask_base); + readl(desc->pci.mask_base); } static inline void pci_msix_unmask(struct msi_desc *desc) { - desc->msix_ctrl &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; - pci_msix_write_vector_ctrl(desc, desc->msix_ctrl); + desc->pci.msix_ctrl &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT; + pci_msix_write_vector_ctrl(desc, desc->pci.msix_ctrl); } static void __pci_msi_mask_desc(struct msi_desc *desc, u32 mask) { - if (desc->msi_attrib.is_msix) + if (desc->pci.msi_attrib.is_msix) pci_msix_mask(desc); else pci_msi_mask(desc, mask); @@ -212,7 +212,7 @@ static void __pci_msi_mask_desc(struct msi_desc *desc, u32 mask) static void __pci_msi_unmask_desc(struct msi_desc *desc, u32 mask) { - if (desc->msi_attrib.is_msix) + if (desc->pci.msi_attrib.is_msix) pci_msix_unmask(desc); else pci_msi_unmask(desc, mask); @@ -256,10 +256,10 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) BUG_ON(dev->current_state != PCI_D0); - if (entry->msi_attrib.is_msix) { + if (entry->pci.msi_attrib.is_msix) { void __iomem *base = pci_msix_desc_addr(entry); - if (WARN_ON_ONCE(entry->msi_attrib.is_virtual)) + if (WARN_ON_ONCE(entry->pci.msi_attrib.is_virtual)) return; msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR); @@ -271,7 +271,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) pci_read_config_dword(dev, pos + PCI_MSI_ADDRESS_LO, &msg->address_lo); - if (entry->msi_attrib.is_64) { + if (entry->pci.msi_attrib.is_64) { pci_read_config_dword(dev, pos + PCI_MSI_ADDRESS_HI, &msg->address_hi); pci_read_config_word(dev, pos + PCI_MSI_DATA_64, &data); @@ -289,12 +289,12 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) if (dev->current_state != PCI_D0 || pci_dev_is_disconnected(dev)) { /* Don't touch the hardware now */ - } else if (entry->msi_attrib.is_msix) { + } else if (entry->pci.msi_attrib.is_msix) { void __iomem *base = pci_msix_desc_addr(entry); - u32 ctrl = entry->msix_ctrl; + u32 ctrl = entry->pci.msix_ctrl; bool unmasked = !(ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT); - if (entry->msi_attrib.is_virtual) + if (entry->pci.msi_attrib.is_virtual) goto skip; /* @@ -323,12 +323,12 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl); msgctl &= ~PCI_MSI_FLAGS_QSIZE; - msgctl |= entry->msi_attrib.multiple << 4; + msgctl |= entry->pci.msi_attrib.multiple << 4; pci_write_config_word(dev, pos + PCI_MSI_FLAGS, msgctl); pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_LO, msg->address_lo); - if (entry->msi_attrib.is_64) { + if (entry->pci.msi_attrib.is_64) { pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_HI, msg->address_hi); pci_write_config_word(dev, pos + PCI_MSI_DATA_64, @@ -376,9 +376,9 @@ static void free_msi_irqs(struct pci_dev *dev) pci_msi_teardown_msi_irqs(dev); list_for_each_entry_safe(entry, tmp, msi_list, list) { - if (entry->msi_attrib.is_msix) { + if (entry->pci.msi_attrib.is_msix) { if (list_is_last(&entry->list, msi_list)) - iounmap(entry->mask_base); + iounmap(entry->pci.mask_base); } list_del(&entry->list); @@ -420,7 +420,7 @@ static void __pci_restore_msi_state(struct pci_dev *dev) pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control); pci_msi_update_mask(entry, 0, 0); control &= ~PCI_MSI_FLAGS_QSIZE; - control |= (entry->msi_attrib.multiple << 4) | PCI_MSI_FLAGS_ENABLE; + control |= (entry->pci.msi_attrib.multiple << 4) | PCI_MSI_FLAGS_ENABLE; pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control); } @@ -449,7 +449,7 @@ static void __pci_restore_msix_state(struct pci_dev *dev) arch_restore_msi_irqs(dev); for_each_pci_msi_entry(entry, dev) - pci_msix_write_vector_ctrl(entry, entry->msix_ctrl); + pci_msix_write_vector_ctrl(entry, entry->pci.msix_ctrl); pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0); } @@ -481,24 +481,24 @@ msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd) if (dev->dev_flags & PCI_DEV_FLAGS_HAS_MSI_MASKING) control |= PCI_MSI_FLAGS_MASKBIT; - entry->msi_attrib.is_msix = 0; - entry->msi_attrib.is_64 = !!(control & PCI_MSI_FLAGS_64BIT); - entry->msi_attrib.is_virtual = 0; - entry->msi_attrib.entry_nr = 0; - entry->msi_attrib.can_mask = !pci_msi_ignore_mask && + entry->pci.msi_attrib.is_msix = 0; + entry->pci.msi_attrib.is_64 = !!(control & PCI_MSI_FLAGS_64BIT); + entry->pci.msi_attrib.is_virtual = 0; + entry->pci.msi_attrib.entry_nr = 0; + entry->pci.msi_attrib.can_mask = !pci_msi_ignore_mask && !!(control & PCI_MSI_FLAGS_MASKBIT); - entry->msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ - entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1; - entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec)); + entry->pci.msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ + entry->pci.msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1; + entry->pci.msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec)); if (control & PCI_MSI_FLAGS_64BIT) - entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64; + entry->pci.mask_pos = dev->msi_cap + PCI_MSI_MASK_64; else - entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_32; + entry->pci.mask_pos = dev->msi_cap + PCI_MSI_MASK_32; /* Save the initial mask status */ - if (entry->msi_attrib.can_mask) - pci_read_config_dword(dev, entry->mask_pos, &entry->msi_mask); + if (entry->pci.msi_attrib.can_mask) + pci_read_config_dword(dev, entry->pci.mask_pos, &entry->pci.msi_mask); out: kfree(masks); @@ -630,26 +630,26 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, goto out; } - entry->msi_attrib.is_msix = 1; - entry->msi_attrib.is_64 = 1; + entry->pci.msi_attrib.is_msix = 1; + entry->pci.msi_attrib.is_64 = 1; if (entries) - entry->msi_attrib.entry_nr = entries[i].entry; + entry->pci.msi_attrib.entry_nr = entries[i].entry; else - entry->msi_attrib.entry_nr = i; + entry->pci.msi_attrib.entry_nr = i; - entry->msi_attrib.is_virtual = - entry->msi_attrib.entry_nr >= vec_count; + entry->pci.msi_attrib.is_virtual = + entry->pci.msi_attrib.entry_nr >= vec_count; - entry->msi_attrib.can_mask = !pci_msi_ignore_mask && - !entry->msi_attrib.is_virtual; + entry->pci.msi_attrib.can_mask = !pci_msi_ignore_mask && + !entry->pci.msi_attrib.is_virtual; - entry->msi_attrib.default_irq = dev->irq; - entry->mask_base = base; + entry->pci.msi_attrib.default_irq = dev->irq; + entry->pci.mask_base = base; - if (entry->msi_attrib.can_mask) { + if (entry->pci.msi_attrib.can_mask) { addr = pci_msix_desc_addr(entry); - entry->msix_ctrl = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL); + entry->pci.msix_ctrl = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL); } list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); @@ -874,7 +874,7 @@ static void pci_msi_shutdown(struct pci_dev *dev) pci_msi_unmask(desc, msi_multi_mask(desc)); /* Restore dev->irq to its default pin-assertion IRQ */ - dev->irq = desc->msi_attrib.default_irq; + dev->irq = desc->pci.msi_attrib.default_irq; pcibios_alloc_irq(dev); } @@ -1203,7 +1203,7 @@ int pci_irq_vector(struct pci_dev *dev, unsigned int nr) struct msi_desc *entry; for_each_pci_msi_entry(entry, dev) { - if (entry->msi_attrib.entry_nr == nr) + if (entry->pci.msi_attrib.entry_nr == nr) return entry->irq; } WARN_ON_ONCE(1); @@ -1242,7 +1242,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr) struct msi_desc *entry; for_each_pci_msi_entry(entry, dev) { - if (entry->msi_attrib.entry_nr == nr) + if (entry->pci.msi_attrib.entry_nr == nr) return &entry->affinity->mask; } WARN_ON_ONCE(1); @@ -1295,14 +1295,14 @@ static irq_hw_number_t pci_msi_domain_calc_hwirq(struct msi_desc *desc) { struct pci_dev *dev = msi_desc_to_pci_dev(desc); - return (irq_hw_number_t)desc->msi_attrib.entry_nr | + return (irq_hw_number_t)desc->pci.msi_attrib.entry_nr | pci_dev_id(dev) << 11 | (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 27; } static inline bool pci_msi_desc_is_multi_msi(struct msi_desc *desc) { - return !desc->msi_attrib.is_msix && desc->nvec_used > 1; + return !desc->pci.msi_attrib.is_msix && desc->nvec_used > 1; } /** @@ -1326,7 +1326,7 @@ int pci_msi_domain_check_cap(struct irq_domain *domain, if (pci_msi_desc_is_multi_msi(desc) && !(info->flags & MSI_FLAG_MULTI_PCI_MSI)) return 1; - else if (desc->msi_attrib.is_msix && !(info->flags & MSI_FLAG_PCI_MSIX)) + else if (desc->pci.msi_attrib.is_msix && !(info->flags & MSI_FLAG_PCI_MSIX)) return -ENOTSUPP; return 0; diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c index d858d25b6cab..699cc9544424 100644 --- a/drivers/pci/xen-pcifront.c +++ b/drivers/pci/xen-pcifront.c @@ -263,7 +263,7 @@ static int pci_frontend_enable_msix(struct pci_dev *dev, i = 0; for_each_pci_msi_entry(entry, dev) { - op.msix_entries[i].entry = entry->msi_attrib.entry_nr; + op.msix_entries[i].entry = entry->pci.msi_attrib.entry_nr; /* Vector is useless at this point. */ op.msix_entries[i].vector = -1; i++; diff --git a/include/linux/msi.h b/include/linux/msi.h index ac6fec105edc..7e5c13f4e41b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -68,6 +68,42 @@ static inline void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) typedef void (*irq_write_msi_msg_t)(struct msi_desc *desc, struct msi_msg *msg); +/** + * pci_msi_desc - PCI/MSI specific MSI descriptor data + * + * @msi_mask: [PCI MSI] MSI cached mask bits + * @msix_ctrl: [PCI MSI-X] MSI-X cached per vector control bits + * @is_msix: [PCI MSI/X] True if MSI-X + * @multiple: [PCI MSI/X] log2 num of messages allocated + * @multi_cap: [PCI MSI/X] log2 num of messages supported + * @can_mask: [PCI MSI/X] Masking supported? + * @is_64: [PCI MSI/X] Address size: 0=32bit 1=64bit + * @entry_nr: [PCI MSI/X] Entry which is described by this descriptor + * @default_irq:[PCI MSI/X] The default pre-assigned non-MSI irq + * @mask_pos: [PCI MSI] Mask register position + * @mask_base: [PCI MSI-X] Mask register base address + */ +struct pci_msi_desc { + union { + u32 msi_mask; + u32 msix_ctrl; + }; + struct { + u8 is_msix : 1; + u8 multiple : 3; + u8 multi_cap : 3; + u8 can_mask : 1; + u8 is_64 : 1; + u8 is_virtual : 1; + u16 entry_nr; + unsigned default_irq; + } msi_attrib; + union { + u8 mask_pos; + void __iomem *mask_base; + }; +}; + /** * platform_msi_desc - Platform device specific msi descriptor data * @msi_priv_data: Pointer to platform private data @@ -107,17 +143,7 @@ struct ti_sci_inta_msi_desc { * address or data changes * @write_msi_msg_data: Data parameter for the callback. * - * @msi_mask: [PCI MSI] MSI cached mask bits - * @msix_ctrl: [PCI MSI-X] MSI-X cached per vector control bits - * @is_msix: [PCI MSI/X] True if MSI-X - * @multiple: [PCI MSI/X] log2 num of messages allocated - * @multi_cap: [PCI MSI/X] log2 num of messages supported - * @maskbit: [PCI MSI/X] Mask-Pending bit supported? - * @is_64: [PCI MSI/X] Address size: 0=32bit 1=64bit - * @entry_nr: [PCI MSI/X] Entry which is described by this descriptor - * @default_irq:[PCI MSI/X] The default pre-assigned non-MSI irq - * @mask_pos: [PCI MSI] Mask register position - * @mask_base: [PCI MSI-X] Mask register base address + * @pci: [PCI] PCI speficic msi descriptor data * @platform: [platform] Platform device specific msi descriptor data * @fsl_mc: [fsl-mc] FSL MC device specific msi descriptor data * @inta: [INTA] TISCI based INTA specific msi descriptor data @@ -138,38 +164,10 @@ struct msi_desc { void *write_msi_msg_data; union { - /* PCI MSI/X specific data */ - struct { - union { - u32 msi_mask; - u32 msix_ctrl; - }; - struct { - u8 is_msix : 1; - u8 multiple : 3; - u8 multi_cap : 3; - u8 can_mask : 1; - u8 is_64 : 1; - u8 is_virtual : 1; - u16 entry_nr; - unsigned default_irq; - } msi_attrib; - union { - u8 mask_pos; - void __iomem *mask_base; - }; - }; - - /* - * Non PCI variants add their data structure here. New - * entries need to use a named structure. We want - * proper name spaces for this. The PCI part is - * anonymous for now as it would require an immediate - * tree wide cleanup. - */ - struct platform_msi_desc platform; - struct fsl_mc_msi_desc fsl_mc; - struct ti_sci_inta_msi_desc inta; + struct pci_msi_desc pci; + struct platform_msi_desc platform; + struct fsl_mc_msi_desc fsl_mc; + struct ti_sci_inta_msi_desc inta; }; }; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6718bab1bde3..7d78d8aff076 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -91,7 +91,7 @@ static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, return -ENODEV; if (dev_is_pci(dev)) - is_msix = entry->msi_attrib.is_msix; + is_msix = entry->pci.msi_attrib.is_msix; return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi"); } @@ -535,7 +535,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, * masking and MSI does so when the can_mask attribute is set. */ desc = first_msi_entry(dev); - return desc->msi_attrib.is_msix || desc->msi_attrib.can_mask; + return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask; } int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, -- cgit v1.2.3 From 890337624e1fa2da079fc1c036a62d178c985280 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:27:59 +0100 Subject: genirq/msi: Handle PCI/MSI allocation fail in core code Get rid of yet another irqdomain callback and let the core code return the already available information of how many descriptors could be allocated. Signed-off-by: Thomas Gleixner Tested-by: Juergen Gross Reviewed-by: Jason Gunthorpe Reviewed-by: Greg Kroah-Hartman Acked-by: Bjorn Helgaas # PCI Link: https://lore.kernel.org/r/20211206210225.046615302@linutronix.de --- drivers/pci/msi/irqdomain.c | 13 ------------- include/linux/msi.h | 5 +---- kernel/irq/msi.c | 29 +++++++++++++++++++++++++---- 3 files changed, 26 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c index 6abd8aff2cea..a5546900244d 100644 --- a/drivers/pci/msi/irqdomain.c +++ b/drivers/pci/msi/irqdomain.c @@ -95,16 +95,6 @@ static int pci_msi_domain_check_cap(struct irq_domain *domain, return 0; } -static int pci_msi_domain_handle_error(struct irq_domain *domain, - struct msi_desc *desc, int error) -{ - /* Special handling to support __pci_enable_msi_range() */ - if (pci_msi_desc_is_multi_msi(desc) && error == -ENOSPC) - return 1; - - return error; -} - static void pci_msi_domain_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc) { @@ -115,7 +105,6 @@ static void pci_msi_domain_set_desc(msi_alloc_info_t *arg, static struct msi_domain_ops pci_msi_domain_ops_default = { .set_desc = pci_msi_domain_set_desc, .msi_check = pci_msi_domain_check_cap, - .handle_error = pci_msi_domain_handle_error, }; static void pci_msi_domain_update_dom_ops(struct msi_domain_info *info) @@ -129,8 +118,6 @@ static void pci_msi_domain_update_dom_ops(struct msi_domain_info *info) ops->set_desc = pci_msi_domain_set_desc; if (ops->msi_check == NULL) ops->msi_check = pci_msi_domain_check_cap; - if (ops->handle_error == NULL) - ops->handle_error = pci_msi_domain_handle_error; } } diff --git a/include/linux/msi.h b/include/linux/msi.h index 5248678e05d1..ba4a39c430b5 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -286,7 +286,6 @@ struct msi_domain_info; * @msi_check: Callback for verification of the domain/info/dev data * @msi_prepare: Prepare the allocation of the interrupts in the domain * @set_desc: Set the msi descriptor for an interrupt - * @handle_error: Optional error handler if the allocation fails * @domain_alloc_irqs: Optional function to override the default allocation * function. * @domain_free_irqs: Optional function to override the default free @@ -295,7 +294,7 @@ struct msi_domain_info; * @get_hwirq, @msi_init and @msi_free are callbacks used by the underlying * irqdomain. * - * @msi_check, @msi_prepare, @handle_error and @set_desc are callbacks used by + * @msi_check, @msi_prepare and @set_desc are callbacks used by * msi_domain_alloc/free_irqs(). * * @domain_alloc_irqs, @domain_free_irqs can be used to override the @@ -332,8 +331,6 @@ struct msi_domain_ops { msi_alloc_info_t *arg); void (*set_desc)(msi_alloc_info_t *arg, struct msi_desc *desc); - int (*handle_error)(struct irq_domain *domain, - struct msi_desc *desc, int error); int (*domain_alloc_irqs)(struct irq_domain *domain, struct device *dev, int nvec); void (*domain_free_irqs)(struct irq_domain *domain, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7d78d8aff076..4a7a7f0f5102 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -538,6 +538,27 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask; } +static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc, + int allocated) +{ + switch(domain->bus_token) { + case DOMAIN_BUS_PCI_MSI: + case DOMAIN_BUS_VMD_MSI: + if (IS_ENABLED(CONFIG_PCI_MSI)) + break; + fallthrough; + default: + return -ENOSPC; + } + + /* Let a failed PCI multi MSI allocation retry */ + if (desc->nvec_used > 1) + return 1; + + /* If there was a successful allocation let the caller know */ + return allocated ? allocated : -ENOSPC; +} + int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec) { @@ -546,6 +567,7 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct irq_data *irq_data; struct msi_desc *desc; msi_alloc_info_t arg = { }; + int allocated = 0; int i, ret, virq; bool can_reserve; @@ -560,16 +582,15 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, dev_to_node(dev), &arg, false, desc->affinity); if (virq < 0) { - ret = -ENOSPC; - if (ops->handle_error) - ret = ops->handle_error(domain, desc, ret); - return ret; + ret = msi_handle_pci_fail(domain, desc, allocated); + goto cleanup; } for (i = 0; i < desc->nvec_used; i++) { irq_set_msi_desc_off(virq, i, desc); irq_debugfs_copy_devname(virq + i, dev); } + allocated++; } can_reserve = msi_check_reservation_mode(domain, info, dev); -- cgit v1.2.3 From c24be24aed405d64ebcf04526614c13b2adfb1d2 Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Thu, 9 Dec 2021 02:43:17 +0000 Subject: tracing: Fix possible memory leak in __create_synth_event() error path There's error paths in __create_synth_event() after the argv is allocated that fail to free it. Add a jump to free it when necessary. Link: https://lkml.kernel.org/r/20211209024317.11783-1-linmq006@gmail.com Suggested-by: Steven Rostedt (VMware) Signed-off-by: Miaoqian Lin [ Fixed up the patch and change log ] Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 22db3ce95e74..ca9c13b2ecf4 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1237,9 +1237,8 @@ static int __create_synth_event(const char *name, const char *raw_fields) argv + consumed, &consumed, &field_version); if (IS_ERR(field)) { - argv_free(argv); ret = PTR_ERR(field); - goto err; + goto err_free_arg; } /* @@ -1262,18 +1261,19 @@ static int __create_synth_event(const char *name, const char *raw_fields) if (cmd_version > 1 && n_fields_this_loop >= 1) { synth_err(SYNTH_ERR_INVALID_CMD, errpos(field_str)); ret = -EINVAL; - goto err; + goto err_free_arg; } fields[n_fields++] = field; if (n_fields == SYNTH_FIELDS_MAX) { synth_err(SYNTH_ERR_TOO_MANY_FIELDS, 0); ret = -EINVAL; - goto err; + goto err_free_arg; } n_fields_this_loop++; } + argv_free(argv); if (consumed < argc) { synth_err(SYNTH_ERR_INVALID_CMD, 0); @@ -1281,7 +1281,6 @@ static int __create_synth_event(const char *name, const char *raw_fields) goto err; } - argv_free(argv); } if (n_fields == 0) { @@ -1307,6 +1306,8 @@ static int __create_synth_event(const char *name, const char *raw_fields) kfree(saved_fields); return ret; + err_free_arg: + argv_free(argv); err: for (i = 0; i < n_fields; i++) free_synth_field(fields[i]); -- cgit v1.2.3 From 381a4f3b38603aab47e5500609d5ec733b5d0ecb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Nov 2021 16:52:02 -0800 Subject: rcu-tasks: Use spin_lock_rcu_node() and friends This commit renames the rcu_tasks_percpu structure's ->cbs_pcpu_lock to ->lock and then uses spin_lock_rcu_node() and friends to acquire and release this lock, preparing for upcoming commits that will spread the grace-period process across multiple CPUs and kthreads. [ paulmck: Apply feedback from kernel test robot. ] Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 2e58d7fa2da4..8cf325146fb0 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -23,11 +23,11 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); /** * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism. * @cblist: Callback list. - * @cbs_pcpu_lock: Lock protecting per-CPU callback list. + * @lock: Lock protecting per-CPU callback list. */ struct rcu_tasks_percpu { struct rcu_segcblist cblist; - raw_spinlock_t cbs_pcpu_lock; + raw_spinlock_t __private lock; }; /** @@ -82,7 +82,7 @@ struct rcu_tasks { #define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ - .cbs_pcpu_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ }; \ static struct rcu_tasks rt_name = \ { \ @@ -177,11 +177,11 @@ static void cblist_init_generic(struct rcu_tasks *rtp) WARN_ON_ONCE(!rtpcp); if (cpu) - raw_spin_lock_init(&rtpcp->cbs_pcpu_lock); - raw_spin_lock(&rtpcp->cbs_pcpu_lock); // irqs already disabled. + raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock)); + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. if (rcu_segcblist_empty(&rtpcp->cblist)) rcu_segcblist_init(&rtpcp->cblist); - raw_spin_unlock(&rtpcp->cbs_pcpu_lock); // irqs remain disabled. + raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); @@ -200,15 +200,15 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, local_irq_save(flags); rtpcp = per_cpu_ptr(rtp->rtpcpu, smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); - raw_spin_lock(&rtpcp->cbs_pcpu_lock); + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) { - raw_spin_unlock(&rtpcp->cbs_pcpu_lock); // irqs remain disabled. + raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. cblist_init_generic(rtp); - raw_spin_lock(&rtpcp->cbs_pcpu_lock); // irqs already disabled. + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. } needwake = rcu_segcblist_empty(&rtpcp->cblist); rcu_segcblist_enqueue(&rtpcp->cblist, rhp); - raw_spin_unlock_irqrestore(&rtpcp->cbs_pcpu_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); /* We can't create the thread unless interrupts are enabled. */ if (needwake && READ_ONCE(rtp->kthread_ptr)) wake_up(&rtp->cbs_wq); @@ -250,11 +250,10 @@ static int __noreturn rcu_tasks_kthread(void *arg) set_tasks_gp_state(rtp, RTGS_WAIT_CBS); /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rtpcp->cbs_pcpu_lock, flags); - smp_mb__after_spinlock(); // Order updates vs. GP. + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); - raw_spin_unlock_irqrestore(&rtpcp->cbs_pcpu_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); /* If there were none, wait a bit and start over. */ if (!rcu_segcblist_pend_cbs(&rtpcp->cblist)) { @@ -277,11 +276,10 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* Invoke the callbacks. */ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); - raw_spin_lock_irqsave(&rtpcp->cbs_pcpu_lock, flags); - smp_mb__after_spinlock(); // Order updates vs. GP. + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl); - raw_spin_unlock_irqrestore(&rtpcp->cbs_pcpu_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); len = rcl.len; for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { local_bh_disable(); @@ -289,10 +287,10 @@ static int __noreturn rcu_tasks_kthread(void *arg) local_bh_enable(); cond_resched(); } - raw_spin_lock_irqsave(&rtpcp->cbs_pcpu_lock, flags); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_add_len(&rtpcp->cblist, -len); (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); - raw_spin_unlock_irqrestore(&rtpcp->cbs_pcpu_lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_idle(rtp->gp_sleep); } @@ -470,10 +468,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU // read-side critical sections waited for by rcu_tasks_postscan(). // -// Pre-grace-period update-side code is ordered before the grace via the -// ->cbs_lock and the smp_mb__after_spinlock(). Pre-grace-period read-side -// code is ordered before the grace period via synchronize_rcu() call -// in rcu_tasks_pregp_step() and by the scheduler's locks and interrupt +// Pre-grace-period update-side code is ordered before the grace +// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code +// is ordered before the grace period via synchronize_rcu() call in +// rcu_tasks_pregp_step() and by the scheduler's locks and interrupt // disabling. /* Pre-grace-period preparation. */ -- cgit v1.2.3 From 65b629e70489b810a108fe1155da4e41a5010534 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Tue, 9 Nov 2021 16:52:14 +0530 Subject: rcu-tasks: Inspect stalled task's trc state in locked state On RCU tasks trace stall, inspect the RCU-tasks-trace specific states of stalled task in locked down state, using try_invoke_ on_locked_down_task(), to get reliable trc state of a non-running stalled task. This was tested using the following command: tools/testing/selftests/rcutorture/bin/kvm.sh --cpus 8 --configs TRACE01 \ --bootargs "rcutorture.torture_type=tasks-tracing rcutorture.stall_cpu=10 \ rcutorture.stall_cpu_block=1 rcupdate.rcu_task_stall_timeout=100" --trust-make As expected, this produced the following console output for running and sleeping tasks. [ 21.520291] INFO: rcu_tasks_trace detected stalls on tasks: [ 21.521292] P85: ... nesting: 1N cpu: 2 [ 21.521966] task:rcu_torture_sta state:D stack:15080 pid: 85 ppid: 2 flags:0x00004000 [ 21.523384] Call Trace: [ 21.523808] __schedule+0x273/0x6e0 [ 21.524428] schedule+0x35/0xa0 [ 21.524971] schedule_timeout+0x1ed/0x270 [ 21.525690] ? del_timer_sync+0x30/0x30 [ 21.526371] ? rcu_torture_writer+0x720/0x720 [ 21.527106] rcu_torture_stall+0x24a/0x270 [ 21.527816] kthread+0x115/0x140 [ 21.528401] ? set_kthread_struct+0x40/0x40 [ 21.529136] ret_from_fork+0x22/0x30 [ 21.529766] 1 holdouts [ 21.632300] INFO: rcu_tasks_trace detected stalls on tasks: [ 21.632345] rcu_torture_stall end. [ 21.633293] P85: . [ 21.633294] task:rcu_torture_sta state:R running task stack:15080 pid: 85 ppid: 2 flags:0x00004000 [ 21.633299] Call Trace: [ 21.633301] ? vprintk_emit+0xab/0x180 [ 21.633306] ? vprintk_emit+0x11a/0x180 [ 21.633308] ? _printk+0x4d/0x69 [ 21.633311] ? __default_send_IPI_shortcut+0x1f/0x40 [ paulmck: Update to new v5.16 task_call_func() name. ] Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8cf325146fb0..c11f4e646bf1 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1132,25 +1132,50 @@ static void rcu_tasks_trace_postscan(struct list_head *hop) // Any tasks that exit after this point will set ->trc_reader_checked. } +/* Communicate task state back to the RCU tasks trace stall warning request. */ +struct trc_stall_chk_rdr { + int nesting; + int ipi_to_cpu; + u8 needqs; +}; + +static int trc_check_slow_task(struct task_struct *t, void *arg) +{ + struct trc_stall_chk_rdr *trc_rdrp = arg; + + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting); + trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu); + trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs); + return true; +} + /* Show the state of a task stalling the current RCU tasks trace GP. */ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) { int cpu; + struct trc_stall_chk_rdr trc_rdr; + bool is_idle_tsk = is_idle_task(t); if (*firstreport) { pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); *firstreport = false; } - // FIXME: This should attempt to use try_invoke_on_nonrunning_task(). cpu = task_cpu(t); - pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", - t->pid, - ".I"[READ_ONCE(t->trc_ipi_to_cpu) >= 0], - ".i"[is_idle_task(t)], - ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], - READ_ONCE(t->trc_reader_nesting), - " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)], - cpu); + if (!task_call_func(t, trc_check_slow_task, &trc_rdr)) + pr_alert("P%d: %c\n", + t->pid, + ".i"[is_idle_tsk]); + else + pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + t->pid, + ".I"[trc_rdr.ipi_to_cpu >= 0], + ".i"[is_idle_tsk], + ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], + trc_rdr.nesting, + " N"[!!trc_rdr.needqs], + cpu); sched_show_task(t); } -- cgit v1.2.3 From 8dd593fddd630e7dcbe79e98ff5e0d0561e9ff27 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Nov 2021 11:11:32 -0800 Subject: rcu-tasks: Add a ->percpu_enqueue_lim to the rcu_tasks structure This commit adds a ->percpu_enqueue_lim field to the rcu_tasks structure. This field contains two to the power of the ->percpu_enqueue_shift field, easing construction of iterators over the per-CPU queues that might contain RCU Tasks callbacks. Such iterators will be introduced in later commits. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index c11f4e646bf1..608af87a5065 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -52,6 +52,7 @@ struct rcu_tasks_percpu { * @call_func: This flavor's call_rcu()-equivalent function. * @rtpcpu: This flavor's rcu_tasks_percpu structure. * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. + * @percpu_enqueue_lim: Number of per-CPU callback queues in use. * @name: This flavor's textual name. * @kname: This flavor's kthread name. */ @@ -76,6 +77,7 @@ struct rcu_tasks { call_rcu_func_t call_func; struct rcu_tasks_percpu __percpu *rtpcpu; int percpu_enqueue_shift; + int percpu_enqueue_lim; char *name; char *kname; }; @@ -93,6 +95,7 @@ static struct rcu_tasks rt_name = \ .rtpcpu = &rt_name ## __percpu, \ .name = n, \ .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS), \ + .percpu_enqueue_lim = 1, \ .kname = #rt_name, \ } @@ -172,6 +175,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp) raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); rtp->percpu_enqueue_shift = ilog2(nr_cpu_ids); + rtp->percpu_enqueue_lim = 1; for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); -- cgit v1.2.3 From 42288cb44c4b5fff7653bc392b583a2b8bd6a8c0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 8 Dec 2021 17:04:51 -0800 Subject: wait: add wake_up_pollfree() Several ->poll() implementations are special in that they use a waitqueue whose lifetime is the current task, rather than the struct file as is normally the case. This is okay for blocking polls, since a blocking poll occurs within one task; however, non-blocking polls require another solution. This solution is for the queue to be cleared before it is freed, using 'wake_up_poll(wq, EPOLLHUP | POLLFREE);'. However, that has a bug: wake_up_poll() calls __wake_up() with nr_exclusive=1. Therefore, if there are multiple "exclusive" waiters, and the wakeup function for the first one returns a positive value, only that one will be called. That's *not* what's needed for POLLFREE; POLLFREE is special in that it really needs to wake up everyone. Considering the three non-blocking poll systems: - io_uring poll doesn't handle POLLFREE at all, so it is broken anyway. - aio poll is unaffected, since it doesn't support exclusive waits. However, that's fragile, as someone could add this feature later. - epoll doesn't appear to be broken by this, since its wakeup function returns 0 when it sees POLLFREE. But this is fragile. Although there is a workaround (see epoll), it's better to define a function which always sends POLLFREE to all waiters. Add such a function. Also make it verify that the queue really becomes empty after all waiters have been woken up. Reported-by: Linus Torvalds Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211209010455.42744-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/wait.h | 26 ++++++++++++++++++++++++++ kernel/sched/wait.c | 7 +++++++ 2 files changed, 33 insertions(+) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 2d0df57c9902..851e07da2583 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -217,6 +217,7 @@ void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); +void __wake_up_pollfree(struct wait_queue_head *wq_head); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) @@ -245,6 +246,31 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode); #define wake_up_interruptible_sync_poll_locked(x, m) \ __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m)) +/** + * wake_up_pollfree - signal that a polled waitqueue is going away + * @wq_head: the wait queue head + * + * In the very rare cases where a ->poll() implementation uses a waitqueue whose + * lifetime is tied to a task rather than to the 'struct file' being polled, + * this function must be called before the waitqueue is freed so that + * non-blocking polls (e.g. epoll) are notified that the queue is going away. + * + * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via + * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU. + */ +static inline void wake_up_pollfree(struct wait_queue_head *wq_head) +{ + /* + * For performance reasons, we don't always take the queue lock here. + * Therefore, we might race with someone removing the last entry from + * the queue, and proceed while they still hold the queue lock. + * However, rcu_read_lock() is required to be held in such cases, so we + * can safely proceed with an RCU-delayed free. + */ + if (waitqueue_active(wq_head)) + __wake_up_pollfree(wq_head); +} + #define ___wait_cond_timeout(condition) \ ({ \ bool __cond = (condition); \ diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 76577d1642a5..eca38107b32f 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -238,6 +238,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +void __wake_up_pollfree(struct wait_queue_head *wq_head) +{ + __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE)); + /* POLLFREE must have cleared the queue. */ + WARN_ON_ONCE(waitqueue_active(wq_head)); +} + /* * Note: we use "set_current_state()" _after_ the wait-queue add, * because we need a memory barrier there on SMP, so that any -- cgit v1.2.3 From 4d1114c05467b5f421d99121bff22a9633390722 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Nov 2021 13:37:34 -0800 Subject: rcu-tasks: Abstract checking of callback lists This commit adds a rcu_tasks_need_gpcb() function that returns an indication of whether another grace period is required, and if no grace period is required, whether there are callbacks that need to be invoked. The function scans all per-CPU lists currently in use. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 61 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 608af87a5065..e63987a5af56 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -229,11 +229,38 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) wait_rcu_gp(rtp->call_func); } +// Advance callbacks and indicate whether either a grace period or +// callback invocation is needed. +static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) +{ + int cpu; + unsigned long flags; + int needgpcb = 0; + + for (cpu = 0; cpu < rtp->percpu_enqueue_lim; cpu++) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + /* Advance and accelerate any new callbacks. */ + if (rcu_segcblist_empty(&rtpcp->cblist)) + continue; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); + (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); + if (rcu_segcblist_pend_cbs(&rtpcp->cblist)) + needgpcb |= 0x3; + if (!rcu_segcblist_empty(&rtpcp->cblist)) + needgpcb |= 0x1; + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } + return needgpcb; +} + /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ static int __noreturn rcu_tasks_kthread(void *arg) { unsigned long flags; int len; + int needgpcb; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); struct rcu_head *rhp; struct rcu_tasks *rtp = arg; @@ -249,37 +276,25 @@ static int __noreturn rcu_tasks_kthread(void *arg) * This loop is terminated by the system going down. ;-) */ for (;;) { - struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each... + struct rcu_tasks_percpu *rtpcp; set_tasks_gp_state(rtp, RTGS_WAIT_CBS); - /* Pick up any new callbacks. */ - raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); - (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - /* If there were none, wait a bit and start over. */ - if (!rcu_segcblist_pend_cbs(&rtpcp->cblist)) { - wait_event_interruptible(rtp->cbs_wq, - rcu_segcblist_pend_cbs(&rtpcp->cblist)); - if (!rcu_segcblist_pend_cbs(&rtpcp->cblist)) { - WARN_ON(signal_pending(current)); - set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); - schedule_timeout_idle(HZ/10); - } - continue; + wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp))); + + if (needgpcb & 0x2) { + // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; + rcu_seq_start(&rtp->tasks_gp_seq); + rtp->gp_func(rtp); + rcu_seq_end(&rtp->tasks_gp_seq); } - // Wait for one grace period. - set_tasks_gp_state(rtp, RTGS_WAIT_GP); - rtp->gp_start = jiffies; - rcu_seq_start(&rtp->tasks_gp_seq); - rtp->gp_func(rtp); - rcu_seq_end(&rtp->tasks_gp_seq); - /* Invoke the callbacks. */ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); + rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); raw_spin_lock_irqsave_rcu_node(rtpcp, flags); rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl); -- cgit v1.2.3 From 57881863ad15fbccbfa637b5e4b67cd3a4520643 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 9 Nov 2021 15:34:56 -0800 Subject: rcu-tasks: Abstract invocations of callbacks This commit adds a rcu_tasks_invoke_cbs() function that invokes all ready callbacks on all of the per-CPU lists that are currently in use. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 56 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index e63987a5af56..6b3406a47720 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -255,14 +255,42 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) return needgpcb; } -/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ -static int __noreturn rcu_tasks_kthread(void *arg) +// Advance callbacks and invoke any that are ready. +static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp) { + int cpu; unsigned long flags; int len; - int needgpcb; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); struct rcu_head *rhp; + + for (cpu = 0; cpu < rtp->percpu_enqueue_lim; cpu++) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + if (rcu_segcblist_empty(&rtpcp->cblist)) + continue; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); + rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + len = rcl.len; + for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + cond_resched(); + } + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + rcu_segcblist_add_len(&rtpcp->cblist, -len); + (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } +} + +/* RCU-tasks kthread that detects grace periods and invokes callbacks. */ +static int __noreturn rcu_tasks_kthread(void *arg) +{ + int needgpcb; struct rcu_tasks *rtp = arg; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ @@ -276,8 +304,6 @@ static int __noreturn rcu_tasks_kthread(void *arg) * This loop is terminated by the system going down. ;-) */ for (;;) { - struct rcu_tasks_percpu *rtpcp; - set_tasks_gp_state(rtp, RTGS_WAIT_CBS); /* If there were none, wait a bit and start over. */ @@ -292,24 +318,10 @@ static int __noreturn rcu_tasks_kthread(void *arg) rcu_seq_end(&rtp->tasks_gp_seq); } - /* Invoke the callbacks. */ + /* Invoke callbacks. */ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); - rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); - raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); - rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl); - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - len = rcl.len; - for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { - local_bh_disable(); - rhp->func(rhp); - local_bh_enable(); - cond_resched(); - } - raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - rcu_segcblist_add_len(&rtpcp->cblist, -len); - (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + rcu_tasks_invoke_cbs(rtp); + /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_idle(rtp->gp_sleep); } -- cgit v1.2.3 From d363f833c6d88331ff013ff0970a96caa8b84653 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 10 Nov 2021 15:56:40 -0800 Subject: rcu-tasks: Use workqueues for multiple rcu_tasks_invoke_cbs() invocations If there is a flood of callbacks, it is necessary to put multiple CPUs to work invoking those callbacks. This commit therefore uses a workqueue-flooding approach to parallelize RCU Tasks callback execution. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 75 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6b3406a47720..97bd7a667e94 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -24,10 +24,14 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism. * @cblist: Callback list. * @lock: Lock protecting per-CPU callback list. + * @rtp_work: Work queue for invoking callbacks. */ struct rcu_tasks_percpu { struct rcu_segcblist cblist; raw_spinlock_t __private lock; + struct work_struct rtp_work; + int cpu; + struct rcu_tasks *rtpp; }; /** @@ -146,6 +150,8 @@ static const char * const rcu_tasks_gp_state_names[] = { // // Generic code. +static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp); + /* Record grace-period phase and time. */ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) { @@ -185,6 +191,9 @@ static void cblist_init_generic(struct rcu_tasks *rtp) raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. if (rcu_segcblist_empty(&rtpcp->cblist)) rcu_segcblist_init(&rtpcp->cblist); + INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); + rtpcp->cpu = cpu; + rtpcp->rtpp = rtp; raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); @@ -256,35 +265,55 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) } // Advance callbacks and invoke any that are ready. -static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp) +static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp) { int cpu; + int cpunext; unsigned long flags; int len; - struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); struct rcu_head *rhp; - - for (cpu = 0; cpu < rtp->percpu_enqueue_lim; cpu++) { - struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); - - if (rcu_segcblist_empty(&rtpcp->cblist)) - continue; - raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); - rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl); - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); - len = rcl.len; - for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { - local_bh_disable(); - rhp->func(rhp); - local_bh_enable(); - cond_resched(); + struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); + struct rcu_tasks_percpu *rtpcp_next; + + cpu = rtpcp->cpu; + cpunext = cpu * 2 + 1; + if (cpunext < rtp->percpu_enqueue_lim) { + rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); + queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + cpunext++; + if (cpunext < rtp->percpu_enqueue_lim) { + rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); + queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); } - raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - rcu_segcblist_add_len(&rtpcp->cblist, -len); - (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); - raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } + + if (rcu_segcblist_empty(&rtpcp->cblist)) + return; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); + rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + len = rcl.len; + for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + cond_resched(); + } + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + rcu_segcblist_add_len(&rtpcp->cblist, -len); + (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); +} + +// Workqueue flood to advance callbacks and invoke any that are ready. +static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = container_of(wp, struct rcu_tasks_percpu, rtp_work); + + rtp = rtpcp->rtpp; + rcu_tasks_invoke_cbs(rtp, rtpcp); } /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ @@ -320,7 +349,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) /* Invoke callbacks. */ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); - rcu_tasks_invoke_cbs(rtp); + rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0)); /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_idle(rtp->gp_sleep); -- cgit v1.2.3 From ce9b1c667f03e0aa30d3eb69d0932e010d131c49 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Nov 2021 14:53:43 -0800 Subject: rcu-tasks: Make rcu_barrier_tasks*() handle multiple callback queues Currently, rcu_barrier_tasks(), rcu_barrier_tasks_rude(), and rcu_barrier_tasks_trace() simply invoke the corresponding synchronize_rcu_tasks*() function. This works because there is only one callback queue. However, there will soon be multiple callback queues. This commit therefore scans the queues currently in use, entraining a callback on each non-empty queue. Sequence numbers and reference counts are used to synchronize this process in a manner similar to the approach taken by rcu_barrier(). Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 70 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 97bd7a667e94..8e6601c5fd2e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -25,11 +25,15 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @cblist: Callback list. * @lock: Lock protecting per-CPU callback list. * @rtp_work: Work queue for invoking callbacks. + * @barrier_q_head: RCU callback for barrier operation. + * @cpu: CPU number corresponding to this entry. + * @rtpp: Pointer to the rcu_tasks structure. */ struct rcu_tasks_percpu { struct rcu_segcblist cblist; raw_spinlock_t __private lock; struct work_struct rtp_work; + struct rcu_head barrier_q_head; int cpu; struct rcu_tasks *rtpp; }; @@ -57,6 +61,10 @@ struct rcu_tasks_percpu { * @rtpcpu: This flavor's rcu_tasks_percpu structure. * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @percpu_enqueue_lim: Number of per-CPU callback queues in use. + * @barrier_q_mutex: Serialize barrier operations. + * @barrier_q_count: Number of queues being waited on. + * @barrier_q_completion: Barrier wait/wakeup mechanism. + * @barrier_q_seq: Sequence number for barrier operations. * @name: This flavor's textual name. * @kname: This flavor's kthread name. */ @@ -82,6 +90,10 @@ struct rcu_tasks { struct rcu_tasks_percpu __percpu *rtpcpu; int percpu_enqueue_shift; int percpu_enqueue_lim; + struct mutex barrier_q_mutex; + atomic_t barrier_q_count; + struct completion barrier_q_completion; + unsigned long barrier_q_seq; char *name; char *kname; }; @@ -100,6 +112,8 @@ static struct rcu_tasks rt_name = \ .name = n, \ .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS), \ .percpu_enqueue_lim = 1, \ + .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \ + .barrier_q_seq = (0UL - 50UL) << RCU_SEQ_CTR_SHIFT, \ .kname = #rt_name, \ } @@ -238,6 +252,53 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) wait_rcu_gp(rtp->call_func); } +// RCU callback function for rcu_barrier_tasks_generic(). +static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp; + + rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head); + rtp = rtpcp->rtpp; + if (atomic_dec_and_test(&rtp->barrier_q_count)) + complete(&rtp->barrier_q_completion); +} + +// Wait for all in-flight callbacks for the specified RCU Tasks flavor. +// Operates in a manner similar to rcu_barrier(). +static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) +{ + int cpu; + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + unsigned long s = rcu_seq_snap(&rtp->barrier_q_seq); + + mutex_lock(&rtp->barrier_q_mutex); + if (rcu_seq_done(&rtp->barrier_q_seq, s)) { + smp_mb(); + mutex_unlock(&rtp->barrier_q_mutex); + return; + } + rcu_seq_start(&rtp->barrier_q_seq); + init_completion(&rtp->barrier_q_completion); + atomic_set(&rtp->barrier_q_count, 2); + for_each_possible_cpu(cpu) { + if (cpu >= smp_load_acquire(&rtp->percpu_enqueue_lim)) + break; + rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + rtpcp->barrier_q_head.func = rcu_barrier_tasks_generic_cb; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (rcu_segcblist_entrain(&rtpcp->cblist, &rtpcp->barrier_q_head)) + atomic_inc(&rtp->barrier_q_count); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } + if (atomic_sub_and_test(2, &rtp->barrier_q_count)) + complete(&rtp->barrier_q_completion); + wait_for_completion(&rtp->barrier_q_completion); + rcu_seq_end(&rtp->barrier_q_seq); + mutex_unlock(&rtp->barrier_q_mutex); +} + // Advance callbacks and indicate whether either a grace period or // callback invocation is needed. static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) @@ -703,8 +764,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); */ void rcu_barrier_tasks(void) { - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); + rcu_barrier_tasks_generic(&rcu_tasks); } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); @@ -842,8 +902,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); */ void rcu_barrier_tasks_rude(void) { - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks_rude(); + rcu_barrier_tasks_generic(&rcu_tasks_rude); } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); @@ -1401,8 +1460,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); */ void rcu_barrier_tasks_trace(void) { - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks_trace(); + rcu_barrier_tasks_generic(&rcu_tasks_trace); } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); -- cgit v1.2.3 From 8610b65680390a103b58f46282a1b05f7eebbba4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 12 Nov 2021 07:33:40 -0800 Subject: rcu-tasks: Add rcupdate.rcu_task_enqueue_lim to set initial queueing This commit adds a rcupdate.rcu_task_enqueue_lim module parameter that sets the initial number of callback queues to use for the RCU Tasks family of RCU implementations. This parameter allows testing of various fanout values. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ kernel/rcu/tasks.h | 24 ++++++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..9b09fc5dfe66 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4805,6 +4805,13 @@ period to instead use normal non-expedited grace-period processing. + rcupdate.rcu_task_enqueue_lim= [KNL] + Set the number of callback queues to use for the + RCU Tasks family of RCU flavors. The default + of -1 allows this to be automatically (and + dynamically) adjusted. This parameter is intended + for use in testing. + rcupdate.rcu_task_ipi_delay= [KNL] Set time in jiffies during which RCU tasks will avoid sending IPIs, starting with the beginning diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8e6601c5fd2e..9d3eddaecfde 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -130,6 +130,9 @@ module_param(rcu_task_ipi_delay, int, 0644); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +static int rcu_task_enqueue_lim __read_mostly = -1; +module_param(rcu_task_enqueue_lim, int, 0444); + /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 #define RTGS_WAIT_WAIT_CBS 1 @@ -192,10 +195,19 @@ static void cblist_init_generic(struct rcu_tasks *rtp) { int cpu; unsigned long flags; + int lim; raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); - rtp->percpu_enqueue_shift = ilog2(nr_cpu_ids); - rtp->percpu_enqueue_lim = 1; + if (rcu_task_enqueue_lim < 0) + rcu_task_enqueue_lim = nr_cpu_ids; + else if (rcu_task_enqueue_lim == 0) + rcu_task_enqueue_lim = 1; + lim = rcu_task_enqueue_lim; + + if (lim > nr_cpu_ids) + lim = nr_cpu_ids; + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids / lim)); + smp_store_release(&rtp->percpu_enqueue_lim, lim); for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); @@ -211,7 +223,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp) raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); - + pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); } // Enqueue a callback for the specified flavor of Tasks RCU. @@ -307,7 +319,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) unsigned long flags; int needgpcb = 0; - for (cpu = 0; cpu < rtp->percpu_enqueue_lim; cpu++) { + for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_enqueue_lim); cpu++) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ @@ -338,11 +350,11 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu cpu = rtpcp->cpu; cpunext = cpu * 2 + 1; - if (cpunext < rtp->percpu_enqueue_lim) { + if (cpunext < smp_load_acquire(&rtp->percpu_enqueue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); cpunext++; - if (cpunext < rtp->percpu_enqueue_lim) { + if (cpunext < smp_load_acquire(&rtp->percpu_enqueue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); } -- cgit v1.2.3 From 7d13d30bb6c54b57d196eab89dea2729a565dbd7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Nov 2021 13:38:42 -0800 Subject: rcu-tasks: Count trylocks to estimate call_rcu_tasks() contention This commit converts the unconditional raw_spin_lock_rcu_node() lock acquisition in call_rcu_tasks_generic() to a trylock followed by an unconditional acquisition if the trylock fails. If the trylock fails, the failure is counted, but the count is reset to zero on each new jiffy. This statistic will be used to determine when to move from a single callback queue to per-CPU callback queues. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 9d3eddaecfde..231a949cdb81 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -24,6 +24,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism. * @cblist: Callback list. * @lock: Lock protecting per-CPU callback list. + * @rtp_jiffies: Jiffies counter value for statistics. + * @rtp_n_lock_retries: Rough lock-contention statistic. * @rtp_work: Work queue for invoking callbacks. * @barrier_q_head: RCU callback for barrier operation. * @cpu: CPU number corresponding to this entry. @@ -32,6 +34,8 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); struct rcu_tasks_percpu { struct rcu_segcblist cblist; raw_spinlock_t __private lock; + unsigned long rtp_jiffies; + unsigned long rtp_n_lock_retries; struct work_struct rtp_work; struct rcu_head barrier_q_head; int cpu; @@ -231,6 +235,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) { unsigned long flags; + unsigned long j; bool needwake; struct rcu_tasks_percpu *rtpcp; @@ -239,7 +244,15 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, local_irq_save(flags); rtpcp = per_cpu_ptr(rtp->rtpcpu, smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); - raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + j = jiffies; + if (rtpcp->rtp_jiffies != j) { + rtpcp->rtp_jiffies = j; + rtpcp->rtp_n_lock_retries = 0; + } + rtpcp->rtp_n_lock_retries++; + } if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) { raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. cblist_init_generic(rtp); -- cgit v1.2.3 From 3063b33a347c088e87516764d487e46fea3dfc94 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Nov 2021 16:16:50 -0800 Subject: rcu-tasks: Avoid raw-spinlocked wakeups from call_rcu_tasks_generic() If the caller of of call_rcu_tasks(), call_rcu_tasks_rude(), or call_rcu_tasks_trace() holds a raw spinlock, and then if call_rcu_tasks_generic() determines that the grace-period kthread must be awakened, then the wakeup might acquire a normal spinlock while a raw spinlock is held. This results in lockdep splats when the kernel is built with CONFIG_PROVE_RAW_LOCK_NESTING=y. This commit therefore defers the wakeup using irq_work_queue(). It would be nice to directly invoke wakeup when a raw spinlock is not held, but there is currently no way to check for this in all kernels. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 231a949cdb81..6e8e4cf6fff3 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -27,6 +27,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @rtp_jiffies: Jiffies counter value for statistics. * @rtp_n_lock_retries: Rough lock-contention statistic. * @rtp_work: Work queue for invoking callbacks. + * @rtp_irq_work: IRQ work queue for deferred wakeups. * @barrier_q_head: RCU callback for barrier operation. * @cpu: CPU number corresponding to this entry. * @rtpp: Pointer to the rcu_tasks structure. @@ -37,6 +38,7 @@ struct rcu_tasks_percpu { unsigned long rtp_jiffies; unsigned long rtp_n_lock_retries; struct work_struct rtp_work; + struct irq_work rtp_irq_work; struct rcu_head barrier_q_head; int cpu; struct rcu_tasks *rtpp; @@ -102,9 +104,12 @@ struct rcu_tasks { char *kname; }; +static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp); + #define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ + .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \ }; \ static struct rcu_tasks rt_name = \ { \ @@ -230,6 +235,16 @@ static void cblist_init_generic(struct rcu_tasks *rtp) pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); } +// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). +static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work); + + rtp = rtpcp->rtpp; + wake_up(&rtp->cbs_wq); +} + // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) @@ -263,7 +278,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); /* We can't create the thread unless interrupts are enabled. */ if (needwake && READ_ONCE(rtp->kthread_ptr)) - wake_up(&rtp->cbs_wq); + irq_work_queue(&rtpcp->rtp_irq_work); } // Wait for a grace period for the specified flavor of Tasks RCU. -- cgit v1.2.3 From ab97152f88a4d580b89f0b7cc3028ffac438216f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 24 Nov 2021 15:12:15 -0800 Subject: rcu-tasks: Use more callback queues if contention encountered The rcupdate.rcu_task_enqueue_lim module parameter allows system administrators to tune the number of callback queues used by the RCU Tasks flavors. However if callback storms are infrequent, it would be better to operate with a single queue on a given system unless and until that system actually needed more queues. Systems not needing more queues can then avoid the overhead of checking the extra queues and especially avoid the overhead of fanning workqueue handlers out to all CPUs to invoke callbacks. This commit therefore switches to using all the CPUs' callback queues if call_rcu_tasks_generic() encounters too much lock contention. The amount of lock contention to tolerate defaults to 100 contended lock acquisitions per jiffy, and can be adjusted using the new rcupdate.rcu_task_contend_lim module parameter. Such switching is undertaken only if the rcupdate.rcu_task_enqueue_lim module parameter is negative, which is its default value (-1). This allows savvy systems administrators to set the number of queues to some known good value and to not have to worry about the kernel doing any second guessing. [ paulmck: Apply feedback from Guillaume Tucker and kernelci. ] Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++ kernel/rcu/tasks.h | 27 +++++++++++++++++++++---- 2 files changed, 31 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9b09fc5dfe66..089f4c5f8225 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4805,6 +4805,14 @@ period to instead use normal non-expedited grace-period processing. + rcupdate.rcu_task_contend_lim= [KNL] + Set the minimum number of callback-queuing-time + lock-contention events per jiffy required to + cause the RCU Tasks flavors to switch to per-CPU + callback queuing. This switching only occurs + when rcupdate.rcu_task_enqueue_lim is set to + the default value of -1. + rcupdate.rcu_task_enqueue_lim= [KNL] Set the number of callback queues to use for the RCU Tasks family of RCU flavors. The default diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6e8e4cf6fff3..d37ab69b9db8 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -142,6 +142,10 @@ module_param(rcu_task_stall_timeout, int, 0644); static int rcu_task_enqueue_lim __read_mostly = -1; module_param(rcu_task_enqueue_lim, int, 0444); +static bool rcu_task_cb_adjust; +static int rcu_task_contend_lim __read_mostly = 100; +module_param(rcu_task_contend_lim, int, 0444); + /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 #define RTGS_WAIT_WAIT_CBS 1 @@ -207,10 +211,13 @@ static void cblist_init_generic(struct rcu_tasks *rtp) int lim; raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); - if (rcu_task_enqueue_lim < 0) - rcu_task_enqueue_lim = nr_cpu_ids; - else if (rcu_task_enqueue_lim == 0) + if (rcu_task_enqueue_lim < 0) { + rcu_task_enqueue_lim = 1; + rcu_task_cb_adjust = true; + pr_info("%s: Setting adjustable number of callback queues.\n", __func__); + } else if (rcu_task_enqueue_lim == 0) { rcu_task_enqueue_lim = 1; + } lim = rcu_task_enqueue_lim; if (lim > nr_cpu_ids) @@ -251,6 +258,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, { unsigned long flags; unsigned long j; + bool needadjust = false; bool needwake; struct rcu_tasks_percpu *rtpcp; @@ -266,7 +274,9 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rtpcp->rtp_jiffies = j; rtpcp->rtp_n_lock_retries = 0; } - rtpcp->rtp_n_lock_retries++; + if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim && + READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) + needadjust = true; // Defer adjustment to avoid deadlock. } if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) { raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. @@ -276,6 +286,15 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, needwake = rcu_segcblist_empty(&rtpcp->cblist); rcu_segcblist_enqueue(&rtpcp->cblist, rhp); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + if (unlikely(needadjust)) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim != nr_cpu_ids) { + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); + pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } /* We can't create the thread unless interrupts are enabled. */ if (needwake && READ_ONCE(rtp->kthread_ptr)) irq_work_queue(&rtpcp->rtp_irq_work); -- cgit v1.2.3 From 2cee0789b458afa384c422b5969c1a338891fd33 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Nov 2021 11:46:33 -0800 Subject: rcu-tasks: Use separate ->percpu_dequeue_lim for callback dequeueing Decreasing the number of callback queues is a bit tricky because it is necessary to handle callbacks that were queued before the number of queues decreased, but which were not ready to invoke until afterwards. This commit takes a first step in this direction by maintaining a separate ->percpu_dequeue_lim to control callback dequeueing, in addition to the existing ->percpu_enqueue_lim which now controls only enqueueing. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d37ab69b9db8..b4a2cab6985a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -66,7 +66,8 @@ struct rcu_tasks_percpu { * @call_func: This flavor's call_rcu()-equivalent function. * @rtpcpu: This flavor's rcu_tasks_percpu structure. * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. - * @percpu_enqueue_lim: Number of per-CPU callback queues in use. + * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. + * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing. * @barrier_q_mutex: Serialize barrier operations. * @barrier_q_count: Number of queues being waited on. * @barrier_q_completion: Barrier wait/wakeup mechanism. @@ -96,6 +97,7 @@ struct rcu_tasks { struct rcu_tasks_percpu __percpu *rtpcpu; int percpu_enqueue_shift; int percpu_enqueue_lim; + int percpu_dequeue_lim; struct mutex barrier_q_mutex; atomic_t barrier_q_count; struct completion barrier_q_completion; @@ -121,6 +123,7 @@ static struct rcu_tasks rt_name = \ .name = n, \ .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS), \ .percpu_enqueue_lim = 1, \ + .percpu_dequeue_lim = 1, \ .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \ .barrier_q_seq = (0UL - 50UL) << RCU_SEQ_CTR_SHIFT, \ .kname = #rt_name, \ @@ -223,6 +226,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp) if (lim > nr_cpu_ids) lim = nr_cpu_ids; WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids / lim)); + WRITE_ONCE(rtp->percpu_dequeue_lim, lim); smp_store_release(&rtp->percpu_enqueue_lim, lim); for_each_possible_cpu(cpu) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); @@ -290,6 +294,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim != nr_cpu_ids) { WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_enqueue_lim, nr_cpu_ids); smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); } @@ -342,7 +347,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) init_completion(&rtp->barrier_q_completion); atomic_set(&rtp->barrier_q_count, 2); for_each_possible_cpu(cpu) { - if (cpu >= smp_load_acquire(&rtp->percpu_enqueue_lim)) + if (cpu >= smp_load_acquire(&rtp->percpu_dequeue_lim)) break; rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); rtpcp->barrier_q_head.func = rcu_barrier_tasks_generic_cb; @@ -366,7 +371,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) unsigned long flags; int needgpcb = 0; - for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_enqueue_lim); cpu++) { + for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ @@ -397,11 +402,11 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu cpu = rtpcp->cpu; cpunext = cpu * 2 + 1; - if (cpunext < smp_load_acquire(&rtp->percpu_enqueue_lim)) { + if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); cpunext++; - if (cpunext < smp_load_acquire(&rtp->percpu_enqueue_lim)) { + if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); } -- cgit v1.2.3 From fd796e4139b481733a701c4d406056538f4c73cc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 29 Nov 2021 16:52:31 -0800 Subject: rcu-tasks: Use fewer callbacks queues if callback flood ends By default, when lock contention is encountered, the RCU Tasks flavors of RCU switch to using per-CPU queueing. However, if the callback flood ends, per-CPU queueing continues to be used, which introduces significant additional overhead, especially for callback invocation, which fans out a series of workqueue handlers. This commit therefore switches back to single-queue operation if at the beginning of a grace period there are very few callbacks. The definition of "very few" is set by the rcupdate.rcu_task_collapse_lim module parameter, which defaults to 10. This switch happens in two phases, with the first phase causing future callbacks to be enqueued on CPU 0's queue, but with all queues continuing to be checked for grace periods and callback invocation. The second phase checks to see if an RCU grace period has elapsed and if all remaining RCU-Tasks callbacks are queued on CPU 0. If so, only CPU 0 is checked for future grace periods and callback operation. Of course, the return of contention anywhere during this process will result in returning to per-CPU callback queueing. Reported-by: Martin Lau Cc: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 8 +++++ kernel/rcu/tasks.h | 48 +++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 089f4c5f8225..d1b0542b8564 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4805,6 +4805,14 @@ period to instead use normal non-expedited grace-period processing. + rcupdate.rcu_task_collapse_lim= [KNL] + Set the maximum number of callbacks present + at the beginning of a grace period that allows + the RCU Tasks flavors to collapse back to using + a single callback queue. This switching only + occurs when rcupdate.rcu_task_enqueue_lim is + set to the default value of -1. + rcupdate.rcu_task_contend_lim= [KNL] Set the minimum number of callback-queuing-time lock-contention events per jiffy required to diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index b4a2cab6985a..84f1d91604cc 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -68,6 +68,7 @@ struct rcu_tasks_percpu { * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing. + * @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers. * @barrier_q_mutex: Serialize barrier operations. * @barrier_q_count: Number of queues being waited on. * @barrier_q_completion: Barrier wait/wakeup mechanism. @@ -98,6 +99,7 @@ struct rcu_tasks { int percpu_enqueue_shift; int percpu_enqueue_lim; int percpu_dequeue_lim; + unsigned long percpu_dequeue_gpseq; struct mutex barrier_q_mutex; atomic_t barrier_q_count; struct completion barrier_q_completion; @@ -148,6 +150,8 @@ module_param(rcu_task_enqueue_lim, int, 0444); static bool rcu_task_cb_adjust; static int rcu_task_contend_lim __read_mostly = 100; module_param(rcu_task_contend_lim, int, 0444); +static int rcu_task_collapse_lim __read_mostly = 10; +module_param(rcu_task_collapse_lim, int, 0444); /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 @@ -269,6 +273,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, rhp->next = NULL; rhp->func = func; local_irq_save(flags); + rcu_read_lock(); rtpcp = per_cpu_ptr(rtp->rtpcpu, smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. @@ -294,12 +299,13 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); if (rtp->percpu_enqueue_lim != nr_cpu_ids) { WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); - WRITE_ONCE(rtp->percpu_enqueue_lim, nr_cpu_ids); + WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); } raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); } + rcu_read_unlock(); /* We can't create the thread unless interrupts are enabled. */ if (needwake && READ_ONCE(rtp->kthread_ptr)) irq_work_queue(&rtpcp->rtp_irq_work); @@ -369,15 +375,25 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) { int cpu; unsigned long flags; + long n; + long ncbs = 0; + long ncbsnz = 0; int needgpcb = 0; for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ - if (rcu_segcblist_empty(&rtpcp->cblist)) + if (!rcu_segcblist_n_cbs(&rtpcp->cblist)) continue; raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + // Should we shrink down to a single callback queue? + n = rcu_segcblist_n_cbs(&rtpcp->cblist); + if (n) { + ncbs += n; + if (cpu > 0) + ncbsnz += n; + } rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); if (rcu_segcblist_pend_cbs(&rtpcp->cblist)) @@ -386,6 +402,34 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) needgpcb |= 0x1; raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } + + // Shrink down to a single callback queue if appropriate. + // This is done in two stages: (1) If there are no more than + // rcu_task_collapse_lim callbacks on CPU 0 and none on any other + // CPU, limit enqueueing to CPU 0. (2) After an RCU grace period, + // if there has not been an increase in callbacks, limit dequeuing + // to CPU 0. Note the matching RCU read-side critical section in + // call_rcu_tasks_generic(). + if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim > 1) { + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + smp_store_release(&rtp->percpu_enqueue_lim, 1); + rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); + pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + if (rcu_task_cb_adjust && !ncbsnz && + poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) { + WRITE_ONCE(rtp->percpu_dequeue_lim, 1); + pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + return needgpcb; } -- cgit v1.2.3 From 2ebc45c44c4f3cc4c757430b2409ece4f976892e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Nov 2021 01:37:03 +0100 Subject: rcu/nocb: Remove rcu_node structure from nocb list when de-offloaded The nocb_gp_wait() function iterates over all CPUs in its group, including even those CPUs that have been de-offloaded. This is of course suboptimal, especially if none of the CPUs within the group are currently offloaded. This will become even more of a problem once a nocb kthread is created for all possible CPUs. Therefore use a standard double linked list to link all the offloaded rcu_data structures and safely add or delete these structure as we offload or de-offload them, respectively. Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Josh Triplett Cc: Joel Fernandes Tested-by: Juri Lelli Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 7 +++++-- kernel/rcu/tree_nocb.h | 45 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 43 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4f6c67b3ccd5..5884380f4039 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -227,8 +227,11 @@ struct rcu_data { struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ struct task_struct *nocb_cb_kthread; - struct rcu_data *nocb_next_cb_rdp; - /* Next rcu_data in wakeup chain. */ + struct list_head nocb_head_rdp; /* + * Head of rcu_data list in wakeup chain, + * if rdp_gp. + */ + struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */ /* The following fields are used by CB kthread, hence new cacheline. */ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 2461fe8d0c23..8e94a5344afe 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -625,7 +625,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * and the global grace-period kthread are awakened if needed. */ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { + /* + * An rcu_data structure is removed from the list after its + * CPU is de-offloaded and added to the list before that CPU is + * (re-)offloaded. If the following loop happens to be referencing + * that rcu_data structure during the time that the corresponding + * CPU is de-offloaded and then immediately re-offloaded, this + * loop's rdp pointer will be carried to the end of the list by + * the resulting pair of list operations. This can cause the loop + * to skip over some of the rcu_data structures that were supposed + * to have been scanned. Fortunately a new iteration through the + * entire loop is forced after a given CPU's rcu_data structure + * is added to the list, so the skipped-over rcu_data structures + * won't be ignored for long. + */ + list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) { bool needwake_state = false; if (!nocb_gp_enabled_cb(rdp)) @@ -1003,6 +1017,8 @@ static long rcu_nocb_rdp_deoffload(void *arg) swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + /* Stop nocb_gp_wait() from iterating over this structure. */ + list_del_rcu(&rdp->nocb_entry_rdp); /* * Lock one last time to acquire latest callback updates from kthreads * so we can later handle callbacks locally without locking. @@ -1066,6 +1082,17 @@ static long rcu_nocb_rdp_offload(void *arg) return -EINVAL; pr_info("Offloading %d\n", rdp->cpu); + + /* + * Cause future nocb_gp_wait() invocations to iterate over + * structure, resetting ->nocb_gp_sleep and waking up the related + * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock + * before setting ->nocb_gp_sleep again, we are guaranteed to + * iterate this newly added structure before "rcuog" goes to + * sleep again. + */ + list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp); + /* * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING * is set. @@ -1268,7 +1295,6 @@ static void __init rcu_organize_nocb_kthreads(void) int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ - struct rcu_data *rdp_prev = NULL; if (!cpumask_available(rcu_nocb_mask)) return; @@ -1288,8 +1314,8 @@ static void __init rcu_organize_nocb_kthreads(void) /* New GP kthread, set up for CBs & next GP. */ gotnocbs = true; nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; - rdp->nocb_gp_rdp = rdp; rdp_gp = rdp; + INIT_LIST_HEAD(&rdp->nocb_head_rdp); if (dump_tree) { if (!firsttime) pr_cont("%s\n", gotnocbscbs @@ -1302,12 +1328,11 @@ static void __init rcu_organize_nocb_kthreads(void) } else { /* Another CB kthread, link to previous GP kthread. */ gotnocbscbs = true; - rdp->nocb_gp_rdp = rdp_gp; - rdp_prev->nocb_next_cb_rdp = rdp; if (dump_tree) pr_cont(" %d", cpu); } - rdp_prev = rdp; + rdp->nocb_gp_rdp = rdp_gp; + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); } if (gotnocbs && dump_tree) pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); @@ -1369,6 +1394,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) { char bufw[20]; char bufr[20]; + struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; bool wassleep; @@ -1376,11 +1402,16 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); + nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp, + &rdp->nocb_entry_rdp, + typeof(*rdp), + nocb_entry_rdp); + sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, - rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, + nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], "cC"[!!atomic_read(&rdp->nocb_lock_contended)], -- cgit v1.2.3 From 8d9703964697340e073305574de4f5df31a28ba9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Nov 2021 01:37:04 +0100 Subject: rcu/nocb: Prepare nocb_cb_wait() to start with a non-offloaded rdp In order to be able to toggle the offloaded state from cpusets, a nocb kthread will need to be created for all possible CPUs whenever either of the "rcu_nocbs=" or "nohz_full=" parameters are specified. Therefore, the nocb_cb_wait() kthread must be prepared to start running on a de-offloaded rdp. To accomplish this, simply move the sleeping condition to the beginning of the nocb_cb_wait() function, which prevents this kthread from attempting to invoke callbacks before the corresponding CPU is offloaded. Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Josh Triplett Cc: Joel Fernandes Tested-by: Juri Lelli Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 8e94a5344afe..cd596cf4ee79 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -803,6 +803,18 @@ static void nocb_cb_wait(struct rcu_data *rdp) bool can_sleep = true; struct rcu_node *rnp = rdp->mynode; + do { + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + + // VVV Ensure CB invocation follows _sleep test. + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + } while (!nocb_cb_can_run(rdp)); + + local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); @@ -855,17 +867,6 @@ static void nocb_cb_wait(struct rcu_data *rdp) if (needwake_state) swake_up_one(&rdp->nocb_state_wq); - - do { - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - nocb_cb_wait_cond(rdp)); - - // VVV Ensure CB invocation follows _sleep test. - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - } - } while (!nocb_cb_can_run(rdp)); } /* -- cgit v1.2.3 From a81aeaf7a1de51400374a8e3982a3cc3ff130dd1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Nov 2021 01:37:05 +0100 Subject: rcu/nocb: Optimize kthreads and rdp initialization Currently cpumask_available() is used to prevent from unwanted NOCB initialization. However if neither "rcu_nocbs=" nor "nohz_full=" parameters are passed to a kernel built with CONFIG_CPUMASK_OFFSTACK=n, the initialization path is still taken, running through all sorts of needless operations and iterations on an empty cpumask. Fix this by relying on a real initialization state instead. This also optimizes kthread creation, preventing needless iteration over all online CPUs when the kernel is booted without any offloaded CPUs. Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Josh Triplett Cc: Joel Fernandes Tested-by: Juri Lelli Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index cd596cf4ee79..42092de67705 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -60,6 +60,9 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ + +static bool rcu_nocb_is_setup; + static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); @@ -67,6 +70,7 @@ static int __init rcu_nocb_setup(char *str) pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); cpumask_setall(rcu_nocb_mask); } + rcu_nocb_is_setup = true; return 1; } __setup("rcu_nocbs=", rcu_nocb_setup); @@ -1167,13 +1171,17 @@ void __init rcu_init_nohz(void) need_rcu_nocb_mask = true; #endif /* #if defined(CONFIG_NO_HZ_FULL) */ - if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { - if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { - pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); - return; + if (need_rcu_nocb_mask) { + if (!cpumask_available(rcu_nocb_mask)) { + if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { + pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); + return; + } } + rcu_nocb_is_setup = true; } - if (!cpumask_available(rcu_nocb_mask)) + + if (!rcu_nocb_is_setup) return; #if defined(CONFIG_NO_HZ_FULL) @@ -1275,8 +1283,10 @@ static void __init rcu_spawn_nocb_kthreads(void) { int cpu; - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); + if (rcu_nocb_is_setup) { + for_each_online_cpu(cpu) + rcu_spawn_cpu_nocb_kthread(cpu); + } } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ -- cgit v1.2.3 From 2cf4528d6dd6f5a7f34ae07e26176a7932310eeb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Nov 2021 01:37:06 +0100 Subject: rcu/nocb: Create kthreads on all CPUs if "rcu_nocbs=" or "nohz_full=" are passed In order to be able to (de-)offload any CPU using cpusets in the future, create the NOCB data structures for all possible CPUs. For now this is done only as long as the "rcu_nocbs=" or "nohz_full=" kernel parameters are passed to avoid the unnecessary overhead for most users. Note that the rcuog and rcuoc kthreads are not created until at least one of the corresponding CPUs comes online. This approach avoids the creation of excess kthreads when firmware lies about the number of CPUs present on the system. Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Josh Triplett Cc: Joel Fernandes Tested-by: Juri Lelli Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 42092de67705..f580a6b2e74e 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1237,11 +1237,8 @@ static void rcu_spawn_one_nocb_kthread(int cpu) struct rcu_data *rdp_gp; struct task_struct *t; - /* - * If this isn't a no-CBs CPU or if it already has an rcuo kthread, - * then nothing to do. - */ - if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) + /* If there already is an rcuo kthread, then nothing to do. */ + if (rdp->nocb_cb_kthread) return; /* If we didn't spawn the GP kthread first, reorganize! */ @@ -1269,7 +1266,7 @@ static void rcu_spawn_one_nocb_kthread(int cpu) */ static void rcu_spawn_cpu_nocb_kthread(int cpu) { - if (rcu_scheduler_fully_active) + if (rcu_scheduler_fully_active && rcu_nocb_is_setup) rcu_spawn_one_nocb_kthread(cpu); } @@ -1319,7 +1316,7 @@ static void __init rcu_organize_nocb_kthreads(void) * Should the corresponding CPU come online in the future, then * we will spawn the needed set of rcu_nocb_kthread() kthreads. */ - for_each_cpu(cpu, rcu_nocb_mask) { + for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu >= nl) { /* New GP kthread, set up for CBs & next GP. */ @@ -1343,7 +1340,8 @@ static void __init rcu_organize_nocb_kthreads(void) pr_cont(" %d", cpu); } rdp->nocb_gp_rdp = rdp_gp; - list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); + if (cpumask_test_cpu(cpu, rcu_nocb_mask)) + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); } if (gotnocbs && dump_tree) pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); -- cgit v1.2.3 From d2cf0854d728c42524efc169edb3505de8c1a9dc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Nov 2021 01:37:07 +0100 Subject: rcu/nocb: Allow empty "rcu_nocbs" kernel parameter Allow the rcu_nocbs kernel parameter to be specified just by itself, without specifying any CPUs. This allows systems administrators to use "rcu_nocbs" to specify that none of the CPUs are to be offloaded at boot time, but than any of them may be offloaded at runtime via cpusets. In contrast, if the "rcu_nocbs" or "nohz_full" kernel parameters are not specified at all, then not only are none of the CPUs offloaded at boot, none of them can be offloaded at runtime, either. While in the area, modernize the description of the "rcuo" kthreads' naming scheme. Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Josh Triplett Cc: Joel Fernandes Tested-by: Juri Lelli Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 37 ++++++++++++++++--------- kernel/rcu/tree_nocb.h | 10 ++++--- 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9725c546a0d4..8b5da10f932e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4343,19 +4343,30 @@ Disable the Correctable Errors Collector, see CONFIG_RAS_CEC help text. - rcu_nocbs= [KNL] - The argument is a cpu list, as described above. - - In kernels built with CONFIG_RCU_NOCB_CPU=y, set - the specified list of CPUs to be no-callback CPUs. - Invocation of these CPUs' RCU callbacks will be - offloaded to "rcuox/N" kthreads created for that - purpose, where "x" is "p" for RCU-preempt, and - "s" for RCU-sched, and "N" is the CPU number. - This reduces OS jitter on the offloaded CPUs, - which can be useful for HPC and real-time - workloads. It can also improve energy efficiency - for asymmetric multiprocessors. + rcu_nocbs[=cpu-list] + [KNL] The optional argument is a cpu list, + as described above. + + In kernels built with CONFIG_RCU_NOCB_CPU=y, + enable the no-callback CPU mode, which prevents + such CPUs' callbacks from being invoked in + softirq context. Invocation of such CPUs' RCU + callbacks will instead be offloaded to "rcuox/N" + kthreads created for that purpose, where "x" is + "p" for RCU-preempt, "s" for RCU-sched, and "g" + for the kthreads that mediate grace periods; and + "N" is the CPU number. This reduces OS jitter on + the offloaded CPUs, which can be useful for HPC + and real-time workloads. It can also improve + energy efficiency for asymmetric multiprocessors. + + If a cpulist is passed as an argument, the specified + list of CPUs is set to no-callback mode from boot. + + Otherwise, if the '=' sign and the cpulist + arguments are omitted, no CPU will be set to + no-callback mode from boot but the mode may be + toggled at runtime via cpusets. rcu_nocb_poll [KNL] Rather than requiring that offloaded CPUs diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index f580a6b2e74e..0c1802ce4764 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -66,14 +66,16 @@ static bool rcu_nocb_is_setup; static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - if (cpulist_parse(str, rcu_nocb_mask)) { - pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); - cpumask_setall(rcu_nocb_mask); + if (*str == '=') { + if (cpulist_parse(++str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } } rcu_nocb_is_setup = true; return 1; } -__setup("rcu_nocbs=", rcu_nocb_setup); +__setup("rcu_nocbs", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { -- cgit v1.2.3 From 10d4703154a72fb4f30fc90a4a7212bf138c17a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Nov 2021 01:37:08 +0100 Subject: rcu/nocb: Merge rcu_spawn_cpu_nocb_kthread() and rcu_spawn_one_nocb_kthread() The rcu_spawn_one_nocb_kthread() function is called only from rcu_spawn_cpu_nocb_kthread(). Therefore, inline the former into the latter, saving a few lines of code. Reviewed-by: Neeraj Upadhyay Signed-off-by: Frederic Weisbecker Cc: Boqun Feng Cc: Uladzislau Rezki Cc: Josh Triplett Cc: Joel Fernandes Tested-by: Juri Lelli Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 0c1802ce4764..eeafb546a7a0 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1233,12 +1233,15 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread * for this CPU's group has not yet been created, spawn it as well. */ -static void rcu_spawn_one_nocb_kthread(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_data *rdp_gp; struct task_struct *t; + if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup) + return; + /* If there already is an rcuo kthread, then nothing to do. */ if (rdp->nocb_cb_kthread) return; @@ -1262,16 +1265,6 @@ static void rcu_spawn_one_nocb_kthread(int cpu) WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread); } -/* - * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread, spawn it. - */ -static void rcu_spawn_cpu_nocb_kthread(int cpu) -{ - if (rcu_scheduler_fully_active && rcu_nocb_is_setup) - rcu_spawn_one_nocb_kthread(cpu); -} - /* * Once the scheduler is running, spawn rcuo kthreads for all online * no-CBs CPUs. This assumes that the early_initcall()s happen before -- cgit v1.2.3 From 59ec71575ab440cd5ca0aa53b2a2985b3639fad4 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Mon, 29 Nov 2021 21:37:25 +0100 Subject: ucounts: Fix rlimit max values check The semantics of the rlimit max values differs from ucounts itself. When creating a new userns, we store the current rlimit of the process in ucount_max. Thus, the value of the limit in the parent userns is saved in the created one. The problem is that now we are taking the maximum value for counter from the same userns. So for init_user_ns it will always be RLIM_INFINITY. To fix the problem we need to check the counter value with the max value stored in userns. Reproducer: su - test -c "ulimit -u 3; sleep 5 & sleep 6 & unshare -U --map-root-user sh -c 'sleep 7 & sleep 8 & date; wait'" Before: [1] 175 [2] 176 Fri Nov 26 13:48:20 UTC 2021 [1]- Done sleep 5 [2]+ Done sleep 6 After: [1] 167 [2] 168 sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: retry: Resource temporarily unavailable sh: fork: Interrupted system call [1]- Done sleep 5 [2]+ Done sleep 6 Fixes: c54b245d0118 ("Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace") Reported-by: Gleb Fotengauer-Malinovskiy Signed-off-by: "Eric W. Biederman" Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/024ec805f6e16896f0b23e094773790d171d2c1c.1638218242.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 4f5613dac227..7b32c356ebc5 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -264,15 +264,16 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) { struct ucounts *iter; + long max = LONG_MAX; long ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long max = READ_ONCE(iter->ns->ucount_max[type]); long new = atomic_long_add_return(v, &iter->ucount[type]); if (new < 0 || new > max) ret = LONG_MAX; else if (iter == ucounts) ret = new; + max = READ_ONCE(iter->ns->ucount_max[type]); } return ret; } @@ -312,15 +313,16 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; + long max = LONG_MAX; long dec, ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long max = READ_ONCE(iter->ns->ucount_max[type]); long new = atomic_long_add_return(1, &iter->ucount[type]); if (new < 0 || new > max) goto unwind; if (iter == ucounts) ret = new; + max = READ_ONCE(iter->ns->ucount_max[type]); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. @@ -339,15 +341,16 @@ unwind: return 0; } -bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max) +bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long rlimit) { struct ucounts *iter; - if (get_ucounts_value(ucounts, type) > max) - return true; + long max = rlimit; + if (rlimit > LONG_MAX) + max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - max = READ_ONCE(iter->ns->ucount_max[type]); if (get_ucounts_value(iter, type) > max) return true; + max = READ_ONCE(iter->ns->ucount_max[type]); } return false; } -- cgit v1.2.3 From ccf45156fd167a234baf038c11c1f367c7ccabd4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:37 +0800 Subject: workqueue: Remove the outdated comment before wq_worker_sleeping() It isn't called with preempt disabled now. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 613917bbc4e7..2964dbb783fe 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -878,8 +878,7 @@ void wq_worker_running(struct task_struct *task) * @task: task going to sleep * * This function is called from schedule() when a busy worker is - * going to sleep. Preemption needs to be disabled to protect ->sleeping - * assignment. + * going to sleep. */ void wq_worker_sleeping(struct task_struct *task) { -- cgit v1.2.3 From 3e5f39ea33b1189ccaa4ae2a9de2bce07753d2e0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:38 +0800 Subject: workqueue: Remove the advanced kicking of the idle workers in rebind_workers() The commit 6d25be5782e4 ("sched/core, workqueues: Distangle worker accounting from rq lock") changed the schedule callbacks for workqueue and removed the local-wake-up functionality. Now the wakingup of workers is done by normal fashion and workers not yet migrated to the specific CPU in concurrency managed pool can also be woken up by workers that already bound to the specific cpu now. So this advanced kicking of the idle workers to migrate them to the associated CPU is unneeded now. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2964dbb783fe..f7f4a5fc7736 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5057,17 +5057,6 @@ static void rebind_workers(struct worker_pool *pool) for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; - /* - * A bound idle worker should actually be on the runqueue - * of the associated CPU for local wake-ups targeting it to - * work. Kick all idle workers so that they migrate to the - * associated CPU. Doing this in the same loop as - * replacing UNBOUND with REBOUND is safe as no worker will - * be bound before @pool->lock is released. - */ - if (worker_flags & WORKER_IDLE) - wake_up_process(worker->task); - /* * We want to clear UNBOUND but can't directly call * worker_clr_flags() or adjust nr_running. Atomically -- cgit v1.2.3 From 11b45b0bf402b53c94c86737a440363fc36f03cd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:39 +0800 Subject: workqueue: Remove outdated comment about exceptional workers in unbind_workers() Long time before, workers are not ALL bound after CPU_ONLINE, they can still be running in other CPUs before self rebinding. But the commit a9ab775bcadf ("workqueue: directly restore CPU affinity of workers from CPU_ONLINE") makes rebind_workers() bind them all. So all workers are on the CPU before the CPU is down. And the comment in unbind_workers() refers to the workers "which are still executing works from before the last CPU down" is outdated. Just removed it. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f7f4a5fc7736..ae58c6ace23f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4978,9 +4978,7 @@ static void unbind_workers(int cpu) /* * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers - * except for the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they may become diasporas. + * must be on the cpu. After this, they may become diasporas. */ for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; -- cgit v1.2.3 From b4ac9384ac057c5bf035fbe82fc162fa2f7b15a9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:40 +0800 Subject: workqueue: Remove schedule() in unbind_workers() The commit 6d25be5782e4 ("sched/core, workqueues: Distangle worker accounting from rq lock") changed the schedule callbacks for workqueue and moved the schedule callback from the wakeup code to at end of schedule() in the worker's process context. It means that the callback wq_worker_running() is guaranteed that it sees the %WORKER_UNBOUND flag after scheduled since unbind_workers() is running on the same CPU that all the pool's workers bound to. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ae58c6ace23f..499a264183ef 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4979,6 +4979,9 @@ static void unbind_workers(int cpu) * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * must be on the cpu. After this, they may become diasporas. + * And the preemption disabled section in their sched callbacks + * are guaranteed to see WORKER_UNBOUND since the code here + * is on the same cpu. */ for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; @@ -4994,14 +4997,6 @@ static void unbind_workers(int cpu) mutex_unlock(&wq_pool_attach_mutex); - /* - * Call schedule() so that we cross rq->lock and thus can - * guarantee sched callbacks see the %WORKER_UNBOUND flag. - * This is necessary as scheduler callbacks may be invoked - * from other cpus. - */ - schedule(); - /* * Sched callbacks are disabled now. Zap nr_running. * After this, nr_running stays zero and need_more_worker() -- cgit v1.2.3 From 989442d73757868118a73b92732b549a73c9ce35 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:41 +0800 Subject: workqueue: Move the code of waking a worker up in unbind_workers() In unbind_workers(), there are two pool->lock held sections separated by the code of zapping nr_running. wake_up_worker() needs to be in pool->lock held section and after zapping nr_running. And zapping nr_running had to be after schedule() when the local wake up functionality was in use. Now, the call to schedule() has been removed along with the local wake up functionality, so the code can be merged into the same pool->lock held section. The diffstat shows that it is other code moved down because the diff tools can not know the meaning of merging lock sections by swapping two code blocks. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 499a264183ef..403387e9a924 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1810,14 +1810,8 @@ static void worker_enter_idle(struct worker *worker) if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); - /* - * Sanity check nr_running. Because unbind_workers() releases - * pool->lock between setting %WORKER_UNBOUND and zapping - * nr_running, the warning may trigger spuriously. Check iff - * unbind is not in progress. - */ - WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && - pool->nr_workers == pool->nr_idle && + /* Sanity check nr_running. */ + WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && atomic_read(&pool->nr_running)); } @@ -4988,21 +4982,12 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; - raw_spin_unlock_irq(&pool->lock); - - for_each_pool_worker(worker, pool) { - kthread_set_per_cpu(worker->task, -1); - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); - } - - mutex_unlock(&wq_pool_attach_mutex); - /* - * Sched callbacks are disabled now. Zap nr_running. - * After this, nr_running stays zero and need_more_worker() - * and keep_working() are always true as long as the - * worklist is not empty. This pool now behaves as an - * unbound (in terms of concurrency management) pool which + * The handling of nr_running in sched callbacks are disabled + * now. Zap nr_running. After this, nr_running stays zero and + * need_more_worker() and keep_working() are always true as + * long as the worklist is not empty. This pool now behaves as + * an unbound (in terms of concurrency management) pool which * are served by workers tied to the pool. */ atomic_set(&pool->nr_running, 0); @@ -5012,9 +4997,16 @@ static void unbind_workers(int cpu) * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - raw_spin_lock_irq(&pool->lock); wake_up_worker(pool); + raw_spin_unlock_irq(&pool->lock); + + for_each_pool_worker(worker, pool) { + kthread_set_per_cpu(worker->task, -1); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); + } + + mutex_unlock(&wq_pool_attach_mutex); } } -- cgit v1.2.3 From 84f91c62d675480ffd3d870ee44c07965cbd8b21 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:42 +0800 Subject: workqueue: Remove the cacheline_aligned for nr_running nr_running is never modified remotely after the schedule callback in wakeup path is removed. Rather nr_running is often accessed with other fields in the pool together, so the cacheline_aligned for nr_running isn't needed. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 403387e9a924..b583141c5481 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -154,6 +154,9 @@ struct worker_pool { unsigned long watchdog_ts; /* L: watchdog timestamp */ + /* The current concurrency level. */ + atomic_t nr_running; + struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -177,19 +180,12 @@ struct worker_pool { struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ - /* - * The current concurrency level. As it's likely to be accessed - * from other CPUs during try_to_wake_up(), put it in a separate - * cacheline. - */ - atomic_t nr_running ____cacheline_aligned_in_smp; - /* * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; -} ____cacheline_aligned_in_smp; +}; /* * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS -- cgit v1.2.3 From 12305abe982740878ecdf9e22852652d8d164855 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:09 +0100 Subject: kcsan: Refactor reading of instrumented memory Factor out the switch statement reading instrumented memory into a helper read_instrumented_memory(). No functional change. Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 51 +++++++++++++++++---------------------------------- 1 file changed, 17 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 4b84c8e7884b..6bfd3040f46b 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -325,6 +325,21 @@ static void delay_access(int type) udelay(delay); } +/* + * Reads the instrumented memory for value change detection; value change + * detection is currently done for accesses up to a size of 8 bytes. + */ +static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size) +{ + switch (size) { + case 1: return READ_ONCE(*(const u8 *)ptr); + case 2: return READ_ONCE(*(const u16 *)ptr); + case 4: return READ_ONCE(*(const u32 *)ptr); + case 8: return READ_ONCE(*(const u64 *)ptr); + default: return 0; /* Ignore; we do not diff the values. */ + } +} + void kcsan_save_irqtrace(struct task_struct *task) { #ifdef CONFIG_TRACE_IRQFLAGS @@ -482,23 +497,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned * Read the current value, to later check and infer a race if the data * was modified via a non-instrumented access, e.g. from a device. */ - old = 0; - switch (size) { - case 1: - old = READ_ONCE(*(const u8 *)ptr); - break; - case 2: - old = READ_ONCE(*(const u16 *)ptr); - break; - case 4: - old = READ_ONCE(*(const u32 *)ptr); - break; - case 8: - old = READ_ONCE(*(const u64 *)ptr); - break; - default: - break; /* ignore; we do not diff the values */ - } + old = read_instrumented_memory(ptr, size); /* * Delay this thread, to increase probability of observing a racy @@ -511,23 +510,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned * racy access. */ access_mask = ctx->access_mask; - new = 0; - switch (size) { - case 1: - new = READ_ONCE(*(const u8 *)ptr); - break; - case 2: - new = READ_ONCE(*(const u16 *)ptr); - break; - case 4: - new = READ_ONCE(*(const u32 *)ptr); - break; - case 8: - new = READ_ONCE(*(const u64 *)ptr); - break; - default: - break; /* ignore; we do not diff the values */ - } + new = read_instrumented_memory(ptr, size); diff = old ^ new; if (access_mask) -- cgit v1.2.3 From 71f8de7092cb2cf95e3f7055df139118d1445597 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:10 +0100 Subject: kcsan: Remove redundant zero-initialization of globals They are implicitly zero-initialized, remove explicit initialization. It keeps the upcoming additions to kcsan_ctx consistent with the rest. No functional change intended. Signed-off-by: Marco Elver Acked-by: Mark Rutland Signed-off-by: Paul E. McKenney --- init/init_task.c | 5 ----- kernel/kcsan/core.c | 5 ----- 2 files changed, 10 deletions(-) (limited to 'kernel') diff --git a/init/init_task.c b/init/init_task.c index 2d024066e27b..73cc8f03511a 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -182,11 +182,6 @@ struct task_struct init_task #endif #ifdef CONFIG_KCSAN .kcsan_ctx = { - .disable_count = 0, - .atomic_next = 0, - .atomic_nest_count = 0, - .in_flat_atomic = false, - .access_mask = 0, .scoped_accesses = {LIST_POISON1, NULL}, }, #endif diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 6bfd3040f46b..e34a1710b7bc 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -44,11 +44,6 @@ bool kcsan_enabled; /* Per-CPU kcsan_ctx for interrupts */ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { - .disable_count = 0, - .atomic_next = 0, - .atomic_nest_count = 0, - .in_flat_atomic = false, - .access_mask = 0, .scoped_accesses = {LIST_POISON1, NULL}, }; -- cgit v1.2.3 From 9756f64c8f2d19c0029a5827bda8ac275302ec22 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:11 +0100 Subject: kcsan: Avoid checking scoped accesses from nested contexts Avoid checking scoped accesses from nested contexts (such as nested interrupts or in scheduler code) which share the same kcsan_ctx. This is to avoid detecting false positive races of accesses in the same thread with currently scoped accesses: consider setting up a watchpoint for a non-scoped (normal) access that also "conflicts" with a current scoped access. In a nested interrupt (or in the scheduler), which shares the same kcsan_ctx, we cannot check scoped accesses set up in the parent context -- simply ignore them in this case. With the introduction of kcsan_ctx::disable_scoped, we can also clean up kcsan_check_scoped_accesses()'s recursion guard, and do not need to modify the list's prev pointer. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan.h | 1 + kernel/kcsan/core.c | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h index fc266ecb2a4d..13cef3458fed 100644 --- a/include/linux/kcsan.h +++ b/include/linux/kcsan.h @@ -21,6 +21,7 @@ */ struct kcsan_ctx { int disable_count; /* disable counter */ + int disable_scoped; /* disable scoped access counter */ int atomic_next; /* number of following atomic ops */ /* diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index e34a1710b7bc..bd359f8ee63a 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -204,15 +204,17 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip); static noinline void kcsan_check_scoped_accesses(void) { struct kcsan_ctx *ctx = get_ctx(); - struct list_head *prev_save = ctx->scoped_accesses.prev; struct kcsan_scoped_access *scoped_access; - ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */ + if (ctx->disable_scoped) + return; + + ctx->disable_scoped++; list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) { check_access(scoped_access->ptr, scoped_access->size, scoped_access->type, scoped_access->ip); } - ctx->scoped_accesses.prev = prev_save; + ctx->disable_scoped--; } /* Rules for generic atomic accesses. Called from fast-path. */ @@ -465,6 +467,15 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned goto out; } + /* + * Avoid races of scoped accesses from nested interrupts (or scheduler). + * Assume setting up a watchpoint for a non-scoped (normal) access that + * also conflicts with a current scoped access. In a nested interrupt, + * which shares the context, it would check a conflicting scoped access. + * To avoid, disable scoped access checking. + */ + ctx->disable_scoped++; + /* * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's * runtime is entered for every memory access, and potentially useful @@ -578,6 +589,7 @@ out_unlock: if (!kcsan_interrupt_watcher) local_irq_restore(irq_flags); kcsan_restore_irqtrace(current); + ctx->disable_scoped--; out: user_access_restore(ua_flags); } -- cgit v1.2.3 From 69562e4983d93e2791c0bf128b07462afbd7f4dc Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 5 Aug 2021 14:57:45 +0200 Subject: kcsan: Add core support for a subset of weak memory modeling Add support for modeling a subset of weak memory, which will enable detection of a subset of data races due to missing memory barriers. KCSAN's approach to detecting missing memory barriers is based on modeling access reordering, and enabled if `CONFIG_KCSAN_WEAK_MEMORY=y`, which depends on `CONFIG_KCSAN_STRICT=y`. The feature can be enabled or disabled at boot and runtime via the `kcsan.weak_memory` boot parameter. Each memory access for which a watchpoint is set up, is also selected for simulated reordering within the scope of its function (at most 1 in-flight access). We are limited to modeling the effects of "buffering" (delaying the access), since the runtime cannot "prefetch" accesses (therefore no acquire modeling). Once an access has been selected for reordering, it is checked along every other access until the end of the function scope. If an appropriate memory barrier is encountered, the access will no longer be considered for reordering. When the result of a memory operation should be ordered by a barrier, KCSAN can then detect data races where the conflict only occurs as a result of a missing barrier due to reordering accesses. Suggested-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan-checks.h | 10 ++- include/linux/kcsan.h | 10 ++- include/linux/sched.h | 3 + kernel/kcsan/core.c | 202 +++++++++++++++++++++++++++++++++++++++---- lib/Kconfig.kcsan | 20 +++++ scripts/Makefile.kcsan | 9 +- 6 files changed, 235 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index 5f5965246877..a1c6a89fde71 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -99,7 +99,15 @@ void kcsan_set_access_mask(unsigned long mask); /* Scoped access information. */ struct kcsan_scoped_access { - struct list_head list; + union { + struct list_head list; /* scoped_accesses list */ + /* + * Not an entry in scoped_accesses list; stack depth from where + * the access was initialized. + */ + int stack_depth; + }; + /* Access information. */ const volatile void *ptr; size_t size; diff --git a/include/linux/kcsan.h b/include/linux/kcsan.h index 13cef3458fed..c07c71f5ba4f 100644 --- a/include/linux/kcsan.h +++ b/include/linux/kcsan.h @@ -49,8 +49,16 @@ struct kcsan_ctx { */ unsigned long access_mask; - /* List of scoped accesses. */ + /* List of scoped accesses; likely to be empty. */ struct list_head scoped_accesses; + +#ifdef CONFIG_KCSAN_WEAK_MEMORY + /* + * Scoped access for modeling access reordering to detect missing memory + * barriers; only keep 1 to keep fast-path complexity manageable. + */ + struct kcsan_scoped_access reorder_access; +#endif }; /** diff --git a/include/linux/sched.h b/include/linux/sched.h index 78c351e35fec..0cd40b010487 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1339,6 +1339,9 @@ struct task_struct { #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif +#ifdef CONFIG_KCSAN_WEAK_MEMORY + int kcsan_stack_depth; +#endif #endif #if IS_ENABLED(CONFIG_KUNIT) diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index bd359f8ee63a..481f8a524089 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -40,6 +40,13 @@ module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644); module_param_named(skip_watch, kcsan_skip_watch, long, 0644); module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444); +#ifdef CONFIG_KCSAN_WEAK_MEMORY +static bool kcsan_weak_memory = true; +module_param_named(weak_memory, kcsan_weak_memory, bool, 0644); +#else +#define kcsan_weak_memory false +#endif + bool kcsan_enabled; /* Per-CPU kcsan_ctx for interrupts */ @@ -351,6 +358,67 @@ void kcsan_restore_irqtrace(struct task_struct *task) #endif } +static __always_inline int get_kcsan_stack_depth(void) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + return current->kcsan_stack_depth; +#else + BUILD_BUG(); + return 0; +#endif +} + +static __always_inline void add_kcsan_stack_depth(int val) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + current->kcsan_stack_depth += val; +#else + BUILD_BUG(); +#endif +} + +static __always_inline struct kcsan_scoped_access *get_reorder_access(struct kcsan_ctx *ctx) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + return ctx->disable_scoped ? NULL : &ctx->reorder_access; +#else + return NULL; +#endif +} + +static __always_inline bool +find_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, + int type, unsigned long ip) +{ + struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx); + + if (!reorder_access) + return false; + + /* + * Note: If accesses are repeated while reorder_access is identical, + * never matches the new access, because !(type & KCSAN_ACCESS_SCOPED). + */ + return reorder_access->ptr == ptr && reorder_access->size == size && + reorder_access->type == type && reorder_access->ip == ip; +} + +static inline void +set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, + int type, unsigned long ip) +{ + struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx); + + if (!reorder_access || !kcsan_weak_memory) + return; + + reorder_access->ptr = ptr; + reorder_access->size = size; + reorder_access->type = type | KCSAN_ACCESS_SCOPED; + reorder_access->ip = ip; + reorder_access->stack_depth = get_kcsan_stack_depth(); +} + /* * Pull everything together: check_access() below contains the performance * critical operations; the fast-path (including check_access) functions should @@ -389,8 +457,10 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, * The access_mask check relies on value-change comparison. To avoid * reporting a race where e.g. the writer set up the watchpoint, but the * reader has access_mask!=0, we have to ignore the found watchpoint. + * + * reorder_access is never created from an access with access_mask set. */ - if (ctx->access_mask) + if (ctx->access_mask && !find_reorder_access(ctx, ptr, size, type, ip)) return; /* @@ -440,11 +510,13 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0; atomic_long_t *watchpoint; u64 old, new, diff; - unsigned long access_mask; enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; + bool interrupt_watcher = kcsan_interrupt_watcher; unsigned long ua_flags = user_access_save(); struct kcsan_ctx *ctx = get_ctx(); + unsigned long access_mask = ctx->access_mask; unsigned long irq_flags = 0; + bool is_reorder_access; /* * Always reset kcsan_skip counter in slow-path to avoid underflow; see @@ -467,6 +539,17 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned goto out; } + /* + * The local CPU cannot observe reordering of its own accesses, and + * therefore we need to take care of 2 cases to avoid false positives: + * + * 1. Races of the reordered access with interrupts. To avoid, if + * the current access is reorder_access, disable interrupts. + * 2. Avoid races of scoped accesses from nested interrupts (below). + */ + is_reorder_access = find_reorder_access(ctx, ptr, size, type, ip); + if (is_reorder_access) + interrupt_watcher = false; /* * Avoid races of scoped accesses from nested interrupts (or scheduler). * Assume setting up a watchpoint for a non-scoped (normal) access that @@ -482,7 +565,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned * information is lost if dirtied by KCSAN. */ kcsan_save_irqtrace(current); - if (!kcsan_interrupt_watcher) + if (!interrupt_watcher) local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); @@ -503,7 +586,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned * Read the current value, to later check and infer a race if the data * was modified via a non-instrumented access, e.g. from a device. */ - old = read_instrumented_memory(ptr, size); + old = is_reorder_access ? 0 : read_instrumented_memory(ptr, size); /* * Delay this thread, to increase probability of observing a racy @@ -515,8 +598,17 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned * Re-read value, and check if it is as expected; if not, we infer a * racy access. */ - access_mask = ctx->access_mask; - new = read_instrumented_memory(ptr, size); + if (!is_reorder_access) { + new = read_instrumented_memory(ptr, size); + } else { + /* + * Reordered accesses cannot be used for value change detection, + * because the memory location may no longer be accessible and + * could result in a fault. + */ + new = 0; + access_mask = 0; + } diff = old ^ new; if (access_mask) @@ -585,11 +677,20 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned */ remove_watchpoint(watchpoint); atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]); + out_unlock: - if (!kcsan_interrupt_watcher) + if (!interrupt_watcher) local_irq_restore(irq_flags); kcsan_restore_irqtrace(current); ctx->disable_scoped--; + + /* + * Reordered accesses cannot be used for value change detection, + * therefore never consider for reordering if access_mask is set. + * ASSERT_EXCLUSIVE are not real accesses, ignore them as well. + */ + if (!access_mask && !is_assert) + set_reorder_access(ctx, ptr, size, type, ip); out: user_access_restore(ua_flags); } @@ -597,7 +698,6 @@ out: static __always_inline void check_access(const volatile void *ptr, size_t size, int type, unsigned long ip) { - const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; atomic_long_t *watchpoint; long encoded_watchpoint; @@ -608,12 +708,14 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip) if (unlikely(size == 0)) return; +again: /* * Avoid user_access_save in fast-path: find_watchpoint is safe without * user_access_save, as the address that ptr points to is only used to * check if a watchpoint exists; ptr is never dereferenced. */ - watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write, + watchpoint = find_watchpoint((unsigned long)ptr, size, + !(type & KCSAN_ACCESS_WRITE), &encoded_watchpoint); /* * It is safe to check kcsan_is_enabled() after find_watchpoint in the @@ -627,9 +729,42 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip) else { struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */ - if (unlikely(should_watch(ctx, ptr, size, type))) + if (unlikely(should_watch(ctx, ptr, size, type))) { kcsan_setup_watchpoint(ptr, size, type, ip); - else if (unlikely(ctx->scoped_accesses.prev)) + return; + } + + if (!(type & KCSAN_ACCESS_SCOPED)) { + struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx); + + if (reorder_access) { + /* + * reorder_access check: simulates reordering of + * the access after subsequent operations. + */ + ptr = reorder_access->ptr; + type = reorder_access->type; + ip = reorder_access->ip; + /* + * Upon a nested interrupt, this context's + * reorder_access can be modified (shared ctx). + * We know that upon return, reorder_access is + * always invalidated by setting size to 0 via + * __tsan_func_exit(). Therefore we must read + * and check size after the other fields. + */ + barrier(); + size = READ_ONCE(reorder_access->size); + if (size) + goto again; + } + } + + /* + * Always checked last, right before returning from runtime; + * if reorder_access is valid, checked after it was checked. + */ + if (unlikely(ctx->scoped_accesses.prev)) kcsan_check_scoped_accesses(); } } @@ -916,19 +1051,56 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(8); DEFINE_TSAN_VOLATILE_READ_WRITE(16); /* - * The below are not required by KCSAN, but can still be emitted by the - * compiler. + * Function entry and exit are used to determine the validty of reorder_access. + * Reordering of the access ends at the end of the function scope where the + * access happened. This is done for two reasons: + * + * 1. Artificially limits the scope where missing barriers are detected. + * This minimizes false positives due to uninstrumented functions that + * contain the required barriers but were missed. + * + * 2. Simplifies generating the stack trace of the access. */ void __tsan_func_entry(void *call_pc); -void __tsan_func_entry(void *call_pc) +noinline void __tsan_func_entry(void *call_pc) { + if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY)) + return; + + add_kcsan_stack_depth(1); } EXPORT_SYMBOL(__tsan_func_entry); + void __tsan_func_exit(void); -void __tsan_func_exit(void) +noinline void __tsan_func_exit(void) { + struct kcsan_scoped_access *reorder_access; + + if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY)) + return; + + reorder_access = get_reorder_access(get_ctx()); + if (!reorder_access) + goto out; + + if (get_kcsan_stack_depth() <= reorder_access->stack_depth) { + /* + * Access check to catch cases where write without a barrier + * (supposed release) was last access in function: because + * instrumentation is inserted before the real access, a data + * race due to the write giving up a c-s would only be caught if + * we do the conflicting access after. + */ + check_access(reorder_access->ptr, reorder_access->size, + reorder_access->type, reorder_access->ip); + reorder_access->size = 0; + reorder_access->stack_depth = INT_MIN; + } +out: + add_kcsan_stack_depth(-1); } EXPORT_SYMBOL(__tsan_func_exit); + void __tsan_init(void); void __tsan_init(void) { diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index e0a93ffdef30..e4394ea8068b 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -191,6 +191,26 @@ config KCSAN_STRICT closely aligns with the rules defined by the Linux-kernel memory consistency model (LKMM). +config KCSAN_WEAK_MEMORY + bool "Enable weak memory modeling to detect missing memory barriers" + default y + depends on KCSAN_STRICT + # We can either let objtool nop __tsan_func_{entry,exit}() and builtin + # atomics instrumentation in .noinstr.text, or use a compiler that can + # implement __no_kcsan to really remove all instrumentation. + depends on STACK_VALIDATION || CC_IS_GCC + help + Enable support for modeling a subset of weak memory, which allows + detecting a subset of data races due to missing memory barriers. + + Depends on KCSAN_STRICT, because the options strenghtening certain + plain accesses by default (depending on !KCSAN_STRICT) reduce the + ability to detect any data races invoving reordered accesses, in + particular reordered writes. + + Weak memory modeling relies on additional instrumentation and may + affect performance. + config KCSAN_REPORT_VALUE_CHANGE_ONLY bool "Only report races where watcher observed a data value change" default y diff --git a/scripts/Makefile.kcsan b/scripts/Makefile.kcsan index 37cb504c77e1..4c7f0d282e42 100644 --- a/scripts/Makefile.kcsan +++ b/scripts/Makefile.kcsan @@ -9,7 +9,12 @@ endif # Keep most options here optional, to allow enabling more compilers if absence # of some options does not break KCSAN nor causes false positive reports. -export CFLAGS_KCSAN := -fsanitize=thread \ - $(call cc-option,$(call cc-param,tsan-instrument-func-entry-exit=0) -fno-optimize-sibling-calls) \ +kcsan-cflags := -fsanitize=thread -fno-optimize-sibling-calls \ $(call cc-option,$(call cc-param,tsan-compound-read-before-write=1),$(call cc-option,$(call cc-param,tsan-instrument-read-before-write=1))) \ $(call cc-param,tsan-distinguish-volatile=1) + +ifndef CONFIG_KCSAN_WEAK_MEMORY +kcsan-cflags += $(call cc-option,$(call cc-param,tsan-instrument-func-entry-exit=0)) +endif + +export CFLAGS_KCSAN := $(kcsan-cflags) -- cgit v1.2.3 From 0b8b0830ac1419d7250fde31ea78793a03f3db44 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:13 +0100 Subject: kcsan: Add core memory barrier instrumentation functions Add the core memory barrier instrumentation functions. These invalidate the current in-flight reordered access based on the rules for the respective barrier types and in-flight access type. To obtain barrier instrumentation that can be disabled via __no_kcsan with appropriate compiler-support (and not just with objtool help), barrier instrumentation repurposes __atomic_signal_fence(), instead of inserting explicit calls. Crucially, __atomic_signal_fence() normally does not map to any real instructions, but is still intercepted by fsanitize=thread. As a result, like any other instrumentation done by the compiler, barrier instrumentation can be disabled with __no_kcsan. Unfortunately Clang and GCC currently differ in their __no_kcsan aka __no_sanitize_thread behaviour with respect to builtin atomics (and __tsan_func_{entry,exit}) instrumentation. This is already reflected in Kconfig.kcsan's dependencies for KCSAN_WEAK_MEMORY. A later change will introduce support for newer versions of Clang that can implement __no_kcsan to also remove the additional instrumentation introduced by KCSAN_WEAK_MEMORY. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- include/linux/kcsan-checks.h | 71 ++++++++++++++++++++++++++++++++++++++++++-- kernel/kcsan/core.c | 68 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/kcsan-checks.h b/include/linux/kcsan-checks.h index a1c6a89fde71..9d2c869167f2 100644 --- a/include/linux/kcsan-checks.h +++ b/include/linux/kcsan-checks.h @@ -36,6 +36,36 @@ */ void __kcsan_check_access(const volatile void *ptr, size_t size, int type); +/* + * See definition of __tsan_atomic_signal_fence() in kernel/kcsan/core.c. + * Note: The mappings are arbitrary, and do not reflect any real mappings of C11 + * memory orders to the LKMM memory orders and vice-versa! + */ +#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb __ATOMIC_SEQ_CST +#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb __ATOMIC_ACQ_REL +#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb __ATOMIC_ACQUIRE +#define __KCSAN_BARRIER_TO_SIGNAL_FENCE_release __ATOMIC_RELEASE + +/** + * __kcsan_mb - full memory barrier instrumentation + */ +void __kcsan_mb(void); + +/** + * __kcsan_wmb - write memory barrier instrumentation + */ +void __kcsan_wmb(void); + +/** + * __kcsan_rmb - read memory barrier instrumentation + */ +void __kcsan_rmb(void); + +/** + * __kcsan_release - release barrier instrumentation + */ +void __kcsan_release(void); + /** * kcsan_disable_current - disable KCSAN for the current context * @@ -159,6 +189,10 @@ void kcsan_end_scoped_access(struct kcsan_scoped_access *sa); static inline void __kcsan_check_access(const volatile void *ptr, size_t size, int type) { } +static inline void __kcsan_mb(void) { } +static inline void __kcsan_wmb(void) { } +static inline void __kcsan_rmb(void) { } +static inline void __kcsan_release(void) { } static inline void kcsan_disable_current(void) { } static inline void kcsan_enable_current(void) { } static inline void kcsan_enable_current_nowarn(void) { } @@ -191,12 +225,45 @@ static inline void kcsan_end_scoped_access(struct kcsan_scoped_access *sa) { } */ #define __kcsan_disable_current kcsan_disable_current #define __kcsan_enable_current kcsan_enable_current_nowarn -#else +#else /* __SANITIZE_THREAD__ */ static inline void kcsan_check_access(const volatile void *ptr, size_t size, int type) { } static inline void __kcsan_enable_current(void) { } static inline void __kcsan_disable_current(void) { } -#endif +#endif /* __SANITIZE_THREAD__ */ + +#if defined(CONFIG_KCSAN_WEAK_MEMORY) && defined(__SANITIZE_THREAD__) +/* + * Normal barrier instrumentation is not done via explicit calls, but by mapping + * to a repurposed __atomic_signal_fence(), which normally does not generate any + * real instructions, but is still intercepted by fsanitize=thread. This means, + * like any other compile-time instrumentation, barrier instrumentation can be + * disabled with the __no_kcsan function attribute. + * + * Also see definition of __tsan_atomic_signal_fence() in kernel/kcsan/core.c. + */ +#define __KCSAN_BARRIER_TO_SIGNAL_FENCE(name) \ + static __always_inline void kcsan_##name(void) \ + { \ + barrier(); \ + __atomic_signal_fence(__KCSAN_BARRIER_TO_SIGNAL_FENCE_##name); \ + barrier(); \ + } +__KCSAN_BARRIER_TO_SIGNAL_FENCE(mb) +__KCSAN_BARRIER_TO_SIGNAL_FENCE(wmb) +__KCSAN_BARRIER_TO_SIGNAL_FENCE(rmb) +__KCSAN_BARRIER_TO_SIGNAL_FENCE(release) +#elif defined(CONFIG_KCSAN_WEAK_MEMORY) && defined(__KCSAN_INSTRUMENT_BARRIERS__) +#define kcsan_mb __kcsan_mb +#define kcsan_wmb __kcsan_wmb +#define kcsan_rmb __kcsan_rmb +#define kcsan_release __kcsan_release +#else /* CONFIG_KCSAN_WEAK_MEMORY && ... */ +static inline void kcsan_mb(void) { } +static inline void kcsan_wmb(void) { } +static inline void kcsan_rmb(void) { } +static inline void kcsan_release(void) { } +#endif /* CONFIG_KCSAN_WEAK_MEMORY && ... */ /** * __kcsan_check_read - check regular read access for races diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 481f8a524089..916060913966 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -939,6 +939,22 @@ void __kcsan_check_access(const volatile void *ptr, size_t size, int type) } EXPORT_SYMBOL(__kcsan_check_access); +#define DEFINE_MEMORY_BARRIER(name, order_before_cond) \ + void __kcsan_##name(void) \ + { \ + struct kcsan_scoped_access *sa = get_reorder_access(get_ctx()); \ + if (!sa) \ + return; \ + if (order_before_cond) \ + sa->size = 0; \ + } \ + EXPORT_SYMBOL(__kcsan_##name) + +DEFINE_MEMORY_BARRIER(mb, true); +DEFINE_MEMORY_BARRIER(wmb, sa->type & (KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND)); +DEFINE_MEMORY_BARRIER(rmb, !(sa->type & KCSAN_ACCESS_WRITE) || (sa->type & KCSAN_ACCESS_COMPOUND)); +DEFINE_MEMORY_BARRIER(release, true); + /* * KCSAN uses the same instrumentation that is emitted by supported compilers * for ThreadSanitizer (TSAN). @@ -1123,10 +1139,19 @@ EXPORT_SYMBOL(__tsan_init); * functions, whose job is to also execute the operation itself. */ +static __always_inline void kcsan_atomic_builtin_memorder(int memorder) +{ + if (memorder == __ATOMIC_RELEASE || + memorder == __ATOMIC_SEQ_CST || + memorder == __ATOMIC_ACQ_REL) + __kcsan_release(); +} + #define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \ { \ + kcsan_atomic_builtin_memorder(memorder); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \ } \ @@ -1136,6 +1161,7 @@ EXPORT_SYMBOL(__tsan_init); void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \ { \ + kcsan_atomic_builtin_memorder(memorder); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \ @@ -1148,6 +1174,7 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \ { \ + kcsan_atomic_builtin_memorder(memorder); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ @@ -1180,6 +1207,7 @@ EXPORT_SYMBOL(__tsan_init); int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ u##bits val, int mo, int fail_mo) \ { \ + kcsan_atomic_builtin_memorder(mo); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ @@ -1195,6 +1223,7 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ int mo, int fail_mo) \ { \ + kcsan_atomic_builtin_memorder(mo); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ @@ -1226,10 +1255,47 @@ DEFINE_TSAN_ATOMIC_OPS(64); void __tsan_atomic_thread_fence(int memorder); void __tsan_atomic_thread_fence(int memorder) { + kcsan_atomic_builtin_memorder(memorder); __atomic_thread_fence(memorder); } EXPORT_SYMBOL(__tsan_atomic_thread_fence); +/* + * In instrumented files, we emit instrumentation for barriers by mapping the + * kernel barriers to an __atomic_signal_fence(), which is interpreted specially + * and otherwise has no relation to a real __atomic_signal_fence(). No known + * kernel code uses __atomic_signal_fence(). + * + * Since fsanitize=thread instrumentation handles __atomic_signal_fence(), which + * are turned into calls to __tsan_atomic_signal_fence(), such instrumentation + * can be disabled via the __no_kcsan function attribute (vs. an explicit call + * which could not). When __no_kcsan is requested, __atomic_signal_fence() + * generates no code. + * + * Note: The result of using __atomic_signal_fence() with KCSAN enabled is + * potentially limiting the compiler's ability to reorder operations; however, + * if barriers were instrumented with explicit calls (without LTO), the compiler + * couldn't optimize much anyway. The result of a hypothetical architecture + * using __atomic_signal_fence() in normal code would be KCSAN false negatives. + */ void __tsan_atomic_signal_fence(int memorder); -void __tsan_atomic_signal_fence(int memorder) { } +noinline void __tsan_atomic_signal_fence(int memorder) +{ + switch (memorder) { + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb: + __kcsan_mb(); + break; + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb: + __kcsan_wmb(); + break; + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb: + __kcsan_rmb(); + break; + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_release: + __kcsan_release(); + break; + default: + break; + } +} EXPORT_SYMBOL(__tsan_atomic_signal_fence); -- cgit v1.2.3 From 3cc21a531252e6693ee989231d5abee6812cfd71 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:15 +0100 Subject: kcsan: Call scoped accesses reordered in reports The scoping of an access simply denotes the scope in which it may be reordered. However, in reports, it'll be less confusing to say the access is "reordered". This is more accurate when the race occurred. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan_test.c | 4 ++-- kernel/kcsan/report.c | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 660729238588..6e3c2b8bc608 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -213,9 +213,9 @@ static bool report_matches(const struct expect_report *r) const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC); const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED); const char *const access_type_aux = - (is_atomic && is_scoped) ? " (marked, scoped)" + (is_atomic && is_scoped) ? " (marked, reordered)" : (is_atomic ? " (marked)" - : (is_scoped ? " (scoped)" : "")); + : (is_scoped ? " (reordered)" : "")); if (i == 1) { /* Access 2 */ diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index fc15077991c4..1b0e050bdf6a 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -215,9 +215,9 @@ static const char *get_access_type(int type) if (type & KCSAN_ACCESS_ASSERT) { if (type & KCSAN_ACCESS_SCOPED) { if (type & KCSAN_ACCESS_WRITE) - return "assert no accesses (scoped)"; + return "assert no accesses (reordered)"; else - return "assert no writes (scoped)"; + return "assert no writes (reordered)"; } else { if (type & KCSAN_ACCESS_WRITE) return "assert no accesses"; @@ -240,17 +240,17 @@ static const char *get_access_type(int type) case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: return "read-write (marked)"; case KCSAN_ACCESS_SCOPED: - return "read (scoped)"; + return "read (reordered)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC: - return "read (marked, scoped)"; + return "read (marked, reordered)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE: - return "write (scoped)"; + return "write (reordered)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: - return "write (marked, scoped)"; + return "write (marked, reordered)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE: - return "read-write (scoped)"; + return "read-write (reordered)"; case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC: - return "read-write (marked, scoped)"; + return "read-write (marked, reordered)"; default: BUG(); } -- cgit v1.2.3 From be3f6967ec5947dd7b2f23bf9d42bb2729889618 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:16 +0100 Subject: kcsan: Show location access was reordered to Also show the location the access was reordered to. An example report: | ================================================================== | BUG: KCSAN: data-race in test_kernel_wrong_memorder / test_kernel_wrong_memorder | | read-write to 0xffffffffc01e61a8 of 8 bytes by task 2311 on cpu 5: | test_kernel_wrong_memorder+0x57/0x90 | access_thread+0x99/0xe0 | kthread+0x2ba/0x2f0 | ret_from_fork+0x22/0x30 | | read-write (reordered) to 0xffffffffc01e61a8 of 8 bytes by task 2310 on cpu 7: | test_kernel_wrong_memorder+0x57/0x90 | access_thread+0x99/0xe0 | kthread+0x2ba/0x2f0 | ret_from_fork+0x22/0x30 | | | +-> reordered to: test_kernel_wrong_memorder+0x80/0x90 | | Reported by Kernel Concurrency Sanitizer on: | CPU: 7 PID: 2310 Comm: access_thread Not tainted 5.14.0-rc1+ #18 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 | ================================================================== Reviewed-by: Boqun Feng Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/report.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 1b0e050bdf6a..67794404042a 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -308,10 +308,12 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries /* * Skips to the first entry that matches the function of @ip, and then replaces - * that entry with @ip, returning the entries to skip. + * that entry with @ip, returning the entries to skip with @replaced containing + * the replaced entry. */ static int -replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip) +replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip, + unsigned long *replaced) { unsigned long symbolsize, offset; unsigned long target_func; @@ -330,6 +332,7 @@ replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned lon func -= offset; if (func == target_func) { + *replaced = stack_entries[skip]; stack_entries[skip] = ip; return skip; } @@ -342,9 +345,10 @@ fallback: } static int -sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip) +sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip, + unsigned long *replaced) { - return ip ? replace_stack_entry(stack_entries, num_entries, ip) : + return ip ? replace_stack_entry(stack_entries, num_entries, ip, replaced) : get_stack_skipnr(stack_entries, num_entries); } @@ -360,6 +364,14 @@ static int sym_strcmp(void *addr1, void *addr2) return strncmp(buf1, buf2, sizeof(buf1)); } +static void +print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long reordered_to) +{ + stack_trace_print(stack_entries, num_entries, 0); + if (reordered_to) + pr_err(" |\n +-> reordered to: %pS\n", (void *)reordered_to); +} + static void print_verbose_info(struct task_struct *task) { if (!task) @@ -378,10 +390,12 @@ static void print_report(enum kcsan_value_change value_change, struct other_info *other_info, u64 old, u64 new, u64 mask) { + unsigned long reordered_to = 0; unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 }; int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1); - int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip); + int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip, &reordered_to); unsigned long this_frame = stack_entries[skipnr]; + unsigned long other_reordered_to = 0; unsigned long other_frame = 0; int other_skipnr = 0; /* silence uninit warnings */ @@ -394,7 +408,7 @@ static void print_report(enum kcsan_value_change value_change, if (other_info) { other_skipnr = sanitize_stack_entries(other_info->stack_entries, other_info->num_stack_entries, - other_info->ai.ip); + other_info->ai.ip, &other_reordered_to); other_frame = other_info->stack_entries[other_skipnr]; /* @value_change is only known for the other thread */ @@ -434,10 +448,9 @@ static void print_report(enum kcsan_value_change value_change, other_info->ai.cpu_id); /* Print the other thread's stack trace. */ - stack_trace_print(other_info->stack_entries + other_skipnr, + print_stack_trace(other_info->stack_entries + other_skipnr, other_info->num_stack_entries - other_skipnr, - 0); - + other_reordered_to); if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) print_verbose_info(other_info->task); @@ -451,9 +464,7 @@ static void print_report(enum kcsan_value_change value_change, get_thread_desc(ai->task_pid), ai->cpu_id); } /* Print stack trace of this thread. */ - stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, - 0); - + print_stack_trace(stack_entries + skipnr, num_stack_entries - skipnr, reordered_to); if (IS_ENABLED(CONFIG_KCSAN_VERBOSE)) print_verbose_info(current); -- cgit v1.2.3 From 7310bd1f3eb9445d96b030457d59d9f84375bdd5 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:18 +0100 Subject: kcsan: test: Match reordered or normal accesses Due to reordering accesses with weak memory modeling, any access can now appear as "(reordered)". Match any permutation of accesses if CONFIG_KCSAN_WEAK_MEMORY=y, so that we effectively match an access if it is denoted "(reordered)" or not. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan_test.c | 92 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 63 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 6e3c2b8bc608..ec054879201b 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -151,7 +151,7 @@ struct expect_report { /* Check observed report matches information in @r. */ __no_kcsan -static bool report_matches(const struct expect_report *r) +static bool __report_matches(const struct expect_report *r) { const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT; bool ret = false; @@ -253,6 +253,40 @@ out: return ret; } +static __always_inline const struct expect_report * +__report_set_scoped(struct expect_report *r, int accesses) +{ + BUILD_BUG_ON(accesses > 3); + + if (accesses & 1) + r->access[0].type |= KCSAN_ACCESS_SCOPED; + else + r->access[0].type &= ~KCSAN_ACCESS_SCOPED; + + if (accesses & 2) + r->access[1].type |= KCSAN_ACCESS_SCOPED; + else + r->access[1].type &= ~KCSAN_ACCESS_SCOPED; + + return r; +} + +__no_kcsan +static bool report_matches_any_reordered(struct expect_report *r) +{ + return __report_matches(__report_set_scoped(r, 0)) || + __report_matches(__report_set_scoped(r, 1)) || + __report_matches(__report_set_scoped(r, 2)) || + __report_matches(__report_set_scoped(r, 3)); +} + +#ifdef CONFIG_KCSAN_WEAK_MEMORY +/* Due to reordering accesses, any access may appear as "(reordered)". */ +#define report_matches report_matches_any_reordered +#else +#define report_matches __report_matches +#endif + /* ===== Test kernels ===== */ static long test_sink; @@ -438,13 +472,13 @@ static noinline void test_kernel_xor_1bit(void) __no_kcsan static void test_basic(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, }, }; - static const struct expect_report never = { + struct expect_report never = { .access = { { test_kernel_read, &test_var, sizeof(test_var), 0 }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, @@ -469,14 +503,14 @@ static void test_basic(struct kunit *test) __no_kcsan static void test_concurrent_races(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { /* NULL will match any address. */ { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) }, }, }; - static const struct expect_report never = { + struct expect_report never = { .access = { { test_kernel_rmw_array, NULL, 0, 0 }, { test_kernel_rmw_array, NULL, 0, 0 }, @@ -498,13 +532,13 @@ static void test_concurrent_races(struct kunit *test) __no_kcsan static void test_novalue_change(struct kunit *test) { - const struct expect_report expect_rw = { + struct expect_report expect_rw = { .access = { { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, }, }; - const struct expect_report expect_ww = { + struct expect_report expect_ww = { .access = { { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, @@ -530,13 +564,13 @@ static void test_novalue_change(struct kunit *test) __no_kcsan static void test_novalue_change_exception(struct kunit *test) { - const struct expect_report expect_rw = { + struct expect_report expect_rw = { .access = { { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, }, }; - const struct expect_report expect_ww = { + struct expect_report expect_ww = { .access = { { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, @@ -556,7 +590,7 @@ static void test_novalue_change_exception(struct kunit *test) __no_kcsan static void test_unknown_origin(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_read, &test_var, sizeof(test_var), 0 }, { NULL }, @@ -578,7 +612,7 @@ static void test_unknown_origin(struct kunit *test) __no_kcsan static void test_write_write_assume_atomic(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, { test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, @@ -604,7 +638,7 @@ static void test_write_write_assume_atomic(struct kunit *test) __no_kcsan static void test_write_write_struct(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, @@ -626,7 +660,7 @@ static void test_write_write_struct(struct kunit *test) __no_kcsan static void test_write_write_struct_part(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, { test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE }, @@ -658,7 +692,7 @@ static void test_read_atomic_write_atomic(struct kunit *test) __no_kcsan static void test_read_plain_atomic_write(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_read, &test_var, sizeof(test_var), 0 }, { test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC }, @@ -679,7 +713,7 @@ static void test_read_plain_atomic_write(struct kunit *test) __no_kcsan static void test_read_plain_atomic_rmw(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_read, &test_var, sizeof(test_var), 0 }, { test_kernel_atomic_rmw, &test_var, sizeof(test_var), @@ -701,13 +735,13 @@ static void test_read_plain_atomic_rmw(struct kunit *test) __no_kcsan static void test_zero_size_access(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, }, }; - const struct expect_report never = { + struct expect_report never = { .access = { { test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE }, { test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 }, @@ -741,7 +775,7 @@ static void test_data_race(struct kunit *test) __no_kcsan static void test_assert_exclusive_writer(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, @@ -759,7 +793,7 @@ static void test_assert_exclusive_writer(struct kunit *test) __no_kcsan static void test_assert_exclusive_access(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, @@ -777,19 +811,19 @@ static void test_assert_exclusive_access(struct kunit *test) __no_kcsan static void test_assert_exclusive_access_writer(struct kunit *test) { - const struct expect_report expect_access_writer = { + struct expect_report expect_access_writer = { .access = { { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, }, }; - const struct expect_report expect_access_access = { + struct expect_report expect_access_access = { .access = { { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, { test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE }, }, }; - const struct expect_report never = { + struct expect_report never = { .access = { { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, { test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, @@ -813,7 +847,7 @@ static void test_assert_exclusive_access_writer(struct kunit *test) __no_kcsan static void test_assert_exclusive_bits_change(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT }, { test_kernel_change_bits, &test_var, sizeof(test_var), @@ -844,13 +878,13 @@ static void test_assert_exclusive_bits_nochange(struct kunit *test) __no_kcsan static void test_assert_exclusive_writer_scoped(struct kunit *test) { - const struct expect_report expect_start = { + struct expect_report expect_start = { .access = { { test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, }, }; - const struct expect_report expect_inscope = { + struct expect_report expect_inscope = { .access = { { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED }, { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE }, @@ -871,16 +905,16 @@ static void test_assert_exclusive_writer_scoped(struct kunit *test) __no_kcsan static void test_assert_exclusive_access_scoped(struct kunit *test) { - const struct expect_report expect_start1 = { + struct expect_report expect_start1 = { .access = { { test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, }, }; - const struct expect_report expect_start2 = { + struct expect_report expect_start2 = { .access = { expect_start1.access[0], expect_start1.access[0] }, }; - const struct expect_report expect_inscope = { + struct expect_report expect_inscope = { .access = { { test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED }, { test_kernel_read, &test_var, sizeof(test_var), 0 }, @@ -985,7 +1019,7 @@ static void test_atomic_builtins(struct kunit *test) __no_kcsan static void test_1bit_value_change(struct kunit *test) { - const struct expect_report expect = { + struct expect_report expect = { .access = { { test_kernel_read, &test_var, sizeof(test_var), 0 }, { test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, -- cgit v1.2.3 From 8bc32b34817862c80a8b9a82ceb643294acd3da3 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:19 +0100 Subject: kcsan: test: Add test cases for memory barrier instrumentation Adds test cases to check that memory barriers are instrumented correctly, and detection of missing memory barriers is working as intended if CONFIG_KCSAN_STRICT=y. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan_test.c | 319 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 319 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index ec054879201b..5bf94550bcdf 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -16,9 +16,12 @@ #define pr_fmt(fmt) "kcsan_test: " fmt #include +#include +#include #include #include #include +#include #include #include #include @@ -305,6 +308,16 @@ static DEFINE_SEQLOCK(test_seqlock); __no_kcsan static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); } +/* + * Generates a delay and some accesses that enter the runtime but do not produce + * data races. + */ +static noinline void test_delay(int iter) +{ + while (iter--) + sink_value(READ_ONCE(test_sink)); +} + static noinline void test_kernel_read(void) { sink_value(test_var); } static noinline void test_kernel_write(void) @@ -466,8 +479,219 @@ static noinline void test_kernel_xor_1bit(void) kcsan_nestable_atomic_end(); } +#define TEST_KERNEL_LOCKED(name, acquire, release) \ + static noinline void test_kernel_##name(void) \ + { \ + long *flag = &test_struct.val[0]; \ + long v = 0; \ + if (!(acquire)) \ + return; \ + while (v++ < 100) { \ + test_var++; \ + barrier(); \ + } \ + release; \ + test_delay(10); \ + } + +TEST_KERNEL_LOCKED(with_memorder, + cmpxchg_acquire(flag, 0, 1) == 0, + smp_store_release(flag, 0)); +TEST_KERNEL_LOCKED(wrong_memorder, + cmpxchg_relaxed(flag, 0, 1) == 0, + WRITE_ONCE(*flag, 0)); +TEST_KERNEL_LOCKED(atomic_builtin_with_memorder, + __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED), + __atomic_store_n(flag, 0, __ATOMIC_RELEASE)); +TEST_KERNEL_LOCKED(atomic_builtin_wrong_memorder, + __atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED), + __atomic_store_n(flag, 0, __ATOMIC_RELAXED)); + /* ===== Test cases ===== */ +/* + * Tests that various barriers have the expected effect on internal state. Not + * exhaustive on atomic_t operations. Unlike the selftest, also checks for + * too-strict barrier instrumentation; these can be tolerated, because it does + * not cause false positives, but at least we should be aware of such cases. + */ +static void test_barrier_nothreads(struct kunit *test) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + struct kcsan_scoped_access *reorder_access = ¤t->kcsan_ctx.reorder_access; +#else + struct kcsan_scoped_access *reorder_access = NULL; +#endif + arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED; + DEFINE_SPINLOCK(spinlock); + DEFINE_MUTEX(mutex); + atomic_t dummy; + + KCSAN_TEST_REQUIRES(test, reorder_access != NULL); + KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP)); + +#define __KCSAN_EXPECT_BARRIER(access_type, barrier, order_before, name) \ + do { \ + reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \ + reorder_access->size = sizeof(test_var); \ + barrier; \ + KUNIT_EXPECT_EQ_MSG(test, reorder_access->size, \ + order_before ? 0 : sizeof(test_var), \ + "improperly instrumented type=(" #access_type "): " name); \ + } while (0) +#define KCSAN_EXPECT_READ_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(0, b, o, #b) +#define KCSAN_EXPECT_WRITE_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_WRITE, b, o, #b) +#define KCSAN_EXPECT_RW_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, b, o, #b) + + /* Force creating a valid entry in reorder_access first. */ + test_var = 0; + while (test_var++ < 1000000 && reorder_access->size != sizeof(test_var)) + __kcsan_check_read(&test_var, sizeof(test_var)); + KUNIT_ASSERT_EQ(test, reorder_access->size, sizeof(test_var)); + + kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */ + + KCSAN_EXPECT_READ_BARRIER(mb(), true); + KCSAN_EXPECT_READ_BARRIER(wmb(), false); + KCSAN_EXPECT_READ_BARRIER(rmb(), true); + KCSAN_EXPECT_READ_BARRIER(smp_mb(), true); + KCSAN_EXPECT_READ_BARRIER(smp_wmb(), false); + KCSAN_EXPECT_READ_BARRIER(smp_rmb(), true); + KCSAN_EXPECT_READ_BARRIER(dma_wmb(), false); + KCSAN_EXPECT_READ_BARRIER(dma_rmb(), true); + KCSAN_EXPECT_READ_BARRIER(smp_mb__before_atomic(), true); + KCSAN_EXPECT_READ_BARRIER(smp_mb__after_atomic(), true); + KCSAN_EXPECT_READ_BARRIER(smp_mb__after_spinlock(), true); + KCSAN_EXPECT_READ_BARRIER(smp_store_mb(test_var, 0), true); + KCSAN_EXPECT_READ_BARRIER(smp_load_acquire(&test_var), false); + KCSAN_EXPECT_READ_BARRIER(smp_store_release(&test_var, 0), true); + KCSAN_EXPECT_READ_BARRIER(xchg(&test_var, 0), true); + KCSAN_EXPECT_READ_BARRIER(xchg_release(&test_var, 0), true); + KCSAN_EXPECT_READ_BARRIER(xchg_relaxed(&test_var, 0), false); + KCSAN_EXPECT_READ_BARRIER(cmpxchg(&test_var, 0, 0), true); + KCSAN_EXPECT_READ_BARRIER(cmpxchg_release(&test_var, 0, 0), true); + KCSAN_EXPECT_READ_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false); + KCSAN_EXPECT_READ_BARRIER(atomic_read(&dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_read_acquire(&dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_set(&dummy, 0), false); + KCSAN_EXPECT_READ_BARRIER(atomic_set_release(&dummy, 0), true); + KCSAN_EXPECT_READ_BARRIER(atomic_add(1, &dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_add_return(1, &dummy), true); + KCSAN_EXPECT_READ_BARRIER(atomic_add_return_acquire(1, &dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_add_return_release(1, &dummy), true); + KCSAN_EXPECT_READ_BARRIER(atomic_add_return_relaxed(1, &dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add(1, &dummy), true); + KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_acquire(1, &dummy), false); + KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_release(1, &dummy), true); + KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false); + KCSAN_EXPECT_READ_BARRIER(test_and_set_bit(0, &test_var), true); + KCSAN_EXPECT_READ_BARRIER(test_and_clear_bit(0, &test_var), true); + KCSAN_EXPECT_READ_BARRIER(test_and_change_bit(0, &test_var), true); + KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_READ_BARRIER(__clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); + KCSAN_EXPECT_READ_BARRIER(arch_spin_lock(&arch_spinlock), false); + KCSAN_EXPECT_READ_BARRIER(arch_spin_unlock(&arch_spinlock), true); + KCSAN_EXPECT_READ_BARRIER(spin_lock(&spinlock), false); + KCSAN_EXPECT_READ_BARRIER(spin_unlock(&spinlock), true); + KCSAN_EXPECT_READ_BARRIER(mutex_lock(&mutex), false); + KCSAN_EXPECT_READ_BARRIER(mutex_unlock(&mutex), true); + + KCSAN_EXPECT_WRITE_BARRIER(mb(), true); + KCSAN_EXPECT_WRITE_BARRIER(wmb(), true); + KCSAN_EXPECT_WRITE_BARRIER(rmb(), false); + KCSAN_EXPECT_WRITE_BARRIER(smp_mb(), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_wmb(), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_rmb(), false); + KCSAN_EXPECT_WRITE_BARRIER(dma_wmb(), true); + KCSAN_EXPECT_WRITE_BARRIER(dma_rmb(), false); + KCSAN_EXPECT_WRITE_BARRIER(smp_mb__before_atomic(), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_atomic(), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_spinlock(), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_store_mb(test_var, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(smp_load_acquire(&test_var), false); + KCSAN_EXPECT_WRITE_BARRIER(smp_store_release(&test_var, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(xchg(&test_var, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(xchg_release(&test_var, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(xchg_relaxed(&test_var, 0), false); + KCSAN_EXPECT_WRITE_BARRIER(cmpxchg(&test_var, 0, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_read(&dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_read_acquire(&dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_set(&dummy, 0), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_set_release(&dummy, 0), true); + KCSAN_EXPECT_WRITE_BARRIER(atomic_add(1, &dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return(1, &dummy), true); + KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_acquire(1, &dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_release(1, &dummy), true); + KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_relaxed(1, &dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add(1, &dummy), true); + KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_acquire(1, &dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy), true); + KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false); + KCSAN_EXPECT_WRITE_BARRIER(test_and_set_bit(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(test_and_clear_bit(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(test_and_change_bit(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(__clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(arch_spin_lock(&arch_spinlock), false); + KCSAN_EXPECT_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock), true); + KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&spinlock), false); + KCSAN_EXPECT_WRITE_BARRIER(spin_unlock(&spinlock), true); + KCSAN_EXPECT_WRITE_BARRIER(mutex_lock(&mutex), false); + KCSAN_EXPECT_WRITE_BARRIER(mutex_unlock(&mutex), true); + + KCSAN_EXPECT_RW_BARRIER(mb(), true); + KCSAN_EXPECT_RW_BARRIER(wmb(), true); + KCSAN_EXPECT_RW_BARRIER(rmb(), true); + KCSAN_EXPECT_RW_BARRIER(smp_mb(), true); + KCSAN_EXPECT_RW_BARRIER(smp_wmb(), true); + KCSAN_EXPECT_RW_BARRIER(smp_rmb(), true); + KCSAN_EXPECT_RW_BARRIER(dma_wmb(), true); + KCSAN_EXPECT_RW_BARRIER(dma_rmb(), true); + KCSAN_EXPECT_RW_BARRIER(smp_mb__before_atomic(), true); + KCSAN_EXPECT_RW_BARRIER(smp_mb__after_atomic(), true); + KCSAN_EXPECT_RW_BARRIER(smp_mb__after_spinlock(), true); + KCSAN_EXPECT_RW_BARRIER(smp_store_mb(test_var, 0), true); + KCSAN_EXPECT_RW_BARRIER(smp_load_acquire(&test_var), false); + KCSAN_EXPECT_RW_BARRIER(smp_store_release(&test_var, 0), true); + KCSAN_EXPECT_RW_BARRIER(xchg(&test_var, 0), true); + KCSAN_EXPECT_RW_BARRIER(xchg_release(&test_var, 0), true); + KCSAN_EXPECT_RW_BARRIER(xchg_relaxed(&test_var, 0), false); + KCSAN_EXPECT_RW_BARRIER(cmpxchg(&test_var, 0, 0), true); + KCSAN_EXPECT_RW_BARRIER(cmpxchg_release(&test_var, 0, 0), true); + KCSAN_EXPECT_RW_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false); + KCSAN_EXPECT_RW_BARRIER(atomic_read(&dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_read_acquire(&dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_set(&dummy, 0), false); + KCSAN_EXPECT_RW_BARRIER(atomic_set_release(&dummy, 0), true); + KCSAN_EXPECT_RW_BARRIER(atomic_add(1, &dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_add_return(1, &dummy), true); + KCSAN_EXPECT_RW_BARRIER(atomic_add_return_acquire(1, &dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_add_return_release(1, &dummy), true); + KCSAN_EXPECT_RW_BARRIER(atomic_add_return_relaxed(1, &dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add(1, &dummy), true); + KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_acquire(1, &dummy), false); + KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_release(1, &dummy), true); + KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false); + KCSAN_EXPECT_RW_BARRIER(test_and_set_bit(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(test_and_clear_bit(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(test_and_change_bit(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(__clear_bit_unlock(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(arch_spin_lock(&arch_spinlock), false); + KCSAN_EXPECT_RW_BARRIER(arch_spin_unlock(&arch_spinlock), true); + KCSAN_EXPECT_RW_BARRIER(spin_lock(&spinlock), false); + KCSAN_EXPECT_RW_BARRIER(spin_unlock(&spinlock), true); + KCSAN_EXPECT_RW_BARRIER(mutex_lock(&mutex), false); + KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&mutex), true); + + kcsan_nestable_atomic_end(); +} + /* Simple test with normal data race. */ __no_kcsan static void test_basic(struct kunit *test) @@ -1039,6 +1263,90 @@ static void test_1bit_value_change(struct kunit *test) KUNIT_EXPECT_TRUE(test, match); } +__no_kcsan +static void test_correct_barrier(struct kunit *test) +{ + struct expect_report expect = { + .access = { + { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) }, + }, + }; + bool match_expect = false; + + test_struct.val[0] = 0; /* init unlocked */ + begin_test_checks(test_kernel_with_memorder, test_kernel_with_memorder); + do { + match_expect = report_matches_any_reordered(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_FALSE(test, match_expect); +} + +__no_kcsan +static void test_missing_barrier(struct kunit *test) +{ + struct expect_report expect = { + .access = { + { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) }, + }, + }; + bool match_expect = false; + + test_struct.val[0] = 0; /* init unlocked */ + begin_test_checks(test_kernel_wrong_memorder, test_kernel_wrong_memorder); + do { + match_expect = report_matches_any_reordered(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY)) + KUNIT_EXPECT_TRUE(test, match_expect); + else + KUNIT_EXPECT_FALSE(test, match_expect); +} + +__no_kcsan +static void test_atomic_builtins_correct_barrier(struct kunit *test) +{ + struct expect_report expect = { + .access = { + { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) }, + }, + }; + bool match_expect = false; + + test_struct.val[0] = 0; /* init unlocked */ + begin_test_checks(test_kernel_atomic_builtin_with_memorder, + test_kernel_atomic_builtin_with_memorder); + do { + match_expect = report_matches_any_reordered(&expect); + } while (!end_test_checks(match_expect)); + KUNIT_EXPECT_FALSE(test, match_expect); +} + +__no_kcsan +static void test_atomic_builtins_missing_barrier(struct kunit *test) +{ + struct expect_report expect = { + .access = { + { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) }, + { test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) }, + }, + }; + bool match_expect = false; + + test_struct.val[0] = 0; /* init unlocked */ + begin_test_checks(test_kernel_atomic_builtin_wrong_memorder, + test_kernel_atomic_builtin_wrong_memorder); + do { + match_expect = report_matches_any_reordered(&expect); + } while (!end_test_checks(match_expect)); + if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY)) + KUNIT_EXPECT_TRUE(test, match_expect); + else + KUNIT_EXPECT_FALSE(test, match_expect); +} + /* * Generate thread counts for all test cases. Values generated are in interval * [2, 5] followed by exponentially increasing thread counts from 8 to 32. @@ -1088,6 +1396,7 @@ static const void *nthreads_gen_params(const void *prev, char *desc) #define KCSAN_KUNIT_CASE(test_name) KUNIT_CASE_PARAM(test_name, nthreads_gen_params) static struct kunit_case kcsan_test_cases[] = { + KUNIT_CASE(test_barrier_nothreads), KCSAN_KUNIT_CASE(test_basic), KCSAN_KUNIT_CASE(test_concurrent_races), KCSAN_KUNIT_CASE(test_novalue_change), @@ -1112,6 +1421,10 @@ static struct kunit_case kcsan_test_cases[] = { KCSAN_KUNIT_CASE(test_seqlock_noreport), KCSAN_KUNIT_CASE(test_atomic_builtins), KCSAN_KUNIT_CASE(test_1bit_value_change), + KCSAN_KUNIT_CASE(test_correct_barrier), + KCSAN_KUNIT_CASE(test_missing_barrier), + KCSAN_KUNIT_CASE(test_atomic_builtins_correct_barrier), + KCSAN_KUNIT_CASE(test_atomic_builtins_missing_barrier), {}, }; @@ -1176,6 +1489,9 @@ static int test_init(struct kunit *test) observed.nlines = 0; spin_unlock_irqrestore(&observed.lock, flags); + if (strstr(test->name, "nothreads")) + return 0; + if (!torture_init_begin((char *)test->name, 1)) return -EBUSY; @@ -1218,6 +1534,9 @@ static void test_exit(struct kunit *test) struct task_struct **stop_thread; int i; + if (strstr(test->name, "nothreads")) + return; + if (torture_cleanup_begin()) return; -- cgit v1.2.3 From 71b0e3aeb28256712945d99ca67b3f5e3ed7e0b1 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:21 +0100 Subject: kcsan: selftest: Add test case to check memory barrier instrumentation Memory barrier instrumentation is crucial to avoid false positives. To avoid surprises, run a simple test case in the boot-time selftest to ensure memory barriers are still instrumented correctly. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/Makefile | 2 + kernel/kcsan/selftest.c | 141 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index c2bb07f5bcc7..ff47e896de3b 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -11,6 +11,8 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ -fno-stack-protector -DDISABLE_BRANCH_PROFILING obj-y := core.o debugfs.o report.o + +KCSAN_INSTRUMENT_BARRIERS_selftest.o := y obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index b4295a3892b7..08c6b84b9ebe 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -7,10 +7,15 @@ #define pr_fmt(fmt) "kcsan: " fmt +#include +#include #include +#include #include #include #include +#include +#include #include #include "encoding.h" @@ -103,6 +108,141 @@ static bool __init test_matching_access(void) return true; } +/* + * Correct memory barrier instrumentation is critical to avoiding false + * positives: simple test to check at boot certain barriers are always properly + * instrumented. See kcsan_test for a more complete test. + */ +static bool __init test_barrier(void) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + struct kcsan_scoped_access *reorder_access = ¤t->kcsan_ctx.reorder_access; +#else + struct kcsan_scoped_access *reorder_access = NULL; +#endif + bool ret = true; + arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED; + DEFINE_SPINLOCK(spinlock); + atomic_t dummy; + long test_var; + + if (!reorder_access || !IS_ENABLED(CONFIG_SMP)) + return true; + +#define __KCSAN_CHECK_BARRIER(access_type, barrier, name) \ + do { \ + reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \ + reorder_access->size = 1; \ + barrier; \ + if (reorder_access->size != 0) { \ + pr_err("improperly instrumented type=(" #access_type "): " name "\n"); \ + ret = false; \ + } \ + } while (0) +#define KCSAN_CHECK_READ_BARRIER(b) __KCSAN_CHECK_BARRIER(0, b, #b) +#define KCSAN_CHECK_WRITE_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE, b, #b) +#define KCSAN_CHECK_RW_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND, b, #b) + + kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */ + + KCSAN_CHECK_READ_BARRIER(mb()); + KCSAN_CHECK_READ_BARRIER(rmb()); + KCSAN_CHECK_READ_BARRIER(smp_mb()); + KCSAN_CHECK_READ_BARRIER(smp_rmb()); + KCSAN_CHECK_READ_BARRIER(dma_rmb()); + KCSAN_CHECK_READ_BARRIER(smp_mb__before_atomic()); + KCSAN_CHECK_READ_BARRIER(smp_mb__after_atomic()); + KCSAN_CHECK_READ_BARRIER(smp_mb__after_spinlock()); + KCSAN_CHECK_READ_BARRIER(smp_store_mb(test_var, 0)); + KCSAN_CHECK_READ_BARRIER(smp_store_release(&test_var, 0)); + KCSAN_CHECK_READ_BARRIER(xchg(&test_var, 0)); + KCSAN_CHECK_READ_BARRIER(xchg_release(&test_var, 0)); + KCSAN_CHECK_READ_BARRIER(cmpxchg(&test_var, 0, 0)); + KCSAN_CHECK_READ_BARRIER(cmpxchg_release(&test_var, 0, 0)); + KCSAN_CHECK_READ_BARRIER(atomic_set_release(&dummy, 0)); + KCSAN_CHECK_READ_BARRIER(atomic_add_return(1, &dummy)); + KCSAN_CHECK_READ_BARRIER(atomic_add_return_release(1, &dummy)); + KCSAN_CHECK_READ_BARRIER(atomic_fetch_add(1, &dummy)); + KCSAN_CHECK_READ_BARRIER(atomic_fetch_add_release(1, &dummy)); + KCSAN_CHECK_READ_BARRIER(test_and_set_bit(0, &test_var)); + KCSAN_CHECK_READ_BARRIER(test_and_clear_bit(0, &test_var)); + KCSAN_CHECK_READ_BARRIER(test_and_change_bit(0, &test_var)); + KCSAN_CHECK_READ_BARRIER(clear_bit_unlock(0, &test_var)); + KCSAN_CHECK_READ_BARRIER(__clear_bit_unlock(0, &test_var)); + KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); + arch_spin_lock(&arch_spinlock); + KCSAN_CHECK_READ_BARRIER(arch_spin_unlock(&arch_spinlock)); + spin_lock(&spinlock); + KCSAN_CHECK_READ_BARRIER(spin_unlock(&spinlock)); + + KCSAN_CHECK_WRITE_BARRIER(mb()); + KCSAN_CHECK_WRITE_BARRIER(wmb()); + KCSAN_CHECK_WRITE_BARRIER(smp_mb()); + KCSAN_CHECK_WRITE_BARRIER(smp_wmb()); + KCSAN_CHECK_WRITE_BARRIER(dma_wmb()); + KCSAN_CHECK_WRITE_BARRIER(smp_mb__before_atomic()); + KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_atomic()); + KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_spinlock()); + KCSAN_CHECK_WRITE_BARRIER(smp_store_mb(test_var, 0)); + KCSAN_CHECK_WRITE_BARRIER(smp_store_release(&test_var, 0)); + KCSAN_CHECK_WRITE_BARRIER(xchg(&test_var, 0)); + KCSAN_CHECK_WRITE_BARRIER(xchg_release(&test_var, 0)); + KCSAN_CHECK_WRITE_BARRIER(cmpxchg(&test_var, 0, 0)); + KCSAN_CHECK_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0)); + KCSAN_CHECK_WRITE_BARRIER(atomic_set_release(&dummy, 0)); + KCSAN_CHECK_WRITE_BARRIER(atomic_add_return(1, &dummy)); + KCSAN_CHECK_WRITE_BARRIER(atomic_add_return_release(1, &dummy)); + KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add(1, &dummy)); + KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy)); + KCSAN_CHECK_WRITE_BARRIER(test_and_set_bit(0, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(test_and_clear_bit(0, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(test_and_change_bit(0, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock(0, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(__clear_bit_unlock(0, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); + arch_spin_lock(&arch_spinlock); + KCSAN_CHECK_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock)); + spin_lock(&spinlock); + KCSAN_CHECK_WRITE_BARRIER(spin_unlock(&spinlock)); + + KCSAN_CHECK_RW_BARRIER(mb()); + KCSAN_CHECK_RW_BARRIER(wmb()); + KCSAN_CHECK_RW_BARRIER(rmb()); + KCSAN_CHECK_RW_BARRIER(smp_mb()); + KCSAN_CHECK_RW_BARRIER(smp_wmb()); + KCSAN_CHECK_RW_BARRIER(smp_rmb()); + KCSAN_CHECK_RW_BARRIER(dma_wmb()); + KCSAN_CHECK_RW_BARRIER(dma_rmb()); + KCSAN_CHECK_RW_BARRIER(smp_mb__before_atomic()); + KCSAN_CHECK_RW_BARRIER(smp_mb__after_atomic()); + KCSAN_CHECK_RW_BARRIER(smp_mb__after_spinlock()); + KCSAN_CHECK_RW_BARRIER(smp_store_mb(test_var, 0)); + KCSAN_CHECK_RW_BARRIER(smp_store_release(&test_var, 0)); + KCSAN_CHECK_RW_BARRIER(xchg(&test_var, 0)); + KCSAN_CHECK_RW_BARRIER(xchg_release(&test_var, 0)); + KCSAN_CHECK_RW_BARRIER(cmpxchg(&test_var, 0, 0)); + KCSAN_CHECK_RW_BARRIER(cmpxchg_release(&test_var, 0, 0)); + KCSAN_CHECK_RW_BARRIER(atomic_set_release(&dummy, 0)); + KCSAN_CHECK_RW_BARRIER(atomic_add_return(1, &dummy)); + KCSAN_CHECK_RW_BARRIER(atomic_add_return_release(1, &dummy)); + KCSAN_CHECK_RW_BARRIER(atomic_fetch_add(1, &dummy)); + KCSAN_CHECK_RW_BARRIER(atomic_fetch_add_release(1, &dummy)); + KCSAN_CHECK_RW_BARRIER(test_and_set_bit(0, &test_var)); + KCSAN_CHECK_RW_BARRIER(test_and_clear_bit(0, &test_var)); + KCSAN_CHECK_RW_BARRIER(test_and_change_bit(0, &test_var)); + KCSAN_CHECK_RW_BARRIER(clear_bit_unlock(0, &test_var)); + KCSAN_CHECK_RW_BARRIER(__clear_bit_unlock(0, &test_var)); + KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); + arch_spin_lock(&arch_spinlock); + KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock)); + spin_lock(&spinlock); + KCSAN_CHECK_RW_BARRIER(spin_unlock(&spinlock)); + + kcsan_nestable_atomic_end(); + + return ret; +} + static int __init kcsan_selftest(void) { int passed = 0; @@ -120,6 +260,7 @@ static int __init kcsan_selftest(void) RUN_TEST(test_requires); RUN_TEST(test_encode_decode); RUN_TEST(test_matching_access); + RUN_TEST(test_barrier); pr_info("selftest: %d/%d tests passed\n", passed, total); if (passed != total) -- cgit v1.2.3 From 6f3f0c98b5665f437a631882653b5baf7dd98448 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 30 Nov 2021 12:44:29 +0100 Subject: sched, kcsan: Enable memory barrier instrumentation There's no fundamental reason to disable KCSAN for scheduler code, except for excessive noise and performance concerns (instrumenting scheduler code is usually a good way to stress test KCSAN itself). However, several core sched functions imply memory barriers that are invisible to KCSAN without instrumentation, but are required to avoid false positives. Therefore, unconditionally enable instrumentation of memory barriers in scheduler code. Also update the comment to reflect this and be a bit more brief. Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/sched/Makefile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index c7421f2d05e1..c83b37af155b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -11,11 +11,10 @@ ccflags-y += $(call cc-disable-warning, unused-but-set-variable) # that is not a function of syscall inputs. E.g. involuntary context switches. KCOV_INSTRUMENT := n -# There are numerous data races here, however, most of them are due to plain accesses. -# This would make it even harder for syzbot to find reproducers, because these -# bugs trigger without specific input. Disable by default, but should re-enable -# eventually. +# Disable KCSAN to avoid excessive noise and performance degradation. To avoid +# false positives ensure barriers implied by sched functions are instrumented. KCSAN_SANITIZE := n +KCSAN_INSTRUMENT_BARRIERS := y ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is -- cgit v1.2.3 From a70d36e6a0bd867ef42dce8ef46eb9b5a1515fb0 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Sat, 4 Dec 2021 00:38:17 +0100 Subject: kcsan: Make barrier tests compatible with lockdep The barrier tests in selftest and the kcsan_test module only need the spinlock and mutex to test correct barrier instrumentation. Therefore, these were initially placed on the stack. However, lockdep asserts that locks are in static storage, and will generate this warning: | INFO: trying to register non-static key. | The code is fine but needs lockdep annotation, or maybe | you didn't initialize this object before use? | turning off the locking correctness validator. | CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.16.0-rc1+ #3208 | Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-1ubuntu1.1 04/01/2014 | Call Trace: | | dump_stack_lvl+0x88/0xd8 | dump_stack+0x15/0x1b | register_lock_class+0x6b3/0x840 | ... | test_barrier+0x490/0x14c7 | kcsan_selftest+0x47/0xa0 | ... To fix, move the test locks into static storage. Fixing the above also revealed that lock operations are strengthened on first use with lockdep enabled, due to lockdep calling out into non-instrumented files (recall that kernel/locking/lockdep.c is not instrumented with KCSAN). Only kcsan_test checks for over-instrumentation of *_lock() operations, where we can simply "warm up" the test locks to avoid the test case failing with lockdep. Reported-by: Paul E. McKenney Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan_test.c | 37 +++++++++++++++++++++++-------------- kernel/kcsan/selftest.c | 14 +++++++------- 2 files changed, 30 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 5bf94550bcdf..2bad0820f73a 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -300,6 +300,8 @@ static struct { long val[8]; } test_struct; static DEFINE_SEQLOCK(test_seqlock); +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_MUTEX(test_mutex); /* * Helper to avoid compiler optimizing out reads, and to generate source values @@ -523,8 +525,6 @@ static void test_barrier_nothreads(struct kunit *test) struct kcsan_scoped_access *reorder_access = NULL; #endif arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED; - DEFINE_SPINLOCK(spinlock); - DEFINE_MUTEX(mutex); atomic_t dummy; KCSAN_TEST_REQUIRES(test, reorder_access != NULL); @@ -543,6 +543,15 @@ static void test_barrier_nothreads(struct kunit *test) #define KCSAN_EXPECT_WRITE_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_WRITE, b, o, #b) #define KCSAN_EXPECT_RW_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, b, o, #b) + /* + * Lockdep initialization can strengthen certain locking operations due + * to calling into instrumented files; "warm up" our locks. + */ + spin_lock(&test_spinlock); + spin_unlock(&test_spinlock); + mutex_lock(&test_mutex); + mutex_unlock(&test_mutex); + /* Force creating a valid entry in reorder_access first. */ test_var = 0; while (test_var++ < 1000000 && reorder_access->size != sizeof(test_var)) @@ -592,10 +601,10 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); KCSAN_EXPECT_READ_BARRIER(arch_spin_lock(&arch_spinlock), false); KCSAN_EXPECT_READ_BARRIER(arch_spin_unlock(&arch_spinlock), true); - KCSAN_EXPECT_READ_BARRIER(spin_lock(&spinlock), false); - KCSAN_EXPECT_READ_BARRIER(spin_unlock(&spinlock), true); - KCSAN_EXPECT_READ_BARRIER(mutex_lock(&mutex), false); - KCSAN_EXPECT_READ_BARRIER(mutex_unlock(&mutex), true); + KCSAN_EXPECT_READ_BARRIER(spin_lock(&test_spinlock), false); + KCSAN_EXPECT_READ_BARRIER(spin_unlock(&test_spinlock), true); + KCSAN_EXPECT_READ_BARRIER(mutex_lock(&test_mutex), false); + KCSAN_EXPECT_READ_BARRIER(mutex_unlock(&test_mutex), true); KCSAN_EXPECT_WRITE_BARRIER(mb(), true); KCSAN_EXPECT_WRITE_BARRIER(wmb(), true); @@ -638,10 +647,10 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); KCSAN_EXPECT_WRITE_BARRIER(arch_spin_lock(&arch_spinlock), false); KCSAN_EXPECT_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock), true); - KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&spinlock), false); - KCSAN_EXPECT_WRITE_BARRIER(spin_unlock(&spinlock), true); - KCSAN_EXPECT_WRITE_BARRIER(mutex_lock(&mutex), false); - KCSAN_EXPECT_WRITE_BARRIER(mutex_unlock(&mutex), true); + KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&test_spinlock), false); + KCSAN_EXPECT_WRITE_BARRIER(spin_unlock(&test_spinlock), true); + KCSAN_EXPECT_WRITE_BARRIER(mutex_lock(&test_mutex), false); + KCSAN_EXPECT_WRITE_BARRIER(mutex_unlock(&test_mutex), true); KCSAN_EXPECT_RW_BARRIER(mb(), true); KCSAN_EXPECT_RW_BARRIER(wmb(), true); @@ -684,10 +693,10 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); KCSAN_EXPECT_RW_BARRIER(arch_spin_lock(&arch_spinlock), false); KCSAN_EXPECT_RW_BARRIER(arch_spin_unlock(&arch_spinlock), true); - KCSAN_EXPECT_RW_BARRIER(spin_lock(&spinlock), false); - KCSAN_EXPECT_RW_BARRIER(spin_unlock(&spinlock), true); - KCSAN_EXPECT_RW_BARRIER(mutex_lock(&mutex), false); - KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&mutex), true); + KCSAN_EXPECT_RW_BARRIER(spin_lock(&test_spinlock), false); + KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true); + KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false); + KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true); kcsan_nestable_atomic_end(); } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 08c6b84b9ebe..b6d4da07d80a 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -113,6 +113,7 @@ static bool __init test_matching_access(void) * positives: simple test to check at boot certain barriers are always properly * instrumented. See kcsan_test for a more complete test. */ +static DEFINE_SPINLOCK(test_spinlock); static bool __init test_barrier(void) { #ifdef CONFIG_KCSAN_WEAK_MEMORY @@ -122,7 +123,6 @@ static bool __init test_barrier(void) #endif bool ret = true; arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED; - DEFINE_SPINLOCK(spinlock); atomic_t dummy; long test_var; @@ -172,8 +172,8 @@ static bool __init test_barrier(void) KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); arch_spin_lock(&arch_spinlock); KCSAN_CHECK_READ_BARRIER(arch_spin_unlock(&arch_spinlock)); - spin_lock(&spinlock); - KCSAN_CHECK_READ_BARRIER(spin_unlock(&spinlock)); + spin_lock(&test_spinlock); + KCSAN_CHECK_READ_BARRIER(spin_unlock(&test_spinlock)); KCSAN_CHECK_WRITE_BARRIER(mb()); KCSAN_CHECK_WRITE_BARRIER(wmb()); @@ -202,8 +202,8 @@ static bool __init test_barrier(void) KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); arch_spin_lock(&arch_spinlock); KCSAN_CHECK_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock)); - spin_lock(&spinlock); - KCSAN_CHECK_WRITE_BARRIER(spin_unlock(&spinlock)); + spin_lock(&test_spinlock); + KCSAN_CHECK_WRITE_BARRIER(spin_unlock(&test_spinlock)); KCSAN_CHECK_RW_BARRIER(mb()); KCSAN_CHECK_RW_BARRIER(wmb()); @@ -235,8 +235,8 @@ static bool __init test_barrier(void) KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); arch_spin_lock(&arch_spinlock); KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock)); - spin_lock(&spinlock); - KCSAN_CHECK_RW_BARRIER(spin_unlock(&spinlock)); + spin_lock(&test_spinlock); + KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock)); kcsan_nestable_atomic_end(); -- cgit v1.2.3 From e3d2b72bbf3c580b0c5c96385777c2f483a45ab5 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 6 Dec 2021 07:41:50 +0100 Subject: kcsan: Avoid nested contexts reading inconsistent reorder_access Nested contexts, such as nested interrupts or scheduler code, share the same kcsan_ctx. When such a nested context reads an inconsistent reorder_access due to an interrupt during set_reorder_access(), we can observe the following warning: | ------------[ cut here ]------------ | Cannot find frame for torture_random kernel/torture.c:456 in stack trace | WARNING: CPU: 13 PID: 147 at kernel/kcsan/report.c:343 replace_stack_entry kernel/kcsan/report.c:343 | ... | Call Trace: | | sanitize_stack_entries kernel/kcsan/report.c:351 [inline] | print_report kernel/kcsan/report.c:409 | kcsan_report_known_origin kernel/kcsan/report.c:693 | kcsan_setup_watchpoint kernel/kcsan/core.c:658 | rcutorture_one_extend kernel/rcu/rcutorture.c:1475 | rcutorture_loop_extend kernel/rcu/rcutorture.c:1558 [inline] | ... | | ---[ end trace ee5299cb933115f5 ]--- | ================================================================== | BUG: KCSAN: data-race in _raw_spin_lock_irqsave / rcutorture_one_extend | | write (reordered) to 0xffffffff8c93b300 of 8 bytes by task 154 on cpu 12: | queued_spin_lock include/asm-generic/qspinlock.h:80 [inline] | do_raw_spin_lock include/linux/spinlock.h:185 [inline] | __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:111 [inline] | _raw_spin_lock_irqsave kernel/locking/spinlock.c:162 | try_to_wake_up kernel/sched/core.c:4003 | sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1097 | asm_sysvec_apic_timer_interrupt arch/x86/include/asm/idtentry.h:638 | set_reorder_access kernel/kcsan/core.c:416 [inline] <-- inconsistent reorder_access | kcsan_setup_watchpoint kernel/kcsan/core.c:693 | rcutorture_one_extend kernel/rcu/rcutorture.c:1475 | rcutorture_loop_extend kernel/rcu/rcutorture.c:1558 [inline] | rcu_torture_one_read kernel/rcu/rcutorture.c:1600 | rcu_torture_reader kernel/rcu/rcutorture.c:1692 | kthread kernel/kthread.c:327 | ret_from_fork arch/x86/entry/entry_64.S:295 | | read to 0xffffffff8c93b300 of 8 bytes by task 147 on cpu 13: | rcutorture_one_extend kernel/rcu/rcutorture.c:1475 | rcutorture_loop_extend kernel/rcu/rcutorture.c:1558 [inline] | ... The warning is telling us that there was a data race which KCSAN wants to report, but the function where the original access (that is now reordered) happened cannot be found in the stack trace, which prevents KCSAN from generating the right stack trace. The stack trace of "write (reordered)" now only shows where the access was reordered to, but should instead show the stack trace of the original write, with a final line saying "reordered to". At the point where set_reorder_access() is interrupted, it just set reorder_access->ptr and size, at which point size is non-zero. This is sufficient (if ctx->disable_scoped is zero) for further accesses from nested contexts to perform checking of this reorder_access. That then happened in _raw_spin_lock_irqsave(), which is called by scheduler code. However, since reorder_access->ip is still stale (ptr and size belong to a different ip not yet set) this finally leads to replace_stack_entry() not finding the frame in reorder_access->ip and generating the above warning. Fix it by ensuring that a nested context cannot access reorder_access while we update it in set_reorder_access(): set ctx->disable_scoped for the duration that reorder_access is updated, which effectively locks reorder_access and prevents concurrent use by nested contexts. Note, set_reorder_access() can do the update only if disabled_scoped is zero on entry, and must therefore set disable_scoped back to non-zero after the initial check in set_reorder_access(). Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 916060913966..fe12dfe254ec 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -412,11 +412,20 @@ set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, if (!reorder_access || !kcsan_weak_memory) return; + /* + * To avoid nested interrupts or scheduler (which share kcsan_ctx) + * reading an inconsistent reorder_access, ensure that the below has + * exclusive access to reorder_access by disallowing concurrent use. + */ + ctx->disable_scoped++; + barrier(); reorder_access->ptr = ptr; reorder_access->size = size; reorder_access->type = type | KCSAN_ACCESS_SCOPED; reorder_access->ip = ip; reorder_access->stack_depth = get_kcsan_stack_depth(); + barrier(); + ctx->disable_scoped--; } /* -- cgit v1.2.3 From b473a3891c46393e9c4ccb4e3197d7fb259c7100 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 6 Dec 2021 07:41:51 +0100 Subject: kcsan: Only test clear_bit_unlock_is_negative_byte if arch defines it Some architectures do not define clear_bit_unlock_is_negative_byte(). Only test it when it is actually defined (similar to other usage, such as in lib/test_kasan.c). Link: https://lkml.kernel.org/r/202112050757.x67rHnFU-lkp@intel.com Reported-by: kernel test robot Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/kcsan_test.c | 8 +++++--- kernel/kcsan/selftest.c | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 2bad0820f73a..a36fca063a73 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -598,7 +598,6 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_READ_BARRIER(test_and_change_bit(0, &test_var), true); KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock(0, &test_var), true); KCSAN_EXPECT_READ_BARRIER(__clear_bit_unlock(0, &test_var), true); - KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); KCSAN_EXPECT_READ_BARRIER(arch_spin_lock(&arch_spinlock), false); KCSAN_EXPECT_READ_BARRIER(arch_spin_unlock(&arch_spinlock), true); KCSAN_EXPECT_READ_BARRIER(spin_lock(&test_spinlock), false); @@ -644,7 +643,6 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_WRITE_BARRIER(test_and_change_bit(0, &test_var), true); KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock(0, &test_var), true); KCSAN_EXPECT_WRITE_BARRIER(__clear_bit_unlock(0, &test_var), true); - KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); KCSAN_EXPECT_WRITE_BARRIER(arch_spin_lock(&arch_spinlock), false); KCSAN_EXPECT_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock), true); KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&test_spinlock), false); @@ -690,7 +688,6 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_RW_BARRIER(test_and_change_bit(0, &test_var), true); KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock(0, &test_var), true); KCSAN_EXPECT_RW_BARRIER(__clear_bit_unlock(0, &test_var), true); - KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); KCSAN_EXPECT_RW_BARRIER(arch_spin_lock(&arch_spinlock), false); KCSAN_EXPECT_RW_BARRIER(arch_spin_unlock(&arch_spinlock), true); KCSAN_EXPECT_RW_BARRIER(spin_lock(&test_spinlock), false); @@ -698,6 +695,11 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false); KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true); +#ifdef clear_bit_unlock_is_negative_byte + KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); +#endif kcsan_nestable_atomic_end(); } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index b6d4da07d80a..75712959c84e 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -169,7 +169,6 @@ static bool __init test_barrier(void) KCSAN_CHECK_READ_BARRIER(test_and_change_bit(0, &test_var)); KCSAN_CHECK_READ_BARRIER(clear_bit_unlock(0, &test_var)); KCSAN_CHECK_READ_BARRIER(__clear_bit_unlock(0, &test_var)); - KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); arch_spin_lock(&arch_spinlock); KCSAN_CHECK_READ_BARRIER(arch_spin_unlock(&arch_spinlock)); spin_lock(&test_spinlock); @@ -199,7 +198,6 @@ static bool __init test_barrier(void) KCSAN_CHECK_WRITE_BARRIER(test_and_change_bit(0, &test_var)); KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock(0, &test_var)); KCSAN_CHECK_WRITE_BARRIER(__clear_bit_unlock(0, &test_var)); - KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); arch_spin_lock(&arch_spinlock); KCSAN_CHECK_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock)); spin_lock(&test_spinlock); @@ -232,12 +230,16 @@ static bool __init test_barrier(void) KCSAN_CHECK_RW_BARRIER(test_and_change_bit(0, &test_var)); KCSAN_CHECK_RW_BARRIER(clear_bit_unlock(0, &test_var)); KCSAN_CHECK_RW_BARRIER(__clear_bit_unlock(0, &test_var)); - KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); arch_spin_lock(&arch_spinlock); KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock)); spin_lock(&test_spinlock); KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock)); +#ifdef clear_bit_unlock_is_negative_byte + KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); + KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); +#endif kcsan_nestable_atomic_end(); return ret; -- cgit v1.2.3 From 345e004d023343d38088fdfea39688aa11e06ccf Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Fri, 10 Dec 2021 00:46:31 +0100 Subject: bpf: Fix incorrect state pruning for <8B spill/fill Commit 354e8f1970f8 ("bpf: Support <8-byte scalar spill and refill") introduced support in the verifier to track <8B spill/fills of scalars. The backtracking logic for the precision bit was however skipping spill/fills of less than 8B. That could cause state pruning to consider two states equivalent when they shouldn't be. As an example, consider the following bytecode snippet: 0: r7 = r1 1: call bpf_get_prandom_u32 2: r6 = 2 3: if r0 == 0 goto pc+1 4: r6 = 3 ... 8: [state pruning point] ... /* u32 spill/fill */ 10: *(u32 *)(r10 - 8) = r6 11: r8 = *(u32 *)(r10 - 8) 12: r0 = 0 13: if r8 == 3 goto pc+1 14: r0 = 1 15: exit The verifier first walks the path with R6=3. Given the support for <8B spill/fills, at instruction 13, it knows the condition is true and skips instruction 14. At that point, the backtracking logic kicks in but stops at the fill instruction since it only propagates the precision bit for 8B spill/fill. When the verifier then walks the path with R6=2, it will consider it safe at instruction 8 because R6 is not marked as needing precision. Instruction 14 is thus never walked and is then incorrectly removed as 'dead code'. It's also possible to lead the verifier to accept e.g. an out-of-bound memory access instead of causing an incorrect dead code elimination. This regression was found via Cilium's bpf-next CI where it was causing a conntrack map update to be silently skipped because the code had been removed by the verifier. This commit fixes it by enabling support for <8B spill/fills in the bactracking logic. In case of a <8B spill/fill, the full 8B stack slot will be marked as needing precision. Then, in __mark_chain_precision, any tracked register spilled in a marked slot will itself be marked as needing precision, regardless of the spill size. This logic makes two assumptions: (1) only 8B-aligned spill/fill are tracked and (2) spilled registers are only tracked if the spill and fill sizes are equal. Commit ef979017b837 ("bpf: selftest: Add verifier tests for <8-byte scalar spill and refill") covers the first assumption and the next commit in this patchset covers the second. Fixes: 354e8f1970f8 ("bpf: Support <8-byte scalar spill and refill") Signed-off-by: Paul Chaignon Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f3001937bbb9..f2f1ed34cfe9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2379,8 +2379,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, */ if (insn->src_reg != BPF_REG_FP) return 0; - if (BPF_SIZE(insn->code) != BPF_DW) - return 0; /* dreg = *(u64 *)[fp - off] was a fill from the stack. * that [fp - off] slot contains scalar that needs to be @@ -2403,8 +2401,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, /* scalars can only be spilled into stack */ if (insn->dst_reg != BPF_REG_FP) return 0; - if (BPF_SIZE(insn->code) != BPF_DW) - return 0; spi = (-insn->off - 1) / BPF_REG_SIZE; if (spi >= 64) { verbose(env, "BUG spi %d\n", spi); -- cgit v1.2.3 From 65c7cdedeb3026fabcc967a7aae2f755ad4d0783 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Sep 2021 11:24:17 -0400 Subject: genirq: Provide new interfaces for affinity hints The discussion about removing the side effect of irq_set_affinity_hint() of actually applying the cpumask (if not NULL) as affinity to the interrupt, unearthed a few unpleasantries: 1) The modular perf drivers rely on the current behaviour for the very wrong reasons. 2) While none of the other drivers prevents user space from changing the affinity, a cursorily inspection shows that there are at least expectations in some drivers. #1 needs to be cleaned up anyway, so that's not a problem #2 might result in subtle regressions especially when irqbalanced (which nowadays ignores the affinity hint) is disabled. Provide new interfaces: irq_update_affinity_hint() - Only sets the affinity hint pointer irq_set_affinity_and_hint() - Set the pointer and apply the affinity to the interrupt Make irq_set_affinity_hint() a wrapper around irq_apply_affinity_hint() and document it to be phased out. Signed-off-by: Thomas Gleixner Signed-off-by: Nitesh Narayan Lal Signed-off-by: Thomas Gleixner Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210501021832.743094-1-jesse.brandeburg@intel.com Link: https://lore.kernel.org/r/20210903152430.244937-2-nitesh@redhat.com --- include/linux/interrupt.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++- kernel/irq/manage.c | 8 +++---- 2 files changed, 56 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 1f22a30c0963..9367f1cb2e3c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -329,7 +329,46 @@ extern int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask); extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); -extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); +extern int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, + bool setaffinity); + +/** + * irq_update_affinity_hint - Update the affinity hint + * @irq: Interrupt to update + * @m: cpumask pointer (NULL to clear the hint) + * + * Updates the affinity hint, but does not change the affinity of the interrupt. + */ +static inline int +irq_update_affinity_hint(unsigned int irq, const struct cpumask *m) +{ + return __irq_apply_affinity_hint(irq, m, false); +} + +/** + * irq_set_affinity_and_hint - Update the affinity hint and apply the provided + * cpumask to the interrupt + * @irq: Interrupt to update + * @m: cpumask pointer (NULL to clear the hint) + * + * Updates the affinity hint and if @m is not NULL it applies it as the + * affinity of that interrupt. + */ +static inline int +irq_set_affinity_and_hint(unsigned int irq, const struct cpumask *m) +{ + return __irq_apply_affinity_hint(irq, m, true); +} + +/* + * Deprecated. Use irq_update_affinity_hint() or irq_set_affinity_and_hint() + * instead. + */ +static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) +{ + return irq_set_affinity_and_hint(irq, m); +} + extern int irq_update_affinity_desc(unsigned int irq, struct irq_affinity_desc *affinity); @@ -361,6 +400,18 @@ static inline int irq_can_set_affinity(unsigned int irq) static inline int irq_select_affinity(unsigned int irq) { return 0; } +static inline int irq_update_affinity_hint(unsigned int irq, + const struct cpumask *m) +{ + return -EINVAL; +} + +static inline int irq_set_affinity_and_hint(unsigned int irq, + const struct cpumask *m) +{ + return -EINVAL; +} + static inline int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7405e384e5ed..f23ffd30385b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -486,7 +486,8 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) } EXPORT_SYMBOL_GPL(irq_force_affinity); -int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) +int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, + bool setaffinity) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); @@ -495,12 +496,11 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) return -EINVAL; desc->affinity_hint = m; irq_put_desc_unlock(desc, flags); - /* set the initial affinity to prevent every interrupt being on CPU0 */ - if (m) + if (m && setaffinity) __irq_set_affinity(irq, m, false); return 0; } -EXPORT_SYMBOL_GPL(irq_set_affinity_hint); +EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { -- cgit v1.2.3 From e4779015fd5d2fb8390c258268addff24d6077c7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Dec 2021 14:46:22 -0800 Subject: timers: implement usleep_idle_range() Patch series "mm/damon: Fix fake /proc/loadavg reports", v3. This patchset fixes DAMON's fake load report issue. The first patch makes yet another variant of usleep_range() for this fix, and the second patch fixes the issue of DAMON by making it using the newly introduced function. This patch (of 2): Some kernel threads such as DAMON could need to repeatedly sleep in micro seconds level. Because usleep_range() sleeps in uninterruptible state, however, such threads would make /proc/loadavg reports fake load. To help such cases, this commit implements a variant of usleep_range() called usleep_idle_range(). It is same to usleep_range() but sets the state of the current task as TASK_IDLE while sleeping. Link: https://lkml.kernel.org/r/20211126145015.15862-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211126145015.15862-2-sj@kernel.org Signed-off-by: SeongJae Park Suggested-by: Andrew Morton Reviewed-by: Thomas Gleixner Tested-by: Oleksandr Natalenko Cc: John Stultz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delay.h | 14 +++++++++++++- kernel/time/timer.c | 16 +++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/delay.h b/include/linux/delay.h index 8eacf67eb212..039e7e0c7378 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -20,6 +20,7 @@ */ #include +#include extern unsigned long loops_per_jiffy; @@ -58,7 +59,18 @@ void calibrate_delay(void); void __attribute__((weak)) calibration_delay_done(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); -void usleep_range(unsigned long min, unsigned long max); +void usleep_range_state(unsigned long min, unsigned long max, + unsigned int state); + +static inline void usleep_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_UNINTERRUPTIBLE); +} + +static inline void usleep_idle_range(unsigned long min, unsigned long max) +{ + usleep_range_state(min, max, TASK_IDLE); +} static inline void ssleep(unsigned int seconds) { diff --git a/kernel/time/timer.c b/kernel/time/timer.c index e3d2c23c413d..85f1021ad459 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2054,26 +2054,28 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); /** - * usleep_range - Sleep for an approximate time - * @min: Minimum time in usecs to sleep - * @max: Maximum time in usecs to sleep + * usleep_range_state - Sleep for an approximate time in a given state + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + * @state: State of the current task that will be while sleeping * * In non-atomic context where the exact wakeup time is flexible, use - * usleep_range() instead of udelay(). The sleep improves responsiveness + * usleep_range_state() instead of udelay(). The sleep improves responsiveness * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces * power usage by allowing hrtimers to take advantage of an already- * scheduled interrupt instead of scheduling a new one just for this sleep. */ -void __sched usleep_range(unsigned long min, unsigned long max) +void __sched usleep_range_state(unsigned long min, unsigned long max, + unsigned int state) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; for (;;) { - __set_current_state(TASK_UNINTERRUPTIBLE); + __set_current_state(state); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } -EXPORT_SYMBOL(usleep_range); +EXPORT_SYMBOL(usleep_range_state); -- cgit v1.2.3 From 82762d2af31a60081162890983a83499c9c7dd74 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 18 Nov 2021 17:42:40 +0100 Subject: sched/fair: Replace CFS internal cpu_util() with cpu_util_cfs() cpu_util_cfs() was created by commit d4edd662ac16 ("sched/cpufreq: Use the DEADLINE utilization signal") to enable the access to CPU utilization from the Schedutil CPUfreq governor. Commit a07630b8b2c1 ("sched/cpufreq/schedutil: Use util_est for OPP selection") added util_est support later. The only thing cpu_util() is doing on top of what cpu_util_cfs() already does is to clamp the return value to the [0..capacity_orig] capacity range of the CPU. Integrating this into cpu_util_cfs() is not harming the existing users (Schedutil and CPUfreq cooling (latter via sched_cpu_util() wrapper)). For straightforwardness, prefer to keep using `int cpu` as the function parameter over using `struct rq *rq` which might avoid some calls to cpu_rq(cpu) -> per_cpu(runqueues, cpu) -> RELOC_HIDE(). Update cfs_util()'s documentation and reuse it for cpu_util_cfs(). Remove cpu_util(). Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20211118164240.623551-1-dietmar.eggemann@arm.com --- kernel/sched/core.c | 2 +- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/fair.c | 71 +++++----------------------------------- kernel/sched/sched.h | 44 ++++++++++++++++++++++--- 4 files changed, 50 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index beaa8be6241e..fe53e510e711 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7166,7 +7166,7 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long sched_cpu_util(int cpu, unsigned long max) { - return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, + return effective_cpu_util(cpu, cpu_util_cfs(cpu), max, ENERGY_UTIL, NULL); } #endif /* CONFIG_SMP */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e7af18857371..26778884d9ab 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -168,7 +168,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), max, FREQUENCY_UTIL, NULL); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ac5e55441cab..095b0aa378df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1502,7 +1502,6 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); -static unsigned long cpu_util(int cpu); static inline long adjust_numa_imbalance(int imbalance, int dst_running, int dst_weight); @@ -1569,7 +1568,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->load += cpu_load(rq); ns->runnable += cpu_runnable(rq); - ns->util += cpu_util(cpu); + ns->util += cpu_util_cfs(cpu); ns->nr_running += rq->cfs.h_nr_running; ns->compute_capacity += capacity_of(cpu); @@ -3240,7 +3239,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) * As is, the util number is not freq-invariant (we'd have to * implement arch_scale_freq_capacity() for that). * - * See cpu_util(). + * See cpu_util_cfs(). */ cpufreq_update_util(rq, flags); } @@ -5510,11 +5509,9 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP -static inline unsigned long cpu_util(int cpu); - static inline bool cpu_overutilized(int cpu) { - return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); + return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)); } static inline void update_overutilized_status(struct rq *rq) @@ -6459,58 +6456,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; } -/** - * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks. - * @cpu: the CPU to get the utilization of - * - * The unit of the return value must be the one of capacity so we can compare - * the utilization with the capacity of the CPU that is available for CFS task - * (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * The estimated utilization of a CPU is defined to be the maximum between its - * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks - * currently RUNNABLE on that CPU. - * This allows to properly represent the expected utilization of a CPU which - * has just got a big task running since a long sleep period. At the same time - * however it preserves the benefits of the "blocked utilization" in - * describing the potential for other tasks waking up on the same CPU. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - * - * Return: the (estimated) utilization for the specified CPU - */ -static inline unsigned long cpu_util(int cpu) -{ - struct cfs_rq *cfs_rq; - unsigned int util; - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - if (sched_feat(UTIL_EST)) - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); - - return min_t(unsigned long, util, capacity_orig_of(cpu)); -} - /* * cpu_util_without: compute cpu utilization without any contributions from *p * @cpu: the CPU which utilization is requested @@ -6531,7 +6476,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util(cpu); + return cpu_util_cfs(cpu); cfs_rq = &cpu_rq(cpu)->cfs; util = READ_ONCE(cfs_rq->avg.util_avg); @@ -6595,7 +6540,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) /* * Utilization (estimated) can exceed the CPU capacity, thus let's * clamp to the maximum CPU capacity to ensure consistency with - * the cpu_util call. + * cpu_util. */ return min_t(unsigned long, util, capacity_orig_of(cpu)); } @@ -6627,7 +6572,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) * During wake-up, the task isn't enqueued yet and doesn't * appear in the cfs_rq->avg.util_est.enqueued of any rq, * so just add it (if needed) to "simulate" what will be - * cpu_util() after the task has been enqueued. + * cpu_util after the task has been enqueued. */ if (dst_cpu == cpu) util_est += _task_util_est(p); @@ -8689,7 +8634,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct rq *rq = cpu_rq(i); sgs->group_load += cpu_load(rq); - sgs->group_util += cpu_util(i); + sgs->group_util += cpu_util_cfs(i); sgs->group_runnable += cpu_runnable(rq); sgs->sum_h_nr_running += rq->cfs.h_nr_running; @@ -9707,7 +9652,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, break; case migrate_util: - util = cpu_util(cpu_of(rq)); + util = cpu_util_cfs(i); /* * Don't try to pull utilization from a CPU with one diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index eb971151e7e4..de53be905739 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2966,16 +2966,52 @@ static inline unsigned long cpu_util_dl(struct rq *rq) return READ_ONCE(rq->avg_dl.util_avg); } -static inline unsigned long cpu_util_cfs(struct rq *rq) +/** + * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks. + * @cpu: the CPU to get the utilization for. + * + * The unit of the return value must be the same as the one of CPU capacity + * so that CPU utilization can be compared with CPU capacity. + * + * CPU utilization is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on that CPU. + * It represents the amount of CPU capacity currently used by CFS tasks in + * the range [0..max CPU capacity] with max CPU capacity being the CPU + * capacity at f_max. + * + * The estimated CPU utilization is defined as the maximum between CPU + * utilization and sum of the estimated utilization of the currently + * runnable tasks on that CPU. It preserves a utilization "snapshot" of + * previously-executed tasks, which helps better deduce how busy a CPU will + * be when a long-sleeping task wakes up. The contribution to CPU utilization + * of such a task would be significantly decayed at this point of time. + * + * CPU utilization can be higher than the current CPU capacity + * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because + * of rounding errors as well as task migrations or wakeups of new tasks. + * CPU utilization has to be capped to fit into the [0..max CPU capacity] + * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) + * could be seen as over-utilized even though CPU1 has 20% of spare CPU + * capacity. CPU utilization is allowed to overshoot current CPU capacity + * though since this is useful for predicting the CPU capacity required + * after task migrations (scheduler-driven DVFS). + * + * Return: (Estimated) utilization for the specified CPU. + */ +static inline unsigned long cpu_util_cfs(int cpu) { - unsigned long util = READ_ONCE(rq->cfs.avg.util_avg); + struct cfs_rq *cfs_rq; + unsigned long util; + + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); if (sched_feat(UTIL_EST)) { util = max_t(unsigned long, util, - READ_ONCE(rq->cfs.avg.util_est.enqueued)); + READ_ONCE(cfs_rq->avg.util_est.enqueued)); } - return util; + return min(util, capacity_orig_of(cpu)); } static inline unsigned long cpu_util_rt(struct rq *rq) -- cgit v1.2.3 From e161c6bf3955d737f755f8eaa3b92de4bc6bd0e7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 25 Nov 2021 21:28:52 +0100 Subject: tracing: Iterate trace_[ku]probe objects directly As suggested by Linus [1] using list_for_each_entry to iterate directly trace_[ku]probe objects so we can skip another call to container_of in these loops. [1] https://lore.kernel.org/r/CAHk-=wjakjw6-rDzDDBsuMoDCqd+9ogifR_EE1F0K-jYek1CdA@mail.gmail.com Link: https://lkml.kernel.org/r/20211125202852.406405-1-jolsa@kernel.org Suggested-by: Linus Torvalds Signed-off-by: Jiri Olsa Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 13 ++++--------- kernel/trace/trace_uprobe.c | 23 ++++++++--------------- 2 files changed, 12 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d10c01948e68..f8c26ee72de3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -327,11 +327,9 @@ static inline int __enable_trace_kprobe(struct trace_kprobe *tk) static void __disable_trace_kprobe(struct trace_probe *tp) { - struct trace_probe *pos; struct trace_kprobe *tk; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tk = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) { if (!trace_kprobe_is_registered(tk)) continue; if (trace_kprobe_is_return(tk)) @@ -348,7 +346,7 @@ static void __disable_trace_kprobe(struct trace_probe *tp) static int enable_trace_kprobe(struct trace_event_call *call, struct trace_event_file *file) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_kprobe *tk; bool enabled; int ret = 0; @@ -369,8 +367,7 @@ static int enable_trace_kprobe(struct trace_event_call *call, if (enabled) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tk = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(tk, trace_probe_probe_list(tp), tp.list) { if (trace_kprobe_has_gone(tk)) continue; ret = __enable_trace_kprobe(tk); @@ -559,11 +556,9 @@ static bool trace_kprobe_has_same_kprobe(struct trace_kprobe *orig, struct trace_kprobe *comp) { struct trace_probe_event *tpe = orig->tp.event; - struct trace_probe *pos; int i; - list_for_each_entry(pos, &tpe->probes, list) { - orig = container_of(pos, struct trace_kprobe, tp); + list_for_each_entry(orig, &tpe->probes, tp.list) { if (strcmp(trace_kprobe_symbol(orig), trace_kprobe_symbol(comp)) || trace_kprobe_offset(orig) != trace_kprobe_offset(comp)) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a4d5c624fe79..3bd09d612137 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -409,12 +409,10 @@ static bool trace_uprobe_has_same_uprobe(struct trace_uprobe *orig, struct trace_uprobe *comp) { struct trace_probe_event *tpe = orig->tp.event; - struct trace_probe *pos; struct inode *comp_inode = d_real_inode(comp->path.dentry); int i; - list_for_each_entry(pos, &tpe->probes, list) { - orig = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(orig, &tpe->probes, tp.list) { if (comp_inode != d_real_inode(orig->path.dentry) || comp->offset != orig->offset) continue; @@ -1072,14 +1070,12 @@ static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter) static void __probe_event_disable(struct trace_probe *tp) { - struct trace_probe *pos; struct trace_uprobe *tu; tu = container_of(tp, struct trace_uprobe, tp); WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter)); - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { if (!tu->inode) continue; @@ -1091,7 +1087,7 @@ static void __probe_event_disable(struct trace_probe *tp) static int probe_event_enable(struct trace_event_call *call, struct trace_event_file *file, filter_func_t filter) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; bool enabled; int ret; @@ -1126,8 +1122,7 @@ static int probe_event_enable(struct trace_event_call *call, if (ret) goto err_flags; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { ret = trace_uprobe_enable(tu, filter); if (ret) { __probe_event_disable(tp); @@ -1272,7 +1267,7 @@ static bool trace_uprobe_filter_add(struct trace_uprobe_filter *filter, static int uprobe_perf_close(struct trace_event_call *call, struct perf_event *event) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; int ret = 0; @@ -1284,8 +1279,7 @@ static int uprobe_perf_close(struct trace_event_call *call, if (trace_uprobe_filter_remove(tu->tp.event->filter, event)) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); if (ret) break; @@ -1297,7 +1291,7 @@ static int uprobe_perf_close(struct trace_event_call *call, static int uprobe_perf_open(struct trace_event_call *call, struct perf_event *event) { - struct trace_probe *pos, *tp; + struct trace_probe *tp; struct trace_uprobe *tu; int err = 0; @@ -1309,8 +1303,7 @@ static int uprobe_perf_open(struct trace_event_call *call, if (trace_uprobe_filter_add(tu->tp.event->filter, event)) return 0; - list_for_each_entry(pos, trace_probe_probe_list(tp), list) { - tu = container_of(pos, struct trace_uprobe, tp); + list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) { err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); if (err) { uprobe_perf_close(call, event); -- cgit v1.2.3 From 4f67cca70c0f615e9cfe6ac42244f3416ec60877 Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Thu, 30 Sep 2021 15:38:21 -0700 Subject: tracing: Do not let synth_events block other dyn_event systems during create synth_events is returning -EINVAL if the dyn_event create command does not contain ' \t'. This prevents other systems from getting called back. synth_events needs to return -ECANCELED in these cases when the command is not targeting the synth_event system. Link: https://lore.kernel.org/linux-trace-devel/20210930223821.11025-1-beaub@linux.microsoft.com Fixes: c9e759b1e8456 ("tracing: Rework synthetic event command parsing") Reviewed-by: Masami Hiramatsu Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 98e002648994..149011e34ad9 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -2053,6 +2053,13 @@ static int create_synth_event(const char *raw_command) last_cmd_set(raw_command); + name = raw_command; + + /* Don't try to process if not our system */ + if (name[0] != 's' || name[1] != ':') + return -ECANCELED; + name += 2; + p = strpbrk(raw_command, " \t"); if (!p) { synth_err(SYNTH_ERR_INVALID_CMD, 0); @@ -2061,12 +2068,6 @@ static int create_synth_event(const char *raw_command) fields = skip_spaces(p); - name = raw_command; - - if (name[0] != 's' || name[1] != ':') - return -ECANCELED; - name += 2; - /* This interface accepts group name prefix */ if (strchr(name, '/')) { len = str_has_prefix(name, SYNTH_SYSTEM "/"); -- cgit v1.2.3 From dba879672258699223b0ce61f9e5c079b0476d92 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Fri, 10 Dec 2021 09:22:45 +0800 Subject: tracing: Use memset_startat helper in trace_iterator_reset() Make use of memset_startat helper to simplify the code, there should be no functional change as a result of this patch. Link: https://lkml.kernel.org/r/20211210012245.207489-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8bd1a815ce90..64a7ec44a635 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1932,14 +1932,7 @@ extern struct trace_iterator *tracepoint_print_iter; */ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) { - const size_t offset = offsetof(struct trace_iterator, seq); - - /* - * Keep gcc from complaining about overwriting more than just one - * member in the structure. - */ - memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset); - + memset_startat(iter, 0, seq); iter->pos = -1; } -- cgit v1.2.3 From 2768c1e7f9d7b82f9e129efe3677c783bc77b8f9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 10 Dec 2021 20:26:16 -0500 Subject: tracing: Use trace_iterator_reset() in tracing_read_pipe() Currently tracing_read_pipe() open codes trace_iterator_reset(). Just have it use trace_iterator_reset() instead. Link: https://lkml.kernel.org/r/20211210202616.64d432d2@gandalf.local.home Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 588de6df473f..547d82628c2e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6731,10 +6731,9 @@ waitagain: cnt = PAGE_SIZE - 1; /* reset all but tr, trace, and overruns */ - memset_startat(iter, 0, seq); + trace_iterator_reset(iter); cpumask_clear(iter->started); trace_seq_init(&iter->seq); - iter->pos = -1; trace_event_read_lock(); trace_access_lock(iter->cpu_file); -- cgit v1.2.3 From c5fb19937455095573a19ddcbff32e993ed10e35 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 10 Dec 2021 22:16:49 +0800 Subject: bpf: Add bpf_strncmp helper The helper compares two strings: one string is a null-terminated read-only string, and another string has const max storage size but doesn't need to be null-terminated. It can be used to compare file name in tracing or LSM program. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211210141652.877186-2-houtao1@huawei.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 11 +++++++++++ kernel/bpf/helpers.c | 16 ++++++++++++++++ tools/include/uapi/linux/bpf.h | 11 +++++++++++ 4 files changed, 39 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0ceb54c6342f..7a40022e3d00 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2163,6 +2163,7 @@ extern const struct bpf_func_proto bpf_sk_getsockopt_proto; extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; +extern const struct bpf_func_proto bpf_strncmp_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c26871263f1f..2820c77e4846 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4983,6 +4983,16 @@ union bpf_attr { * Return * The number of loops performed, **-EINVAL** for invalid **flags**, * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. + * + * long bpf_strncmp(const char *s1, u32 s1_sz, const char *s2) + * Description + * Do strncmp() between **s1** and **s2**. **s1** doesn't need + * to be null-terminated and **s1_sz** is the maximum storage + * size of **s1**. **s2** must be a read-only string. + * Return + * An integer less than, equal to, or greater than zero + * if the first **s1_sz** bytes of **s1** is found to be + * less than, to match, or be greater than **s2**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5167,6 +5177,7 @@ union bpf_attr { FN(kallsyms_lookup_name), \ FN(find_vma), \ FN(loop), \ + FN(strncmp), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 19fecfeaa9c2..8babae03d30a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -565,6 +565,20 @@ const struct bpf_func_proto bpf_strtoul_proto = { }; #endif +BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) +{ + return strncmp(s1, s2, s1_sz); +} + +const struct bpf_func_proto bpf_strncmp_proto = { + .func = bpf_strncmp, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_PTR_TO_CONST_STR, +}; + BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, struct bpf_pidns_info *, nsdata, u32, size) { @@ -1378,6 +1392,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: return &bpf_loop_proto; + case BPF_FUNC_strncmp: + return &bpf_strncmp_proto; default: break; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c26871263f1f..2820c77e4846 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4983,6 +4983,16 @@ union bpf_attr { * Return * The number of loops performed, **-EINVAL** for invalid **flags**, * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. + * + * long bpf_strncmp(const char *s1, u32 s1_sz, const char *s2) + * Description + * Do strncmp() between **s1** and **s2**. **s1** doesn't need + * to be null-terminated and **s1_sz** is the maximum storage + * size of **s1**. **s2** must be a read-only string. + * Return + * An integer less than, equal to, or greater than zero + * if the first **s1_sz** bytes of **s1** is found to be + * less than, to match, or be greater than **s2**. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5167,6 +5177,7 @@ union bpf_attr { FN(kallsyms_lookup_name), \ FN(find_vma), \ FN(loop), \ + FN(strncmp), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 4674f21071b935c237217ac02cb310522d6ad95d Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 9 Dec 2021 14:21:22 +0800 Subject: bpf: Use kmemdup() to replace kmalloc + memcpy Eliminate the follow coccicheck warning: ./kernel/bpf/btf.c:6537:13-20: WARNING opportunity for kmemdup. Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1639030882-92383-1-git-send-email-jiapeng.chong@linux.alibaba.com --- kernel/bpf/btf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8b00c6e4d6fb..baa90f3acd41 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6534,12 +6534,11 @@ static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, bpf_free_cands_from_cache(*cc); *cc = NULL; } - new_cands = kmalloc(sizeof_cands(cands->cnt), GFP_KERNEL); + new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL); if (!new_cands) { bpf_free_cands(cands); return ERR_PTR(-ENOMEM); } - memcpy(new_cands, cands, sizeof_cands(cands->cnt)); /* strdup the name, since it will stay in cache. * the cands->name points to strings in prog's BTF and the prog can be unloaded. */ -- cgit v1.2.3 From f18a499799dd0f0fdd98cf72d98d3866ce9ac60e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sat, 11 Dec 2021 18:08:19 -0800 Subject: bpf: Silence coverity false positive warning. Coverity issued the following warning: 6685 cands = bpf_core_add_cands(cands, main_btf, 1); 6686 if (IS_ERR(cands)) >>> CID 1510300: (RETURN_LOCAL) >>> Returning pointer "cands" which points to local variable "local_cand". 6687 return cands; It's a false positive. Add ERR_CAST() to silence it. Signed-off-by: Alexei Starovoitov --- kernel/bpf/btf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index baa90f3acd41..65231045a529 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6656,7 +6656,7 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) main_btf = bpf_get_btf_vmlinux(); if (IS_ERR(main_btf)) - return (void *)main_btf; + return ERR_CAST(main_btf); local_type = btf_type_by_id(local_btf, local_type_id); if (!local_type) @@ -6683,14 +6683,14 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) /* Attempt to find target candidates in vmlinux BTF first */ cands = bpf_core_add_cands(cands, main_btf, 1); if (IS_ERR(cands)) - return cands; + return ERR_CAST(cands); /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */ /* populate cache even when cands->cnt == 0 */ cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); if (IS_ERR(cc)) - return cc; + return ERR_CAST(cc); /* if vmlinux BTF has any candidate, don't go for module BTFs */ if (cc->cnt) @@ -6716,7 +6716,7 @@ check_modules: cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); if (IS_ERR(cands)) { btf_put(mod_btf); - return cands; + return ERR_CAST(cands); } spin_lock_bh(&btf_idr_lock); btf_put(mod_btf); -- cgit v1.2.3 From bb6728d756112596881a5fdf2040544031905840 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 8 Dec 2021 20:32:41 +0100 Subject: bpf: Allow access to int pointer arguments in tracing programs Adding support to access arguments with int pointer arguments in tracing programs. Currently we allow tracing programs to access only pointers to string (char pointer), void pointers and pointers to structs. If we try to access argument which is pointer to int, verifier will fail to load the program with; R1 type=ctx expected=fp ; int BPF_PROG(fmod_ret_test, int _a, __u64 _b, int _ret) 0: (bf) r6 = r1 ; int BPF_PROG(fmod_ret_test, int _a, __u64 _b, int _ret) 1: (79) r9 = *(u64 *)(r6 +8) func 'bpf_modify_return_test' arg1 type INT is not a struct There is no harm for the program to access int pointer argument. We are already doing that for string pointer, which is pointer to int with 1 byte size. Changing the is_string_ptr to generic integer check and renaming it to btf_type_is_int. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211208193245.172141-2-jolsa@kernel.org --- kernel/bpf/btf.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 65231045a529..a17de71abd2e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4826,7 +4826,7 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) return prog->aux->attach_btf; } -static bool is_string_ptr(struct btf *btf, const struct btf_type *t) +static bool is_int_ptr(struct btf *btf, const struct btf_type *t) { /* t comes in already as a pointer */ t = btf_type_by_id(btf, t->type); @@ -4835,8 +4835,7 @@ static bool is_string_ptr(struct btf *btf, const struct btf_type *t) if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST) t = btf_type_by_id(btf, t->type); - /* char, signed char, unsigned char */ - return btf_type_is_int(t) && t->size == 1; + return btf_type_is_int(t); } bool btf_ctx_access(int off, int size, enum bpf_access_type type, @@ -4957,7 +4956,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, */ return true; - if (is_string_ptr(btf, t)) + if (is_int_ptr(btf, t)) return true; /* this is a pointer to another type */ -- cgit v1.2.3 From f92c1e183604c20ce00eb889315fdaa8f2d9e509 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 8 Dec 2021 20:32:44 +0100 Subject: bpf: Add get_func_[arg|ret|arg_cnt] helpers Adding following helpers for tracing programs: Get n-th argument of the traced function: long bpf_get_func_arg(void *ctx, u32 n, u64 *value) Get return value of the traced function: long bpf_get_func_ret(void *ctx, u64 *value) Get arguments count of the traced function: long bpf_get_func_arg_cnt(void *ctx) The trampoline now stores number of arguments on ctx-8 address, so it's easy to verify argument index and find return value argument's position. Moving function ip address on the trampoline stack behind the number of functions arguments, so it's now stored on ctx-16 address if it's needed. All helpers above are inlined by verifier. Also bit unrelated small change - using newly added function bpf_prog_has_trampoline in check_get_func_ip. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211208193245.172141-5-jolsa@kernel.org --- arch/x86/net/bpf_jit_comp.c | 15 +++++++- include/linux/bpf.h | 5 +++ include/uapi/linux/bpf.h | 28 +++++++++++++++ kernel/bpf/trampoline.c | 8 +++++ kernel/bpf/verifier.c | 77 +++++++++++++++++++++++++++++++++++++++--- kernel/trace/bpf_trace.c | 55 +++++++++++++++++++++++++++++- tools/include/uapi/linux/bpf.h | 28 +++++++++++++++ 7 files changed, 209 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 10fab8cb3fb5..4bbcded07415 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1941,7 +1941,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i void *orig_call) { int ret, i, nr_args = m->nr_args; - int regs_off, ip_off, stack_size = nr_args * 8; + int regs_off, ip_off, args_off, stack_size = nr_args * 8; struct bpf_tramp_progs *fentry = &tprogs[BPF_TRAMP_FENTRY]; struct bpf_tramp_progs *fexit = &tprogs[BPF_TRAMP_FEXIT]; struct bpf_tramp_progs *fmod_ret = &tprogs[BPF_TRAMP_MODIFY_RETURN]; @@ -1968,6 +1968,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i * [ ... ] * RBP - regs_off [ reg_arg1 ] program's ctx pointer * + * RBP - args_off [ args count ] always + * * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag */ @@ -1978,6 +1980,10 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i regs_off = stack_size; + /* args count */ + stack_size += 8; + args_off = stack_size; + if (flags & BPF_TRAMP_F_IP_ARG) stack_size += 8; /* room for IP address argument */ @@ -1996,6 +2002,13 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ EMIT1(0x53); /* push rbx */ + /* Store number of arguments of the traced function: + * mov rax, nr_args + * mov QWORD PTR [rbp - args_off], rax + */ + emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args); + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off); + if (flags & BPF_TRAMP_F_IP_ARG) { /* Store IP address of the traced function: * mov rax, QWORD PTR [rbp + 8] diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7a40022e3d00..965fffaf0308 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -777,6 +777,7 @@ void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); int bpf_jit_charge_modmem(u32 pages); void bpf_jit_uncharge_modmem(u32 pages); +bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) @@ -805,6 +806,10 @@ static inline bool is_bpf_image_address(unsigned long address) { return false; } +static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) +{ + return false; +} #endif struct bpf_func_info_aux { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2820c77e4846..b0383d371b9a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4993,6 +4993,31 @@ union bpf_attr { * An integer less than, equal to, or greater than zero * if the first **s1_sz** bytes of **s1** is found to be * less than, to match, or be greater than **s2**. + * + * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) + * Description + * Get **n**-th argument (zero based) of the traced function (for tracing programs) + * returned in **value**. + * + * Return + * 0 on success. + * **-EINVAL** if n >= arguments count of traced function. + * + * long bpf_get_func_ret(void *ctx, u64 *value) + * Description + * Get return value of the traced function (for tracing programs) + * in **value**. + * + * Return + * 0 on success. + * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. + * + * long bpf_get_func_arg_cnt(void *ctx) + * Description + * Get number of arguments of the traced function (for tracing programs). + * + * Return + * The number of arguments of the traced function. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5178,6 +5203,9 @@ union bpf_attr { FN(find_vma), \ FN(loop), \ FN(strncmp), \ + FN(get_func_arg), \ + FN(get_func_ret), \ + FN(get_func_arg_cnt), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index e98de5e73ba5..4b6974a195c1 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -27,6 +27,14 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); +bool bpf_prog_has_trampoline(const struct bpf_prog *prog) +{ + enum bpf_attach_type eatype = prog->expected_attach_type; + + return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN; +} + void *bpf_jit_alloc_exec_page(void) { void *image; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b39de3ae50f5..d74e8a99412e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6395,13 +6395,11 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, static int check_get_func_ip(struct bpf_verifier_env *env) { - enum bpf_attach_type eatype = env->prog->expected_attach_type; enum bpf_prog_type type = resolve_prog_type(env->prog); int func_id = BPF_FUNC_get_func_ip; if (type == BPF_PROG_TYPE_TRACING) { - if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT && - eatype != BPF_MODIFY_RETURN) { + if (!bpf_prog_has_trampoline(env->prog)) { verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n", func_id_name(func_id), func_id); return -ENOTSUPP; @@ -12997,6 +12995,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; + enum bpf_attach_type eatype = prog->expected_attach_type; bool expect_blinding = bpf_jit_blinding_enabled(prog); enum bpf_prog_type prog_type = resolve_prog_type(prog); struct bpf_insn *insn = prog->insnsi; @@ -13367,11 +13366,79 @@ patch_map_ops_generic: continue; } + /* Implement bpf_get_func_arg inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_arg) { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); + insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); + insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); + insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); + insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); + insn_buf[7] = BPF_JMP_A(1); + insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); + cnt = 9; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + + /* Implement bpf_get_func_ret inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_ret) { + if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN) { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); + insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0); + cnt = 6; + } else { + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP); + cnt = 1; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + + /* Implement get_func_arg_cnt inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_arg_cnt) { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); + if (!new_prog) + return -ENOMEM; + + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + /* Implement bpf_get_func_ip inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_ip) { - /* Load IP address from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + /* Load IP address from ctx - 16 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16); new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); if (!new_prog) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 623dd0684429..cea2ca6df949 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1012,7 +1012,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { BPF_CALL_1(bpf_get_func_ip_tracing, void *, ctx) { /* This helper call is inlined by verifier. */ - return ((u64 *)ctx)[-1]; + return ((u64 *)ctx)[-2]; } static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = { @@ -1091,6 +1091,53 @@ static const struct bpf_func_proto bpf_get_branch_snapshot_proto = { .arg2_type = ARG_CONST_SIZE_OR_ZERO, }; +BPF_CALL_3(get_func_arg, void *, ctx, u32, n, u64 *, value) +{ + /* This helper call is inlined by verifier. */ + u64 nr_args = ((u64 *)ctx)[-1]; + + if ((u64) n >= nr_args) + return -EINVAL; + *value = ((u64 *)ctx)[n]; + return 0; +} + +static const struct bpf_func_proto bpf_get_func_arg_proto = { + .func = get_func_arg, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_2(get_func_ret, void *, ctx, u64 *, value) +{ + /* This helper call is inlined by verifier. */ + u64 nr_args = ((u64 *)ctx)[-1]; + + *value = ((u64 *)ctx)[nr_args]; + return 0; +} + +static const struct bpf_func_proto bpf_get_func_ret_proto = { + .func = get_func_ret, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_LONG, +}; + +BPF_CALL_1(get_func_arg_cnt, void *, ctx) +{ + /* This helper call is inlined by verifier. */ + return ((u64 *)ctx)[-1]; +} + +static const struct bpf_func_proto bpf_get_func_arg_cnt_proto = { + .func = get_func_arg_cnt, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1629,6 +1676,12 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) NULL; case BPF_FUNC_d_path: return &bpf_d_path_proto; + case BPF_FUNC_get_func_arg: + return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_proto : NULL; + case BPF_FUNC_get_func_ret: + return bpf_prog_has_trampoline(prog) ? &bpf_get_func_ret_proto : NULL; + case BPF_FUNC_get_func_arg_cnt: + return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL; default: fn = raw_tp_prog_func_proto(func_id, prog); if (!fn && prog->expected_attach_type == BPF_TRACE_ITER) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2820c77e4846..b0383d371b9a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4993,6 +4993,31 @@ union bpf_attr { * An integer less than, equal to, or greater than zero * if the first **s1_sz** bytes of **s1** is found to be * less than, to match, or be greater than **s2**. + * + * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) + * Description + * Get **n**-th argument (zero based) of the traced function (for tracing programs) + * returned in **value**. + * + * Return + * 0 on success. + * **-EINVAL** if n >= arguments count of traced function. + * + * long bpf_get_func_ret(void *ctx, u64 *value) + * Description + * Get return value of the traced function (for tracing programs) + * in **value**. + * + * Return + * 0 on success. + * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. + * + * long bpf_get_func_arg_cnt(void *ctx) + * Description + * Get number of arguments of the traced function (for tracing programs). + * + * Return + * The number of arguments of the traced function. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5178,6 +5203,9 @@ union bpf_attr { FN(find_vma), \ FN(loop), \ FN(strncmp), \ + FN(get_func_arg), \ + FN(get_func_ret), \ + FN(get_func_arg_cnt), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 0e25498f8cd43c1b5aa327f373dd094e9a006da7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 28 Jun 2021 14:52:01 -0500 Subject: exit: Add and use make_task_dead. There are two big uses of do_exit. The first is it's design use to be the guts of the exit(2) system call. The second use is to terminate a task after something catastrophic has happened like a NULL pointer in kernel code. Add a function make_task_dead that is initialy exactly the same as do_exit to cover the cases where do_exit is called to handle catastrophic failure. In time this can probably be reduced to just a light wrapper around do_task_dead. For now keep it exactly the same so that there will be no behavioral differences introducing this new concept. Replace all of the uses of do_exit that use it for catastraphic task cleanup with make_task_dead to make it clear what the code is doing. As part of this rename rewind_stack_do_exit rewind_stack_and_make_dead. Signed-off-by: "Eric W. Biederman" --- arch/alpha/kernel/traps.c | 6 +++--- arch/alpha/mm/fault.c | 2 +- arch/arm/kernel/traps.c | 2 +- arch/arm/mm/fault.c | 2 +- arch/arm64/kernel/traps.c | 2 +- arch/arm64/mm/fault.c | 2 +- arch/csky/abiv1/alignment.c | 2 +- arch/csky/kernel/traps.c | 2 +- arch/csky/mm/fault.c | 2 +- arch/h8300/kernel/traps.c | 2 +- arch/h8300/mm/fault.c | 2 +- arch/hexagon/kernel/traps.c | 2 +- arch/ia64/kernel/mca_drv.c | 2 +- arch/ia64/kernel/traps.c | 2 +- arch/ia64/mm/fault.c | 2 +- arch/m68k/kernel/traps.c | 2 +- arch/m68k/mm/fault.c | 2 +- arch/microblaze/kernel/exceptions.c | 4 ++-- arch/mips/kernel/traps.c | 2 +- arch/nds32/kernel/fpu.c | 2 +- arch/nds32/kernel/traps.c | 8 ++++---- arch/nios2/kernel/traps.c | 4 ++-- arch/openrisc/kernel/traps.c | 2 +- arch/parisc/kernel/traps.c | 2 +- arch/powerpc/kernel/traps.c | 8 ++++---- arch/riscv/kernel/traps.c | 2 +- arch/riscv/mm/fault.c | 2 +- arch/s390/kernel/dumpstack.c | 2 +- arch/s390/kernel/nmi.c | 2 +- arch/sh/kernel/traps.c | 2 +- arch/sparc/kernel/traps_32.c | 4 +--- arch/sparc/kernel/traps_64.c | 4 +--- arch/x86/entry/entry_32.S | 6 +++--- arch/x86/entry/entry_64.S | 6 +++--- arch/x86/kernel/dumpstack.c | 4 ++-- arch/xtensa/kernel/traps.c | 2 +- include/linux/sched/task.h | 1 + kernel/exit.c | 9 +++++++++ tools/objtool/check.c | 3 ++- 39 files changed, 63 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c index 2ae34702456c..8a66fe544c69 100644 --- a/arch/alpha/kernel/traps.c +++ b/arch/alpha/kernel/traps.c @@ -190,7 +190,7 @@ die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15) local_irq_enable(); while (1); } - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } #ifndef CONFIG_MATHEMU @@ -575,7 +575,7 @@ do_entUna(void * va, unsigned long opcode, unsigned long reg, printk("Bad unaligned kernel access at %016lx: %p %lx %lu\n", pc, va, opcode, reg); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); got_exception: /* Ok, we caught the exception, but we don't want it. Is there @@ -630,7 +630,7 @@ got_exception: local_irq_enable(); while (1); } - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } /* diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index eee5102c3d88..e9193d52222e 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c @@ -204,7 +204,7 @@ retry: printk(KERN_ALERT "Unable to handle kernel paging request at " "virtual address %016lx\n", address); die_if_kernel("Oops", regs, cause, (unsigned long*)regs - 16); - do_exit(SIGKILL); + make_task_dead(SIGKILL); /* We ran out of memory, or some other thing happened to us that made us unable to handle the page fault gracefully. */ diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 195dff58bafc..b4bd2e5f17c1 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -333,7 +333,7 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, int signr) if (panic_on_oops) panic("Fatal exception"); if (signr) - do_exit(signr); + make_task_dead(signr); } /* diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index bc8779d54a64..bf1a0c618c49 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -111,7 +111,7 @@ static void die_kernel_fault(const char *msg, struct mm_struct *mm, show_pte(KERN_ALERT, mm, addr); die("Oops", regs, fsr); bust_spinlocks(0); - do_exit(SIGKILL); + make_task_dead(SIGKILL); } /* diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 7b21213a570f..bdd456e4e7f4 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -235,7 +235,7 @@ void die(const char *str, struct pt_regs *regs, int err) raw_spin_unlock_irqrestore(&die_lock, flags); if (ret != NOTIFY_STOP) - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } static void arm64_show_signal(int signo, const char *str) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9ae24e3b72be..11a28cace2d2 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -302,7 +302,7 @@ static void die_kernel_fault(const char *msg, unsigned long addr, show_pte(addr); die("Oops", regs, esr); bust_spinlocks(0); - do_exit(SIGKILL); + make_task_dead(SIGKILL); } #ifdef CONFIG_KASAN_HW_TAGS diff --git a/arch/csky/abiv1/alignment.c b/arch/csky/abiv1/alignment.c index cb2a0d94a144..5e2fb45d605c 100644 --- a/arch/csky/abiv1/alignment.c +++ b/arch/csky/abiv1/alignment.c @@ -294,7 +294,7 @@ bad_area: __func__, opcode, rz, rx, imm, addr); show_regs(regs); bust_spinlocks(0); - do_exit(SIGKILL); + make_dead_task(SIGKILL); } force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)addr); diff --git a/arch/csky/kernel/traps.c b/arch/csky/kernel/traps.c index e5fbf8653a21..88a47035b925 100644 --- a/arch/csky/kernel/traps.c +++ b/arch/csky/kernel/traps.c @@ -109,7 +109,7 @@ void die(struct pt_regs *regs, const char *str) if (panic_on_oops) panic("Fatal exception"); if (ret != NOTIFY_STOP) - do_exit(SIGSEGV); + make_dead_task(SIGSEGV); } void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr) diff --git a/arch/csky/mm/fault.c b/arch/csky/mm/fault.c index 466ad949818a..7215a46b6b8e 100644 --- a/arch/csky/mm/fault.c +++ b/arch/csky/mm/fault.c @@ -67,7 +67,7 @@ static inline void no_context(struct pt_regs *regs, unsigned long addr) pr_alert("Unable to handle kernel paging request at virtual " "addr 0x%08lx, pc: 0x%08lx\n", addr, regs->pc); die(regs, "Oops"); - do_exit(SIGKILL); + make_task_dead(SIGKILL); } static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault) diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c index bdbe988d8dbc..3d4e0bde37ae 100644 --- a/arch/h8300/kernel/traps.c +++ b/arch/h8300/kernel/traps.c @@ -106,7 +106,7 @@ void die(const char *str, struct pt_regs *fp, unsigned long err) dump(fp); spin_unlock_irq(&die_lock); - do_exit(SIGSEGV); + make_dead_task(SIGSEGV); } static int kstack_depth_to_print = 24; diff --git a/arch/h8300/mm/fault.c b/arch/h8300/mm/fault.c index d4bc9c16f2df..0223528565dd 100644 --- a/arch/h8300/mm/fault.c +++ b/arch/h8300/mm/fault.c @@ -51,7 +51,7 @@ asmlinkage int do_page_fault(struct pt_regs *regs, unsigned long address, printk(" at virtual address %08lx\n", address); if (!user_mode(regs)) die("Oops", regs, error_code); - do_exit(SIGKILL); + make_dead_task(SIGKILL); return 1; } diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c index edfc35dafeb1..6dd6cf0ab711 100644 --- a/arch/hexagon/kernel/traps.c +++ b/arch/hexagon/kernel/traps.c @@ -214,7 +214,7 @@ int die(const char *str, struct pt_regs *regs, long err) panic("Fatal exception"); oops_exit(); - do_exit(err); + make_dead_task(err); return 0; } diff --git a/arch/ia64/kernel/mca_drv.c b/arch/ia64/kernel/mca_drv.c index 5bfc79be4cef..23c203639a96 100644 --- a/arch/ia64/kernel/mca_drv.c +++ b/arch/ia64/kernel/mca_drv.c @@ -176,7 +176,7 @@ mca_handler_bh(unsigned long paddr, void *iip, unsigned long ipsr) spin_unlock(&mca_bh_lock); /* This process is about to be killed itself */ - do_exit(SIGKILL); + make_task_dead(SIGKILL); } /** diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c index e13cb905930f..753642366e12 100644 --- a/arch/ia64/kernel/traps.c +++ b/arch/ia64/kernel/traps.c @@ -85,7 +85,7 @@ die (const char *str, struct pt_regs *regs, long err) if (panic_on_oops) panic("Fatal exception"); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); return 0; } diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 02de2e70c587..4796cccbf74f 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -259,7 +259,7 @@ retry: regs = NULL; bust_spinlocks(0); if (regs) - do_exit(SIGKILL); + make_task_dead(SIGKILL); return; out_of_memory: diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c index 34d6458340b0..59fc63feb0dc 100644 --- a/arch/m68k/kernel/traps.c +++ b/arch/m68k/kernel/traps.c @@ -1131,7 +1131,7 @@ void die_if_kernel (char *str, struct pt_regs *fp, int nr) pr_crit("%s: %08x\n", str, nr); show_registers(fp); add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } asmlinkage void set_esp0(unsigned long ssp) diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index ef46e77e97a5..fcb3a0d8421c 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -48,7 +48,7 @@ int send_fault_sig(struct pt_regs *regs) pr_alert("Unable to handle kernel access"); pr_cont(" at virtual address %p\n", addr); die_if_kernel("Oops", regs, 0 /*error_code*/); - do_exit(SIGKILL); + make_task_dead(SIGKILL); } return 1; diff --git a/arch/microblaze/kernel/exceptions.c b/arch/microblaze/kernel/exceptions.c index 908788497b28..fd153d5fab98 100644 --- a/arch/microblaze/kernel/exceptions.c +++ b/arch/microblaze/kernel/exceptions.c @@ -44,10 +44,10 @@ void die(const char *str, struct pt_regs *fp, long err) pr_warn("Oops: %s, sig: %ld\n", str, err); show_regs(fp); spin_unlock_irq(&die_lock); - /* do_exit() should take care of panic'ing from an interrupt + /* make_task_dead() should take care of panic'ing from an interrupt * context so we don't handle it here */ - do_exit(err); + make_task_dead(err); } /* for user application debugging */ diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index d26b0fb8ea06..a486486b2355 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -422,7 +422,7 @@ void __noreturn die(const char *str, struct pt_regs *regs) if (regs && kexec_should_crash(current)) crash_kexec(regs); - do_exit(sig); + make_task_dead(sig); } extern struct exception_table_entry __start___dbe_table[]; diff --git a/arch/nds32/kernel/fpu.c b/arch/nds32/kernel/fpu.c index 9edd7ed7d7bf..701c09a668de 100644 --- a/arch/nds32/kernel/fpu.c +++ b/arch/nds32/kernel/fpu.c @@ -223,7 +223,7 @@ inline void handle_fpu_exception(struct pt_regs *regs) } } else if (fpcsr & FPCSR_mskRIT) { if (!user_mode(regs)) - do_exit(SIGILL); + make_task_dead(SIGILL); si_signo = SIGILL; } diff --git a/arch/nds32/kernel/traps.c b/arch/nds32/kernel/traps.c index ca75d475eda4..c0a8f3344fb9 100644 --- a/arch/nds32/kernel/traps.c +++ b/arch/nds32/kernel/traps.c @@ -141,7 +141,7 @@ void __noreturn die(const char *str, struct pt_regs *regs, int err) bust_spinlocks(0); spin_unlock_irq(&die_lock); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } EXPORT_SYMBOL(die); @@ -240,7 +240,7 @@ void unhandled_interruption(struct pt_regs *regs) pr_emerg("unhandled_interruption\n"); show_regs(regs); if (!user_mode(regs)) - do_exit(SIGKILL); + make_task_dead(SIGKILL); force_sig(SIGKILL); } @@ -251,7 +251,7 @@ void unhandled_exceptions(unsigned long entry, unsigned long addr, addr, type); show_regs(regs); if (!user_mode(regs)) - do_exit(SIGKILL); + make_task_dead(SIGKILL); force_sig(SIGKILL); } @@ -278,7 +278,7 @@ void do_revinsn(struct pt_regs *regs) pr_emerg("Reserved Instruction\n"); show_regs(regs); if (!user_mode(regs)) - do_exit(SIGILL); + make_task_dead(SIGILL); force_sig(SIGILL); } diff --git a/arch/nios2/kernel/traps.c b/arch/nios2/kernel/traps.c index 596986a74a26..85ac49d64cf7 100644 --- a/arch/nios2/kernel/traps.c +++ b/arch/nios2/kernel/traps.c @@ -37,10 +37,10 @@ void die(const char *str, struct pt_regs *regs, long err) show_regs(regs); spin_unlock_irq(&die_lock); /* - * do_exit() should take care of panic'ing from an interrupt + * make_task_dead() should take care of panic'ing from an interrupt * context so we don't handle it here */ - do_exit(err); + make_task_dead(err); } void _exception(int signo, struct pt_regs *regs, int code, unsigned long addr) diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c index 0898cb159fac..0446a3c34372 100644 --- a/arch/openrisc/kernel/traps.c +++ b/arch/openrisc/kernel/traps.c @@ -212,7 +212,7 @@ void __noreturn die(const char *str, struct pt_regs *regs, long err) __asm__ __volatile__("l.nop 1"); do {} while (1); #endif - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } /* This is normally the 'Oops' routine */ diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index b11fb26ce299..df2122c50d78 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -269,7 +269,7 @@ void die_if_kernel(char *str, struct pt_regs *regs, long err) panic("Fatal exception"); oops_exit(); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } /* gdb uses break 4,8 */ diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 11741703d26e..a08bb7cefdc5 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -245,7 +245,7 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, if (panic_on_oops) panic("Fatal exception"); - do_exit(signr); + make_task_dead(signr); } NOKPROBE_SYMBOL(oops_end); @@ -792,9 +792,9 @@ int machine_check_generic(struct pt_regs *regs) void die_mce(const char *str, struct pt_regs *regs, long err) { /* - * The machine check wants to kill the interrupted context, but - * do_exit() checks for in_interrupt() and panics in that case, so - * exit the irq/nmi before calling die. + * The machine check wants to kill the interrupted context, + * but make_task_dead() checks for in_interrupt() and panics + * in that case, so exit the irq/nmi before calling die. */ if (in_nmi()) nmi_exit(); diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 0daaa3e4630d..fe92e119e6a3 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -54,7 +54,7 @@ void die(struct pt_regs *regs, const char *str) if (panic_on_oops) panic("Fatal exception"); if (ret != NOTIFY_STOP) - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } void do_trap(struct pt_regs *regs, int signo, int code, unsigned long addr) diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index aa08dd2f8fae..42118bc728f9 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -31,7 +31,7 @@ static void die_kernel_fault(const char *msg, unsigned long addr, bust_spinlocks(0); die(regs, "Oops"); - do_exit(SIGKILL); + make_task_dead(SIGKILL); } static inline void no_context(struct pt_regs *regs, unsigned long addr) diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index 0681c55e831d..1e3233eb510a 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -224,5 +224,5 @@ void __noreturn die(struct pt_regs *regs, const char *str) if (panic_on_oops) panic("Fatal exception: panic_on_oops"); oops_exit(); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 20f8e1868853..a4d8c058dd27 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -175,7 +175,7 @@ void __s390_handle_mcck(void) "malfunction (code 0x%016lx).\n", mcck.mcck_code); printk(KERN_EMERG "mcck: task: %s, pid: %d.\n", current->comm, current->pid); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } } diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index cbe3201d4f21..01884054aeb2 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -57,7 +57,7 @@ void __noreturn die(const char *str, struct pt_regs *regs, long err) if (panic_on_oops) panic("Fatal exception"); - do_exit(SIGSEGV); + make_task_dead(SIGSEGV); } void die_if_kernel(const char *str, struct pt_regs *regs, long err) diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c index 5630e5a395e0..179aabfa712e 100644 --- a/arch/sparc/kernel/traps_32.c +++ b/arch/sparc/kernel/traps_32.c @@ -86,9 +86,7 @@ void __noreturn die_if_kernel(char *str, struct pt_regs *regs) } printk("Instruction DUMP:"); instruction_dump ((unsigned long *) regs->pc); - if(regs->psr & PSR_PS) - do_exit(SIGKILL); - do_exit(SIGSEGV); + make_task_dead((regs->psr & PSR_PS) ? SIGKILL : SIGSEGV); } void do_hw_interrupt(struct pt_regs *regs, unsigned long type) diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 6863025ed56d..21077821f427 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -2559,9 +2559,7 @@ void __noreturn die_if_kernel(char *str, struct pt_regs *regs) } if (panic_on_oops) panic("Fatal exception"); - if (regs->tstate & TSTATE_PRIV) - do_exit(SIGKILL); - do_exit(SIGSEGV); + make_task_dead((regs->tstate & TSTATE_PRIV)? SIGKILL : SIGSEGV); } EXPORT_SYMBOL(die_if_kernel); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index ccb9d32768f3..7738fad6a85e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1248,14 +1248,14 @@ SYM_CODE_START(asm_exc_nmi) SYM_CODE_END(asm_exc_nmi) .pushsection .text, "ax" -SYM_CODE_START(rewind_stack_do_exit) +SYM_CODE_START(rewind_stack_and_make_dead) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp movl PER_CPU_VAR(cpu_current_top_of_stack), %esi leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp - call do_exit + call make_task_dead 1: jmp 1b -SYM_CODE_END(rewind_stack_do_exit) +SYM_CODE_END(rewind_stack_and_make_dead) .popsection diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e38a4cf795d9..f09276457942 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1429,7 +1429,7 @@ SYM_CODE_END(ignore_sysret) #endif .pushsection .text, "ax" -SYM_CODE_START(rewind_stack_do_exit) +SYM_CODE_START(rewind_stack_and_make_dead) UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp @@ -1438,6 +1438,6 @@ SYM_CODE_START(rewind_stack_do_exit) leaq -PTREGS_SIZE(%rax), %rsp UNWIND_HINT_REGS - call do_exit -SYM_CODE_END(rewind_stack_do_exit) + call make_task_dead +SYM_CODE_END(rewind_stack_and_make_dead) .popsection diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ea4fe192189d..53de044e5654 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -351,7 +351,7 @@ unsigned long oops_begin(void) } NOKPROBE_SYMBOL(oops_begin); -void __noreturn rewind_stack_do_exit(int signr); +void __noreturn rewind_stack_and_make_dead(int signr); void oops_end(unsigned long flags, struct pt_regs *regs, int signr) { @@ -386,7 +386,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) * reuse the task stack and that existing poisons are invalid. */ kasan_unpoison_task_stack(current); - rewind_stack_do_exit(signr); + rewind_stack_and_make_dead(signr); } NOKPROBE_SYMBOL(oops_end); diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 4b4dbeb2d612..9345007d474d 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -552,5 +552,5 @@ void __noreturn die(const char * str, struct pt_regs * regs, long err) if (panic_on_oops) panic("Fatal exception"); - do_exit(err); + make_task_dead(err); } diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ba88a6987400..2d4bbd9c3278 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -59,6 +59,7 @@ extern void sched_post_fork(struct task_struct *p, extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); +void __noreturn make_task_dead(int signr); extern void proc_caches_init(void); diff --git a/kernel/exit.c b/kernel/exit.c index f702a6a63686..bfa513c5b227 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -884,6 +884,15 @@ void __noreturn do_exit(long code) } EXPORT_SYMBOL_GPL(do_exit); +void __noreturn make_task_dead(int signr) +{ + /* + * Take the task off the cpu after something catastrophic has + * happened. + */ + do_exit(signr); +} + void complete_and_exit(struct completion *comp, long code) { if (comp) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 21735829b860..e6ab5687770b 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -168,6 +168,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "panic", "do_exit", "do_task_dead", + "make_task_dead", "__module_put_and_exit", "complete_and_exit", "__reiserfs_panic", @@ -175,7 +176,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "fortify_panic", "usercopy_abort", "machine_real_restart", - "rewind_stack_do_exit", + "rewind_stack_and_make_dead" "kunit_try_catch_throw", "xen_start_kernel", "cpu_bringup_and_idle", -- cgit v1.2.3 From 05ea0424f0e21c0ef9b47c89826e7c22ae137975 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Nov 2021 09:33:00 -0600 Subject: exit: Move oops specific logic from do_exit into make_task_dead The beginning of do_exit has become cluttered and difficult to read as it is filled with checks to handle things that can only happen when the kernel is operating improperly. Now that we have a dedicated function for cleaning up a task when the kernel is operating improperly move the checks there. Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 78 ++++++++++++++++++++++++++--------------------------- kernel/futex/core.c | 2 +- kernel/kexec_core.c | 2 +- 3 files changed, 41 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index bfa513c5b227..d0ec6f6b41cb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -735,36 +735,8 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - /* - * We can get here from a kernel oops, sometimes with preemption off. - * Start by checking for critical errors. - * Then fix up important state like USER_DS and preemption. - * Then do everything else. - */ - WARN_ON(blk_needs_flush_plug(tsk)); - if (unlikely(in_interrupt())) - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) - panic("Attempted to kill the idle task!"); - - /* - * If do_exit is called because this processes oopsed, it's possible - * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before - * continuing. Amongst other possible reasons, this is to prevent - * mm_release()->clear_child_tid() from writing to a user-controlled - * kernel address. - */ - force_uaccess_begin(); - - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } - profile_task_exit(tsk); kcov_task_exit(tsk); @@ -773,17 +745,6 @@ void __noreturn do_exit(long code) validate_creds_for_do_exit(tsk); - /* - * We're taking recursive faults here in do_exit. Safest is to just - * leave this task alone and wait for reboot. - */ - if (unlikely(tsk->flags & PF_EXITING)) { - pr_alert("Fixing recursive fault but reboot is needed!\n"); - futex_exit_recursive(tsk); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - } - io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ @@ -889,7 +850,46 @@ void __noreturn make_task_dead(int signr) /* * Take the task off the cpu after something catastrophic has * happened. + * + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. */ + struct task_struct *tsk = current; + + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); + + /* + * If make_task_dead is called because this processes oopsed, it's possible + * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before + * continuing. Amongst other possible reasons, this is to prevent + * mm_release()->clear_child_tid() from writing to a user-controlled + * kernel address. + */ + force_uaccess_begin(); + + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } + + /* + * We're taking recursive faults here in make_task_dead. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + pr_alert("Fixing recursive fault but reboot is needed!\n"); + futex_exit_recursive(tsk); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + } + do_exit(signr); } diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 25d8a88b32e5..39a1522865b5 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1044,7 +1044,7 @@ static void futex_cleanup(struct task_struct *tsk) * actually finished the futex cleanup. The worst case for this is that the * waiter runs through the wait loop until the state becomes visible. * - * This is called from the recursive fault handling path in do_exit(). + * This is called from the recursive fault handling path in make_task_dead(). * * This is best effort. Either the futex exit code has run already or * not. If the OWNER_DIED bit has been set on the futex then the waiter can diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 5a5d192a89ac..68480f731192 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -81,7 +81,7 @@ int kexec_should_crash(struct task_struct *p) if (crash_kexec_post_notifiers) return 0; /* - * There are 4 panic() calls in do_exit() path, each of which + * There are 4 panic() calls in make_task_dead() path, each of which * corresponds to each of these 4 conditions. */ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) -- cgit v1.2.3 From 7f80a2fd7db9a55894fd841915236aca611291b5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Nov 2021 09:51:03 -0600 Subject: exit: Stop poorly open coding do_task_dead in make_task_dead When the kernel detects it is oops or otherwise force killing a task while it exits the code poorly attempts to permanently stop the task from scheduling. I say poorly because it is possible for a task in TASK_UINTERRUPTIBLE to be woken up. As it makes no sense for the task to continue call do_task_dead instead which actually does the work and permanently removes the task from the scheduler. Guaranteeing the task will never be woken up again. Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index d0ec6f6b41cb..f975cd8a2ed8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -886,8 +886,7 @@ void __noreturn make_task_dead(int signr) if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); + do_task_dead(); } do_exit(signr); -- cgit v1.2.3 From eb55e716ac1aa0de13ef5abbf1479d995582d967 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Nov 2021 10:01:59 -0600 Subject: exit: Stop exporting do_exit Now that there are no more modular uses of do_exit remove the EXPORT_SYMBOL. Suggested-by: Christoph Hellwig Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f975cd8a2ed8..57afac845a0a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -843,7 +843,6 @@ void __noreturn do_exit(long code) lockdep_free_task(tsk); do_task_dead(); } -EXPORT_SYMBOL_GPL(do_exit); void __noreturn make_task_dead(int signr) { -- cgit v1.2.3 From bbda86e988d4c124e4cfa816291cbd583ae8bfb1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Nov 2021 10:27:36 -0600 Subject: exit: Implement kthread_exit The way the per task_struct exit_code is used by kernel threads is not quite compatible how it is used by userspace applications. The low byte of the userspace exit_code value encodes the exit signal. While kthreads just use the value as an int holding ordinary kernel function exit status like -EPERM. Add kthread_exit to clearly separate the two kinds of uses. Signed-off-by: "Eric W. Biederman" --- include/linux/kthread.h | 1 + kernel/kthread.c | 23 +++++++++++++++++++---- tools/objtool/check.c | 1 + 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 346b0f269161..22c43d419687 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -70,6 +70,7 @@ void *kthread_probe_data(struct task_struct *k); int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); +void kthread_exit(long result) __noreturn; int kthreadd(void *unused); extern struct task_struct *kthreadd_task; diff --git a/kernel/kthread.c b/kernel/kthread.c index 7113003fab63..77b7c3f23f18 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -268,6 +268,21 @@ void kthread_parkme(void) } EXPORT_SYMBOL_GPL(kthread_parkme); +/** + * kthread_exit - Cause the current kthread return @result to kthread_stop(). + * @result: The integer value to return to kthread_stop(). + * + * While kthread_exit can be called directly, it exists so that + * functions which do some additional work in non-modular code such as + * module_put_and_kthread_exit can be implemented. + * + * Does not return. + */ +void __noreturn kthread_exit(long result) +{ + do_exit(result); +} + static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; @@ -286,13 +301,13 @@ static int kthread(void *_create) done = xchg(&create->done, NULL); if (!done) { kfree(create); - do_exit(-EINTR); + kthread_exit(-EINTR); } if (!self) { create->result = ERR_PTR(-ENOMEM); complete(done); - do_exit(-ENOMEM); + kthread_exit(-ENOMEM); } self->threadfn = threadfn; @@ -326,7 +341,7 @@ static int kthread(void *_create) __kthread_parkme(self); ret = threadfn(data); } - do_exit(ret); + kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ @@ -627,7 +642,7 @@ EXPORT_SYMBOL_GPL(kthread_park); * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * - * If threadfn() may call do_exit() itself, the caller must ensure + * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e6ab5687770b..90108fe5610d 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -168,6 +168,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "panic", "do_exit", "do_task_dead", + "kthread_exit", "make_task_dead", "__module_put_and_exit", "complete_and_exit", -- cgit v1.2.3 From ca3574bd653aba234a4b31955f2778947403be16 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 3 Dec 2021 11:00:19 -0600 Subject: exit: Rename module_put_and_exit to module_put_and_kthread_exit Update module_put_and_exit to call kthread_exit instead of do_exit. Change the name to reflect this change in functionality. All of the users of module_put_and_exit are causing the current kthread to exit so this change makes it clear what is happening. There is no functional change. Signed-off-by: "Eric W. Biederman" --- crypto/algboss.c | 4 ++-- fs/cifs/connect.c | 2 +- fs/nfs/callback.c | 4 ++-- fs/nfs/nfs4state.c | 2 +- fs/nfsd/nfssvc.c | 2 +- include/linux/module.h | 6 +++--- kernel/module.c | 6 +++--- net/bluetooth/bnep/core.c | 2 +- net/bluetooth/cmtp/core.c | 2 +- net/bluetooth/hidp/core.c | 2 +- tools/objtool/check.c | 2 +- 11 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/crypto/algboss.c b/crypto/algboss.c index 1814d2c5188a..eb5fe84efb83 100644 --- a/crypto/algboss.c +++ b/crypto/algboss.c @@ -67,7 +67,7 @@ out: complete_all(¶m->larval->completion); crypto_alg_put(¶m->larval->alg); kfree(param); - module_put_and_exit(0); + module_put_and_kthread_exit(0); } static int cryptomgr_schedule_probe(struct crypto_larval *larval) @@ -190,7 +190,7 @@ skiptest: crypto_alg_tested(param->driver, err); kfree(param); - module_put_and_exit(0); + module_put_and_kthread_exit(0); } static int cryptomgr_schedule_test(struct crypto_alg *alg) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 82577a7a5bb1..39fbe9acbf51 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1139,7 +1139,7 @@ next_pdu: } memalloc_noreclaim_restore(noreclaim_flag); - module_put_and_exit(0); + module_put_and_kthread_exit(0); } /* diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 86d856de1389..3c86a559a321 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -93,7 +93,7 @@ nfs4_callback_svc(void *vrqstp) svc_process(rqstp); } svc_exit_thread(rqstp); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } @@ -137,7 +137,7 @@ nfs41_callback_svc(void *vrqstp) } } svc_exit_thread(rqstp); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ecc4594299d6..ea41af731978 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2689,6 +2689,6 @@ static int nfs4_run_state_manager(void *ptr) allow_signal(SIGKILL); nfs4_state_manager(clp); nfs_put_client(clp); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 80431921e5d7..5ce9f14318c4 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -986,7 +986,7 @@ out: /* Release module */ mutex_unlock(&nfsd_mutex); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } diff --git a/include/linux/module.h b/include/linux/module.h index c9f1200b2312..f03be97e9ec1 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -595,9 +595,9 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name); -extern void __noreturn __module_put_and_exit(struct module *mod, +extern void __noreturn __module_put_and_kthread_exit(struct module *mod, long code); -#define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code) +#define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code) #ifdef CONFIG_MODULE_UNLOAD int module_refcount(struct module *mod); @@ -790,7 +790,7 @@ static inline int unregister_module_notifier(struct notifier_block *nb) return 0; } -#define module_put_and_exit(code) do_exit(code) +#define module_put_and_kthread_exit(code) kthread_exit(code) static inline void print_modules(void) { diff --git a/kernel/module.c b/kernel/module.c index 84a9141a5e15..a3aa00bf270d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -337,12 +337,12 @@ static inline void add_taint_module(struct module *mod, unsigned flag, * A thread that wants to hold a reference to a module only while it * is running can call this to safely exit. nfsd and lockd use this. */ -void __noreturn __module_put_and_exit(struct module *mod, long code) +void __noreturn __module_put_and_kthread_exit(struct module *mod, long code) { module_put(mod); - do_exit(code); + kthread_exit(code); } -EXPORT_SYMBOL(__module_put_and_exit); +EXPORT_SYMBOL(__module_put_and_kthread_exit); /* Find a module section: 0 means not found. */ static unsigned int find_sec(const struct load_info *info, const char *name) diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index c9add7753b9f..40baa6b7321a 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -535,7 +535,7 @@ static int bnep_session(void *arg) up_write(&bnep_session_sem); free_netdev(dev); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 0a2d78e811cf..9bfded6b74b3 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -323,7 +323,7 @@ static int cmtp_session(void *arg) up_write(&cmtp_session_sem); kfree(session); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 80848dfc01db..5940744a8cd8 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -1305,7 +1305,7 @@ static int hidp_session_thread(void *arg) l2cap_unregister_user(session->conn, &session->user); hidp_session_put(session); - module_put_and_exit(0); + module_put_and_kthread_exit(0); return 0; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 90108fe5610d..120e9598c11a 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -170,7 +170,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "do_task_dead", "kthread_exit", "make_task_dead", - "__module_put_and_exit", + "__module_put_and_kthread_exit", "complete_and_exit", "__reiserfs_panic", "lbug_with_loc", -- cgit v1.2.3 From cead18552660702a4a46f58e65188fe5f36e9dfe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Nov 2021 11:15:19 -0600 Subject: exit: Rename complete_and_exit to kthread_complete_and_exit Update complete_and_exit to call kthread_exit instead of do_exit. Change the name to reflect this change in functionality. All of the users of complete_and_exit are causing the current kthread to exit so this change makes it clear what is happening. Move the implementation of kthread_complete_and_exit from kernel/exit.c to to kernel/kthread.c. As this function is kthread specific it makes most sense to live with the kthread functions. There are no functional change. Signed-off-by: "Eric W. Biederman" --- drivers/net/wireless/rsi/rsi_91x_coex.c | 2 +- drivers/net/wireless/rsi/rsi_91x_main.c | 2 +- drivers/net/wireless/rsi/rsi_91x_sdio_ops.c | 2 +- drivers/net/wireless/rsi/rsi_91x_usb_ops.c | 2 +- drivers/pnp/pnpbios/core.c | 6 +++--- drivers/staging/rts5208/rtsx.c | 16 ++++++++-------- drivers/usb/atm/usbatm.c | 2 +- drivers/usb/gadget/function/f_mass_storage.c | 2 +- fs/jffs2/background.c | 2 +- include/linux/kernel.h | 1 - include/linux/kthread.h | 1 + kernel/exit.c | 9 --------- kernel/kthread.c | 21 +++++++++++++++++++++ lib/kunit/try-catch.c | 4 ++-- tools/objtool/check.c | 2 +- 15 files changed, 43 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/drivers/net/wireless/rsi/rsi_91x_coex.c b/drivers/net/wireless/rsi/rsi_91x_coex.c index a0c5d02ae88c..8a3d86897ea8 100644 --- a/drivers/net/wireless/rsi/rsi_91x_coex.c +++ b/drivers/net/wireless/rsi/rsi_91x_coex.c @@ -63,7 +63,7 @@ static void rsi_coex_scheduler_thread(struct rsi_common *common) rsi_coex_sched_tx_pkts(coex_cb); } while (atomic_read(&coex_cb->coex_tx_thread.thread_done) == 0); - complete_and_exit(&coex_cb->coex_tx_thread.completion, 0); + kthread_complete_and_exit(&coex_cb->coex_tx_thread.completion, 0); } int rsi_coex_recv_pkt(struct rsi_common *common, u8 *msg) diff --git a/drivers/net/wireless/rsi/rsi_91x_main.c b/drivers/net/wireless/rsi/rsi_91x_main.c index f1bf71e6c608..c7f5cec5e446 100644 --- a/drivers/net/wireless/rsi/rsi_91x_main.c +++ b/drivers/net/wireless/rsi/rsi_91x_main.c @@ -260,7 +260,7 @@ static void rsi_tx_scheduler_thread(struct rsi_common *common) if (common->init_done) rsi_core_qos_processor(common); } while (atomic_read(&common->tx_thread.thread_done) == 0); - complete_and_exit(&common->tx_thread.completion, 0); + kthread_complete_and_exit(&common->tx_thread.completion, 0); } #ifdef CONFIG_RSI_COEX diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c index 8ace1874e5cb..b2b47a0abcbf 100644 --- a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c +++ b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c @@ -75,7 +75,7 @@ void rsi_sdio_rx_thread(struct rsi_common *common) rsi_dbg(INFO_ZONE, "%s: Terminated SDIO RX thread\n", __func__); atomic_inc(&sdev->rx_thread.thread_done); - complete_and_exit(&sdev->rx_thread.completion, 0); + kthread_complete_and_exit(&sdev->rx_thread.completion, 0); } /** diff --git a/drivers/net/wireless/rsi/rsi_91x_usb_ops.c b/drivers/net/wireless/rsi/rsi_91x_usb_ops.c index 4ffcdde1acb1..5130b0e72adc 100644 --- a/drivers/net/wireless/rsi/rsi_91x_usb_ops.c +++ b/drivers/net/wireless/rsi/rsi_91x_usb_ops.c @@ -56,6 +56,6 @@ void rsi_usb_rx_thread(struct rsi_common *common) out: rsi_dbg(INFO_ZONE, "%s: Terminated thread\n", __func__); skb_queue_purge(&dev->rx_q); - complete_and_exit(&dev->rx_thread.completion, 0); + kthread_complete_and_exit(&dev->rx_thread.completion, 0); } diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index 669ef4700c1a..f7e86ae9f72f 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c @@ -160,7 +160,7 @@ static int pnp_dock_thread(void *unused) * No dock to manage */ case PNP_FUNCTION_NOT_SUPPORTED: - complete_and_exit(&unload_sem, 0); + kthread_complete_and_exit(&unload_sem, 0); case PNP_SYSTEM_NOT_DOCKED: d = 0; break; @@ -170,7 +170,7 @@ static int pnp_dock_thread(void *unused) default: pnpbios_print_status("pnp_dock_thread", status); printk(KERN_WARNING "PnPBIOS: disabling dock monitoring.\n"); - complete_and_exit(&unload_sem, 0); + kthread_complete_and_exit(&unload_sem, 0); } if (d != docked) { if (pnp_dock_event(d, &now) == 0) { @@ -183,7 +183,7 @@ static int pnp_dock_thread(void *unused) } } } - complete_and_exit(&unload_sem, 0); + kthread_complete_and_exit(&unload_sem, 0); } static int pnpbios_get_resources(struct pnp_dev *dev) diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c index 91fcf85e150a..5a58dac76c88 100644 --- a/drivers/staging/rts5208/rtsx.c +++ b/drivers/staging/rts5208/rtsx.c @@ -450,13 +450,13 @@ skip_for_abort: * after the down() -- that's necessary for the thread-shutdown * case. * - * complete_and_exit() goes even further than this -- it is safe in - * the case that the thread of the caller is going away (not just - * the structure) -- this is necessary for the module-remove case. - * This is important in preemption kernels, which transfer the flow - * of execution immediately upon a complete(). + * kthread_complete_and_exit() goes even further than this -- + * it is safe in the case that the thread of the caller is going away + * (not just the structure) -- this is necessary for the module-remove + * case. This is important in preemption kernels, which transfer the + * flow of execution immediately upon a complete(). */ - complete_and_exit(&dev->control_exit, 0); + kthread_complete_and_exit(&dev->control_exit, 0); } static int rtsx_polling_thread(void *__dev) @@ -501,7 +501,7 @@ static int rtsx_polling_thread(void *__dev) mutex_unlock(&dev->dev_mutex); } - complete_and_exit(&dev->polling_exit, 0); + kthread_complete_and_exit(&dev->polling_exit, 0); } /* @@ -682,7 +682,7 @@ static int rtsx_scan_thread(void *__dev) /* Should we unbind if no devices were detected? */ } - complete_and_exit(&dev->scanning_done, 0); + kthread_complete_and_exit(&dev->scanning_done, 0); } static void rtsx_init_options(struct rtsx_chip *chip) diff --git a/drivers/usb/atm/usbatm.c b/drivers/usb/atm/usbatm.c index da17be1ef64e..e3a49d837609 100644 --- a/drivers/usb/atm/usbatm.c +++ b/drivers/usb/atm/usbatm.c @@ -969,7 +969,7 @@ static int usbatm_do_heavy_init(void *arg) instance->thread = NULL; mutex_unlock(&instance->serialize); - complete_and_exit(&instance->thread_exited, ret); + kthread_complete_and_exit(&instance->thread_exited, ret); } static int usbatm_heavy_init(struct usbatm_data *instance) diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index 752439690fda..46dd11dcb3a8 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -2547,7 +2547,7 @@ static int fsg_main_thread(void *common_) up_write(&common->filesem); /* Let fsg_unbind() know the thread has exited */ - complete_and_exit(&common->thread_notifier, 0); + kthread_complete_and_exit(&common->thread_notifier, 0); } diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 2b4d5013dc5d..6da92ecaf66d 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -161,5 +161,5 @@ static int jffs2_garbage_collect_thread(void *_c) spin_lock(&c->erase_completion_lock); c->gc_task = NULL; spin_unlock(&c->erase_completion_lock); - complete_and_exit(&c->gc_thread_exit, 0); + kthread_complete_and_exit(&c->gc_thread_exit, 0); } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 77755ac3e189..055eb203c00e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -187,7 +187,6 @@ static inline void might_fault(void) { } #endif void do_exit(long error_code) __noreturn; -void complete_and_exit(struct completion *, long) __noreturn; extern int num_to_str(char *buf, int size, unsigned long long num, unsigned int width); diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 22c43d419687..d86a7e3b9a52 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -71,6 +71,7 @@ int kthread_park(struct task_struct *k); void kthread_unpark(struct task_struct *k); void kthread_parkme(void); void kthread_exit(long result) __noreturn; +void kthread_complete_and_exit(struct completion *, long) __noreturn; int kthreadd(void *unused); extern struct task_struct *kthreadd_task; diff --git a/kernel/exit.c b/kernel/exit.c index 57afac845a0a..6c4b04531f17 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -891,15 +891,6 @@ void __noreturn make_task_dead(int signr) do_exit(signr); } -void complete_and_exit(struct completion *comp, long code) -{ - if (comp) - complete(comp); - - do_exit(code); -} -EXPORT_SYMBOL(complete_and_exit); - SYSCALL_DEFINE1(exit, int, error_code) { do_exit((error_code&0xff)<<8); diff --git a/kernel/kthread.c b/kernel/kthread.c index 77b7c3f23f18..4388d6694a7f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -283,6 +283,27 @@ void __noreturn kthread_exit(long result) do_exit(result); } +/** + * kthread_complete_and exit - Exit the current kthread. + * @comp: Completion to complete + * @code: The integer value to return to kthread_stop(). + * + * If present complete @comp and the reuturn code to kthread_stop(). + * + * A kernel thread whose module may be removed after the completion of + * @comp can use this function exit safely. + * + * Does not return. + */ +void __noreturn kthread_complete_and_exit(struct completion *comp, long code) +{ + if (comp) + complete(comp); + + kthread_exit(code); +} +EXPORT_SYMBOL(kthread_complete_and_exit); + static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; diff --git a/lib/kunit/try-catch.c b/lib/kunit/try-catch.c index 0dd434e40487..be38a2c5ecc2 100644 --- a/lib/kunit/try-catch.c +++ b/lib/kunit/try-catch.c @@ -17,7 +17,7 @@ void __noreturn kunit_try_catch_throw(struct kunit_try_catch *try_catch) { try_catch->try_result = -EFAULT; - complete_and_exit(try_catch->try_completion, -EFAULT); + kthread_complete_and_exit(try_catch->try_completion, -EFAULT); } EXPORT_SYMBOL_GPL(kunit_try_catch_throw); @@ -27,7 +27,7 @@ static int kunit_generic_run_threadfn_adapter(void *data) try_catch->try(try_catch->context); - complete_and_exit(try_catch->try_completion, 0); + kthread_complete_and_exit(try_catch->try_completion, 0); } static unsigned long kunit_test_timeout(void) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 120e9598c11a..282273a1ffa5 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -171,7 +171,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "kthread_exit", "make_task_dead", "__module_put_and_kthread_exit", - "complete_and_exit", + "kthread_complete_and_exit", "__reiserfs_panic", "lbug_with_loc", "fortify_panic", -- cgit v1.2.3 From 40966e316f86b8cfd83abd31ccb4df729309d3e7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 2 Dec 2021 09:56:14 -0600 Subject: kthread: Ensure struct kthread is present for all kthreads Today the rules are a bit iffy and arbitrary about which kernel threads have struct kthread present. Both idle threads and thread started with create_kthread want struct kthread present so that is effectively all kernel threads. Make the rule that if PF_KTHREAD and the task is running then struct kthread is present. This will allow the kernel thread code to using tsk->exit_code with different semantics from ordinary processes. To make ensure that struct kthread is present for all kernel threads move it's allocation into copy_process. Add a deallocation of struct kthread in exec for processes that were kernel threads. Move the allocation of struct kthread for the initial thread earlier so that it is not repeated for each additional idle thread. Move the initialization of struct kthread into set_kthread_struct so that the structure is always and reliably initailized. Clear set_child_tid in free_kthread_struct to ensure the kthread struct is reliably freed during exec. The function free_kthread_struct does not need to clear vfork_done during exec as exec_mm_release called from exec_mmap has already cleared vfork_done. Signed-off-by: "Eric W. Biederman" --- fs/exec.c | 2 ++ include/linux/kthread.h | 2 +- kernel/fork.c | 4 ++++ kernel/kthread.c | 31 ++++++++++++++----------------- kernel/sched/core.c | 16 ++++++++-------- 5 files changed, 29 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 537d92c41105..59cac7c18178 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1307,6 +1307,8 @@ int begin_new_exec(struct linux_binprm * bprm) */ force_uaccess_begin(); + if (me->flags & PF_KTHREAD) + free_kthread_struct(me); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); diff --git a/include/linux/kthread.h b/include/linux/kthread.h index d86a7e3b9a52..4f3433afb54b 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -33,7 +33,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), unsigned int cpu, const char *namefmt); -void set_kthread_struct(struct task_struct *p); +bool set_kthread_struct(struct task_struct *p); void kthread_set_per_cpu(struct task_struct *k, int cpu); bool kthread_is_per_cpu(struct task_struct *k); diff --git a/kernel/fork.c b/kernel/fork.c index 3244cc56b697..04fa3e5d97af 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2118,6 +2118,10 @@ static __latent_entropy struct task_struct *copy_process( p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); + if (p->flags & PF_KTHREAD) { + if (!set_kthread_struct(p)) + goto bad_fork_cleanup_threadgroup_lock; + } #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { diff --git a/kernel/kthread.c b/kernel/kthread.c index 4388d6694a7f..8e5f44bed027 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -93,20 +93,27 @@ static inline struct kthread *__to_kthread(struct task_struct *p) return kthread; } -void set_kthread_struct(struct task_struct *p) +bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; - if (__to_kthread(p)) - return; + if (WARN_ON_ONCE(to_kthread(p))) + return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); + if (!kthread) + return false; + + init_completion(&kthread->exited); + init_completion(&kthread->parked); + p->vfork_done = &kthread->exited; + /* * We abuse ->set_child_tid to avoid the new member and because it - * can't be wrongly copied by copy_process(). We also rely on fact - * that the caller can't exec, so PF_KTHREAD can't be cleared. + * can't be wrongly copied by copy_process(). */ p->set_child_tid = (__force void __user *)kthread; + return true; } void free_kthread_struct(struct task_struct *k) @@ -114,13 +121,13 @@ void free_kthread_struct(struct task_struct *k) struct kthread *kthread; /* - * Can be NULL if this kthread was created by kernel_thread() - * or if kmalloc() in kthread() failed. + * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread && kthread->blkcg_css); #endif + k->set_child_tid = (__force void __user *)NULL; kfree(kthread); } @@ -315,7 +322,6 @@ static int kthread(void *_create) struct kthread *self; int ret; - set_kthread_struct(current); self = to_kthread(current); /* If user was SIGKILLed, I release the structure. */ @@ -325,17 +331,8 @@ static int kthread(void *_create) kthread_exit(-EINTR); } - if (!self) { - create->result = ERR_PTR(-ENOMEM); - complete(done); - kthread_exit(-ENOMEM); - } - self->threadfn = threadfn; self->data = data; - init_completion(&self->exited); - init_completion(&self->parked); - current->vfork_done = &self->exited; /* * The new thread inherited kthreadd's priority and CPU mask. Reset diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c9b0fda64ac..0404a8c572a1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8599,14 +8599,6 @@ void __init init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); - /* - * The idle task doesn't need the kthread struct to function, but it - * is dressed up as a per-CPU kthread and thus needs to play the part - * if we want to avoid special-casing it in code that deals with per-CPU - * kthreads. - */ - set_kthread_struct(idle); - raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_rq_lock(rq); @@ -9427,6 +9419,14 @@ void __init sched_init(void) mmgrab(&init_mm); enter_lazy_tlb(&init_mm, current); + /* + * The idle task doesn't need the kthread struct to function, but it + * is dressed up as a per-CPU kthread and thus needs to play the part + * if we want to avoid special-casing it in code that deals with per-CPU + * kthreads. + */ + WARN_ON(set_kthread_struct(current)); + /* * Make us the idle thread. Technically, schedule() should not be * called from this thread, however somewhere below it might be, -- cgit v1.2.3 From 6b1248798eb6f6d5285db214299996ecc5dc1e6b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 3 Dec 2021 11:42:49 -0600 Subject: exit/kthread: Move the exit code for kernel threads into struct kthread The exit code of kernel threads has different semantics than the exit_code of userspace tasks. To avoid confusion and allow the userspace implementation to change as needed move the kernel thread exit code into struct kthread. Signed-off-by: "Eric W. Biederman" --- kernel/kthread.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 8e5f44bed027..9c6c532047c4 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -52,6 +52,7 @@ struct kthread_create_info struct kthread { unsigned long flags; unsigned int cpu; + int result; int (*threadfn)(void *); void *data; mm_segment_t oldfs; @@ -287,7 +288,9 @@ EXPORT_SYMBOL_GPL(kthread_parkme); */ void __noreturn kthread_exit(long result) { - do_exit(result); + struct kthread *kthread = to_kthread(current); + kthread->result = result; + do_exit(0); } /** @@ -679,7 +682,7 @@ int kthread_stop(struct task_struct *k) kthread_unpark(k); wake_up_process(k); wait_for_completion(&kthread->exited); - ret = k->exit_code; + ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); -- cgit v1.2.3 From 1f1562fcd04a485734e94390660e741c3be47867 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 5 Dec 2021 13:32:14 -0500 Subject: cgroup/cpuset: Don't let child cpusets restrict parent in default hierarchy In validate_change(), there is a check since v2.6.12 to make sure that each of the child cpusets must be a subset of a parent cpuset. IOW, it allows child cpusets to restrict what changes can be made to a parent's "cpuset.cpus". This actually violates one of the core principles of the default hierarchy where a cgroup higher up in the hierarchy should be able to change configuration however it sees fit as deligation breaks down otherwise. To address this issue, the check is now removed for the default hierarchy to free parent cpusets from being restricted by child cpusets. The check will still apply for legacy hierarchy. Suggested-by: Tejun Heo Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index d0e163a02099..0dd7d853ed17 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -616,19 +616,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) struct cpuset *c, *par; int ret; - rcu_read_lock(); - - /* Each of our child cpusets must be a subset of us */ - ret = -EBUSY; - cpuset_for_each_child(c, css, cur) - if (!is_cpuset_subset(c, trial)) - goto out; - - /* Remaining checks don't apply to root cpuset */ - ret = 0; + /* The checks don't apply to root cpuset */ if (cur == &top_cpuset) - goto out; + return 0; + rcu_read_lock(); par = parent_cs(cur); /* On legacy hierarchy, we must be a subset of our parent cpuset. */ -- cgit v1.2.3 From c8064e5b4adac5e1255cf4f3b374e75b5376e7ca Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 30 Nov 2021 11:08:07 +0100 Subject: bpf: Let bpf_warn_invalid_xdp_action() report more info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In non trivial scenarios, the action id alone is not sufficient to identify the program causing the warning. Before the previous patch, the generated stack-trace pointed out at least the involved device driver. Let's additionally include the program name and id, and the relevant device name. If the user needs additional infos, he can fetch them via a kernel probe, leveraging the arguments added here. Signed-off-by: Paolo Abeni Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/ddb96bb975cbfddb1546cf5da60e77d5100b533c.1638189075.git.pabeni@redhat.com --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 2 +- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 2 +- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 2 +- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 2 +- drivers/net/ethernet/freescale/enetc/enetc.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 +- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 +- drivers/net/ethernet/intel/ice/ice_txrx.c | 2 +- drivers/net/ethernet/intel/ice/ice_xsk.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/igc/igc_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 2 +- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- drivers/net/ethernet/marvell/mvneta.c | 2 +- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +- drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 2 +- drivers/net/ethernet/microsoft/mana/mana_bpf.c | 2 +- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +- drivers/net/ethernet/qlogic/qede/qede_fp.c | 2 +- drivers/net/ethernet/sfc/rx.c | 2 +- drivers/net/ethernet/socionext/netsec.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- drivers/net/ethernet/ti/cpsw_priv.c | 2 +- drivers/net/hyperv/netvsc_bpf.c | 2 +- drivers/net/tun.c | 2 +- drivers/net/veth.c | 4 ++-- drivers/net/virtio_net.c | 4 ++-- drivers/net/xen-netfront.c | 2 +- include/linux/filter.h | 2 +- kernel/bpf/cpumap.c | 4 ++-- kernel/bpf/devmap.c | 4 ++-- net/core/dev.c | 2 +- net/core/filter.c | 6 +++--- 37 files changed, 43 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 7d5d885d85d5..3b46f1df5609 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -434,7 +434,7 @@ static int ena_xdp_execute(struct ena_ring *rx_ring, struct xdp_buff *xdp) xdp_stat = &rx_ring->rx_stats.xdp_pass; break; default: - bpf_warn_invalid_xdp_action(verdict); + bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, verdict); xdp_stat = &rx_ring->rx_stats.xdp_invalid; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c index c8083df5e0ab..52fad0fdeacf 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c @@ -195,7 +195,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons, *event |= BNXT_REDIRECT_EVENT; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(bp->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(bp->dev, xdp_prog, act); diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index bb45d5df2856..30450efccad7 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -590,7 +590,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, nicvf_xdp_sq_append_pkt(nic, sq, (u64)xdp.data, dma_addr, len); return true; default: - bpf_warn_invalid_xdp_action(action); + bpf_warn_invalid_xdp_action(nic->netdev, prog, action); fallthrough; case XDP_ABORTED: trace_xdp_exception(nic->netdev, prog, action); diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index d6871437d951..c78883c3a2c8 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -2623,7 +2623,7 @@ static u32 dpaa_run_xdp(struct dpaa_priv *priv, struct qm_fd *fd, void *vaddr, } break; default: - bpf_warn_invalid_xdp_action(xdp_act); + bpf_warn_invalid_xdp_action(priv->net_dev, xdp_prog, xdp_act); fallthrough; case XDP_ABORTED: trace_xdp_exception(priv->net_dev, xdp_prog, xdp_act); diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 8e643567abce..d21ba70ef4a3 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -374,7 +374,7 @@ static u32 dpaa2_eth_run_xdp(struct dpaa2_eth_priv *priv, dpaa2_eth_xdp_enqueue(priv, ch, fd, vaddr, rx_fq->flowid); break; default: - bpf_warn_invalid_xdp_action(xdp_act); + bpf_warn_invalid_xdp_action(priv->net_dev, xdp_prog, xdp_act); fallthrough; case XDP_ABORTED: trace_xdp_exception(priv->net_dev, xdp_prog, xdp_act); diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 504e12554079..eacb41f86bdb 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1547,7 +1547,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, switch (xdp_act) { default: - bpf_warn_invalid_xdp_action(xdp_act); + bpf_warn_invalid_xdp_action(rx_ring->ndev, prog, xdp_act); fallthrough; case XDP_ABORTED: trace_xdp_exception(rx_ring->ndev, prog, xdp_act); diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 10a83e5385c7..b399ca649f09 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -2322,7 +2322,7 @@ static int i40e_run_xdp(struct i40e_ring *rx_ring, struct xdp_buff *xdp) result = I40E_XDP_REDIR; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act); fallthrough; case XDP_ABORTED: out_failure: diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index ea06e957393e..945b1bb9c6f4 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -176,7 +176,7 @@ static int i40e_run_xdp_zc(struct i40e_ring *rx_ring, struct xdp_buff *xdp) goto out_failure; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act); fallthrough; case XDP_ABORTED: out_failure: diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c index bc3ba19dc88f..56940bb908bc 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx.c @@ -561,7 +561,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, goto out_failure; return ICE_XDP_REDIR; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act); fallthrough; case XDP_ABORTED: out_failure: diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index bb9a80847298..ef4b213881f3 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -483,7 +483,7 @@ ice_run_xdp_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, goto out_failure; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act); fallthrough; case XDP_ABORTED: out_failure: diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index dd208930fbe4..337a7197e96f 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -8422,7 +8422,7 @@ static struct sk_buff *igb_run_xdp(struct igb_adapter *adapter, result = IGB_XDP_REDIR; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(adapter->netdev, xdp_prog, act); fallthrough; case XDP_ABORTED: out_failure: diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 142c57b7a451..7f2ce4b256d9 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2241,7 +2241,7 @@ static int __igc_xdp_run_prog(struct igc_adapter *adapter, return IGC_XDP_REDIRECT; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(adapter->netdev, prog, act); fallthrough; case XDP_ABORTED: out_failure: diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 0f9f022260d7..265bc52aacf8 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2235,7 +2235,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter, result = IXGBE_XDP_REDIR; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act); fallthrough; case XDP_ABORTED: out_failure: diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index db2bc58dfcfd..b3fd8e5cd85b 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -131,7 +131,7 @@ static int ixgbe_run_xdp_zc(struct ixgbe_adapter *adapter, goto out_failure; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act); fallthrough; case XDP_ABORTED: out_failure: diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index b1dfbaff8b31..2459ecf65125 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -1070,7 +1070,7 @@ static struct sk_buff *ixgbevf_run_xdp(struct ixgbevf_adapter *adapter, goto out_failure; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, act); fallthrough; case XDP_ABORTED: out_failure: diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index ce810fc3c1a2..23cc4b874285 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2239,7 +2239,7 @@ mvneta_run_xdp(struct mvneta_port *pp, struct mvneta_rx_queue *rxq, mvneta_xdp_put_buff(pp, rxq, xdp, sinfo, sync); break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(pp->dev, prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(pp->dev, prog, act); diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index 8e5820d12362..7273a4c9dbb1 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -3823,7 +3823,7 @@ mvpp2_run_xdp(struct mvpp2_port *port, struct bpf_prog *prog, } break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(port->dev, prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(port->dev, prog, act); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index 0cc6353254bf..7c4068c5d1ac 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -1198,7 +1198,7 @@ static bool otx2_xdp_rcv_pkt_handler(struct otx2_nic *pfvf, put_page(page); break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(pfvf->netdev, prog, act); break; case XDP_ABORTED: trace_xdp_exception(pfvf->netdev, prog, act); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 650e6a1844ae..8cfc649f226b 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -812,7 +812,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud trace_xdp_exception(dev, xdp_prog, act); goto xdp_drop_no_cnt; /* Drop on xmit failure */ default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dev, xdp_prog, act); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index 2f0df5cc1a2d..338d65e2c9ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -151,7 +151,7 @@ bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct mlx5e_dma_info *di, rq->stats->xdp_redirect++; return true; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(rq->netdev, prog, act); fallthrough; case XDP_ABORTED: xdp_abort: diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c b/drivers/net/ethernet/microsoft/mana/mana_bpf.c index 1bc8ff388341..1d2f948b5c00 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c +++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c @@ -60,7 +60,7 @@ u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq, break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(ndev, prog, act); } out: diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 6e38da4ad554..79257ec41987 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1944,7 +1944,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget) xdp_prog, act); continue; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(dp->netdev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dp->netdev, xdp_prog, act); diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c index 5ea9cb4311a1..b242000a77fd 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_fp.c +++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c @@ -1153,7 +1153,7 @@ static bool qede_rx_xdp(struct qede_dev *edev, qede_rx_bd_ring_consume(rxq); break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(edev->ndev, prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(edev->ndev, prog, act); diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 606750938b89..2375cef577e4 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -338,7 +338,7 @@ static bool efx_do_xdp(struct efx_nic *efx, struct efx_channel *channel, break; default: - bpf_warn_invalid_xdp_action(xdp_act); + bpf_warn_invalid_xdp_action(efx->net_dev, xdp_prog, xdp_act); efx_free_rx_buffers(rx_queue, rx_buf, 1); channel->n_rx_xdp_bad_drops++; trace_xdp_exception(efx->net_dev, xdp_prog, xdp_act); diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c index de7d8bf2c226..25dcd8eda5fc 100644 --- a/drivers/net/ethernet/socionext/netsec.c +++ b/drivers/net/ethernet/socionext/netsec.c @@ -933,7 +933,7 @@ static u32 netsec_run_xdp(struct netsec_priv *priv, struct bpf_prog *prog, } break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(priv->ndev, prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(priv->ndev, prog, act); diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 4e05c1d92935..7f62ddc90088 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -4714,7 +4714,7 @@ static int __stmmac_xdp_run_prog(struct stmmac_priv *priv, res = STMMAC_XDP_REDIRECT; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(priv->dev, prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(priv->dev, prog, act); diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c index c99dd9735087..67c4009fd16c 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.c +++ b/drivers/net/ethernet/ti/cpsw_priv.c @@ -1366,7 +1366,7 @@ int cpsw_run_xdp(struct cpsw_priv *priv, int ch, struct xdp_buff *xdp, xdp_do_flush_map(); break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(ndev, prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(ndev, prog, act); diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index aa877da113f8..7856905414eb 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -68,7 +68,7 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(ndev, prog, act); } out: diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 1572878c3403..0e5d2272f63a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1551,7 +1551,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, case XDP_PASS: break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(tun->dev, xdp_prog, act); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 38f6da24f460..700cc9374f9c 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -644,7 +644,7 @@ static struct xdp_frame *veth_xdp_rcv_one(struct veth_rq *rq, rcu_read_unlock(); goto xdp_xmit; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(rq->dev, xdp_prog, act); @@ -794,7 +794,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, rcu_read_unlock(); goto xdp_xmit; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(rq->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(rq->dev, xdp_prog, act); diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 088d9404c93d..d3e0b3a533cd 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -812,7 +812,7 @@ static struct sk_buff *receive_small(struct net_device *dev, rcu_read_unlock(); goto xdp_xmit; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(vi->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(vi->dev, xdp_prog, act); @@ -1025,7 +1025,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, rcu_read_unlock(); goto xdp_xmit; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(vi->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(vi->dev, xdp_prog, act); diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 911f43986a8c..7b7eb617051a 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -930,7 +930,7 @@ static u32 xennet_run_xdp(struct netfront_queue *queue, struct page *pdata, break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(queue->info->netdev, prog, act); } return act; diff --git a/include/linux/filter.h b/include/linux/filter.h index 68c6b5c208e7..60eec80fa1d4 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1027,7 +1027,7 @@ void xdp_do_flush(void); */ #define xdp_do_flush_map xdp_do_flush -void bpf_warn_invalid_xdp_action(u32 act); +void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 585b2b77ccc4..0421061d95f1 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -195,7 +195,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, } return; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(skb->dev, rcpu->prog, act); @@ -254,7 +254,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; case XDP_DROP: xdp_return_frame(xdpf); diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f02d04540c0c..6feea293ff10 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -348,7 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, frames[nframes++] = xdpf; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dev, xdp_prog, act); @@ -507,7 +507,7 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev __skb_push(skb, skb->mac_len); break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dst->dev, dst->xdp_prog, act); diff --git a/net/core/dev.c b/net/core/dev.c index 4420086f3aeb..c431c8925eed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4708,7 +4708,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, case XDP_PASS: break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(skb->dev, xdp_prog, act); diff --git a/net/core/filter.c b/net/core/filter.c index ad8619aa77b7..3f656391af7e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8180,13 +8180,13 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } -void bpf_warn_invalid_xdp_action(u32 act) +void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; - pr_warn_once("%s XDP return value %u, expect packet loss!\n", + pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n", act > act_max ? "Illegal" : "Driver unsupported", - act); + act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A"); } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -- cgit v1.2.3 From 82ff0c022d19c2ad69a472692bb7ee01ac07a40b Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 8 Dec 2021 14:11:21 -0600 Subject: perf: Add a counter for number of user access events in context On arm64, user space counter access will be controlled differently compared to x86. On x86, access in the strictest mode is enabled for all tasks in an MM when any event is mmap'ed. For arm64, access is explicitly requested for an event and only enabled when the event's context is active. This avoids hooks into the arch context switch code and gives better control of when access is enabled. In order to configure user space access when the PMU is enabled, it is necessary to know if any event (currently active or not) in the current context has user space accessed enabled. Add a counter similar to other counters in the context to avoid walking the event list every time. Reviewed-by: Mark Rutland Reviewed-by: Thomas Gleixner Signed-off-by: Rob Herring Link: https://lore.kernel.org/r/20211208201124.310740-3-robh@kernel.org Signed-off-by: Will Deacon --- include/linux/perf_event.h | 1 + kernel/events/core.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ba9467972c09..411e34210fbf 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -831,6 +831,7 @@ struct perf_event_context { int nr_events; int nr_active; + int nr_user; int is_active; int nr_stat; int nr_freq; diff --git a/kernel/events/core.c b/kernel/events/core.c index 30d94f68c5bd..1362b9b770d8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1808,6 +1808,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; + if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) + ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -1999,6 +2001,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; ctx->nr_events--; + if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) + ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; -- cgit v1.2.3 From 5eb6f22823e023ee13391978c329c4bd18f82552 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 14 Dec 2021 11:25:01 -0600 Subject: exit/kthread: Fix the kerneldoc comment for kthread_complete_and_exit I misspelled kthread_complete_and_exit in the kernel doc comment fix it. Link: https://lkml.kernel.org/r/202112141329.KBkyJ5ql-lkp@intel.com Link: https://lkml.kernel.org/r/202112141422.Cykr6YUS-lkp@intel.com Reported-by: kernel test robot Fixes: cead18552660 ("exit: Rename complete_and_exit to kthread_complete_and_exit") Signed-off-by: "Eric W. Biederman" --- kernel/kthread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9c6c532047c4..c14707d15341 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -294,7 +294,7 @@ void __noreturn kthread_exit(long result) } /** - * kthread_complete_and exit - Exit the current kthread. + * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * -- cgit v1.2.3 From dd03762ab608e058c8f390ad9cf667e490089796 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Sat, 11 Dec 2021 21:17:34 +0800 Subject: arm64: Enable KCSAN This patch enables KCSAN for arm64, with updates to build rules to not use KCSAN for several incompatible compilation units. Recent GCC version(at least GCC10) made outline-atomics as the default option(unlike Clang), which will cause linker errors for kernel/kcsan/core.o. Disables the out-of-line atomics by no-outline-atomics to fix the linker errors. Meanwhile, as Mark said[1], some latent issues are needed to be fixed which isn't just a KCSAN problem, we make the KCSAN depends on EXPERT for now. Tested selftest and kcsan_test(built with GCC11 and Clang 13), and all passed. [1] https://lkml.kernel.org/r/YadiUPpJ0gADbiHQ@FVFF77S0Q05N Acked-by: Marco Elver # kernel/kcsan Tested-by: Joey Gouly Signed-off-by: Kefeng Wang Link: https://lore.kernel.org/r/20211211131734.126874-1-wangkefeng.wang@huawei.com [catalin.marinas@arm.com: added comment to justify EXPERT] Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 2 ++ arch/arm64/kernel/vdso/Makefile | 1 + arch/arm64/kvm/hyp/nvhe/Makefile | 1 + kernel/kcsan/Makefile | 1 + 4 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c4207cf9bb17..1d98e935c4d6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -150,6 +150,8 @@ config ARM64 select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_SW_TAGS if HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_HW_TAGS if (HAVE_ARCH_KASAN && ARM64_MTE) + # Some instrumentation may be unsound, hence EXPERT + select HAVE_ARCH_KCSAN if EXPERT select HAVE_ARCH_KFENCE select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 700767dfd221..60813497a381 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -32,6 +32,7 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS) \ $(CC_FLAGS_LTO) KASAN_SANITIZE := n +KCSAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y KCOV_INSTRUMENT := n diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index c3c11974fa3b..24b2c2425b38 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -89,6 +89,7 @@ KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI) # cause crashes. Just disable it. GCOV_PROFILE := n KASAN_SANITIZE := n +KCSAN_SANITIZE := n UBSAN_SANITIZE := n KCOV_INSTRUMENT := n diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index c2bb07f5bcc7..e893b0e1d62a 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -8,6 +8,7 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ + $(call cc-option,-mno-outline-atomics) \ -fno-stack-protector -DDISABLE_BRANCH_PROFILING obj-y := core.o debugfs.o report.o -- cgit v1.2.3 From 1815775e74541d7b498c0baf15726ad3d1247abf Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 14 Dec 2021 00:46:07 +0000 Subject: cgroup: return early if it is already on preloaded list If a cset is already on preloaded list, this means we have already setup this cset properly for migration. This patch just relocates the root cgrp lookup which isn't used anyway when the cset is already on the preloaded list. [tj@kernel.org: rephrase the commit log] Signed-off-by: Wei Yang Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f522cee8e650..4f77bf1eaf9f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2650,11 +2650,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - if (!list_empty(&src_cset->mg_preload_node)) return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); -- cgit v1.2.3 From 6c3118c32129b4197999a8928ba776bcabd0f5c4 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Fri, 10 Dec 2021 14:55:03 -0800 Subject: signal: Skip the altstack update when not needed == Background == Support for large, "dynamic" fpstates was recently merged. This included code to ensure that sigaltstacks are sufficiently sized for these large states. A new lock was added to remove races between enabling large features and setting up sigaltstacks. == Problem == The new lock (sigaltstack_lock()) is acquired in the sigreturn path before restoring the old sigaltstack. Unfortunately, contention on the new lock causes a measurable signal handling performance regression [1]. However, the common case is that no *changes* are made to the sigaltstack state at sigreturn. == Solution == do_sigaltstack() acquires sigaltstack_lock() and is used for both sys_sigaltstack() and restoring the sigaltstack in sys_sigreturn(). Check for changes to the sigaltstack before taking the lock. If no changes were made, return before acquiring the lock. This removes lock contention from the common-case sigreturn path. [1] https://lore.kernel.org/lkml/20211207012128.GA16074@xsang-OptiPlex-9020/ Fixes: 3aac3ebea08f ("x86/signal: Implement sigaltstack size validation") Reported-by: kernel test robot Signed-off-by: Chang S. Bae Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20211210225503.12734-1-chang.seok.bae@intel.com --- kernel/signal.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a629b11bf3e0..dfcee3888b00 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -4185,6 +4185,15 @@ do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp, ss_mode != 0)) return -EINVAL; + /* + * Return before taking any locks if no actual + * sigaltstack changes were requested. + */ + if (t->sas_ss_sp == (unsigned long)ss_sp && + t->sas_ss_size == ss_size && + t->sas_ss_flags == ss_flags) + return 0; + sigaltstack_lock(); if (ss_mode == SS_DISABLE) { ss_size = 0; -- cgit v1.2.3 From bc6e60a4fc1daef2d95367fea8ee74fc5b62b7d6 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 14 Dec 2021 19:48:54 +0800 Subject: audit: use struct_size() helper in kmalloc() Make use of struct_size() helper instead of an open-coded calucation. Link: https://github.com/KSPP/linux/issues/160 Signed-off-by: Xiu Jianfeng Reviewed-by: Kees Cook Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/audit_tree.c | 2 +- kernel/auditfilter.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 121d37e700a6..0117e7d947fd 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1446,7 +1446,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) return err; } - sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); + sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL); if (!sig_data) { if (audit_sig_sid) security_release_secctx(ctx, len); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 72324afcffef..e7315d487163 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -94,7 +94,7 @@ static struct audit_tree *alloc_tree(const char *s) { struct audit_tree *tree; - tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL); + tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL); if (tree) { refcount_set(&tree->count, 1); tree->goner = 0; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d75acb014ccd..398b4c57e921 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -637,7 +637,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) void *bufp; int i; - data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); + data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL); if (unlikely(!data)) return NULL; memset(data, 0, sizeof(*data)); -- cgit v1.2.3 From 7d3baf0afa3aa9102d6a521a8e4c41888bb79882 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 7 Dec 2021 12:51:56 +0000 Subject: bpf: Fix kernel address leakage in atomic fetch The change in commit 37086bfdc737 ("bpf: Propagate stack bounds to registers in atomics w/ BPF_FETCH") around check_mem_access() handling is buggy since this would allow for unprivileged users to leak kernel pointers. For example, an atomic fetch/and with -1 on a stack destination which holds a spilled pointer will migrate the spilled register type into a scalar, which can then be exported out of the program (since scalar != pointer) by dumping it into a map value. The original implementation of XADD was preventing this situation by using a double call to check_mem_access() one with BPF_READ and a subsequent one with BPF_WRITE, in both cases passing -1 as a placeholder value instead of register as per XADD semantics since it didn't contain a value fetch. The BPF_READ also included a check in check_stack_read_fixed_off() which rejects the program if the stack slot is of __is_pointer_value() if dst_regno < 0. The latter is to distinguish whether we're dealing with a regular stack spill/ fill or some arithmetical operation which is disallowed on non-scalars, see also 6e7e63cbb023 ("bpf: Forbid XADD on spilled pointers for unprivileged users") for more context on check_mem_access() and its handling of placeholder value -1. One minimally intrusive option to fix the leak is for the BPF_FETCH case to initially check the BPF_READ case via check_mem_access() with -1 as register, followed by the actual load case with non-negative load_reg to propagate stack bounds to registers. Fixes: 37086bfdc737 ("bpf: Propagate stack bounds to registers in atomics w/ BPF_FETCH") Reported-by: Acked-by: Brendan Jackman Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f2f1ed34cfe9..53d39db3b0fa 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4584,13 +4584,19 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i load_reg = -1; } - /* check whether we can read the memory */ + /* Check whether we can read the memory, with second call for fetch + * case to simulate the register fill. + */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, load_reg, true); + BPF_SIZE(insn->code), BPF_READ, -1, true); + if (!err && load_reg >= 0) + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, load_reg, + true); if (err) return err; - /* check whether we can write into the same memory */ + /* Check whether we can write into the same memory. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true); if (err) -- cgit v1.2.3 From a82fe085f344ef20b452cd5f481010ff96b5c4cd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 7 Dec 2021 11:02:02 +0000 Subject: bpf: Fix kernel address leakage in atomic cmpxchg's r0 aux reg The implementation of BPF_CMPXCHG on a high level has the following parameters: .-[old-val] .-[new-val] BPF_R0 = cmpxchg{32,64}(DST_REG + insn->off, BPF_R0, SRC_REG) `-[mem-loc] `-[old-val] Given a BPF insn can only have two registers (dst, src), the R0 is fixed and used as an auxilliary register for input (old value) as well as output (returning old value from memory location). While the verifier performs a number of safety checks, it misses to reject unprivileged programs where R0 contains a pointer as old value. Through brute-forcing it takes about ~16sec on my machine to leak a kernel pointer with BPF_CMPXCHG. The PoC is basically probing for kernel addresses by storing the guessed address into the map slot as a scalar, and using the map value pointer as R0 while SRC_REG has a canary value to detect a matching address. Fix it by checking R0 for pointers, and reject if that's the case for unprivileged programs. Fixes: 5ffa25502b5a ("bpf: Add instructions for atomic_[cmp]xchg") Reported-by: Ryota Shiga (Flatt Security) Acked-by: Brendan Jackman Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 53d39db3b0fa..2d48159b58bd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4547,9 +4547,16 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (insn->imm == BPF_CMPXCHG) { /* Check comparison of R0 with memory location */ - err = check_reg_arg(env, BPF_REG_0, SRC_OP); + const u32 aux_reg = BPF_REG_0; + + err = check_reg_arg(env, aux_reg, SRC_OP); if (err) return err; + + if (is_pointer_value(env, aux_reg)) { + verbose(env, "R%d leaks addr into mem\n", aux_reg); + return -EACCES; + } } if (is_pointer_value(env, insn->src_reg)) { -- cgit v1.2.3 From 8f110f530635af44fff1f4ee100ecef0bac62510 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 13 Dec 2021 15:45:20 -0500 Subject: audit: ensure userspace is penalized the same as the kernel when under pressure Due to the audit control mutex necessary for serializing audit userspace messages we haven't been able to block/penalize userspace processes that attempt to send audit records while the system is under audit pressure. The result is that privileged userspace applications have a priority boost with respect to audit as they are not bound by the same audit queue throttling as the other tasks on the system. This patch attempts to restore some balance to the system when under audit pressure by blocking these privileged userspace tasks after they have finished their audit processing, and dropped the audit control mutex, but before they return to userspace. Reported-by: Gaosheng Cui Tested-by: Gaosheng Cui Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 0117e7d947fd..3dd8bde2c00f 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1542,6 +1542,20 @@ static void audit_receive(struct sk_buff *skb) nlh = nlmsg_next(nlh, &len); } audit_ctl_unlock(); + + /* can't block with the ctrl lock, so penalize the sender now */ + if (audit_backlog_limit && + (skb_queue_len(&audit_queue) > audit_backlog_limit)) { + DECLARE_WAITQUEUE(wait, current); + + /* wake kauditd to try and flush the queue */ + wake_up_interruptible(&kauditd_wait); + + add_wait_queue_exclusive(&audit_backlog_wait, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(audit_backlog_wait_time); + remove_wait_queue(&audit_backlog_wait, &wait); + } } /* Log information about who is connecting to the audit multicast socket */ @@ -1825,7 +1839,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block - * while holding the mutex */ + * while holding the mutex, although we do penalize the sender + * later in audit_receive() when it is safe to block + */ if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; -- cgit v1.2.3 From f4b3ee3c85551d2d343a3ba159304066523f730f Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 9 Dec 2021 11:46:07 -0500 Subject: audit: improve robustness of the audit queue handling If the audit daemon were ever to get stuck in a stopped state the kernel's kauditd_thread() could get blocked attempting to send audit records to the userspace audit daemon. With the kernel thread blocked it is possible that the audit queue could grow unbounded as certain audit record generating events must be exempt from the queue limits else the system enter a deadlock state. This patch resolves this problem by lowering the kernel thread's socket sending timeout from MAX_SCHEDULE_TIMEOUT to HZ/10 and tweaks the kauditd_send_queue() function to better manage the various audit queues when connection problems occur between the kernel and the audit daemon. With this patch, the backlog may temporarily grow beyond the defined limits when the audit daemon is stopped and the system is under heavy audit pressure, but kauditd_thread() will continue to make progress and drain the queues as it would for other connection problems. For example, with the audit daemon put into a stopped state and the system configured to audit every syscall it was still possible to shutdown the system without a kernel panic, deadlock, etc.; granted, the system was slow to shutdown but that is to be expected given the extreme pressure of recording every syscall. The timeout value of HZ/10 was chosen primarily through experimentation and this developer's "gut feeling". There is likely no one perfect value, but as this scenario is limited in scope (root privileges would be needed to send SIGSTOP to the audit daemon), it is likely not worth exposing this as a tunable at present. This can always be done at a later date if it proves necessary. Cc: stable@vger.kernel.org Fixes: 5b52330bbfe63 ("audit: fix auditd/kernel connection state tracking") Reported-by: Gaosheng Cui Tested-by: Gaosheng Cui Reviewed-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/audit.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 121d37e700a6..4cebadb5f30d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -718,7 +718,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, { int rc = 0; struct sk_buff *skb; - static unsigned int failed = 0; + unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ @@ -735,32 +735,30 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, continue; } +retry: /* grab an extra skb reference in case of error */ skb_get(skb); rc = netlink_unicast(sk, skb, portid, 0); if (rc < 0) { - /* fatal failure for our queue flush attempt? */ + /* send failed - try a few times unless fatal error */ if (++failed >= retry_limit || rc == -ECONNREFUSED || rc == -EPERM) { - /* yes - error processing for the queue */ sk = NULL; if (err_hook) (*err_hook)(skb); - if (!skb_hook) - goto out; - /* keep processing with the skb_hook */ + if (rc == -EAGAIN) + rc = 0; + /* continue to drain the queue */ continue; } else - /* no - requeue to preserve ordering */ - skb_queue_head(queue, skb); + goto retry; } else { - /* it worked - drop the extra reference and continue */ + /* skb sent - drop the extra reference and continue */ consume_skb(skb); failed = 0; } } -out: return (rc >= 0 ? 0 : rc); } @@ -1609,7 +1607,8 @@ static int __net_init audit_net_init(struct net *net) audit_panic("cannot initialize netlink socket in namespace"); return -ENOMEM; } - aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + /* limit the timeout in case auditd is blocked/stopped */ + aunet->sk->sk_sndtimeo = HZ / 10; return 0; } -- cgit v1.2.3 From 3cf2b61eb06765e27fec6799292d9fb46d0b7e60 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Dec 2021 22:02:19 +0000 Subject: bpf: Fix signed bounds propagation after mov32 For the case where both s32_{min,max}_value bounds are positive, the __reg_assign_32_into_64() directly propagates them to their 64 bit counterparts, otherwise it pessimises them into [0,u32_max] universe and tries to refine them later on by learning through the tnum as per comment in mentioned function. However, that does not always happen, for example, in mov32 operation we call zext_32_to_64(dst_reg) which invokes the __reg_assign_32_into_64() as is without subsequent bounds update as elsewhere thus no refinement based on tnum takes place. Thus, not calling into the __update_reg_bounds() / __reg_deduce_bounds() / __reg_bound_offset() triplet as we do, for example, in case of ALU ops via adjust_scalar_min_max_vals(), will lead to more pessimistic bounds when dumping the full register state: Before fix: 0: (b4) w0 = -1 1: R0_w=invP4294967295 (id=0,imm=ffffffff, smin_value=4294967295,smax_value=4294967295, umin_value=4294967295,umax_value=4294967295, var_off=(0xffffffff; 0x0), s32_min_value=-1,s32_max_value=-1, u32_min_value=-1,u32_max_value=-1) 1: (bc) w0 = w0 2: R0_w=invP4294967295 (id=0,imm=ffffffff, smin_value=0,smax_value=4294967295, umin_value=4294967295,umax_value=4294967295, var_off=(0xffffffff; 0x0), s32_min_value=-1,s32_max_value=-1, u32_min_value=-1,u32_max_value=-1) Technically, the smin_value=0 and smax_value=4294967295 bounds are not incorrect, but given the register is still a constant, they break assumptions about const scalars that smin_value == smax_value and umin_value == umax_value. After fix: 0: (b4) w0 = -1 1: R0_w=invP4294967295 (id=0,imm=ffffffff, smin_value=4294967295,smax_value=4294967295, umin_value=4294967295,umax_value=4294967295, var_off=(0xffffffff; 0x0), s32_min_value=-1,s32_max_value=-1, u32_min_value=-1,u32_max_value=-1) 1: (bc) w0 = w0 2: R0_w=invP4294967295 (id=0,imm=ffffffff, smin_value=4294967295,smax_value=4294967295, umin_value=4294967295,umax_value=4294967295, var_off=(0xffffffff; 0x0), s32_min_value=-1,s32_max_value=-1, u32_min_value=-1,u32_max_value=-1) Without the smin_value == smax_value and umin_value == umax_value invariant being intact for const scalars, it is possible to leak out kernel pointers from unprivileged user space if the latter is enabled. For example, when such registers are involved in pointer arithmtics, then adjust_ptr_min_max_vals() will taint the destination register into an unknown scalar, and the latter can be exported and stored e.g. into a BPF map value. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Kuee K1r0a Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2d48159b58bd..0872b6c9fb33 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8317,6 +8317,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->dst_reg); } zext_32_to_64(dst_reg); + + __update_reg_bounds(dst_reg); + __reg_deduce_bounds(dst_reg); + __reg_bound_offset(dst_reg); } } else { /* case: R = imm -- cgit v1.2.3 From e572ff80f05c33cd0cb4860f864f5c9c044280b6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Dec 2021 22:28:48 +0000 Subject: bpf: Make 32->64 bounds propagation slightly more robust Make the bounds propagation in __reg_assign_32_into_64() slightly more robust and readable by aligning it similarly as we did back in the __reg_combine_64_into_32() counterpart. Meaning, only propagate or pessimize them as a smin/smax pair. Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0872b6c9fb33..b532f1058d35 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1366,22 +1366,28 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } +static bool __reg32_bound_s64(s32 a) +{ + return a >= 0 && a <= S32_MAX; +} + static void __reg_assign_32_into_64(struct bpf_reg_state *reg) { reg->umin_value = reg->u32_min_value; reg->umax_value = reg->u32_max_value; - /* Attempt to pull 32-bit signed bounds into 64-bit bounds - * but must be positive otherwise set to worse case bounds - * and refine later from tnum. + + /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must + * be positive otherwise set to worse case bounds and refine later + * from tnum. */ - if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0) - reg->smax_value = reg->s32_max_value; - else - reg->smax_value = U32_MAX; - if (reg->s32_min_value >= 0) + if (__reg32_bound_s64(reg->s32_min_value) && + __reg32_bound_s64(reg->s32_max_value)) { reg->smin_value = reg->s32_min_value; - else + reg->smax_value = reg->s32_max_value; + } else { reg->smin_value = 0; + reg->smax_value = U32_MAX; + } } static void __reg_combine_32_into_64(struct bpf_reg_state *reg) -- cgit v1.2.3 From 6ef7f771de0182141ef1a0863f27b12963e1d184 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:18:49 +0100 Subject: genirq/msi: Use PCI device property to determine whether this is MSI or MSIX instead of consulting MSI descriptors. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221813.434156196@linutronix.de --- kernel/irq/msi.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 4a7a7f0f5102..b3f73ef0376c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -77,21 +77,8 @@ EXPORT_SYMBOL_GPL(get_cached_msi_msg); static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct msi_desc *entry; - bool is_msix = false; - unsigned long irq; - int retval; - - retval = kstrtoul(attr->attr.name, 10, &irq); - if (retval) - return retval; - - entry = irq_get_msi_desc(irq); - if (!entry) - return -ENODEV; - - if (dev_is_pci(dev)) - is_msix = entry->pci.msi_attrib.is_msix; + /* MSI vs. MSIX is per device not per interrupt */ + bool is_msix = dev_is_pci(dev) ? to_pci_dev(dev)->msix_enabled : false; return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi"); } -- cgit v1.2.3 From 013bd8e543c2c777b586cf033c588ea82bd502db Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:18:55 +0100 Subject: device: Add device:: Msi_data pointer and struct msi_device_data Create struct msi_device_data and add a pointer of that type to struct dev_msi_info, which is part of struct device. Provide an allocator function which can be invoked from the MSI interrupt allocation code pathes. Add a properties field to the data structure as a first member so the allocation size is not zero bytes. The field will be uses later on. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221813.676660809@linutronix.de --- include/linux/device.h | 5 +++++ include/linux/msi.h | 18 ++++++++++++++++++ kernel/irq/msi.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) (limited to 'kernel') diff --git a/include/linux/device.h b/include/linux/device.h index f212b7a7b156..f0033cd93631 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -45,6 +45,7 @@ struct iommu_ops; struct iommu_group; struct dev_pin_info; struct dev_iommu; +struct msi_device_data; /** * struct subsys_interface - interfaces to device functions @@ -374,11 +375,15 @@ struct dev_links_info { /** * struct dev_msi_info - Device data related to MSI * @domain: The MSI interrupt domain associated to the device + * @data: Pointer to MSI device data */ struct dev_msi_info { #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN struct irq_domain *domain; #endif +#ifdef CONFIG_GENERIC_MSI_IRQ + struct msi_device_data *data; +#endif }; /** diff --git a/include/linux/msi.h b/include/linux/msi.h index ba4a39c430b5..7e4c8fd7c65d 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -171,6 +171,16 @@ struct msi_desc { }; }; +/** + * msi_device_data - MSI per device data + * @properties: MSI properties which are interesting to drivers + */ +struct msi_device_data { + unsigned long properties; +}; + +int msi_setup_device_data(struct device *dev); + /* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) #define dev_to_msi_list(dev) (&(dev)->msi_list) @@ -233,10 +243,16 @@ void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); #ifdef CONFIG_SYSFS +int msi_device_populate_sysfs(struct device *dev); +void msi_device_destroy_sysfs(struct device *dev); + const struct attribute_group **msi_populate_sysfs(struct device *dev); void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups); #else +static inline int msi_device_populate_sysfs(struct device *dev) { return 0; } +static inline void msi_device_destroy_sysfs(struct device *dev) { } + static inline const struct attribute_group **msi_populate_sysfs(struct device *dev) { return NULL; @@ -384,6 +400,8 @@ enum { MSI_FLAG_MUST_REACTIVATE = (1 << 5), /* Is level-triggered capable, using two messages */ MSI_FLAG_LEVEL_CAPABLE = (1 << 6), + /* Populate sysfs on alloc() and destroy it on free() */ + MSI_FLAG_DEV_SYSFS = (1 << 7), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index b3f73ef0376c..6bca6ad9bf69 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -73,6 +73,38 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) } EXPORT_SYMBOL_GPL(get_cached_msi_msg); +static void msi_device_data_release(struct device *dev, void *res) +{ + WARN_ON_ONCE(!list_empty(&dev->msi_list)); + dev->msi.data = NULL; +} + +/** + * msi_setup_device_data - Setup MSI device data + * @dev: Device for which MSI device data should be set up + * + * Return: 0 on success, appropriate error code otherwise + * + * This can be called more than once for @dev. If the MSI device data is + * already allocated the call succeeds. The allocated memory is + * automatically released when the device is destroyed. + */ +int msi_setup_device_data(struct device *dev) +{ + struct msi_device_data *md; + + if (dev->msi.data) + return 0; + + md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL); + if (!md) + return -ENOMEM; + + dev->msi.data = md; + devres_add(dev, md); + return 0; +} + #ifdef CONFIG_SYSFS static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, char *buf) -- cgit v1.2.3 From bf6e054e0e3fbc9614355b760e18c8a14f952a4e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:03 +0100 Subject: genirq/msi: Provide msi_device_populate/destroy_sysfs() Add new allocation functions which can be activated by domain info flags. They store the groups pointer in struct msi_device_data. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221813.988659194@linutronix.de --- include/linux/msi.h | 4 ++++ kernel/irq/msi.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 44 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 7e4c8fd7c65d..1b96dc483b88 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -56,6 +56,8 @@ struct irq_data; struct msi_desc; struct pci_dev; struct platform_msi_priv_data; +struct attribute_group; + void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg); @@ -174,9 +176,11 @@ struct msi_desc { /** * msi_device_data - MSI per device data * @properties: MSI properties which are interesting to drivers + * @attrs: Pointer to the sysfs attribute group */ struct msi_device_data { unsigned long properties; + const struct attribute_group **attrs; }; int msi_setup_device_data(struct device *dev); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6bca6ad9bf69..dd65e678a46c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -199,6 +199,20 @@ error_attrs: return ERR_PTR(ret); } +/** + * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device + * @dev: The device (PCI, platform etc) which will get sysfs entries + */ +int msi_device_populate_sysfs(struct device *dev) +{ + const struct attribute_group **group = msi_populate_sysfs(dev); + + if (IS_ERR(group)) + return PTR_ERR(group); + dev->msi.data->attrs = group; + return 0; +} + /** * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices * @dev: The device(PCI, platform etc) who will remove sysfs entries @@ -225,6 +239,17 @@ void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_ir kfree(msi_irq_groups); } } + +/** + * msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device + * @dev: The device (PCI, platform etc) for which to remove + * sysfs entries + */ +void msi_device_destroy_sysfs(struct device *dev) +{ + msi_destroy_sysfs(dev, dev->msi.data->attrs); + dev->msi.data->attrs = NULL; +} #endif #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN @@ -672,8 +697,19 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; + int ret; + + ret = ops->domain_alloc_irqs(domain, dev, nvec); + if (ret) + return ret; + + if (!(info->flags & MSI_FLAG_DEV_SYSFS)) + return 0; - return ops->domain_alloc_irqs(domain, dev, nvec); + ret = msi_device_populate_sysfs(dev); + if (ret) + msi_domain_free_irqs(domain, dev); + return ret; } void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) @@ -712,7 +748,9 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - return ops->domain_free_irqs(domain, dev); + if (info->flags & MSI_FLAG_DEV_SYSFS) + msi_device_destroy_sysfs(dev); + ops->domain_free_irqs(domain, dev); } /** -- cgit v1.2.3 From 24cff375fdb663c2238f06693a067b9219596fdc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:08 +0100 Subject: genirq/msi: Remove the original sysfs interfaces No more users. Refactor the core code accordingly and move the global interface under CONFIG_PCI_MSI_ARCH_FALLBACKS. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221814.168362229@linutronix.de --- include/linux/msi.h | 18 +++--------------- kernel/irq/msi.c | 53 ++++++++++++++++++++--------------------------------- 2 files changed, 23 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 1b96dc483b88..634a12962e72 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -249,22 +249,10 @@ void pci_msi_unmask_irq(struct irq_data *data); #ifdef CONFIG_SYSFS int msi_device_populate_sysfs(struct device *dev); void msi_device_destroy_sysfs(struct device *dev); - -const struct attribute_group **msi_populate_sysfs(struct device *dev); -void msi_destroy_sysfs(struct device *dev, - const struct attribute_group **msi_irq_groups); -#else +#else /* CONFIG_SYSFS */ static inline int msi_device_populate_sysfs(struct device *dev) { return 0; } static inline void msi_device_destroy_sysfs(struct device *dev) { } - -static inline const struct attribute_group **msi_populate_sysfs(struct device *dev) -{ - return NULL; -} -static inline void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups) -{ -} -#endif +#endif /* !CONFIG_SYSFS */ /* * The arch hooks to setup up msi irqs. Default functions are implemented @@ -279,7 +267,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc); void arch_teardown_msi_irq(unsigned int irq); int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void arch_teardown_msi_irqs(struct pci_dev *dev); -#endif +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */ /* * The restore hook is still available even for fully irq domain based diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index dd65e678a46c..8e433f1a24fb 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -118,12 +118,8 @@ static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, /** * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices * @dev: The device(PCI, platform etc) who will get sysfs entries - * - * Return attribute_group ** so that specific bus MSI can save it to - * somewhere during initilizing msi irqs. If devices has no MSI irq, - * return NULL; if it fails to populate sysfs, return ERR_PTR */ -const struct attribute_group **msi_populate_sysfs(struct device *dev) +static const struct attribute_group **msi_populate_sysfs(struct device *dev) { const struct attribute_group **msi_irq_groups; struct attribute **msi_attrs, *msi_attr; @@ -213,33 +209,6 @@ int msi_device_populate_sysfs(struct device *dev) return 0; } -/** - * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices - * @dev: The device(PCI, platform etc) who will remove sysfs entries - * @msi_irq_groups: attribute_group for device msi_irqs entries - */ -void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups) -{ - struct device_attribute *dev_attr; - struct attribute **msi_attrs; - int count = 0; - - if (msi_irq_groups) { - sysfs_remove_groups(&dev->kobj, msi_irq_groups); - msi_attrs = msi_irq_groups[0]->attrs; - while (msi_attrs[count]) { - dev_attr = container_of(msi_attrs[count], - struct device_attribute, attr); - kfree(dev_attr->attr.name); - kfree(dev_attr); - ++count; - } - kfree(msi_attrs); - kfree(msi_irq_groups[0]); - kfree(msi_irq_groups); - } -} - /** * msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device * @dev: The device (PCI, platform etc) for which to remove @@ -247,8 +216,26 @@ void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_ir */ void msi_device_destroy_sysfs(struct device *dev) { - msi_destroy_sysfs(dev, dev->msi.data->attrs); + const struct attribute_group **msi_irq_groups = dev->msi.data->attrs; + struct device_attribute *dev_attr; + struct attribute **msi_attrs; + int count = 0; + dev->msi.data->attrs = NULL; + if (!msi_irq_groups) + return; + + sysfs_remove_groups(&dev->kobj, msi_irq_groups); + msi_attrs = msi_irq_groups[0]->attrs; + while (msi_attrs[count]) { + dev_attr = container_of(msi_attrs[count], struct device_attribute, attr); + kfree(dev_attr->attr.name); + kfree(dev_attr); + ++count; + } + kfree(msi_attrs); + kfree(msi_irq_groups[0]); + kfree(msi_irq_groups); } #endif -- cgit v1.2.3 From cf15f43acaad31dabb2646cef170a506a1d663eb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Dec 2021 23:19:23 +0100 Subject: genirq/msi: Provide interface to retrieve Linux interrupt number This allows drivers to retrieve the Linux interrupt number instead of fiddling with MSI descriptors. msi_get_virq() returns the Linux interrupt number or 0 in case that there is no entry for the given MSI index. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211210221814.780824745@linutronix.de --- include/linux/msi.h | 2 ++ kernel/irq/msi.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index d206239e6fa8..7593fc383dba 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -153,6 +153,8 @@ struct msi_device_data { int msi_setup_device_data(struct device *dev); +unsigned int msi_get_virq(struct device *dev, unsigned int index); + /* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) #define dev_to_msi_list(dev) (&(dev)->msi_list) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 8e433f1a24fb..ab5e83f41188 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -105,6 +105,42 @@ int msi_setup_device_data(struct device *dev) return 0; } +/** + * msi_get_virq - Return Linux interrupt number of a MSI interrupt + * @dev: Device to operate on + * @index: MSI interrupt index to look for (0-based) + * + * Return: The Linux interrupt number on success (> 0), 0 if not found + */ +unsigned int msi_get_virq(struct device *dev, unsigned int index) +{ + struct msi_desc *desc; + bool pcimsi; + + if (!dev->msi.data) + return 0; + + pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false; + + for_each_msi_entry(desc, dev) { + /* PCI-MSI has only one descriptor for multiple interrupts. */ + if (pcimsi) { + if (desc->irq && index < desc->nvec_used) + return desc->irq + index; + break; + } + + /* + * PCI-MSIX and platform MSI use a descriptor per + * interrupt. + */ + if (desc->msi_index == index) + return desc->irq; + } + return 0; +} +EXPORT_SYMBOL_GPL(msi_get_virq); + #ifdef CONFIG_SYSFS static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, char *buf) -- cgit v1.2.3 From 125282cd4f33ecd53a24ae4807409da0e5e90fd4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:04 +0100 Subject: genirq/msi: Move descriptor list to struct msi_device_data It's only required when MSI is in use. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.650487479@linutronix.de --- drivers/base/core.c | 3 --- include/linux/device.h | 4 ---- include/linux/msi.h | 4 +++- kernel/irq/msi.c | 5 ++++- 4 files changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/drivers/base/core.c b/drivers/base/core.c index f26c668092d6..f8987867789f 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2874,9 +2874,6 @@ void device_initialize(struct device *dev) INIT_LIST_HEAD(&dev->devres_head); device_pm_init(dev); set_dev_node(dev, NUMA_NO_NODE); -#ifdef CONFIG_GENERIC_MSI_IRQ - INIT_LIST_HEAD(&dev->msi_list); -#endif INIT_LIST_HEAD(&dev->links.consumers); INIT_LIST_HEAD(&dev->links.suppliers); INIT_LIST_HEAD(&dev->links.defer_sync); diff --git a/include/linux/device.h b/include/linux/device.h index f0033cd93631..93459724dcde 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -423,7 +423,6 @@ struct dev_msi_info { * @pins: For device pin management. * See Documentation/driver-api/pin-control.rst for details. * @msi: MSI related data - * @msi_list: Hosts MSI descriptors * @numa_node: NUMA node this device is close to. * @dma_ops: DMA mapping operations for this device. * @dma_mask: Dma mask (if dma'ble device). @@ -519,9 +518,6 @@ struct device { struct dev_pin_info *pins; #endif struct dev_msi_info msi; -#ifdef CONFIG_GENERIC_MSI_IRQ - struct list_head msi_list; -#endif #ifdef CONFIG_DMA_OPS const struct dma_map_ops *dma_ops; #endif diff --git a/include/linux/msi.h b/include/linux/msi.h index 7593fc383dba..4223e47103ed 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -144,11 +144,13 @@ struct msi_desc { * @properties: MSI properties which are interesting to drivers * @attrs: Pointer to the sysfs attribute group * @platform_data: Platform-MSI specific data + * @list: List of MSI descriptors associated to the device */ struct msi_device_data { unsigned long properties; const struct attribute_group **attrs; struct platform_msi_priv_data *platform_data; + struct list_head list; }; int msi_setup_device_data(struct device *dev); @@ -157,7 +159,7 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index); /* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) -#define dev_to_msi_list(dev) (&(dev)->msi_list) +#define dev_to_msi_list(dev) (&(dev)->msi.data->list) #define first_msi_entry(dev) \ list_first_entry(dev_to_msi_list((dev)), struct msi_desc, list) #define for_each_msi_entry(desc, dev) \ diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index ab5e83f41188..c66787daee57 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -75,7 +75,9 @@ EXPORT_SYMBOL_GPL(get_cached_msi_msg); static void msi_device_data_release(struct device *dev, void *res) { - WARN_ON_ONCE(!list_empty(&dev->msi_list)); + struct msi_device_data *md = res; + + WARN_ON_ONCE(!list_empty(&md->list)); dev->msi.data = NULL; } @@ -100,6 +102,7 @@ int msi_setup_device_data(struct device *dev) if (!md) return -ENOMEM; + INIT_LIST_HEAD(&md->list); dev->msi.data = md; devres_add(dev, md); return 0; -- cgit v1.2.3 From b5f687f97d1e112493fe0447a1fb09fbd93c334b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:05 +0100 Subject: genirq/msi: Add mutex for MSI list protection For upcoming runtime extensions of MSI-X interrupts it's required to protect the MSI descriptor list. Add a mutex to struct msi_device_data and provide lock/unlock functions. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.708877269@linutronix.de --- include/linux/msi.h | 5 +++++ kernel/irq/msi.c | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 4223e47103ed..2cf6c530588d 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -3,6 +3,7 @@ #define LINUX_MSI_H #include +#include #include #include @@ -145,17 +146,21 @@ struct msi_desc { * @attrs: Pointer to the sysfs attribute group * @platform_data: Platform-MSI specific data * @list: List of MSI descriptors associated to the device + * @mutex: Mutex protecting the MSI list */ struct msi_device_data { unsigned long properties; const struct attribute_group **attrs; struct platform_msi_priv_data *platform_data; struct list_head list; + struct mutex mutex; }; int msi_setup_device_data(struct device *dev); unsigned int msi_get_virq(struct device *dev, unsigned int index); +void msi_lock_descs(struct device *dev); +void msi_unlock_descs(struct device *dev); /* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index c66787daee57..97ec245803f0 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -103,11 +103,32 @@ int msi_setup_device_data(struct device *dev) return -ENOMEM; INIT_LIST_HEAD(&md->list); + mutex_init(&md->mutex); dev->msi.data = md; devres_add(dev, md); return 0; } +/** + * msi_lock_descs - Lock the MSI descriptor storage of a device + * @dev: Device to operate on + */ +void msi_lock_descs(struct device *dev) +{ + mutex_lock(&dev->msi.data->mutex); +} +EXPORT_SYMBOL_GPL(msi_lock_descs); + +/** + * msi_unlock_descs - Unlock the MSI descriptor storage of a device + * @dev: Device to operate on + */ +void msi_unlock_descs(struct device *dev) +{ + mutex_unlock(&dev->msi.data->mutex); +} +EXPORT_SYMBOL_GPL(msi_unlock_descs); + /** * msi_get_virq - Return Linux interrupt number of a MSI interrupt * @dev: Device to operate on -- cgit v1.2.3 From 0f62d941acf9ac3b6025692ce649b1f282b89e7f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:07 +0100 Subject: genirq/msi: Provide msi_domain_alloc/free_irqs_descs_locked() Usage sites which do allocations of the MSI descriptors before invoking msi_domain_alloc_irqs() require to lock the MSI decriptors accross the operation. Provide entry points which can be called with the MSI mutex held and lock the mutex in the existing entry points. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.765371053@linutronix.de --- include/linux/msi.h | 3 +++ kernel/irq/msi.c | 74 +++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 61 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 2cf6c530588d..69c588efe85b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -383,9 +383,12 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct irq_domain *parent); int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec); +int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev, + int nvec); int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec); void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev); +void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev); void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev); struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 97ec245803f0..3b21e99bb793 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -672,10 +672,8 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, dev_to_node(dev), &arg, false, desc->affinity); - if (virq < 0) { - ret = msi_handle_pci_fail(domain, desc, allocated); - goto cleanup; - } + if (virq < 0) + return msi_handle_pci_fail(domain, desc, allocated); for (i = 0; i < desc->nvec_used; i++) { irq_set_msi_desc_off(virq, i, desc); @@ -709,7 +707,7 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, } ret = irq_domain_activate_irq(irq_data, can_reserve); if (ret) - goto cleanup; + return ret; } skip_activate: @@ -724,38 +722,63 @@ skip_activate: } } return 0; - -cleanup: - msi_domain_free_irqs(domain, dev); - return ret; } /** - * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain + * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain * @domain: The domain to allocate from * @dev: Pointer to device struct of the device for which the interrupts * are allocated * @nvec: The number of interrupts to allocate * + * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() + * pair. Use this for MSI irqdomains which implement their own vector + * allocation/free. + * * Return: %0 on success or an error code. */ -int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, - int nvec) +int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev, + int nvec) { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; int ret; + lockdep_assert_held(&dev->msi.data->mutex); + ret = ops->domain_alloc_irqs(domain, dev, nvec); if (ret) - return ret; + goto cleanup; if (!(info->flags & MSI_FLAG_DEV_SYSFS)) return 0; ret = msi_device_populate_sysfs(dev); if (ret) - msi_domain_free_irqs(domain, dev); + goto cleanup; + return 0; + +cleanup: + msi_domain_free_irqs_descs_locked(domain, dev); + return ret; +} + +/** + * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain + * @domain: The domain to allocate from + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @nvec: The number of interrupts to allocate + * + * Return: %0 on success or an error code. + */ +int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec) +{ + int ret; + + msi_lock_descs(dev); + ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec); + msi_unlock_descs(dev); return ret; } @@ -785,21 +808,40 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) } /** - * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev + * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev * @domain: The domain to managing the interrupts * @dev: Pointer to device struct of the device for which the interrupts * are free + * + * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() + * pair. Use this for MSI irqdomains which implement their own vector + * allocation. */ -void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev) { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; + lockdep_assert_held(&dev->msi.data->mutex); + if (info->flags & MSI_FLAG_DEV_SYSFS) msi_device_destroy_sysfs(dev); ops->domain_free_irqs(domain, dev); } +/** + * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev + * @domain: The domain to managing the interrupts + * @dev: Pointer to device struct of the device for which the interrupts + * are free + */ +void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + msi_lock_descs(dev); + msi_domain_free_irqs_descs_locked(domain, dev); + msi_unlock_descs(dev); +} + /** * msi_get_domain_info - Get the MSI interrupt domain info for @domain * @domain: The interrupt domain to retrieve data from -- cgit v1.2.3 From 1046f71d7268b1680d7b044dea83c664403f6302 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:08 +0100 Subject: genirq/msi: Provide a set of advanced MSI accessors and iterators In preparation for dynamic handling of MSI-X interrupts provide a new set of MSI descriptor accessor functions and iterators. They are benefitial per se as they allow to cleanup quite some code in various MSI domain implementations. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.818635078@linutronix.de --- include/linux/msi.h | 33 ++++++++++++++++++ kernel/irq/msi.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 69c588efe85b..703221f7e9ea 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -140,6 +140,18 @@ struct msi_desc { struct pci_msi_desc pci; }; +/* + * Filter values for the MSI descriptor iterators and accessor functions. + */ +enum msi_desc_filter { + /* All descriptors */ + MSI_DESC_ALL, + /* Descriptors which have no interrupt associated */ + MSI_DESC_NOTASSOCIATED, + /* Descriptors which have an interrupt associated */ + MSI_DESC_ASSOCIATED, +}; + /** * msi_device_data - MSI per device data * @properties: MSI properties which are interesting to drivers @@ -147,6 +159,7 @@ struct msi_desc { * @platform_data: Platform-MSI specific data * @list: List of MSI descriptors associated to the device * @mutex: Mutex protecting the MSI list + * @__next: Cached pointer to the next entry for iterators */ struct msi_device_data { unsigned long properties; @@ -154,6 +167,7 @@ struct msi_device_data { struct platform_msi_priv_data *platform_data; struct list_head list; struct mutex mutex; + struct msi_desc *__next; }; int msi_setup_device_data(struct device *dev); @@ -162,6 +176,25 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index); void msi_lock_descs(struct device *dev); void msi_unlock_descs(struct device *dev); +struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter); +struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter); + +/** + * msi_for_each_desc - Iterate the MSI descriptors + * + * @desc: struct msi_desc pointer used as iterator + * @dev: struct device pointer - device to iterate + * @filter: Filter for descriptor selection + * + * Notes: + * - The loop must be protected with a msi_lock_descs()/msi_unlock_descs() + * pair. + * - It is safe to remove a retrieved MSI descriptor in the loop. + */ +#define msi_for_each_desc(desc, dev, filter) \ + for ((desc) = msi_first_desc((dev), (filter)); (desc); \ + (desc) = msi_next_desc((dev), (filter))) + /* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) #define dev_to_msi_list(dev) (&(dev)->msi.data->list) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 3b21e99bb793..bc67b2cafc9d 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -125,10 +125,106 @@ EXPORT_SYMBOL_GPL(msi_lock_descs); */ void msi_unlock_descs(struct device *dev) { + /* Clear the next pointer which was cached by the iterator */ + dev->msi.data->__next = NULL; mutex_unlock(&dev->msi.data->mutex); } EXPORT_SYMBOL_GPL(msi_unlock_descs); +static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter) +{ + switch (filter) { + case MSI_DESC_ALL: + return true; + case MSI_DESC_NOTASSOCIATED: + return !desc->irq; + case MSI_DESC_ASSOCIATED: + return !!desc->irq; + } + WARN_ON_ONCE(1); + return false; +} + +static struct msi_desc *msi_find_first_desc(struct device *dev, enum msi_desc_filter filter) +{ + struct msi_desc *desc; + + list_for_each_entry(desc, dev_to_msi_list(dev), list) { + if (msi_desc_match(desc, filter)) + return desc; + } + return NULL; +} + +/** + * msi_first_desc - Get the first MSI descriptor of a device + * @dev: Device to operate on + * @filter: Descriptor state filter + * + * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs() + * must be invoked before the call. + * + * Return: Pointer to the first MSI descriptor matching the search + * criteria, NULL if none found. + */ +struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter) +{ + struct msi_desc *desc; + + if (WARN_ON_ONCE(!dev->msi.data)) + return NULL; + + lockdep_assert_held(&dev->msi.data->mutex); + + desc = msi_find_first_desc(dev, filter); + dev->msi.data->__next = desc ? list_next_entry(desc, list) : NULL; + return desc; +} +EXPORT_SYMBOL_GPL(msi_first_desc); + +static struct msi_desc *__msi_next_desc(struct device *dev, enum msi_desc_filter filter, + struct msi_desc *from) +{ + struct msi_desc *desc = from; + + list_for_each_entry_from(desc, dev_to_msi_list(dev), list) { + if (msi_desc_match(desc, filter)) + return desc; + } + return NULL; +} + +/** + * msi_next_desc - Get the next MSI descriptor of a device + * @dev: Device to operate on + * + * The first invocation of msi_next_desc() has to be preceeded by a + * successful incovation of __msi_first_desc(). Consecutive invocations are + * only valid if the previous one was successful. All these operations have + * to be done within the same MSI mutex held region. + * + * Return: Pointer to the next MSI descriptor matching the search + * criteria, NULL if none found. + */ +struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter) +{ + struct msi_device_data *data = dev->msi.data; + struct msi_desc *desc; + + if (WARN_ON_ONCE(!data)) + return NULL; + + lockdep_assert_held(&data->mutex); + + if (!data->__next) + return NULL; + + desc = __msi_next_desc(dev, filter, data->__next); + dev->msi.data->__next = desc ? list_next_entry(desc, list) : NULL; + return desc; +} +EXPORT_SYMBOL_GPL(msi_next_desc); + /** * msi_get_virq - Return Linux interrupt number of a MSI interrupt * @dev: Device to operate on -- cgit v1.2.3 From 602905253607ba892336f7bba8bb45b5be819d87 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:10 +0100 Subject: genirq/msi: Provide msi_alloc_msi_desc() and a simple allocator Provide msi_alloc_msi_desc() which takes a template MSI descriptor for initializing a newly allocated descriptor. This allows to simplify various usage sites of alloc_msi_entry() and moves the storage handling into the core code. For simple cases where only a linear vector space is required provide msi_add_simple_msi_descs() which just allocates a linear range of MSI descriptors and fills msi_desc::msi_index accordingly. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.873833567@linutronix.de --- include/linux/msi.h | 2 ++ kernel/irq/msi.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 703221f7e9ea..bbb8c1e2c18b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -247,6 +247,8 @@ static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg) } #endif /* CONFIG_PCI_MSI */ +int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc); + struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, const struct irq_affinity_desc *affinity); void free_msi_entry(struct msi_desc *entry); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index bc67b2cafc9d..6ffe75eeba59 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -60,6 +60,65 @@ void free_msi_entry(struct msi_desc *entry) kfree(entry); } +/** + * msi_add_msi_desc - Allocate and initialize a MSI descriptor + * @dev: Pointer to the device for which the descriptor is allocated + * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor + * + * Return: 0 on success or an appropriate failure code. + */ +int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc) +{ + struct msi_desc *desc; + + lockdep_assert_held(&dev->msi.data->mutex); + + desc = alloc_msi_entry(dev, init_desc->nvec_used, init_desc->affinity); + if (!desc) + return -ENOMEM; + + /* Copy the MSI index and type specific data to the new descriptor. */ + desc->msi_index = init_desc->msi_index; + desc->pci = init_desc->pci; + + list_add_tail(&desc->list, &dev->msi.data->list); + return 0; +} + +/** + * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors + * @dev: Pointer to the device for which the descriptors are allocated + * @index: Index for the first MSI descriptor + * @ndesc: Number of descriptors to allocate + * + * Return: 0 on success or an appropriate failure code. + */ +static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc) +{ + struct msi_desc *desc, *tmp; + LIST_HEAD(list); + unsigned int i; + + lockdep_assert_held(&dev->msi.data->mutex); + + for (i = 0; i < ndesc; i++) { + desc = alloc_msi_entry(dev, 1, NULL); + if (!desc) + goto fail; + desc->msi_index = index + i; + list_add_tail(&desc->list, &list); + } + list_splice_tail(&list, &dev->msi.data->list); + return 0; + +fail: + list_for_each_entry_safe(desc, tmp, &list, list) { + list_del(&desc->list); + free_msi_entry(desc); + } + return -ENOMEM; +} + void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { *msg = entry->msg; -- cgit v1.2.3 From 645474e2cee450131e8b8d8a69a5d9bbabd43f3f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:12 +0100 Subject: genirq/msi: Provide domain flags to allocate/free MSI descriptors automatically Provide domain info flags which tell the core to allocate simple descriptors or to free descriptors when the interrupts are freed and implement the required functionality. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210747.928198636@linutronix.de --- include/linux/msi.h | 17 +++++++++++++++++ kernel/irq/msi.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index bbb8c1e2c18b..17e47ab8d57a 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -105,6 +105,8 @@ struct pci_msi_desc { }; }; +#define MSI_MAX_INDEX ((unsigned int)USHRT_MAX) + /** * struct msi_desc - Descriptor structure for MSI based interrupts * @list: List head for management @@ -248,6 +250,17 @@ static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg) #endif /* CONFIG_PCI_MSI */ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc); +void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, + unsigned int first_index, unsigned int last_index); + +/** + * msi_free_msi_descs - Free MSI descriptors of a device + * @dev: Device to free the descriptors + */ +static inline void msi_free_msi_descs(struct device *dev) +{ + msi_free_msi_descs_range(dev, MSI_DESC_ALL, 0, MSI_MAX_INDEX); +} struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, const struct irq_affinity_desc *affinity); @@ -408,6 +421,10 @@ enum { MSI_FLAG_DEV_SYSFS = (1 << 7), /* MSI-X entries must be contiguous */ MSI_FLAG_MSIX_CONTIGUOUS = (1 << 8), + /* Allocate simple MSI descriptors */ + MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS = (1 << 9), + /* Free MSI descriptors */ + MSI_FLAG_FREE_MSI_DESCS = (1 << 10), }; int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask, diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6ffe75eeba59..b511dc1a0219 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -119,6 +119,32 @@ fail: return -ENOMEM; } +/** + * msi_free_msi_descs_range - Free MSI descriptors of a device + * @dev: Device to free the descriptors + * @filter: Descriptor state filter + * @first_index: Index to start freeing from + * @last_index: Last index to be freed + */ +void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, + unsigned int first_index, unsigned int last_index) +{ + struct msi_desc *desc; + + lockdep_assert_held(&dev->msi.data->mutex); + + msi_for_each_desc(desc, dev, filter) { + /* + * Stupid for now to handle MSI device domain until the + * storage is switched over to an xarray. + */ + if (desc->msi_index < first_index || desc->msi_index > last_index) + continue; + list_del(&desc->list); + free_msi_entry(desc); + } +} + void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { *msg = entry->msg; @@ -879,6 +905,16 @@ skip_activate: return 0; } +static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info, + struct device *dev, + unsigned int num_descs) +{ + if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS)) + return 0; + + return msi_add_simple_msi_descs(dev, 0, num_descs); +} + /** * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain * @domain: The domain to allocate from @@ -901,6 +937,10 @@ int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device lockdep_assert_held(&dev->msi.data->mutex); + ret = msi_domain_add_simple_msi_descs(info, dev, nvec); + if (ret) + return ret; + ret = ops->domain_alloc_irqs(domain, dev, nvec); if (ret) goto cleanup; @@ -962,6 +1002,13 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) } } +static void msi_domain_free_msi_descs(struct msi_domain_info *info, + struct device *dev) +{ + if (info->flags & MSI_FLAG_FREE_MSI_DESCS) + msi_free_msi_descs(dev); +} + /** * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev * @domain: The domain to managing the interrupts @@ -982,6 +1029,7 @@ void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device if (info->flags & MSI_FLAG_DEV_SYSFS) msi_device_destroy_sysfs(dev); ops->domain_free_irqs(domain, dev); + msi_domain_free_msi_descs(info, dev); } /** -- cgit v1.2.3 From a80713fea3d12344e1da18f9113c74cdb3c463f1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:42 +0100 Subject: platform-msi: Simplify platform device MSI code The allocation code is overly complex. It tries to have the MSI index space packed, which is not working when an interrupt is freed. There is no requirement for this. The only requirement is that the MSI index is unique. Move the MSI descriptor allocation into msi_domain_populate_irqs() and use the Linux interrupt number as MSI index which fulfils the unique requirement. This requires to lock the MSI descriptors which makes the lock order reverse to the regular MSI alloc/free functions vs. the domain mutex. Assign a seperate lockdep class for these MSI device domains. Signed-off-by: Thomas Gleixner Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210748.956731741@linutronix.de --- drivers/base/platform-msi.c | 88 ++++++++++----------------------------------- kernel/irq/msi.c | 45 +++++++++++------------ 2 files changed, 39 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/drivers/base/platform-msi.c b/drivers/base/platform-msi.c index 01c897c45dcc..296ea673d661 100644 --- a/drivers/base/platform-msi.c +++ b/drivers/base/platform-msi.c @@ -246,6 +246,8 @@ void *platform_msi_get_host_data(struct irq_domain *domain) return data->host_data; } +static struct lock_class_key platform_device_msi_lock_class; + /** * __platform_msi_create_device_domain - Create a platform-msi device domain * @@ -278,6 +280,13 @@ __platform_msi_create_device_domain(struct device *dev, if (err) return NULL; + /* + * Use a separate lock class for the MSI descriptor mutex on + * platform MSI device domains because the descriptor mutex nests + * into the domain mutex. See alloc/free below. + */ + lockdep_set_class(&dev->msi.data->mutex, &platform_device_msi_lock_class); + data = dev->msi.data->platform_data; data->host_data = host_data; domain = irq_domain_create_hierarchy(dev->msi.domain, 0, @@ -300,75 +309,23 @@ free_priv: return NULL; } -static void platform_msi_free_descs(struct device *dev, int base, int nvec) -{ - struct msi_desc *desc, *tmp; - - list_for_each_entry_safe(desc, tmp, dev_to_msi_list(dev), list) { - if (desc->msi_index >= base && - desc->msi_index < (base + nvec)) { - list_del(&desc->list); - free_msi_entry(desc); - } - } -} - -static int platform_msi_alloc_descs_with_irq(struct device *dev, int virq, - int nvec) -{ - struct msi_desc *desc; - int i, base = 0; - - if (!list_empty(dev_to_msi_list(dev))) { - desc = list_last_entry(dev_to_msi_list(dev), - struct msi_desc, list); - base = desc->msi_index + 1; - } - - for (i = 0; i < nvec; i++) { - desc = alloc_msi_entry(dev, 1, NULL); - if (!desc) - break; - - desc->msi_index = base + i; - desc->irq = virq + i; - - list_add_tail(&desc->list, dev_to_msi_list(dev)); - } - - if (i != nvec) { - /* Clean up the mess */ - platform_msi_free_descs(dev, base, nvec); - return -ENOMEM; - } - - return 0; -} - /** * platform_msi_device_domain_free - Free interrupts associated with a platform-msi * device domain * * @domain: The platform-msi device domain * @virq: The base irq from which to perform the free operation - * @nvec: How many interrupts to free from @virq + * @nr_irqs: How many interrupts to free from @virq */ void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int virq, - unsigned int nvec) + unsigned int nr_irqs) { struct platform_msi_priv_data *data = domain->host_data; - struct msi_desc *desc, *tmp; - for_each_msi_entry_safe(desc, tmp, data->dev) { - if (WARN_ON(!desc->irq || desc->nvec_used != 1)) - return; - if (!(desc->irq >= virq && desc->irq < (virq + nvec))) - continue; - - irq_domain_free_irqs_common(domain, desc->irq, 1); - list_del(&desc->list); - free_msi_entry(desc); - } + msi_lock_descs(data->dev); + irq_domain_free_irqs_common(domain, virq, nr_irqs); + msi_free_msi_descs_range(data->dev, MSI_DESC_ALL, virq, virq + nr_irqs - 1); + msi_unlock_descs(data->dev); } /** @@ -377,7 +334,7 @@ void platform_msi_device_domain_free(struct irq_domain *domain, unsigned int vir * * @domain: The platform-msi device domain * @virq: The base irq from which to perform the allocate operation - * @nr_irqs: How many interrupts to free from @virq + * @nr_irqs: How many interrupts to allocate from @virq * * Return 0 on success, or an error code on failure. Must be called * with irq_domain_mutex held (which can only be done as part of a @@ -387,16 +344,7 @@ int platform_msi_device_domain_alloc(struct irq_domain *domain, unsigned int vir unsigned int nr_irqs) { struct platform_msi_priv_data *data = domain->host_data; - int err; - - err = platform_msi_alloc_descs_with_irq(data->dev, virq, nr_irqs); - if (err) - return err; - - err = msi_domain_populate_irqs(domain->parent, data->dev, - virq, nr_irqs, &data->arg); - if (err) - platform_msi_device_domain_free(domain, virq, nr_irqs); + struct device *dev = data->dev; - return err; + return msi_domain_populate_irqs(domain->parent, dev, virq, nr_irqs, &data->arg); } diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index b511dc1a0219..09f34e17e891 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -731,43 +731,40 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, } int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, - int virq, int nvec, msi_alloc_info_t *arg) + int virq_base, int nvec, msi_alloc_info_t *arg) { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; struct msi_desc *desc; - int ret = 0; + int ret, virq; - for_each_msi_entry(desc, dev) { - /* Don't even try the multi-MSI brain damage. */ - if (WARN_ON(!desc->irq || desc->nvec_used != 1)) { - ret = -EINVAL; - break; + msi_lock_descs(dev); + for (virq = virq_base; virq < virq_base + nvec; virq++) { + desc = alloc_msi_entry(dev, 1, NULL); + if (!desc) { + ret = -ENOMEM; + goto fail; } - if (!(desc->irq >= virq && desc->irq < (virq + nvec))) - continue; + desc->msi_index = virq; + desc->irq = virq; + list_add_tail(&desc->list, &dev->msi.data->list); ops->set_desc(arg, desc); - /* Assumes the domain mutex is held! */ - ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1, - arg); + ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); if (ret) - break; - - irq_set_msi_desc_off(desc->irq, 0, desc); - } - - if (ret) { - /* Mop up the damage */ - for_each_msi_entry(desc, dev) { - if (!(desc->irq >= virq && desc->irq < (virq + nvec))) - continue; + goto fail; - irq_domain_free_irqs_common(domain, desc->irq, 1); - } + irq_set_msi_desc(virq, desc); } + msi_unlock_descs(dev); + return 0; +fail: + for (--virq; virq >= virq_base; virq--) + irq_domain_free_irqs_common(domain, virq, 1); + msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1); + msi_unlock_descs(dev); return ret; } -- cgit v1.2.3 From ef8dd01538ea2553ab101ddce6a85a321406d9c0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:44 +0100 Subject: genirq/msi: Make interrupt allocation less convoluted There is no real reason to do several loops over the MSI descriptors instead of just doing one loop. In case of an error everything is undone anyway so it does not matter whether it's a partial or a full rollback. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210749.010234767@linutronix.de --- .clang-format | 1 - include/linux/msi.h | 6 --- kernel/irq/msi.c | 129 ++++++++++++++++++++++++++++------------------------ 3 files changed, 69 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/.clang-format b/.clang-format index 15d4eaabc6b5..fa959436bcfd 100644 --- a/.clang-format +++ b/.clang-format @@ -216,7 +216,6 @@ ForEachMacros: - 'for_each_migratetype_order' - 'for_each_msi_entry' - 'for_each_msi_entry_safe' - - 'for_each_msi_vector' - 'for_each_net' - 'for_each_net_continue_reverse' - 'for_each_netdev' diff --git a/include/linux/msi.h b/include/linux/msi.h index 17e47ab8d57a..e8dd0be17e89 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -206,12 +206,6 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter); list_for_each_entry((desc), dev_to_msi_list((dev)), list) #define for_each_msi_entry_safe(desc, tmp, dev) \ list_for_each_entry_safe((desc), (tmp), dev_to_msi_list((dev)), list) -#define for_each_msi_vector(desc, __irq, dev) \ - for_each_msi_entry((desc), (dev)) \ - if ((desc)->irq) \ - for (__irq = (desc)->irq; \ - __irq < ((desc)->irq + (desc)->nvec_used); \ - __irq++) #ifdef CONFIG_IRQ_MSI_IOMMU static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 09f34e17e891..bbe36e20a986 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -828,23 +828,74 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc, return allocated ? allocated : -ENOSPC; } +#define VIRQ_CAN_RESERVE 0x01 +#define VIRQ_ACTIVATE 0x02 +#define VIRQ_NOMASK_QUIRK 0x04 + +static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags) +{ + struct irq_data *irqd = irq_domain_get_irq_data(domain, virq); + int ret; + + if (!(vflags & VIRQ_CAN_RESERVE)) { + irqd_clr_can_reserve(irqd); + if (vflags & VIRQ_NOMASK_QUIRK) + irqd_set_msi_nomask_quirk(irqd); + } + + if (!(vflags & VIRQ_ACTIVATE)) + return 0; + + ret = irq_domain_activate_irq(irqd, vflags & VIRQ_CAN_RESERVE); + if (ret) + return ret; + /* + * If the interrupt uses reservation mode, clear the activated bit + * so request_irq() will assign the final vector. + */ + if (vflags & VIRQ_CAN_RESERVE) + irqd_clr_activated(irqd); + return 0; +} + int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec) { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - struct irq_data *irq_data; - struct msi_desc *desc; msi_alloc_info_t arg = { }; + unsigned int vflags = 0; + struct msi_desc *desc; int allocated = 0; int i, ret, virq; - bool can_reserve; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) return ret; - for_each_msi_entry(desc, dev) { + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (info->flags & MSI_FLAG_ACTIVATE_EARLY) + vflags |= VIRQ_ACTIVATE; + + /* + * Interrupt can use a reserved vector and will not occupy + * a real device vector until the interrupt is requested. + */ + if (msi_check_reservation_mode(domain, info, dev)) { + vflags |= VIRQ_CAN_RESERVE; + /* + * MSI affinity setting requires a special quirk (X86) when + * reservation mode is active. + */ + if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) + vflags |= VIRQ_NOMASK_QUIRK; + } + + msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) { ops->set_desc(&arg, desc); virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, @@ -856,49 +907,12 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, for (i = 0; i < desc->nvec_used; i++) { irq_set_msi_desc_off(virq, i, desc); irq_debugfs_copy_devname(virq + i, dev); + ret = msi_init_virq(domain, virq + i, vflags); + if (ret) + return ret; } allocated++; } - - can_reserve = msi_check_reservation_mode(domain, info, dev); - - /* - * This flag is set by the PCI layer as we need to activate - * the MSI entries before the PCI layer enables MSI in the - * card. Otherwise the card latches a random msi message. - */ - if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) - goto skip_activate; - - for_each_msi_vector(desc, i, dev) { - if (desc->irq == i) { - virq = desc->irq; - dev_dbg(dev, "irq [%d-%d] for MSI\n", - virq, virq + desc->nvec_used - 1); - } - - irq_data = irq_domain_get_irq_data(domain, i); - if (!can_reserve) { - irqd_clr_can_reserve(irq_data); - if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) - irqd_set_msi_nomask_quirk(irq_data); - } - ret = irq_domain_activate_irq(irq_data, can_reserve); - if (ret) - return ret; - } - -skip_activate: - /* - * If these interrupts use reservation mode, clear the activated bit - * so request_irq() will assign the final vector. - */ - if (can_reserve) { - for_each_msi_vector(desc, i, dev) { - irq_data = irq_domain_get_irq_data(domain, i); - irqd_clr_activated(irq_data); - } - } return 0; } @@ -976,26 +990,21 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nve void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) { - struct irq_data *irq_data; + struct irq_data *irqd; struct msi_desc *desc; int i; - for_each_msi_vector(desc, i, dev) { - irq_data = irq_domain_get_irq_data(domain, i); - if (irqd_is_activated(irq_data)) - irq_domain_deactivate_irq(irq_data); - } - - for_each_msi_entry(desc, dev) { - /* - * We might have failed to allocate an MSI early - * enough that there is no IRQ associated to this - * entry. If that's the case, don't do anything. - */ - if (desc->irq) { - irq_domain_free_irqs(desc->irq, desc->nvec_used); - desc->irq = 0; + /* Only handle MSI entries which have an interrupt associated */ + msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) { + /* Make sure all interrupts are deactivated */ + for (i = 0; i < desc->nvec_used; i++) { + irqd = irq_domain_get_irq_data(domain, desc->irq + i); + if (irqd && irqd_is_activated(irqd)) + irq_domain_deactivate_irq(irqd); } + + irq_domain_free_irqs(desc->irq, desc->nvec_used); + desc->irq = 0; } } -- cgit v1.2.3 From 495c66aca3da704e063fa373fdbe371e71d3f4ee Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:45 +0100 Subject: genirq/msi: Convert to new functions Use the new iterator functions and add locking where required. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210749.063705667@linutronix.de --- kernel/irq/msi.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index bbe36e20a986..745434efb557 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -320,6 +320,7 @@ EXPORT_SYMBOL_GPL(msi_next_desc); unsigned int msi_get_virq(struct device *dev, unsigned int index) { struct msi_desc *desc; + unsigned int ret = 0; bool pcimsi; if (!dev->msi.data) @@ -327,11 +328,12 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index) pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false; - for_each_msi_entry(desc, dev) { + msi_lock_descs(dev); + msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) { /* PCI-MSI has only one descriptor for multiple interrupts. */ if (pcimsi) { - if (desc->irq && index < desc->nvec_used) - return desc->irq + index; + if (index < desc->nvec_used) + ret = desc->irq + index; break; } @@ -339,10 +341,13 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index) * PCI-MSIX and platform MSI use a descriptor per * interrupt. */ - if (desc->msi_index == index) - return desc->irq; + if (desc->msi_index == index) { + ret = desc->irq; + break; + } } - return 0; + msi_unlock_descs(dev); + return ret; } EXPORT_SYMBOL_GPL(msi_get_virq); @@ -373,7 +378,7 @@ static const struct attribute_group **msi_populate_sysfs(struct device *dev) int i; /* Determine how many msi entries we have */ - for_each_msi_entry(entry, dev) + msi_for_each_desc(entry, dev, MSI_DESC_ALL) num_msi += entry->nvec_used; if (!num_msi) return NULL; @@ -383,7 +388,7 @@ static const struct attribute_group **msi_populate_sysfs(struct device *dev) if (!msi_attrs) return ERR_PTR(-ENOMEM); - for_each_msi_entry(entry, dev) { + msi_for_each_desc(entry, dev, MSI_DESC_ALL) { for (i = 0; i < entry->nvec_used; i++) { msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL); if (!msi_dev_attr) @@ -803,7 +808,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, * Checking the first MSI descriptor is sufficient. MSIX supports * masking and MSI does so when the can_mask attribute is set. */ - desc = first_msi_entry(dev); + desc = msi_first_desc(dev, MSI_DESC_ALL); return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask; } -- cgit v1.2.3 From cc9a246dbf6bdef56d9eee296a1db52dd0607976 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:47 +0100 Subject: genirq/msi: Mop up old interfaces Get rid of the old iterators, alloc/free functions and adjust the core code accordingly. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210749.117395027@linutronix.de --- include/linux/msi.h | 15 --------------- kernel/irq/msi.c | 31 +++++++++++++++---------------- 2 files changed, 15 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index e8dd0be17e89..b54010ba7b0d 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -197,15 +197,7 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter); for ((desc) = msi_first_desc((dev), (filter)); (desc); \ (desc) = msi_next_desc((dev), (filter))) -/* Helpers to hide struct msi_desc implementation details */ #define msi_desc_to_dev(desc) ((desc)->dev) -#define dev_to_msi_list(dev) (&(dev)->msi.data->list) -#define first_msi_entry(dev) \ - list_first_entry(dev_to_msi_list((dev)), struct msi_desc, list) -#define for_each_msi_entry(desc, dev) \ - list_for_each_entry((desc), dev_to_msi_list((dev)), list) -#define for_each_msi_entry_safe(desc, tmp, dev) \ - list_for_each_entry_safe((desc), (tmp), dev_to_msi_list((dev)), list) #ifdef CONFIG_IRQ_MSI_IOMMU static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) @@ -231,10 +223,6 @@ static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, #endif #ifdef CONFIG_PCI_MSI -#define first_pci_msi_entry(pdev) first_msi_entry(&(pdev)->dev) -#define for_each_pci_msi_entry(desc, pdev) \ - for_each_msi_entry((desc), &(pdev)->dev) - struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc); void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg); #else /* CONFIG_PCI_MSI */ @@ -256,9 +244,6 @@ static inline void msi_free_msi_descs(struct device *dev) msi_free_msi_descs_range(dev, MSI_DESC_ALL, 0, MSI_MAX_INDEX); } -struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, - const struct irq_affinity_desc *affinity); -void free_msi_entry(struct msi_desc *entry); void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 745434efb557..e8c19740ca0c 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -19,8 +19,10 @@ #include "internals.h" +#define dev_to_msi_list(dev) (&(dev)->msi.data->list) + /** - * alloc_msi_entry - Allocate an initialized msi_desc + * msi_alloc_desc - Allocate an initialized msi_desc * @dev: Pointer to the device for which this is allocated * @nvec: The number of vectors used in this entry * @affinity: Optional pointer to an affinity mask array size of @nvec @@ -30,12 +32,11 @@ * * Return: pointer to allocated &msi_desc on success or %NULL on failure */ -struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, - const struct irq_affinity_desc *affinity) +static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec, + const struct irq_affinity_desc *affinity) { - struct msi_desc *desc; + struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); - desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) return NULL; @@ -43,21 +44,19 @@ struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, desc->dev = dev; desc->nvec_used = nvec; if (affinity) { - desc->affinity = kmemdup(affinity, - nvec * sizeof(*desc->affinity), GFP_KERNEL); + desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL); if (!desc->affinity) { kfree(desc); return NULL; } } - return desc; } -void free_msi_entry(struct msi_desc *entry) +static void msi_free_desc(struct msi_desc *desc) { - kfree(entry->affinity); - kfree(entry); + kfree(desc->affinity); + kfree(desc); } /** @@ -73,7 +72,7 @@ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc) lockdep_assert_held(&dev->msi.data->mutex); - desc = alloc_msi_entry(dev, init_desc->nvec_used, init_desc->affinity); + desc = msi_alloc_desc(dev, init_desc->nvec_used, init_desc->affinity); if (!desc) return -ENOMEM; @@ -102,7 +101,7 @@ static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsi lockdep_assert_held(&dev->msi.data->mutex); for (i = 0; i < ndesc; i++) { - desc = alloc_msi_entry(dev, 1, NULL); + desc = msi_alloc_desc(dev, 1, NULL); if (!desc) goto fail; desc->msi_index = index + i; @@ -114,7 +113,7 @@ static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsi fail: list_for_each_entry_safe(desc, tmp, &list, list) { list_del(&desc->list); - free_msi_entry(desc); + msi_free_desc(desc); } return -ENOMEM; } @@ -141,7 +140,7 @@ void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, if (desc->msi_index < first_index || desc->msi_index > last_index) continue; list_del(&desc->list); - free_msi_entry(desc); + msi_free_desc(desc); } } @@ -745,7 +744,7 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, msi_lock_descs(dev); for (virq = virq_base; virq < virq_base + nvec; virq++) { - desc = alloc_msi_entry(dev, 1, NULL); + desc = msi_alloc_desc(dev, 1, NULL); if (!desc) { ret = -ENOMEM; goto fail; -- cgit v1.2.3 From bf5e758f02fc739589dcc6a3395c3a3eb77b5c90 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:50 +0100 Subject: genirq/msi: Simplify sysfs handling The sysfs handling for MSI is a convoluted maze and it is in the way of supporting dynamic expansion of the MSI-X vectors because it only supports a one off bulk population/free of the sysfs entries. Change it to do: 1) Creating an empty sysfs attribute group when msi_device_data is allocated 2) Populate the entries when the MSI descriptor is initialized 3) Free the entries when a MSI descriptor is detached from a Linux interrupt. 4) Provide functions for the legacy non-irqdomain fallback code to do a bulk population/free. This code won't support dynamic expansion. This makes the code simpler and reduces the number of allocations as the empty attribute group can be shared. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210749.224917330@linutronix.de --- include/linux/msi.h | 23 +++--- kernel/irq/msi.c | 198 ++++++++++++++++++++++++---------------------------- 2 files changed, 103 insertions(+), 118 deletions(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 70cc6a555a8e..1a00367d2cfa 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -71,7 +71,7 @@ struct irq_data; struct msi_desc; struct pci_dev; struct platform_msi_priv_data; -struct attribute_group; +struct device_attribute; void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg); #ifdef CONFIG_GENERIC_MSI_IRQ @@ -129,6 +129,7 @@ struct pci_msi_desc { * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse * @affinity: Optional pointer to a cpu affinity mask for this descriptor + * @sysfs_attr: Pointer to sysfs device attribute * * @write_msi_msg: Callback that may be called when the MSI message * address or data changes @@ -148,6 +149,9 @@ struct msi_desc { #ifdef CONFIG_IRQ_MSI_IOMMU const void *iommu_cookie; #endif +#ifdef CONFIG_SYSFS + struct device_attribute *sysfs_attrs; +#endif void (*write_msi_msg)(struct msi_desc *entry, void *data); void *write_msi_msg_data; @@ -171,7 +175,6 @@ enum msi_desc_filter { /** * msi_device_data - MSI per device data * @properties: MSI properties which are interesting to drivers - * @attrs: Pointer to the sysfs attribute group * @platform_data: Platform-MSI specific data * @list: List of MSI descriptors associated to the device * @mutex: Mutex protecting the MSI list @@ -179,7 +182,6 @@ enum msi_desc_filter { */ struct msi_device_data { unsigned long properties; - const struct attribute_group **attrs; struct platform_msi_priv_data *platform_data; struct list_head list; struct mutex mutex; @@ -264,14 +266,6 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); -#ifdef CONFIG_SYSFS -int msi_device_populate_sysfs(struct device *dev); -void msi_device_destroy_sysfs(struct device *dev); -#else /* CONFIG_SYSFS */ -static inline int msi_device_populate_sysfs(struct device *dev) { return 0; } -static inline void msi_device_destroy_sysfs(struct device *dev) { } -#endif /* !CONFIG_SYSFS */ - /* * The arch hooks to setup up msi irqs. Default functions are implemented * as weak symbols so that they /can/ be overriden by architecture specific @@ -285,6 +279,13 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc); void arch_teardown_msi_irq(unsigned int irq); int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void arch_teardown_msi_irqs(struct pci_dev *dev); +#ifdef CONFIG_SYSFS +int msi_device_populate_sysfs(struct device *dev); +void msi_device_destroy_sysfs(struct device *dev); +#else /* CONFIG_SYSFS */ +static inline int msi_device_populate_sysfs(struct device *dev) { return 0; } +static inline void msi_device_destroy_sysfs(struct device *dev) { } +#endif /* !CONFIG_SYSFS */ #endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */ /* diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index e8c19740ca0c..d290e09258bc 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -19,6 +19,7 @@ #include "internals.h" +static inline int msi_sysfs_create_group(struct device *dev); #define dev_to_msi_list(dev) (&(dev)->msi.data->list) /** @@ -178,6 +179,7 @@ static void msi_device_data_release(struct device *dev, void *res) int msi_setup_device_data(struct device *dev) { struct msi_device_data *md; + int ret; if (dev->msi.data) return 0; @@ -186,6 +188,12 @@ int msi_setup_device_data(struct device *dev) if (!md) return -ENOMEM; + ret = msi_sysfs_create_group(dev); + if (ret) { + devres_free(md); + return ret; + } + INIT_LIST_HEAD(&md->list); mutex_init(&md->mutex); dev->msi.data = md; @@ -351,6 +359,20 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index) EXPORT_SYMBOL_GPL(msi_get_virq); #ifdef CONFIG_SYSFS +static struct attribute *msi_dev_attrs[] = { + NULL +}; + +static const struct attribute_group msi_irqs_group = { + .name = "msi_irqs", + .attrs = msi_dev_attrs, +}; + +static inline int msi_sysfs_create_group(struct device *dev) +{ + return devm_device_add_group(dev, &msi_irqs_group); +} + static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -360,97 +382,74 @@ static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi"); } -/** - * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices - * @dev: The device(PCI, platform etc) who will get sysfs entries - */ -static const struct attribute_group **msi_populate_sysfs(struct device *dev) +static void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { - const struct attribute_group **msi_irq_groups; - struct attribute **msi_attrs, *msi_attr; - struct device_attribute *msi_dev_attr; - struct attribute_group *msi_irq_group; - struct msi_desc *entry; - int ret = -ENOMEM; - int num_msi = 0; - int count = 0; + struct device_attribute *attrs = desc->sysfs_attrs; int i; - /* Determine how many msi entries we have */ - msi_for_each_desc(entry, dev, MSI_DESC_ALL) - num_msi += entry->nvec_used; - if (!num_msi) - return NULL; + if (!attrs) + return; - /* Dynamically create the MSI attributes for the device */ - msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL); - if (!msi_attrs) - return ERR_PTR(-ENOMEM); - - msi_for_each_desc(entry, dev, MSI_DESC_ALL) { - for (i = 0; i < entry->nvec_used; i++) { - msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL); - if (!msi_dev_attr) - goto error_attrs; - msi_attrs[count] = &msi_dev_attr->attr; - - sysfs_attr_init(&msi_dev_attr->attr); - msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d", - entry->irq + i); - if (!msi_dev_attr->attr.name) - goto error_attrs; - msi_dev_attr->attr.mode = 0444; - msi_dev_attr->show = msi_mode_show; - ++count; - } + desc->sysfs_attrs = NULL; + for (i = 0; i < desc->nvec_used; i++) { + if (attrs[i].show) + sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name); + kfree(attrs[i].attr.name); } + kfree(attrs); +} - msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL); - if (!msi_irq_group) - goto error_attrs; - msi_irq_group->name = "msi_irqs"; - msi_irq_group->attrs = msi_attrs; +static int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) +{ + struct device_attribute *attrs; + int ret, i; - msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL); - if (!msi_irq_groups) - goto error_irq_group; - msi_irq_groups[0] = msi_irq_group; + attrs = kcalloc(desc->nvec_used, sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return -ENOMEM; - ret = sysfs_create_groups(&dev->kobj, msi_irq_groups); - if (ret) - goto error_irq_groups; - - return msi_irq_groups; - -error_irq_groups: - kfree(msi_irq_groups); -error_irq_group: - kfree(msi_irq_group); -error_attrs: - count = 0; - msi_attr = msi_attrs[count]; - while (msi_attr) { - msi_dev_attr = container_of(msi_attr, struct device_attribute, attr); - kfree(msi_attr->name); - kfree(msi_dev_attr); - ++count; - msi_attr = msi_attrs[count]; + desc->sysfs_attrs = attrs; + for (i = 0; i < desc->nvec_used; i++) { + sysfs_attr_init(&attrs[i].attr); + attrs[i].attr.name = kasprintf(GFP_KERNEL, "%d", desc->irq + i); + if (!attrs[i].attr.name) { + ret = -ENOMEM; + goto fail; + } + + attrs[i].attr.mode = 0444; + attrs[i].show = msi_mode_show; + + ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name); + if (ret) { + attrs[i].show = NULL; + goto fail; + } } - kfree(msi_attrs); - return ERR_PTR(ret); + return 0; + +fail: + msi_sysfs_remove_desc(dev, desc); + return ret; } +#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS /** * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device * @dev: The device (PCI, platform etc) which will get sysfs entries */ int msi_device_populate_sysfs(struct device *dev) { - const struct attribute_group **group = msi_populate_sysfs(dev); + struct msi_desc *desc; + int ret; - if (IS_ERR(group)) - return PTR_ERR(group); - dev->msi.data->attrs = group; + msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) { + if (desc->sysfs_attrs) + continue; + ret = msi_sysfs_populate_desc(dev, desc); + if (ret) + return ret; + } return 0; } @@ -461,28 +460,17 @@ int msi_device_populate_sysfs(struct device *dev) */ void msi_device_destroy_sysfs(struct device *dev) { - const struct attribute_group **msi_irq_groups = dev->msi.data->attrs; - struct device_attribute *dev_attr; - struct attribute **msi_attrs; - int count = 0; - - dev->msi.data->attrs = NULL; - if (!msi_irq_groups) - return; + struct msi_desc *desc; - sysfs_remove_groups(&dev->kobj, msi_irq_groups); - msi_attrs = msi_irq_groups[0]->attrs; - while (msi_attrs[count]) { - dev_attr = container_of(msi_attrs[count], struct device_attribute, attr); - kfree(dev_attr->attr.name); - kfree(dev_attr); - ++count; - } - kfree(msi_attrs); - kfree(msi_irq_groups[0]); - kfree(msi_irq_groups); + msi_for_each_desc(desc, dev, MSI_DESC_ALL) + msi_sysfs_remove_desc(dev, desc); } -#endif +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */ +#else /* CONFIG_SYSFS */ +static inline int msi_sysfs_create_group(struct device *dev) { return 0; } +static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; } +static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { } +#endif /* !CONFIG_SYSFS */ #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN static inline void irq_chip_write_msi_msg(struct irq_data *data, @@ -914,6 +902,12 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ret = msi_init_virq(domain, virq + i, vflags); if (ret) return ret; + + if (info->flags & MSI_FLAG_DEV_SYSFS) { + ret = msi_sysfs_populate_desc(dev, desc); + if (ret) + return ret; + } } allocated++; } @@ -958,18 +952,7 @@ int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device ret = ops->domain_alloc_irqs(domain, dev, nvec); if (ret) - goto cleanup; - - if (!(info->flags & MSI_FLAG_DEV_SYSFS)) - return 0; - - ret = msi_device_populate_sysfs(dev); - if (ret) - goto cleanup; - return 0; - -cleanup: - msi_domain_free_irqs_descs_locked(domain, dev); + msi_domain_free_irqs_descs_locked(domain, dev); return ret; } @@ -994,6 +977,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nve void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) { + struct msi_domain_info *info = domain->host_data; struct irq_data *irqd; struct msi_desc *desc; int i; @@ -1008,6 +992,8 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) } irq_domain_free_irqs(desc->irq, desc->nvec_used); + if (info->flags & MSI_FLAG_DEV_SYSFS) + msi_sysfs_remove_desc(dev, desc); desc->irq = 0; } } @@ -1036,8 +1022,6 @@ void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device lockdep_assert_held(&dev->msi.data->mutex); - if (info->flags & MSI_FLAG_DEV_SYSFS) - msi_device_destroy_sysfs(dev); ops->domain_free_irqs(domain, dev); msi_domain_free_msi_descs(info, dev); } -- cgit v1.2.3 From cd6cf06590b9792340dceaa285138777f3cc4d90 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Dec 2021 23:51:52 +0100 Subject: genirq/msi: Convert storage to xarray The current linked list storage for MSI descriptors is suboptimal in several ways: 1) Looking up a MSI desciptor requires a O(n) list walk in the worst case 2) The upcoming support of runtime expansion of MSI-X vectors would need to do a full list walk to figure out whether a particular index is already associated. 3) Runtime expansion of sparse allocations is even more complex as the current implementation assumes an ordered list (increasing MSI index). Use an xarray which solves all of the above problems nicely. Signed-off-by: Thomas Gleixner Tested-by: Michael Kelley Tested-by: Nishanth Menon Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20211206210749.280627070@linutronix.de --- include/linux/msi.h | 13 ++-- kernel/irq/msi.c | 169 ++++++++++++++++++++++++---------------------------- 2 files changed, 83 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 1a00367d2cfa..fc918a658d48 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -17,6 +17,7 @@ */ #include +#include #include #include #include @@ -123,7 +124,6 @@ struct pci_msi_desc { /** * struct msi_desc - Descriptor structure for MSI based interrupts - * @list: List head for management * @irq: The base interrupt number * @nvec_used: The number of vectors used * @dev: Pointer to the device which uses this descriptor @@ -140,7 +140,6 @@ struct pci_msi_desc { */ struct msi_desc { /* Shared device/bus type independent data */ - struct list_head list; unsigned int irq; unsigned int nvec_used; struct device *dev; @@ -176,16 +175,16 @@ enum msi_desc_filter { * msi_device_data - MSI per device data * @properties: MSI properties which are interesting to drivers * @platform_data: Platform-MSI specific data - * @list: List of MSI descriptors associated to the device - * @mutex: Mutex protecting the MSI list - * @__next: Cached pointer to the next entry for iterators + * @mutex: Mutex protecting the MSI descriptor store + * @__store: Xarray for storing MSI descriptor pointers + * @__iter_idx: Index to search the next entry for iterators */ struct msi_device_data { unsigned long properties; struct platform_msi_priv_data *platform_data; - struct list_head list; struct mutex mutex; - struct msi_desc *__next; + struct xarray __store; + unsigned long __iter_idx; }; int msi_setup_device_data(struct device *dev); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index d290e09258bc..173bc04f9fe5 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -20,7 +20,6 @@ #include "internals.h" static inline int msi_sysfs_create_group(struct device *dev); -#define dev_to_msi_list(dev) (&(dev)->msi.data->list) /** * msi_alloc_desc - Allocate an initialized msi_desc @@ -41,7 +40,6 @@ static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec, if (!desc) return NULL; - INIT_LIST_HEAD(&desc->list); desc->dev = dev; desc->nvec_used = nvec; if (affinity) { @@ -60,6 +58,17 @@ static void msi_free_desc(struct msi_desc *desc) kfree(desc); } +static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index) +{ + int ret; + + desc->msi_index = index; + ret = xa_insert(&md->__store, index, desc, GFP_KERNEL); + if (ret) + msi_free_desc(desc); + return ret; +} + /** * msi_add_msi_desc - Allocate and initialize a MSI descriptor * @dev: Pointer to the device for which the descriptor is allocated @@ -77,12 +86,9 @@ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc) if (!desc) return -ENOMEM; - /* Copy the MSI index and type specific data to the new descriptor. */ - desc->msi_index = init_desc->msi_index; + /* Copy type specific data to the new descriptor. */ desc->pci = init_desc->pci; - - list_add_tail(&desc->list, &dev->msi.data->list); - return 0; + return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index); } /** @@ -95,28 +101,41 @@ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc) */ static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc) { - struct msi_desc *desc, *tmp; - LIST_HEAD(list); - unsigned int i; + unsigned int idx, last = index + ndesc - 1; + struct msi_desc *desc; + int ret; lockdep_assert_held(&dev->msi.data->mutex); - for (i = 0; i < ndesc; i++) { + for (idx = index; idx <= last; idx++) { desc = msi_alloc_desc(dev, 1, NULL); if (!desc) + goto fail_mem; + ret = msi_insert_desc(dev->msi.data, desc, idx); + if (ret) goto fail; - desc->msi_index = index + i; - list_add_tail(&desc->list, &list); } - list_splice_tail(&list, &dev->msi.data->list); return 0; +fail_mem: + ret = -ENOMEM; fail: - list_for_each_entry_safe(desc, tmp, &list, list) { - list_del(&desc->list); - msi_free_desc(desc); + msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last); + return ret; +} + +static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter) +{ + switch (filter) { + case MSI_DESC_ALL: + return true; + case MSI_DESC_NOTASSOCIATED: + return !desc->irq; + case MSI_DESC_ASSOCIATED: + return !!desc->irq; } - return -ENOMEM; + WARN_ON_ONCE(1); + return false; } /** @@ -129,19 +148,17 @@ fail: void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, unsigned int first_index, unsigned int last_index) { + struct xarray *xa = &dev->msi.data->__store; struct msi_desc *desc; + unsigned long idx; lockdep_assert_held(&dev->msi.data->mutex); - msi_for_each_desc(desc, dev, filter) { - /* - * Stupid for now to handle MSI device domain until the - * storage is switched over to an xarray. - */ - if (desc->msi_index < first_index || desc->msi_index > last_index) - continue; - list_del(&desc->list); - msi_free_desc(desc); + xa_for_each_range(xa, idx, desc, first_index, last_index) { + if (msi_desc_match(desc, filter)) { + xa_erase(xa, idx); + msi_free_desc(desc); + } } } @@ -162,7 +179,8 @@ static void msi_device_data_release(struct device *dev, void *res) { struct msi_device_data *md = res; - WARN_ON_ONCE(!list_empty(&md->list)); + WARN_ON_ONCE(!xa_empty(&md->__store)); + xa_destroy(&md->__store); dev->msi.data = NULL; } @@ -194,7 +212,7 @@ int msi_setup_device_data(struct device *dev) return ret; } - INIT_LIST_HEAD(&md->list); + xa_init(&md->__store); mutex_init(&md->mutex); dev->msi.data = md; devres_add(dev, md); @@ -217,34 +235,21 @@ EXPORT_SYMBOL_GPL(msi_lock_descs); */ void msi_unlock_descs(struct device *dev) { - /* Clear the next pointer which was cached by the iterator */ - dev->msi.data->__next = NULL; + /* Invalidate the index wich was cached by the iterator */ + dev->msi.data->__iter_idx = MSI_MAX_INDEX; mutex_unlock(&dev->msi.data->mutex); } EXPORT_SYMBOL_GPL(msi_unlock_descs); -static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter) -{ - switch (filter) { - case MSI_DESC_ALL: - return true; - case MSI_DESC_NOTASSOCIATED: - return !desc->irq; - case MSI_DESC_ASSOCIATED: - return !!desc->irq; - } - WARN_ON_ONCE(1); - return false; -} - -static struct msi_desc *msi_find_first_desc(struct device *dev, enum msi_desc_filter filter) +static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter) { struct msi_desc *desc; - list_for_each_entry(desc, dev_to_msi_list(dev), list) { + xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) { if (msi_desc_match(desc, filter)) return desc; } + md->__iter_idx = MSI_MAX_INDEX; return NULL; } @@ -261,37 +266,24 @@ static struct msi_desc *msi_find_first_desc(struct device *dev, enum msi_desc_fi */ struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter) { - struct msi_desc *desc; + struct msi_device_data *md = dev->msi.data; - if (WARN_ON_ONCE(!dev->msi.data)) + if (WARN_ON_ONCE(!md)) return NULL; - lockdep_assert_held(&dev->msi.data->mutex); + lockdep_assert_held(&md->mutex); - desc = msi_find_first_desc(dev, filter); - dev->msi.data->__next = desc ? list_next_entry(desc, list) : NULL; - return desc; + md->__iter_idx = 0; + return msi_find_desc(md, filter); } EXPORT_SYMBOL_GPL(msi_first_desc); -static struct msi_desc *__msi_next_desc(struct device *dev, enum msi_desc_filter filter, - struct msi_desc *from) -{ - struct msi_desc *desc = from; - - list_for_each_entry_from(desc, dev_to_msi_list(dev), list) { - if (msi_desc_match(desc, filter)) - return desc; - } - return NULL; -} - /** * msi_next_desc - Get the next MSI descriptor of a device * @dev: Device to operate on * * The first invocation of msi_next_desc() has to be preceeded by a - * successful incovation of __msi_first_desc(). Consecutive invocations are + * successful invocation of __msi_first_desc(). Consecutive invocations are * only valid if the previous one was successful. All these operations have * to be done within the same MSI mutex held region. * @@ -300,20 +292,18 @@ static struct msi_desc *__msi_next_desc(struct device *dev, enum msi_desc_filter */ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter) { - struct msi_device_data *data = dev->msi.data; - struct msi_desc *desc; + struct msi_device_data *md = dev->msi.data; - if (WARN_ON_ONCE(!data)) + if (WARN_ON_ONCE(!md)) return NULL; - lockdep_assert_held(&data->mutex); + lockdep_assert_held(&md->mutex); - if (!data->__next) + if (md->__iter_idx >= (unsigned long)MSI_MAX_INDEX) return NULL; - desc = __msi_next_desc(dev, filter, data->__next); - dev->msi.data->__next = desc ? list_next_entry(desc, list) : NULL; - return desc; + md->__iter_idx++; + return msi_find_desc(md, filter); } EXPORT_SYMBOL_GPL(msi_next_desc); @@ -336,21 +326,18 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index) pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false; msi_lock_descs(dev); - msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) { - /* PCI-MSI has only one descriptor for multiple interrupts. */ - if (pcimsi) { - if (index < desc->nvec_used) - ret = desc->irq + index; - break; - } - + desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index); + if (desc && desc->irq) { /* + * PCI-MSI has only one descriptor for multiple interrupts. * PCI-MSIX and platform MSI use a descriptor per * interrupt. */ - if (desc->msi_index == index) { + if (pcimsi) { + if (index < desc->nvec_used) + ret = desc->irq + index; + } else { ret = desc->irq; - break; } } msi_unlock_descs(dev); @@ -731,16 +718,13 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, int ret, virq; msi_lock_descs(dev); - for (virq = virq_base; virq < virq_base + nvec; virq++) { - desc = msi_alloc_desc(dev, 1, NULL); - if (!desc) { - ret = -ENOMEM; - goto fail; - } + ret = msi_add_simple_msi_descs(dev, virq_base, nvec); + if (ret) + goto unlock; - desc->msi_index = virq; + for (virq = virq_base; virq < virq_base + nvec; virq++) { + desc = xa_load(&dev->msi.data->__store, virq); desc->irq = virq; - list_add_tail(&desc->list, &dev->msi.data->list); ops->set_desc(arg, desc); ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); @@ -756,6 +740,7 @@ fail: for (--virq; virq >= virq_base; virq--) irq_domain_free_irqs_common(domain, virq, 1); msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1); +unlock: msi_unlock_descs(dev); return ret; } -- cgit v1.2.3 From aef2feda97b840ec38e9fa53d0065188453304e8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 15 Dec 2021 18:55:37 -0800 Subject: add missing bpf-cgroup.h includes We're about to break the cgroup-defs.h -> bpf-cgroup.h dependency, make sure those who actually need more than the definition of struct cgroup_bpf include bpf-cgroup.h explicitly. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Acked-by: Tejun Heo Link: https://lore.kernel.org/bpf/20211216025538.1649516-3-kuba@kernel.org --- kernel/bpf/helpers.c | 1 + kernel/bpf/syscall.c | 1 + kernel/bpf/verifier.c | 1 + kernel/cgroup/cgroup.c | 1 + kernel/trace/trace_kprobe.c | 1 + kernel/trace/trace_uprobe.c | 1 + net/ipv4/udp.c | 1 + net/ipv6/udp.c | 1 + net/socket.c | 1 + security/device_cgroup.c | 1 + 10 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 8babae03d30a..34d6f91dec1c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2,6 +2,7 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include +#include #include #include #include diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ddd81d543203..da07bdf71697 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2,6 +2,7 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include +#include #include #include #include diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d74e8a99412e..f0604796132f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4,6 +4,7 @@ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io */ #include +#include #include #include #include diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 919194de39c8..cd4c23f7e3df 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -30,6 +30,7 @@ #include "cgroup-internal.h" +#include #include #include #include diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 33272a7b6912..4e1257f50aa3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -7,6 +7,7 @@ */ #define pr_fmt(fmt) "trace_kprobe: " fmt +#include #include #include #include diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index f5f0039d31e5..4f35514a48f3 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -7,6 +7,7 @@ */ #define pr_fmt(fmt) "trace_uprobe: " fmt +#include #include #include #include diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 69d30053fed9..99536127650b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -74,6 +74,7 @@ #define pr_fmt(fmt) "UDP: " fmt +#include #include #include #include diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6a0e569f0bb8..ba8986d12413 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -17,6 +17,7 @@ * YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file. */ +#include #include #include #include diff --git a/net/socket.c b/net/socket.c index 7f64a6eccf63..721a5a1b1106 100644 --- a/net/socket.c +++ b/net/socket.c @@ -52,6 +52,7 @@ * Based upon Swansea University Computer Society NET3.039 */ +#include #include #include #include diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 04375df52fc9..842889f3dcb7 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -5,6 +5,7 @@ * Copyright 2007 IBM Corp */ +#include #include #include #include -- cgit v1.2.3 From 0f55f9ed21f96630c6ec96805d42f92c0b458b37 Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Thu, 16 Dec 2021 13:33:56 -0800 Subject: bpf: Only print scratched registers and stack slots to verifier logs. When printing verifier state for any log level, print full verifier state only on function calls or on errors. Otherwise, only print the registers and stack slots that were accessed. Log size differences: verif_scale_loop6 before: 234566564 verif_scale_loop6 after: 72143943 69% size reduction kfree_skb before: 166406 kfree_skb after: 55386 69% size reduction Before: 156: (61) r0 = *(u32 *)(r1 +0) 157: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=ctx(id=0,off=0,imm=0) R2_w=invP0 R10=fp0 fp-8_w=00000000 fp-16_w=00\ 000000 fp-24_w=00000000 fp-32_w=00000000 fp-40_w=00000000 fp-48_w=00000000 fp-56_w=00000000 fp-64_w=00000000 fp-72_w=00000000 fp-80_w=00000\ 000 fp-88_w=00000000 fp-96_w=00000000 fp-104_w=00000000 fp-112_w=00000000 fp-120_w=00000000 fp-128_w=00000000 fp-136_w=00000000 fp-144_w=00\ 000000 fp-152_w=00000000 fp-160_w=00000000 fp-168_w=00000000 fp-176_w=00000000 fp-184_w=00000000 fp-192_w=00000000 fp-200_w=00000000 fp-208\ _w=00000000 fp-216_w=00000000 fp-224_w=00000000 fp-232_w=00000000 fp-240_w=00000000 fp-248_w=00000000 fp-256_w=00000000 fp-264_w=00000000 f\ p-272_w=00000000 fp-280_w=00000000 fp-288_w=00000000 fp-296_w=00000000 fp-304_w=00000000 fp-312_w=00000000 fp-320_w=00000000 fp-328_w=00000\ 000 fp-336_w=00000000 fp-344_w=00000000 fp-352_w=00000000 fp-360_w=00000000 fp-368_w=00000000 fp-376_w=00000000 fp-384_w=00000000 fp-392_w=\ 00000000 fp-400_w=00000000 fp-408_w=00000000 fp-416_w=00000000 fp-424_w=00000000 fp-432_w=00000000 fp-440_w=00000000 fp-448_w=00000000 ; return skb->len; 157: (95) exit Func#4 is safe for any args that match its prototype Validating get_constant() func#5... 158: R1=invP(id=0) R10=fp0 ; int get_constant(long val) 158: (bf) r0 = r1 159: R0_w=invP(id=1) R1=invP(id=1) R10=fp0 ; return val - 122; 159: (04) w0 += -122 160: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=invP(id=1) R10=fp0 ; return val - 122; 160: (95) exit Func#5 is safe for any args that match its prototype Validating get_skb_ifindex() func#6... 161: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0 ; int get_skb_ifindex(int val, struct __sk_buff *skb, int var) 161: (bc) w0 = w3 162: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0 After: 156: (61) r0 = *(u32 *)(r1 +0) 157: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1=ctx(id=0,off=0,imm=0) ; return skb->len; 157: (95) exit Func#4 is safe for any args that match its prototype Validating get_constant() func#5... 158: R1=invP(id=0) R10=fp0 ; int get_constant(long val) 158: (bf) r0 = r1 159: R0_w=invP(id=1) R1=invP(id=1) ; return val - 122; 159: (04) w0 += -122 160: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return val - 122; 160: (95) exit Func#5 is safe for any args that match its prototype Validating get_skb_ifindex() func#6... 161: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R3=invP(id=0) R10=fp0 ; int get_skb_ifindex(int val, struct __sk_buff *skb, int var) 161: (bc) w0 = w3 162: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R3=invP(id=0) Signed-off-by: Christy Lee Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211216213358.3374427-2-christylee@fb.com --- include/linux/bpf_verifier.h | 7 +++ kernel/bpf/verifier.c | 83 +++++++++++++++++++++----- tools/testing/selftests/bpf/prog_tests/align.c | 30 +++++----- 3 files changed, 91 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 182b16a91084..c66f238c538d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -474,6 +474,13 @@ struct bpf_verifier_env { /* longest register parentage chain walked for liveness marking */ u32 longest_mark_read_walk; bpfptr_t fd_array; + + /* bit mask to keep track of whether a register has been accessed + * since the last time the function state was printed + */ + u32 scratched_regs; + /* Same as scratched_regs but for stack slots */ + u64 scratched_stack_slots; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f0604796132f..ded6e816dcd9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -609,6 +609,44 @@ static const char *kernel_type_name(const struct btf* btf, u32 id) return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } +static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) +{ + env->scratched_regs |= 1U << regno; +} + +static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) +{ + env->scratched_stack_slots |= 1UL << spi; +} + +static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) +{ + return (env->scratched_regs >> regno) & 1; +} + +static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) +{ + return (env->scratched_stack_slots >> regno) & 1; +} + +static bool verifier_state_scratched(const struct bpf_verifier_env *env) +{ + return env->scratched_regs || env->scratched_stack_slots; +} + +static void mark_verifier_state_clean(struct bpf_verifier_env *env) +{ + env->scratched_regs = 0U; + env->scratched_stack_slots = 0UL; +} + +/* Used for printing the entire verifier state. */ +static void mark_verifier_state_scratched(struct bpf_verifier_env *env) +{ + env->scratched_regs = ~0U; + env->scratched_stack_slots = ~0UL; +} + /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ @@ -624,7 +662,8 @@ static void scrub_spilled_slot(u8 *stype) } static void print_verifier_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state) + const struct bpf_func_state *state, + bool print_all) { const struct bpf_reg_state *reg; enum bpf_reg_type t; @@ -637,6 +676,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, t = reg->type; if (t == NOT_INIT) continue; + if (!print_all && !reg_scratched(env, i)) + continue; verbose(env, " R%d", i); print_liveness(env, reg->live); verbose(env, "=%s", reg_type_str[t]); @@ -726,6 +767,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, types_buf[BPF_REG_SIZE] = 0; if (!valid) continue; + if (!print_all && !stack_slot_scratched(env, i)) + continue; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, state->stack[i].spilled_ptr.live); if (is_spilled_reg(&state->stack[i])) { @@ -751,6 +794,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (state->in_async_callback_fn) verbose(env, " async_cb"); verbose(env, "\n"); + mark_verifier_state_clean(env); } /* copy array src of length n * size bytes to dst. dst is reallocated if it's too @@ -1541,6 +1585,7 @@ static void init_func_state(struct bpf_verifier_env *env, state->frameno = frameno; state->subprogno = subprogno; init_reg_state(env, state); + mark_verifier_state_scratched(env); } /* Similar to push_stack(), but for async callbacks */ @@ -2228,6 +2273,8 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return -EINVAL; } + mark_reg_scratched(env, regno); + reg = ®s[regno]; rw64 = is_reg64(env, insn, regno, reg, t); if (t == SRC_OP) { @@ -2678,7 +2725,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, reg->precise = true; } if (env->log.level & BPF_LOG_LEVEL) { - print_verifier_state(env, func); + print_verifier_state(env, func, false); verbose(env, "parent %s regs=%x stack=%llx marks\n", new_marks ? "didn't have" : "already had", reg_mask, stack_mask); @@ -2837,6 +2884,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, env->insn_aux_data[insn_idx].sanitize_stack_spill = true; } + mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { @@ -2958,6 +3006,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, slot = -i - 1; spi = slot / BPF_REG_SIZE; stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + mark_stack_slot_scratched(env, spi); if (!env->allow_ptr_leaks && *stype != NOT_INIT @@ -3376,7 +3425,7 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, * to make sure our theoretical access will be safe. */ if (env->log.level & BPF_LOG_LEVEL) - print_verifier_state(env, state); + print_verifier_state(env, state, false); /* The minimum value is only important with signed * comparisons where we can't assume the floor of a @@ -6011,9 +6060,9 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); - print_verifier_state(env, caller); + print_verifier_state(env, caller, true); verbose(env, "callee:\n"); - print_verifier_state(env, callee); + print_verifier_state(env, callee, true); } return 0; } @@ -6228,9 +6277,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) *insn_idx = callee->callsite + 1; if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); - print_verifier_state(env, callee); + print_verifier_state(env, callee, true); verbose(env, "to caller at %d:\n", *insn_idx); - print_verifier_state(env, caller); + print_verifier_state(env, caller, true); } /* clear everything in the callee */ free_func_state(callee); @@ -8249,12 +8298,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, state); + print_verifier_state(env, state, true); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, state); + print_verifier_state(env, state, true); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@ -9389,7 +9438,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EACCES; } if (env->log.level & BPF_LOG_LEVEL) - print_verifier_state(env, this_branch->frame[this_branch->curframe]); + print_verifier_state(env, this_branch->frame[this_branch->curframe], false); return 0; } @@ -11259,14 +11308,17 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level & BPF_LOG_LEVEL2 || (env->log.level & BPF_LOG_LEVEL && do_print_state)) { - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "%d:", env->insn_idx); - else + if (env->log.level & BPF_LOG_LEVEL2) { + if (verifier_state_scratched(env)) + verbose(env, "%d:", env->insn_idx); + } else { verbose(env, "\nfrom %d to %d%s:", env->prev_insn_idx, env->insn_idx, env->cur_state->speculative ? " (speculative execution)" : ""); - print_verifier_state(env, state->frame[state->curframe]); + } + print_verifier_state(env, state->frame[state->curframe], + false); do_print_state = false; } @@ -11488,6 +11540,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: + mark_verifier_state_scratched(env); update_branch_counts(env, env->cur_state); err = pop_stack(env, &prev_insn_idx, &env->insn_idx, pop_log); @@ -14148,6 +14201,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) } } + mark_verifier_state_clean(env); + if (IS_ERR(btf_vmlinux)) { /* Either gcc or pahole or kernel are broken. */ verbose(env, "in-kernel BTF is malformed\n"); diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 837f67c6bfda..aeb2080a67f7 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -39,8 +39,8 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {1, "R1=ctx(id=0,off=0,imm=0)"}, - {1, "R10=fp0"}, + {0, "R1=ctx(id=0,off=0,imm=0)"}, + {0, "R10=fp0"}, {1, "R3_w=inv2"}, {2, "R3_w=inv4"}, {3, "R3_w=inv8"}, @@ -67,8 +67,8 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {1, "R1=ctx(id=0,off=0,imm=0)"}, - {1, "R10=fp0"}, + {0, "R1=ctx(id=0,off=0,imm=0)"}, + {0, "R10=fp0"}, {1, "R3_w=inv1"}, {2, "R3_w=inv2"}, {3, "R3_w=inv4"}, @@ -96,8 +96,8 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {1, "R1=ctx(id=0,off=0,imm=0)"}, - {1, "R10=fp0"}, + {0, "R1=ctx(id=0,off=0,imm=0)"}, + {0, "R10=fp0"}, {1, "R3_w=inv4"}, {2, "R3_w=inv8"}, {3, "R3_w=inv10"}, @@ -118,8 +118,8 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {1, "R1=ctx(id=0,off=0,imm=0)"}, - {1, "R10=fp0"}, + {0, "R1=ctx(id=0,off=0,imm=0)"}, + {0, "R10=fp0"}, {1, "R3_w=inv7"}, {2, "R3_w=inv7"}, {3, "R3_w=inv14"}, @@ -161,13 +161,13 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {7, "R0_w=pkt(id=0,off=8,r=8,imm=0)"}, + {6, "R0_w=pkt(id=0,off=8,r=8,imm=0)"}, {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, {8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, {9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, {10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, {11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, - {18, "R3=pkt_end(id=0,off=0,imm=0)"}, + {13, "R3_w=pkt_end(id=0,off=0,imm=0)"}, {18, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, {19, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"}, {20, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, @@ -234,10 +234,10 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"}, + {3, "R5_w=pkt(id=0,off=0,r=0,imm=0)"}, {5, "R5_w=pkt(id=0,off=14,r=0,imm=0)"}, {6, "R4_w=pkt(id=0,off=14,r=0,imm=0)"}, - {10, "R2=pkt(id=0,off=0,r=18,imm=0)"}, + {9, "R2=pkt(id=0,off=0,r=18,imm=0)"}, {10, "R5=pkt(id=0,off=14,r=18,imm=0)"}, {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, {14, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"}, @@ -296,7 +296,7 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, + {6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Offset is added to packet pointer R5, resulting in * known fixed offset, and variable offset from R6. @@ -386,7 +386,7 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, + {6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Adding 14 makes R6 be (4n+2) */ {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, @@ -458,7 +458,7 @@ static struct bpf_align_test tests[] = { /* Checked s>=0 */ {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, + {12, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able -- cgit v1.2.3 From 2e5766483c8c5cf886b4dc647a1741738dde7d79 Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Thu, 16 Dec 2021 19:42:45 -0800 Subject: bpf: Right align verifier states in verifier logs. Make the verifier logs more readable, print the verifier states on the corresponding instruction line. If the previous line was not a bpf instruction, then print the verifier states on its own line. Before: Validating test_pkt_access_subprog3() func#3... 86: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R10=fp0 ; int test_pkt_access_subprog3(int val, struct __sk_buff *skb) 86: (bf) r6 = r2 87: R2=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 87: (bc) w7 = w1 88: R1=invP(id=0) R7_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 88: (bf) r1 = r6 89: R1_w=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 89: (85) call pc+9 Func#4 is global and valid. Skipping. 90: R0_w=invP(id=0) 90: (bc) w8 = w0 91: R0_w=invP(id=0) R8_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 91: (b7) r1 = 123 92: R1_w=invP123 92: (85) call pc+65 Func#5 is global and valid. Skipping. 93: R0=invP(id=0) After: 86: R1=invP(id=0) R2=ctx(id=0,off=0,imm=0) R10=fp0 ; int test_pkt_access_subprog3(int val, struct __sk_buff *skb) 86: (bf) r6 = r2 ; R2=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 87: (bc) w7 = w1 ; R1=invP(id=0) R7_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 88: (bf) r1 = r6 ; R1_w=ctx(id=0,off=0,imm=0) R6_w=ctx(id=0,off=0,imm=0) 89: (85) call pc+9 Func#4 is global and valid. Skipping. 90: R0_w=invP(id=0) 90: (bc) w8 = w0 ; R0_w=invP(id=0) R8_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) ; return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123)); 91: (b7) r1 = 123 ; R1_w=invP123 92: (85) call pc+65 Func#5 is global and valid. Skipping. 93: R0=invP(id=0) Signed-off-by: Christy Lee Acked-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 + kernel/bpf/verifier.c | 57 ++++++--- tools/testing/selftests/bpf/prog_tests/align.c | 169 ++++++++++++++----------- 3 files changed, 131 insertions(+), 98 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c66f238c538d..ee931398f311 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -388,6 +388,8 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) #define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) #define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) #define BPF_LOG_KERNEL (BPF_LOG_MASK + 1) /* kernel internal flag */ +#define BPF_LOG_MIN_ALIGNMENT 8U +#define BPF_LOG_ALIGNMENT 40U static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { @@ -481,6 +483,7 @@ struct bpf_verifier_env { u32 scratched_regs; /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; + u32 prev_log_len, prev_insn_print_len; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ded6e816dcd9..72c4a6bbb5bc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -797,6 +797,25 @@ static void print_verifier_state(struct bpf_verifier_env *env, mark_verifier_state_clean(env); } +static inline u32 vlog_alignment(u32 pos) +{ + return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT), + BPF_LOG_MIN_ALIGNMENT) - pos - 1; +} + +static void print_insn_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state) +{ + if (env->prev_log_len && env->prev_log_len == env->log.len_used) { + /* remove new line character */ + bpf_vlog_reset(&env->log, env->prev_log_len - 1); + verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' '); + } else { + verbose(env, "%d:", env->insn_idx); + } + print_verifier_state(env, state, false); +} + /* copy array src of length n * size bytes to dst. dst is reallocated if it's too * small to hold src. This is different from krealloc since we don't want to preserve * the contents of dst. @@ -2725,10 +2744,10 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, reg->precise = true; } if (env->log.level & BPF_LOG_LEVEL) { - print_verifier_state(env, func, false); - verbose(env, "parent %s regs=%x stack=%llx marks\n", + verbose(env, "parent %s regs=%x stack=%llx marks:", new_marks ? "didn't have" : "already had", reg_mask, stack_mask); + print_verifier_state(env, func, true); } if (!reg_mask && !stack_mask) @@ -3423,11 +3442,8 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. - */ - if (env->log.level & BPF_LOG_LEVEL) - print_verifier_state(env, state, false); - - /* The minimum value is only important with signed + * + * The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our * index'es we need to make sure that whatever we use @@ -9438,7 +9454,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EACCES; } if (env->log.level & BPF_LOG_LEVEL) - print_verifier_state(env, this_branch->frame[this_branch->curframe], false); + print_insn_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -11306,19 +11322,12 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (env->log.level & BPF_LOG_LEVEL2 || - (env->log.level & BPF_LOG_LEVEL && do_print_state)) { - if (env->log.level & BPF_LOG_LEVEL2) { - if (verifier_state_scratched(env)) - verbose(env, "%d:", env->insn_idx); - } else { - verbose(env, "\nfrom %d to %d%s:", - env->prev_insn_idx, env->insn_idx, - env->cur_state->speculative ? - " (speculative execution)" : ""); - } - print_verifier_state(env, state->frame[state->curframe], - false); + if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) { + verbose(env, "\nfrom %d to %d%s:", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); + print_verifier_state(env, state->frame[state->curframe], true); do_print_state = false; } @@ -11329,9 +11338,15 @@ static int do_check(struct bpf_verifier_env *env) .private_data = env, }; + if (verifier_state_scratched(env)) + print_insn_state(env, state->frame[state->curframe]); + verbose_linfo(env, env->insn_idx, "; "); + env->prev_log_len = env->log.len_used; verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + env->prev_insn_print_len = env->log.len_used - env->prev_log_len; + env->prev_log_len = env->log.len_used; } if (bpf_prog_is_dev_bound(env->prog->aux)) { diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index aeb2080a67f7..0ee29e11eaee 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -41,11 +41,11 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1=ctx(id=0,off=0,imm=0)"}, {0, "R10=fp0"}, - {1, "R3_w=inv2"}, - {2, "R3_w=inv4"}, - {3, "R3_w=inv8"}, - {4, "R3_w=inv16"}, - {5, "R3_w=inv32"}, + {0, "R3_w=inv2"}, + {1, "R3_w=inv4"}, + {2, "R3_w=inv8"}, + {3, "R3_w=inv16"}, + {4, "R3_w=inv32"}, }, }, { @@ -69,17 +69,17 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1=ctx(id=0,off=0,imm=0)"}, {0, "R10=fp0"}, - {1, "R3_w=inv1"}, - {2, "R3_w=inv2"}, - {3, "R3_w=inv4"}, - {4, "R3_w=inv8"}, - {5, "R3_w=inv16"}, - {6, "R3_w=inv1"}, - {7, "R4_w=inv32"}, - {8, "R4_w=inv16"}, - {9, "R4_w=inv8"}, - {10, "R4_w=inv4"}, - {11, "R4_w=inv2"}, + {0, "R3_w=inv1"}, + {1, "R3_w=inv2"}, + {2, "R3_w=inv4"}, + {3, "R3_w=inv8"}, + {4, "R3_w=inv16"}, + {5, "R3_w=inv1"}, + {6, "R4_w=inv32"}, + {7, "R4_w=inv16"}, + {8, "R4_w=inv8"}, + {9, "R4_w=inv4"}, + {10, "R4_w=inv2"}, }, }, { @@ -98,12 +98,12 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1=ctx(id=0,off=0,imm=0)"}, {0, "R10=fp0"}, - {1, "R3_w=inv4"}, - {2, "R3_w=inv8"}, - {3, "R3_w=inv10"}, - {4, "R4_w=inv8"}, - {5, "R4_w=inv12"}, - {6, "R4_w=inv14"}, + {0, "R3_w=inv4"}, + {1, "R3_w=inv8"}, + {2, "R3_w=inv10"}, + {3, "R4_w=inv8"}, + {4, "R4_w=inv12"}, + {5, "R4_w=inv14"}, }, }, { @@ -120,10 +120,10 @@ static struct bpf_align_test tests[] = { .matches = { {0, "R1=ctx(id=0,off=0,imm=0)"}, {0, "R10=fp0"}, + {0, "R3_w=inv7"}, {1, "R3_w=inv7"}, - {2, "R3_w=inv7"}, - {3, "R3_w=inv14"}, - {4, "R3_w=inv56"}, + {2, "R3_w=inv14"}, + {3, "R3_w=inv56"}, }, }, @@ -162,18 +162,18 @@ static struct bpf_align_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { {6, "R0_w=pkt(id=0,off=8,r=8,imm=0)"}, - {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, - {9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - {10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, - {11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, - {13, "R3_w=pkt_end(id=0,off=0,imm=0)"}, - {18, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {19, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"}, - {20, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, - {21, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, - {22, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - {23, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, + {6, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {7, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, + {8, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {9, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, + {10, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, + {12, "R3_w=pkt_end(id=0,off=0,imm=0)"}, + {17, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {18, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"}, + {19, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, + {20, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, + {21, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {22, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, }, }, { @@ -194,16 +194,16 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {8, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, - {9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, - {10, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, - {11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, - {12, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, - {13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, - {14, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, - {15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, - {16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, + {6, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {7, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, + {8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {9, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, + {10, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"}, + {11, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, + {12, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {13, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"}, + {14, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"}, + {15, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"}, }, }, { @@ -234,14 +234,14 @@ static struct bpf_align_test tests[] = { }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .matches = { - {3, "R5_w=pkt(id=0,off=0,r=0,imm=0)"}, - {5, "R5_w=pkt(id=0,off=14,r=0,imm=0)"}, - {6, "R4_w=pkt(id=0,off=14,r=0,imm=0)"}, + {2, "R5_w=pkt(id=0,off=0,r=0,imm=0)"}, + {4, "R5_w=pkt(id=0,off=14,r=0,imm=0)"}, + {5, "R4_w=pkt(id=0,off=14,r=0,imm=0)"}, {9, "R2=pkt(id=0,off=0,r=18,imm=0)"}, {10, "R5=pkt(id=0,off=14,r=18,imm=0)"}, {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"}, + {13, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"}, {14, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"}, - {15, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"}, }, }, { @@ -297,7 +297,7 @@ static struct bpf_align_test tests[] = { * alignment of 4. */ {6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, - {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {7, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Offset is added to packet pointer R5, resulting in * known fixed offset, and variable offset from R6. */ @@ -313,11 +313,11 @@ static struct bpf_align_test tests[] = { /* Variable offset is added to R5 packet pointer, * resulting in auxiliary alignment of 4. */ - {18, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {17, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Constant offset is added to R5, resulting in * reg->off of 14. */ - {19, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {18, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off * (14) which is 16. Then the variable offset is 4-byte @@ -329,18 +329,18 @@ static struct bpf_align_test tests[] = { /* Constant offset is added to R5 packet pointer, * resulting in reg->off value of 14. */ - {26, "R5_w=pkt(id=0,off=14,r=8"}, + {25, "R5_w=pkt(id=0,off=14,r=8"}, /* Variable offset is added to R5, resulting in a * variable offset of (4n). */ - {27, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {26, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Constant is added to R5 again, setting reg->off to 18. */ - {28, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {27, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* And once more we add a variable; resulting var_off * is still (4n), fixed offset is not changed. * Also, we create a new reg->id. */ - {29, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"}, + {28, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (18) * which is 20. Then the variable offset is (4n), so @@ -387,12 +387,12 @@ static struct bpf_align_test tests[] = { * alignment of 4. */ {6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, - {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {7, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Adding 14 makes R6 be (4n+2) */ - {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {8, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* Packet pointer has (4n+2) offset */ {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, - {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, + {12, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so @@ -403,12 +403,12 @@ static struct bpf_align_test tests[] = { /* Newly read value in R6 was shifted left by 2, so has * known alignment of 4. */ - {18, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {17, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Added (4n) to packet pointer's (4n+2) var_off, giving * another (4n+2). */ {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, - {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, + {20, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"}, /* At the time the word size load is performed from R5, * its total fixed offset is NET_IP_ALIGN + reg->off (0) * which is 2. Then the variable offset is (4n+2), so @@ -448,18 +448,18 @@ static struct bpf_align_test tests[] = { .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, .matches = { - {4, "R5_w=pkt_end(id=0,off=0,imm=0)"}, + {3, "R5_w=pkt_end(id=0,off=0,imm=0)"}, /* (ptr - ptr) << 2 == unknown, (4n) */ - {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"}, + {5, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"}, /* (4n) + 14 == (4n+2). We blow our bounds, because * the add could overflow. */ - {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, + {6, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {12, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, - {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, + {12, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -502,14 +502,14 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, - {9, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, + {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Adding 14 makes R6 be (4n+2) */ - {10, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, + {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* New unknown value in R7 is (4n) */ - {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, + {10, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"}, /* Subtracting it from R6 blows our unsigned bounds */ - {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, + {11, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>= 0 */ {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"}, /* At the time the word size load is performed from R5, @@ -556,14 +556,14 @@ static struct bpf_align_test tests[] = { /* Calculated offset in R6 has unknown value, but known * alignment of 4. */ - {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, - {10, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"}, + {6, "R2_w=pkt(id=0,off=0,r=8,imm=0)"}, + {9, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"}, /* Adding 14 makes R6 be (4n+2) */ - {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"}, + {10, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"}, /* Subtracting from packet pointer overflows ubounds */ {13, "R5_w=pkt(id=2,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"}, /* New unknown value in R7 is (4n), >= 76 */ - {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"}, + {14, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"}, /* Adding it to packet pointer gives nice bounds again */ {16, "R5_w=pkt(id=3,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"}, /* At the time the word size load is performed from R5, @@ -625,12 +625,15 @@ static int do_test_single(struct bpf_align_test *test) line_ptr = strtok(bpf_vlog_copy, "\n"); for (i = 0; i < MAX_MATCHES; i++) { struct bpf_reg_match m = test->matches[i]; + int tmp; if (!m.match) break; while (line_ptr) { cur_line = -1; sscanf(line_ptr, "%u: ", &cur_line); + if (cur_line == -1) + sscanf(line_ptr, "from %u to %u: ", &tmp, &cur_line); if (cur_line == m.line) break; line_ptr = strtok(NULL, "\n"); @@ -642,7 +645,19 @@ static int do_test_single(struct bpf_align_test *test) printf("%s", bpf_vlog); break; } + /* Check the next line as well in case the previous line + * did not have a corresponding bpf insn. Example: + * func#0 @0 + * 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 + * 0: (b7) r3 = 2 ; R3_w=inv2 + */ if (!strstr(line_ptr, m.match)) { + cur_line = -1; + line_ptr = strtok(NULL, "\n"); + sscanf(line_ptr, "%u: ", &cur_line); + } + if (cur_line != m.line || !line_ptr || + !strstr(line_ptr, m.match)) { printf("Failed to find match %u: %s\n", m.line, m.match); ret = 1; -- cgit v1.2.3 From 496f3324048b6ce024af19ac84a03f6bdef93b7d Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Thu, 16 Dec 2021 13:33:58 -0800 Subject: Only output backtracking information in log level 2 Backtracking information is very verbose, don't print it in log level 1 to improve readability. Signed-off-by: Christy Lee Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211216213358.3374427-4-christylee@fb.com --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 72c4a6bbb5bc..07d7d6124a18 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2398,7 +2398,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (insn->code == 0) return 0; - if (env->log.level & BPF_LOG_LEVEL) { + if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); verbose(env, "%d: ", idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); @@ -2656,7 +2656,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; - if (env->log.level & BPF_LOG_LEVEL) + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); for (i = last_idx;;) { if (skip_first) { @@ -2743,7 +2743,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, new_marks = true; reg->precise = true; } - if (env->log.level & BPF_LOG_LEVEL) { + if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "parent %s regs=%x stack=%llx marks:", new_marks ? "didn't have" : "already had", reg_mask, stack_mask); -- cgit v1.2.3 From 4e8c11b6b3f0b6a283e898344f154641eda94266 Mon Sep 17 00:00:00 2001 From: Yu Liao Date: Mon, 13 Dec 2021 21:57:27 +0800 Subject: timekeeping: Really make sure wall_to_monotonic isn't positive Even after commit e1d7ba873555 ("time: Always make sure wall_to_monotonic isn't positive") it is still possible to make wall_to_monotonic positive by running the following code: int main(void) { struct timespec time; clock_gettime(CLOCK_MONOTONIC, &time); time.tv_nsec = 0; clock_settime(CLOCK_REALTIME, &time); return 0; } The reason is that the second parameter of timespec64_compare(), ts_delta, may be unnormalized because the delta is calculated with an open coded substraction which causes the comparison of tv_sec to yield the wrong result: wall_to_monotonic = { .tv_sec = -10, .tv_nsec = 900000000 } ts_delta = { .tv_sec = -9, .tv_nsec = -900000000 } That makes timespec64_compare() claim that wall_to_monotonic < ts_delta, but actually the result should be wall_to_monotonic > ts_delta. After normalization, the result of timespec64_compare() is correct because the tv_sec comparison is not longer misleading: wall_to_monotonic = { .tv_sec = -10, .tv_nsec = 900000000 } ts_delta = { .tv_sec = -10, .tv_nsec = 100000000 } Use timespec64_sub() to ensure that ts_delta is normalized, which fixes the issue. Fixes: e1d7ba873555 ("time: Always make sure wall_to_monotonic isn't positive") Signed-off-by: Yu Liao Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211213135727.1656662-1-liaoyu15@huawei.com --- kernel/time/timekeeping.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b348749a9fc6..dcdcb85121e4 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1306,8 +1306,7 @@ int do_settimeofday64(const struct timespec64 *ts) timekeeping_forward_now(tk); xt = tk_xtime(tk); - ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; - ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; + ts_delta = timespec64_sub(*ts, xt); if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) { ret = -EINVAL; -- cgit v1.2.3 From 8f556a326c93213927e683fc32bbf5be1b62540a Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 17 Dec 2021 15:42:07 +0800 Subject: locking/rtmutex: Fix incorrect condition in rtmutex_spin_on_owner() Optimistic spinning needs to be terminated when the spinning waiter is not longer the top waiter on the lock, but the condition is negated. It terminates if the waiter is the top waiter, which is defeating the whole purpose. Fixes: c3123c431447 ("locking/rtmutex: Dont dereference waiter lockless") Signed-off-by: Zqiang Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211217074207.77425-1-qiang1.zhang@intel.com --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 0c6a48dfcecb..1f25a4d7de27 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1380,7 +1380,7 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock, * - the VCPU on which owner runs is preempted */ if (!owner->on_cpu || need_resched() || - rt_mutex_waiter_is_top_waiter(lock, waiter) || + !rt_mutex_waiter_is_top_waiter(lock, waiter) || vcpu_is_preempted(task_cpu(owner))) { res = false; break; -- cgit v1.2.3 From 48946bd6a5d695c50b34546864b79c1f910a33c1 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:45 -0800 Subject: bpf: Replace ARG_XXX_OR_NULL with ARG_XXX | PTR_MAYBE_NULL We have introduced a new type to make bpf_arg composable, by reserving high bits of bpf_arg to represent flags of a type. One of the flags is PTR_MAYBE_NULL which indicates a pointer may be NULL. When applying this flag to an arg_type, it means the arg can take NULL pointer. This patch switches the qualified arg_types to use this flag. The arg_types changed in this patch include: 1. ARG_PTR_TO_MAP_VALUE_OR_NULL 2. ARG_PTR_TO_MEM_OR_NULL 3. ARG_PTR_TO_CTX_OR_NULL 4. ARG_PTR_TO_SOCKET_OR_NULL 5. ARG_PTR_TO_ALLOC_MEM_OR_NULL 6. ARG_PTR_TO_STACK_OR_NULL This patch does not eliminate the use of these arg_types, instead it makes them an alias to the 'ARG_XXX | PTR_MAYBE_NULL'. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-3-haoluo@google.com --- include/linux/bpf.h | 15 +++++++++------ kernel/bpf/verifier.c | 39 ++++++++++++++------------------------- 2 files changed, 23 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 41bb3687cc85..765bd7cc4272 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -331,13 +331,11 @@ enum bpf_arg_type { ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ - ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack */ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ - ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, * helper function must fill all bytes or clear * them in error case. @@ -347,26 +345,31 @@ enum bpf_arg_type { ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ARG_PTR_TO_CTX, /* pointer to context */ - ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ - ARG_PTR_TO_SOCKET_OR_NULL, /* pointer to bpf_sock (fullsock) or NULL */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ - ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ ARG_PTR_TO_FUNC, /* pointer to a bpf program function */ - ARG_PTR_TO_STACK_OR_NULL, /* pointer to stack or NULL */ + ARG_PTR_TO_STACK, /* pointer to stack */ ARG_PTR_TO_CONST_STR, /* pointer to a null terminated read-only string */ ARG_PTR_TO_TIMER, /* pointer to bpf_timer */ __BPF_ARG_TYPE_MAX, + /* Extended arg_types. */ + ARG_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE, + ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, + ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, + ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, + ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, + ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 07d7d6124a18..6095a75c1118 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -475,14 +475,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) return type == ARG_PTR_TO_SOCK_COMMON; } -static bool arg_type_may_be_null(enum bpf_arg_type type) +static bool type_may_be_null(u32 type) { - return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_CTX_OR_NULL || - type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || - type == ARG_PTR_TO_STACK_OR_NULL; + return type & PTR_MAYBE_NULL; } /* Determine whether the function releases some resources allocated by another @@ -5016,9 +5011,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { - return type == ARG_PTR_TO_MEM || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_UNINIT_MEM; + return base_type(type) == ARG_PTR_TO_MEM || + base_type(type) == ARG_PTR_TO_UNINIT_MEM; } static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -5155,31 +5149,26 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, - [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_MAP_PTR] = &const_map_ptr_types, [ARG_PTR_TO_CTX] = &context_types, - [ARG_PTR_TO_CTX_OR_NULL] = &context_types, [ARG_PTR_TO_SOCK_COMMON] = &sock_types, #ifdef CONFIG_NET [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, #endif [ARG_PTR_TO_SOCKET] = &fullsock_types, - [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, [ARG_PTR_TO_UNINIT_MEM] = &mem_types, [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, - [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, [ARG_PTR_TO_FUNC] = &func_ptr_types, - [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, + [ARG_PTR_TO_STACK] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, }; @@ -5193,7 +5182,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, const struct bpf_reg_types *compatible; int i, j; - compatible = compatible_reg_types[arg_type]; + compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); return -EFAULT; @@ -5274,15 +5263,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { + if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { err = resolve_map_arg_type(env, meta, &arg_type); if (err) return err; } - if (register_is_null(reg) && arg_type_may_be_null(arg_type)) + if (register_is_null(reg) && type_may_be_null(arg_type)) /* A NULL register has a SCALAR_VALUE type, so skip * type checking. */ @@ -5351,10 +5339,11 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE || - (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && - !register_is_null(reg)) || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + if (type_may_be_null(arg_type) && register_is_null(reg)) + return 0; + /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ -- cgit v1.2.3 From 3c4807322660d4290ac9062c034aed6b87243861 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:46 -0800 Subject: bpf: Replace RET_XXX_OR_NULL with RET_XXX | PTR_MAYBE_NULL We have introduced a new type to make bpf_ret composable, by reserving high bits to represent flags. One of the flag is PTR_MAYBE_NULL, which indicates a pointer may be NULL. When applying this flag to ret_types, it means the returned value could be a NULL pointer. This patch switches the qualified arg_types to use this flag. The ret_types changed in this patch include: 1. RET_PTR_TO_MAP_VALUE_OR_NULL 2. RET_PTR_TO_SOCKET_OR_NULL 3. RET_PTR_TO_TCP_SOCK_OR_NULL 4. RET_PTR_TO_SOCK_COMMON_OR_NULL 5. RET_PTR_TO_ALLOC_MEM_OR_NULL 6. RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL 7. RET_PTR_TO_BTF_ID_OR_NULL This patch doesn't eliminate the use of these names, instead it makes them aliases to 'RET_PTR_TO_XXX | PTR_MAYBE_NULL'. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-4-haoluo@google.com --- include/linux/bpf.h | 19 ++++++++++++------- kernel/bpf/helpers.c | 2 +- kernel/bpf/verifier.c | 52 +++++++++++++++++++++++++-------------------------- 3 files changed, 39 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 765bd7cc4272..975a1d5951bd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -382,17 +382,22 @@ enum bpf_return_type { RET_INTEGER, /* function returns integer */ RET_VOID, /* function doesn't return anything */ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ - RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ - RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ - RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ - RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ - RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ - RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ - RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ + RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ + RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ + RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ + RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ __BPF_RET_TYPE_MAX, + /* Extended ret_types. */ + RET_PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE, + RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, + RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, + RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 34d6f91dec1c..c49dc5cbe0a7 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -682,7 +682,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6095a75c1118..ccc068c5c5f2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6473,6 +6473,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; + enum bpf_return_type ret_type; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; int insn_idx = *insn_idx_p; @@ -6612,13 +6613,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; /* update return register (already marked as written above) */ - if (fn->ret_type == RET_INTEGER) { + ret_type = fn->ret_type; + if (ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); - } else if (fn->ret_type == RET_VOID) { + } else if (ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || - fn->ret_type == RET_PTR_TO_MAP_VALUE) { + } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) { /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -6632,28 +6633,27 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].map_uid = meta.map_uid; - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { + if (type_may_be_null(ret_type)) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + } else { regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; if (map_value_has_spin_lock(meta.map_ptr)) regs[BPF_REG_0].id = ++env->id_gen; - } else { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; } - } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { + } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); @@ -6672,28 +6672,28 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_MEM : PTR_TO_MEM_OR_NULL; + (ret_type & PTR_MAYBE_NULL) ? + PTR_TO_MEM_OR_NULL : PTR_TO_MEM; regs[BPF_REG_0].mem_size = tsize; } else { regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + (ret_type & PTR_MAYBE_NULL) ? + PTR_TO_BTF_ID_OR_NULL : PTR_TO_BTF_ID; regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || - fn->ret_type == RET_PTR_TO_BTF_ID) { + } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ? - PTR_TO_BTF_ID : - PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = (ret_type & PTR_MAYBE_NULL) ? + PTR_TO_BTF_ID_OR_NULL : + PTR_TO_BTF_ID; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { - verbose(env, "invalid return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "invalid return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), + func_id); return -EINVAL; } /* current BPF helper definitions are only coming from @@ -6702,8 +6702,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].btf = btf_vmlinux; regs[BPF_REG_0].btf_id = ret_btf_id; } else { - verbose(env, "unknown return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "unknown return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; } -- cgit v1.2.3 From c25b2ae136039ffa820c26138ed4a5e5f3ab3841 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:47 -0800 Subject: bpf: Replace PTR_TO_XXX_OR_NULL with PTR_TO_XXX | PTR_MAYBE_NULL We have introduced a new type to make bpf_reg composable, by allocating bits in the type to represent flags. One of the flags is PTR_MAYBE_NULL which indicates a pointer may be NULL. This patch switches the qualified reg_types to use this flag. The reg_types changed in this patch include: 1. PTR_TO_MAP_VALUE_OR_NULL 2. PTR_TO_SOCKET_OR_NULL 3. PTR_TO_SOCK_COMMON_OR_NULL 4. PTR_TO_TCP_SOCK_OR_NULL 5. PTR_TO_BTF_ID_OR_NULL 6. PTR_TO_MEM_OR_NULL 7. PTR_TO_RDONLY_BUF_OR_NULL 8. PTR_TO_RDWR_BUF_OR_NULL Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20211217003152.48334-5-haoluo@google.com --- include/linux/bpf.h | 18 +-- include/linux/bpf_verifier.h | 4 + kernel/bpf/btf.c | 7 +- kernel/bpf/map_iter.c | 4 +- kernel/bpf/verifier.c | 298 ++++++++++++++++++------------------------- net/core/bpf_sk_storage.c | 2 +- net/core/sock_map.c | 2 +- 7 files changed, 147 insertions(+), 188 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 975a1d5951bd..c3de62267b84 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -465,18 +465,15 @@ enum bpf_reg_type { PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ - PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ + PTR_TO_MAP_KEY, /* reg points to a map element key */ PTR_TO_STACK, /* reg == frame_pointer + offset */ PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ - PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ - PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ - PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ /* PTR_TO_BTF_ID points to a kernel struct that does not need @@ -494,18 +491,21 @@ enum bpf_reg_type { * been checked for null. Used primarily to inform the verifier * an explicit null check is required for this struct. */ - PTR_TO_BTF_ID_OR_NULL, PTR_TO_MEM, /* reg points to valid memory region */ - PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ - PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ - PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ PTR_TO_FUNC, /* reg points to a bpf program function */ - PTR_TO_MAP_KEY, /* reg points to a map element key */ __BPF_REG_TYPE_MAX, + /* Extended reg_types. */ + PTR_TO_MAP_VALUE_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE, + PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCKET, + PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, + PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, + PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, + PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MEM, + /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 34e4ceaca3c7..143401d4c9d9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -18,6 +18,8 @@ * that converting umax_value to int cannot overflow. */ #define BPF_MAX_VAR_SIZ (1 << 29) +/* size of type_str_buf in bpf_verifier. */ +#define TYPE_STR_BUF_LEN 64 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that @@ -484,6 +486,8 @@ struct bpf_verifier_env { /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; u32 prev_log_len, prev_insn_print_len; + /* buffer used in reg_type_str() to generate reg_type string */ + char type_str_buf[TYPE_STR_BUF_LEN]; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a17de71abd2e..4e2ca7bea6c4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4940,10 +4940,13 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + u32 type, flag; + type = base_type(ctx_arg_info->reg_type); + flag = type_flag(ctx_arg_info->reg_type); if (ctx_arg_info->offset == off && - (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || - ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { + (type == PTR_TO_RDWR_BUF || type == PTR_TO_RDONLY_BUF) && + (flag & PTR_MAYBE_NULL)) { info->reg_type = ctx_arg_info->reg_type; return true; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 6a9542af4212..631f0e44b7a9 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, { offsetof(struct bpf_iter__bpf_map_elem, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, }, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ccc068c5c5f2..97e9d3f31443 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -442,18 +442,6 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON; } -static bool reg_type_may_be_null(enum bpf_reg_type type) -{ - return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL || - type == PTR_TO_MEM_OR_NULL || - type == PTR_TO_RDONLY_BUF_OR_NULL || - type == PTR_TO_RDWR_BUF_OR_NULL; -} - static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return reg->type == PTR_TO_MAP_VALUE && @@ -462,12 +450,9 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { - return type == PTR_TO_SOCKET || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_MEM || - type == PTR_TO_MEM_OR_NULL; + return base_type(type) == PTR_TO_SOCKET || + base_type(type) == PTR_TO_TCP_SOCK || + base_type(type) == PTR_TO_MEM; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -537,39 +522,52 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } -/* string representation of 'enum bpf_reg_type' */ -static const char * const reg_type_str[] = { - [NOT_INIT] = "?", - [SCALAR_VALUE] = "inv", - [PTR_TO_CTX] = "ctx", - [CONST_PTR_TO_MAP] = "map_ptr", - [PTR_TO_MAP_VALUE] = "map_value", - [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", - [PTR_TO_STACK] = "fp", - [PTR_TO_PACKET] = "pkt", - [PTR_TO_PACKET_META] = "pkt_meta", - [PTR_TO_PACKET_END] = "pkt_end", - [PTR_TO_FLOW_KEYS] = "flow_keys", - [PTR_TO_SOCKET] = "sock", - [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", - [PTR_TO_SOCK_COMMON] = "sock_common", - [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", - [PTR_TO_TCP_SOCK] = "tcp_sock", - [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", - [PTR_TO_TP_BUFFER] = "tp_buffer", - [PTR_TO_XDP_SOCK] = "xdp_sock", - [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", - [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", - [PTR_TO_MEM] = "mem", - [PTR_TO_MEM_OR_NULL] = "mem_or_null", - [PTR_TO_RDONLY_BUF] = "rdonly_buf", - [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", - [PTR_TO_RDWR_BUF] = "rdwr_buf", - [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", - [PTR_TO_FUNC] = "func", - [PTR_TO_MAP_KEY] = "map_key", -}; +/* string representation of 'enum bpf_reg_type' + * + * Note that reg_type_str() can not appear more than once in a single verbose() + * statement. + */ +static const char *reg_type_str(struct bpf_verifier_env *env, + enum bpf_reg_type type) +{ + char postfix[16] = {0}; + static const char * const str[] = { + [NOT_INIT] = "?", + [SCALAR_VALUE] = "inv", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_STACK] = "fp", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", + [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", + [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", + [PTR_TO_MEM] = "mem", + [PTR_TO_RDONLY_BUF] = "rdonly_buf", + [PTR_TO_RDWR_BUF] = "rdwr_buf", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", + }; + + if (type & PTR_MAYBE_NULL) { + if (base_type(type) == PTR_TO_BTF_ID || + base_type(type) == PTR_TO_PERCPU_BTF_ID) + strncpy(postfix, "or_null_", 16); + else + strncpy(postfix, "_or_null", 16); + } + + snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s", + str[base_type(type)], postfix); + return env->type_str_buf; +} static char slot_type_char[] = { [STACK_INVALID] = '?', @@ -675,7 +673,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, continue; verbose(env, " R%d", i); print_liveness(env, reg->live); - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "=%s", reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && @@ -683,9 +681,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || - t == PTR_TO_BTF_ID_OR_NULL || - t == PTR_TO_PERCPU_BTF_ID) + if (base_type(t) == PTR_TO_BTF_ID || + base_type(t) == PTR_TO_PERCPU_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -694,10 +691,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) verbose(env, ",r=%d", reg->range); - else if (t == CONST_PTR_TO_MAP || - t == PTR_TO_MAP_KEY || - t == PTR_TO_MAP_VALUE || - t == PTR_TO_MAP_VALUE_OR_NULL) + else if (base_type(t) == CONST_PTR_TO_MAP || + base_type(t) == PTR_TO_MAP_KEY || + base_type(t) == PTR_TO_MAP_VALUE) verbose(env, ",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); @@ -769,7 +765,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (is_spilled_reg(&state->stack[i])) { reg = &state->stack[i].spilled_ptr; t = reg->type; - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "=%s", reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) @@ -1202,8 +1198,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) { - switch (reg->type) { - case PTR_TO_MAP_VALUE_OR_NULL: { + if (base_type(reg->type) == PTR_TO_MAP_VALUE) { const struct bpf_map *map = reg->map_ptr; if (map->inner_map_meta) { @@ -1222,32 +1217,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) } else { reg->type = PTR_TO_MAP_VALUE; } - break; - } - case PTR_TO_SOCKET_OR_NULL: - reg->type = PTR_TO_SOCKET; - break; - case PTR_TO_SOCK_COMMON_OR_NULL: - reg->type = PTR_TO_SOCK_COMMON; - break; - case PTR_TO_TCP_SOCK_OR_NULL: - reg->type = PTR_TO_TCP_SOCK; - break; - case PTR_TO_BTF_ID_OR_NULL: - reg->type = PTR_TO_BTF_ID; - break; - case PTR_TO_MEM_OR_NULL: - reg->type = PTR_TO_MEM; - break; - case PTR_TO_RDONLY_BUF_OR_NULL: - reg->type = PTR_TO_RDONLY_BUF; - break; - case PTR_TO_RDWR_BUF_OR_NULL: - reg->type = PTR_TO_RDWR_BUF; - break; - default: - WARN_ONCE(1, "unknown nullable register type"); + return; } + + reg->type &= ~PTR_MAYBE_NULL; } static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) @@ -2103,7 +2076,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, break; if (parent->live & REG_LIVE_DONE) { verbose(env, "verifier BUG type %s var_off %lld off %d\n", - reg_type_str[parent->type], + reg_type_str(env, parent->type), parent->var_off.value, parent->off); return -EFAULT; } @@ -2768,9 +2741,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) static bool is_spillable_regtype(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_MAP_VALUE: - case PTR_TO_MAP_VALUE_OR_NULL: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: @@ -2779,21 +2751,14 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: case PTR_TO_RDONLY_BUF: - case PTR_TO_RDONLY_BUF_OR_NULL: case PTR_TO_RDWR_BUF: - case PTR_TO_RDWR_BUF_OR_NULL: case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: - case PTR_TO_MEM_OR_NULL: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: return true; @@ -3633,7 +3598,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) { + if (base_type(*reg_type) == PTR_TO_BTF_ID) { *btf = info.btf; *btf_id = info.btf_id; } else { @@ -3701,7 +3666,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, } verbose(env, "R%d invalid %s access off=%d size=%d\n", - regno, reg_type_str[reg->type], off, size); + regno, reg_type_str(env, reg->type), off, size); return -EACCES; } @@ -4466,7 +4431,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (reg_type_may_be_null(reg_type)) + if (type_may_be_null(reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the @@ -4474,8 +4439,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID || - reg_type == PTR_TO_BTF_ID_OR_NULL) { + if (base_type(reg_type) == PTR_TO_BTF_ID) { regs[value_regno].btf = btf; regs[value_regno].btf_id = btf_id; } @@ -4528,7 +4492,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); + regno, reg_type_str(env, reg->type)); return -EACCES; } err = check_sock_access(env, insn_idx, regno, off, size, t); @@ -4547,7 +4511,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == PTR_TO_RDONLY_BUF) { if (t == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); + regno, reg_type_str(env, reg->type)); return -EACCES; } err = check_buffer_access(env, reg, regno, off, size, false, @@ -4563,7 +4527,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EACCES; } @@ -4630,7 +4594,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; } @@ -4850,9 +4814,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, register_is_null(reg)) return 0; - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], - reg_type_str[PTR_TO_STACK]); + verbose(env, "R%d type=%s ", regno, + reg_type_str(env, reg->type)); + verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); return -EACCES; } } @@ -4863,7 +4827,7 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, if (register_is_null(reg)) return 0; - if (reg_type_may_be_null(reg->type)) { + if (type_may_be_null(reg->type)) { /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. @@ -5197,10 +5161,10 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, goto found; } - verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); + verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, type)); for (j = 0; j + 1 < i; j++) - verbose(env, "%s, ", reg_type_str[compatible->types[j]]); - verbose(env, "%s\n", reg_type_str[compatible->types[j]]); + verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); + verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); return -EACCES; found: @@ -6474,6 +6438,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn { const struct bpf_func_proto *fn = NULL; enum bpf_return_type ret_type; + enum bpf_type_flag ret_flag; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; int insn_idx = *insn_idx_p; @@ -6614,6 +6579,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* update return register (already marked as written above) */ ret_type = fn->ret_type; + ret_flag = type_flag(fn->ret_type); if (ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); @@ -6633,25 +6599,23 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].map_uid = meta.map_uid; - if (type_may_be_null(ret_type)) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; - } else { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - if (map_value_has_spin_lock(meta.map_ptr)) - regs[BPF_REG_0].id = ++env->id_gen; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; + if (!type_may_be_null(ret_type) && + map_value_has_spin_lock(meta.map_ptr)) { + regs[BPF_REG_0].id = ++env->id_gen; } } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag; } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag; } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = meta.mem_size; } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t; @@ -6671,14 +6635,10 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn tname, PTR_ERR(ret)); return -EINVAL; } - regs[BPF_REG_0].type = - (ret_type & PTR_MAYBE_NULL) ? - PTR_TO_MEM_OR_NULL : PTR_TO_MEM; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { - regs[BPF_REG_0].type = - (ret_type & PTR_MAYBE_NULL) ? - PTR_TO_BTF_ID_OR_NULL : PTR_TO_BTF_ID; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } @@ -6686,9 +6646,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = (ret_type & PTR_MAYBE_NULL) ? - PTR_TO_BTF_ID_OR_NULL : - PTR_TO_BTF_ID; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { verbose(env, "invalid return type %u of func %s#%d\n", @@ -6707,7 +6665,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } - if (reg_type_may_be_null(regs[BPF_REG_0].type)) + if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; if (is_ptr_cast_function(func_id)) { @@ -6916,25 +6874,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "math between %s pointer and %lld is not allowed\n", - reg_type_str[type], val); + reg_type_str(env, type), val); return false; } if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { verbose(env, "%s pointer offset %d is not allowed\n", - reg_type_str[type], reg->off); + reg_type_str(env, type), reg->off); return false; } if (smin == S64_MIN) { verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", - reg_type_str[type]); + reg_type_str(env, type)); return false; } if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { verbose(env, "value %lld makes %s pointer be out of bounds\n", - smin, reg_type_str[type]); + smin, reg_type_str(env, type)); return false; } @@ -7311,11 +7269,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - switch (ptr_reg->type) { - case PTR_TO_MAP_VALUE_OR_NULL: + if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; + } + + switch (base_type(ptr_reg->type)) { case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) @@ -7323,14 +7283,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, fallthrough; case PTR_TO_PACKET_END: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: verbose(env, "R%d pointer arithmetic on %s prohibited\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; default: break; @@ -9049,7 +9006,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, struct bpf_reg_state *reg, u32 id, bool is_null) { - if (reg_type_may_be_null(reg->type) && reg->id == id && + if (type_may_be_null(reg->type) && reg->id == id && !WARN_ON_ONCE(!reg->id)) { /* Old offset (both fixed and variable parts) should * have been known-zero, because we don't allow pointer @@ -9427,7 +9384,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - reg_type_may_be_null(dst_reg->type)) { + type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ @@ -9681,7 +9638,7 @@ static int check_return_code(struct bpf_verifier_env *env) /* enforce return zero from async callbacks like timer */ if (reg->type != SCALAR_VALUE) { verbose(env, "In async callback the register R0 is not a known value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } @@ -9695,7 +9652,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (is_subprog) { if (reg->type != SCALAR_VALUE) { verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } return 0; @@ -9759,7 +9716,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } @@ -10616,7 +10573,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return true; if (rcur->type == NOT_INIT) return false; - switch (rold->type) { + switch (base_type(rold->type)) { case SCALAR_VALUE: if (env->explore_alu_limits) return false; @@ -10638,6 +10595,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: + /* a PTR_TO_MAP_VALUE could be safe to use as a + * PTR_TO_MAP_VALUE_OR_NULL into the same map. + * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- + * checked, doing so could have affected others with the same + * id, and we can't check for that because we lost the id when + * we converted to a PTR_TO_MAP_VALUE. + */ + if (type_may_be_null(rold->type)) { + if (!type_may_be_null(rcur->type)) + return false; + if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) + return false; + /* Check our ids match any regs they're supposed to */ + return check_ids(rold->id, rcur->id, idmap); + } + /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. * 'id' is not compared, since it's only used for maps with @@ -10649,20 +10622,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_MAP_VALUE_OR_NULL: - /* a PTR_TO_MAP_VALUE could be safe to use as a - * PTR_TO_MAP_VALUE_OR_NULL into the same map. - * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- - * checked, doing so could have affected others with the same - * id, and we can't check for that because we lost the id when - * we converted to a PTR_TO_MAP_VALUE. - */ - if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL) - return false; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) - return false; - /* Check our ids match any regs they're supposed to */ - return check_ids(rold->id, rcur->id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: if (rcur->type != rold->type) @@ -10691,11 +10650,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case PTR_TO_PACKET_END: case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted @@ -11221,17 +11177,13 @@ next: /* Return true if it's OK to have the same insn return a different type. */ static bool reg_type_mismatch_ok(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_CTX: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -11457,7 +11409,7 @@ static int do_check(struct bpf_verifier_env *env) if (is_ctx_reg(env, insn->dst_reg)) { verbose(env, "BPF_ST stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; } diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 68d2cbf8331a..4cb5ef8eddbc 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -929,7 +929,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), PTR_TO_BTF_ID_OR_NULL }, { offsetof(struct bpf_iter__bpf_sk_storage_map, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, }, .seq_info = &iter_seq_info, }; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 4ca4b11f4e5f..96d4ea7e6918 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1564,7 +1564,7 @@ static struct bpf_iter_reg sock_map_iter_reg = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, -- cgit v1.2.3 From 20b2aff4bc15bda809f994761d5719827d66c0b4 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:48 -0800 Subject: bpf: Introduce MEM_RDONLY flag This patch introduce a flag MEM_RDONLY to tag a reg value pointing to read-only memory. It makes the following changes: 1. PTR_TO_RDWR_BUF -> PTR_TO_BUF 2. PTR_TO_RDONLY_BUF -> PTR_TO_BUF | MEM_RDONLY Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-6-haoluo@google.com --- include/linux/bpf.h | 8 +++-- kernel/bpf/btf.c | 3 +- kernel/bpf/map_iter.c | 4 +-- kernel/bpf/verifier.c | 84 ++++++++++++++++++++++++++++------------------- net/core/bpf_sk_storage.c | 2 +- net/core/sock_map.c | 2 +- 6 files changed, 60 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c3de62267b84..126048110bdb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -311,7 +311,10 @@ enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = PTR_MAYBE_NULL, + /* MEM is read-only. */ + MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_RDONLY, }; /* Max number of base types. */ @@ -492,8 +495,7 @@ enum bpf_reg_type { * an explicit null check is required for this struct. */ PTR_TO_MEM, /* reg points to valid memory region */ - PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ - PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ + PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ PTR_TO_FUNC, /* reg points to a bpf program function */ __BPF_REG_TYPE_MAX, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 4e2ca7bea6c4..d1447b075c73 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4944,8 +4944,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, type = base_type(ctx_arg_info->reg_type); flag = type_flag(ctx_arg_info->reg_type); - if (ctx_arg_info->offset == off && - (type == PTR_TO_RDWR_BUF || type == PTR_TO_RDONLY_BUF) && + if (ctx_arg_info->offset == off && type == PTR_TO_BUF && (flag & PTR_MAYBE_NULL)) { info->reg_type = ctx_arg_info->reg_type; return true; diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 631f0e44b7a9..b0fa190b0979 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), - PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__bpf_map_elem, value), - PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 97e9d3f31443..e0a8a55ea3df 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -455,6 +455,11 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) base_type(type) == PTR_TO_MEM; } +static bool type_is_rdonly_mem(u32 type) +{ + return type & MEM_RDONLY; +} + static bool arg_type_may_be_refcounted(enum bpf_arg_type type) { return type == ARG_PTR_TO_SOCK_COMMON; @@ -530,7 +535,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) static const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) { - char postfix[16] = {0}; + char postfix[16] = {0}, prefix[16] = {0}; static const char * const str[] = { [NOT_INIT] = "?", [SCALAR_VALUE] = "inv", @@ -550,8 +555,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", [PTR_TO_MEM] = "mem", - [PTR_TO_RDONLY_BUF] = "rdonly_buf", - [PTR_TO_RDWR_BUF] = "rdwr_buf", + [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", }; @@ -564,8 +568,11 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(postfix, "_or_null", 16); } - snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s", - str[base_type(type)], postfix); + if (type & MEM_RDONLY) + strncpy(prefix, "rdonly_", 16); + + snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", + prefix, str[base_type(type)], postfix); return env->type_str_buf; } @@ -2755,8 +2762,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_RDONLY_BUF: - case PTR_TO_RDWR_BUF: + case PTR_TO_BUF: case PTR_TO_PERCPU_BTF_ID: case PTR_TO_MEM: case PTR_TO_FUNC: @@ -4508,22 +4514,28 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); - } else if (reg->type == PTR_TO_RDONLY_BUF) { - if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str(env, reg->type)); - return -EACCES; + } else if (base_type(reg->type) == PTR_TO_BUF) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + const char *buf_info; + u32 *max_access; + + if (rdonly_mem) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + buf_info = "rdonly"; + max_access = &env->prog->aux->max_rdonly_access; + } else { + buf_info = "rdwr"; + max_access = &env->prog->aux->max_rdwr_access; } + err = check_buffer_access(env, reg, regno, off, size, false, - "rdonly", - &env->prog->aux->max_rdonly_access); - if (!err && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_RDWR_BUF) { - err = check_buffer_access(env, reg, regno, off, size, false, - "rdwr", - &env->prog->aux->max_rdwr_access); - if (!err && t == BPF_READ && value_regno >= 0) + buf_info, max_access); + + if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, @@ -4771,8 +4783,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + const char *buf_info; + u32 *max_access; - switch (reg->type) { + switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, @@ -4791,18 +4805,20 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); - case PTR_TO_RDONLY_BUF: - if (meta && meta->raw_mode) - return -EACCES; - return check_buffer_access(env, reg, regno, reg->off, - access_size, zero_size_allowed, - "rdonly", - &env->prog->aux->max_rdonly_access); - case PTR_TO_RDWR_BUF: + case PTR_TO_BUF: + if (type_is_rdonly_mem(reg->type)) { + if (meta && meta->raw_mode) + return -EACCES; + + buf_info = "rdonly"; + max_access = &env->prog->aux->max_rdonly_access; + } else { + buf_info = "rdwr"; + max_access = &env->prog->aux->max_rdwr_access; + } return check_buffer_access(env, reg, regno, reg->off, access_size, zero_size_allowed, - "rdwr", - &env->prog->aux->max_rdwr_access); + buf_info, max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, @@ -5081,8 +5097,8 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, - PTR_TO_RDONLY_BUF, - PTR_TO_RDWR_BUF, + PTR_TO_BUF, + PTR_TO_BUF | MEM_RDONLY, }, }; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 4cb5ef8eddbc..ea61dfe19c86 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -929,7 +929,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = { { offsetof(struct bpf_iter__bpf_sk_storage_map, sk), PTR_TO_BTF_ID_OR_NULL }, { offsetof(struct bpf_iter__bpf_sk_storage_map, value), - PTR_TO_RDWR_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, .seq_info = &iter_seq_info, }; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 96d4ea7e6918..9618ab6d7cc9 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1564,7 +1564,7 @@ static struct bpf_iter_reg sock_map_iter_reg = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__sockmap, key), - PTR_TO_RDONLY_BUF | PTR_MAYBE_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__sockmap, sk), PTR_TO_BTF_ID_OR_NULL }, }, -- cgit v1.2.3 From cf9f2f8d62eca810afbd1ee6cc0800202b000e57 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:49 -0800 Subject: bpf: Convert PTR_TO_MEM_OR_NULL to composable types. Remove PTR_TO_MEM_OR_NULL and replace it with PTR_TO_MEM combined with flag PTR_MAYBE_NULL. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-7-haoluo@google.com --- include/linux/bpf.h | 1 - kernel/bpf/btf.c | 2 +- kernel/bpf/verifier.c | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 126048110bdb..567d83bf28f9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -506,7 +506,6 @@ enum bpf_reg_type { PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON, PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK, PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | PTR_TO_BTF_ID, - PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | PTR_TO_MEM, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d1447b075c73..d948b5be3bb8 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5859,7 +5859,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, return -EINVAL; } - reg->type = PTR_TO_MEM_OR_NULL; + reg->type = PTR_TO_MEM | PTR_MAYBE_NULL; reg->id = ++env->id_gen; continue; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e0a8a55ea3df..9073337ac66f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13578,7 +13578,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, i); else if (regs[i].type == SCALAR_VALUE) mark_reg_unknown(env, regs, i); - else if (regs[i].type == PTR_TO_MEM_OR_NULL) { + else if (base_type(regs[i].type) == PTR_TO_MEM) { const u32 mem_size = regs[i].mem_size; mark_reg_known_zero(env, regs, i); -- cgit v1.2.3 From 34d3a78c681e8e7844b43d1a2f4671a04249c821 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:50 -0800 Subject: bpf: Make per_cpu_ptr return rdonly PTR_TO_MEM. Tag the return type of {per, this}_cpu_ptr with RDONLY_MEM. The returned value of this pair of helpers is kernel object, which can not be updated by bpf programs. Previously these two helpers return PTR_OT_MEM for kernel objects of scalar type, which allows one to directly modify the memory. Now with RDONLY_MEM tagging, the verifier will reject programs that write into RDONLY_MEM. Fixes: 63d9b80dcf2c ("bpf: Introducte bpf_this_cpu_ptr()") Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Fixes: 4976b718c355 ("bpf: Introduce pseudo_btf_id") Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-8-haoluo@google.com --- kernel/bpf/helpers.c | 4 ++-- kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c49dc5cbe0a7..6a65e2a62b01 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -682,7 +682,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; @@ -695,7 +695,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .func = bpf_this_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9073337ac66f..f49b3d334f4e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4399,15 +4399,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } - } else if (reg->type == PTR_TO_MEM) { + } else if (base_type(reg->type) == PTR_TO_MEM) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + + if (type_may_be_null(reg->type)) { + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } + + if (t == BPF_WRITE && rdonly_mem) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into mem\n", value_regno); return -EACCES; } + err = check_mem_region_access(env, regno, off, size, reg->mem_size, false); - if (!err && t == BPF_READ && value_regno >= 0) + if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; @@ -6654,6 +6669,13 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { + /* MEM_RDONLY may be carried from ret_flag, but it + * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise + * it will confuse the check of PTR_TO_BTF_ID in + * check_mem_access(). + */ + ret_flag &= ~MEM_RDONLY; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; @@ -9455,7 +9477,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->type = aux->btf_var.reg_type; - switch (dst_reg->type) { + switch (base_type(dst_reg->type)) { case PTR_TO_MEM: dst_reg->mem_size = aux->btf_var.mem_size; break; @@ -11678,7 +11700,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, err = -EINVAL; goto err_put; } - aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; -- cgit v1.2.3 From 216e3cd2f28dbbf1fe86848e0e29e6693b9f0a20 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 16 Dec 2021 16:31:51 -0800 Subject: bpf: Add MEM_RDONLY for helper args that are pointers to rdonly mem. Some helper functions may modify its arguments, for example, bpf_d_path, bpf_get_stack etc. Previously, their argument types were marked as ARG_PTR_TO_MEM, which is compatible with read-only mem types, such as PTR_TO_RDONLY_BUF. Therefore it's legitimate, but technically incorrect, to modify a read-only memory by passing it into one of such helper functions. This patch tags the bpf_args compatible with immutable memory with MEM_RDONLY flag. The arguments that don't have this flag will be only compatible with mutable memory types, preventing the helper from modifying a read-only memory. The bpf_args that have MEM_RDONLY are compatible with both mutable memory and immutable memory. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217003152.48334-9-haoluo@google.com --- include/linux/bpf.h | 4 ++- kernel/bpf/btf.c | 2 +- kernel/bpf/cgroup.c | 2 +- kernel/bpf/helpers.c | 8 +++--- kernel/bpf/ringbuf.c | 2 +- kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 20 ++++++++++++--- kernel/trace/bpf_trace.c | 26 ++++++++++---------- net/core/filter.c | 64 ++++++++++++++++++++++++------------------------ 9 files changed, 73 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 567d83bf28f9..26753139d5b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -311,7 +311,9 @@ enum bpf_type_flag { /* PTR may be NULL. */ PTR_MAYBE_NULL = BIT(0 + BPF_BASE_TYPE_BITS), - /* MEM is read-only. */ + /* MEM is read-only. When applied on bpf_arg, it indicates the arg is + * compatible with both mutable and immutable memory. + */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), __BPF_TYPE_LAST_FLAG = MEM_RDONLY, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index d948b5be3bb8..b3fddfb5bc84 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6353,7 +6353,7 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .func = bpf_btf_find_by_name_kind, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 43eb3501721b..514b4681a90a 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1789,7 +1789,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6a65e2a62b01..01cfdf40c838 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -531,7 +531,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .func = bpf_strtol, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, @@ -559,7 +559,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { .func = bpf_strtoul, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, @@ -645,7 +645,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1026,7 +1026,7 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg1_type = ARG_PTR_TO_MEM_OR_NULL, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_CONST_STR, - .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 9e0c10c6892a..638d7fd7b375 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = { .func = bpf_ringbuf_output, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index da07bdf71697..fa4505f9b611 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4773,7 +4773,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f49b3d334f4e..ca5cd0de804c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5113,7 +5113,6 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MAP_VALUE, PTR_TO_MEM, PTR_TO_BUF, - PTR_TO_BUF | MEM_RDONLY, }, }; @@ -5183,6 +5182,21 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, return -EFAULT; } + /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY + * + * Same for MAYBE_NULL: + * + * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL + * + * Therefore we fold these flags depending on the arg_type before comparison. + */ + if (arg_type & MEM_RDONLY) + type &= ~MEM_RDONLY; + if (arg_type & PTR_MAYBE_NULL) + type &= ~PTR_MAYBE_NULL; + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; if (expected == NOT_INIT) @@ -5192,14 +5206,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, goto found; } - verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, type)); + verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); for (j = 0; j + 1 < i; j++) verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); return -EACCES; found: - if (type == PTR_TO_BTF_ID) { + if (reg->type == PTR_TO_BTF_ID) { if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cea2ca6df949..21aa30644219 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -345,7 +345,7 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -394,7 +394,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { .func = bpf_trace_printk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, }; @@ -450,9 +450,9 @@ static const struct bpf_func_proto bpf_trace_vprintk_proto = { .func = bpf_trace_vprintk, .gpl_only = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, - .arg3_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -492,9 +492,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -509,7 +509,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -533,7 +533,7 @@ static const struct bpf_func_proto bpf_seq_printf_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &btf_seq_file_ids[0], - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -694,7 +694,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1004,7 +1004,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -1334,7 +1334,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1556,7 +1556,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1610,7 +1610,7 @@ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { .gpl_only = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; diff --git a/net/core/filter.c b/net/core/filter.c index 3f656391af7e..606ab5a98a1a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1712,7 +1712,7 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE, .arg5_type = ARG_ANYTHING, }; @@ -2017,9 +2017,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = { .gpl_only = false, .pkt_access = true, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM_OR_NULL, + .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_MEM_OR_NULL, + .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; @@ -2540,7 +2540,7 @@ static const struct bpf_func_proto bpf_redirect_neigh_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM_OR_NULL, + .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; @@ -4173,7 +4173,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4187,7 +4187,7 @@ const struct bpf_func_proto bpf_skb_output_proto = { .arg1_btf_id = &bpf_skb_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4370,7 +4370,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -4396,7 +4396,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -4566,7 +4566,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -4580,7 +4580,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = { .arg1_btf_id = &bpf_xdp_output_btf_ids[0], .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -5066,7 +5066,7 @@ const struct bpf_func_proto bpf_sk_setsockopt_proto = { .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5100,7 +5100,7 @@ static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5134,7 +5134,7 @@ static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -5309,7 +5309,7 @@ static const struct bpf_func_proto bpf_bind_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -5897,7 +5897,7 @@ static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5907,7 +5907,7 @@ static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -5950,7 +5950,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -6038,7 +6038,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE }; @@ -6263,7 +6263,7 @@ static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6282,7 +6282,7 @@ static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6301,7 +6301,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6338,7 +6338,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6361,7 +6361,7 @@ static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6384,7 +6384,7 @@ static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = { .pkt_access = true, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6403,7 +6403,7 @@ static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6422,7 +6422,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6441,7 +6441,7 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .gpl_only = false, .ret_type = RET_PTR_TO_SOCKET_OR_NULL, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, @@ -6754,9 +6754,9 @@ static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -6823,9 +6823,9 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = { .pkt_access = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE, }; @@ -7054,7 +7054,7 @@ static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; -- cgit v1.2.3 From 3363bd0cfbb80dfcd25003cd3815b0ad8b68d0ff Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 17 Dec 2021 07:20:24 +0530 Subject: bpf: Extend kfunc with PTR_TO_CTX, PTR_TO_MEM argument support Allow passing PTR_TO_CTX, if the kfunc expects a matching struct type, and punt to PTR_TO_MEM block if reg->type does not fall in one of PTR_TO_BTF_ID or PTR_TO_SOCK* types. This will be used by future commits to get access to XDP and TC PTR_TO_CTX, and pass various data (flags, l4proto, netns_id, etc.) encoded in opts struct passed as pointer to kfunc. For PTR_TO_MEM support, arguments are currently limited to pointer to scalar, or pointer to struct composed of scalars. This is done so that unsafe scenarios (like passing PTR_TO_MEM where PTR_TO_BTF_ID of in-kernel valid structure is expected, which may have pointers) are avoided. Since the argument checking happens basd on argument register type, it is not easy to ascertain what the expected type is. In the future, support for PTR_TO_MEM for kfunc can be extended to serve other usecases. The struct type whose pointer is passed in may have maximum nesting depth of 4, all recursively composed of scalars or struct with scalars. Future commits will add negative tests that check whether these restrictions imposed for kfunc arguments are duly rejected by BPF verifier or not. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211217015031.1278167-4-memxor@gmail.com --- kernel/bpf/btf.c | 94 +++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index b3fddfb5bc84..33bb8ae4a804 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5576,12 +5576,53 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #endif }; +/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ +static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int rec) +{ + const struct btf_type *member_type; + const struct btf_member *member; + u32 i; + + if (!btf_type_is_struct(t)) + return false; + + for_each_member(i, t, member) { + const struct btf_array *array; + + member_type = btf_type_skip_modifiers(btf, member->type, NULL); + if (btf_type_is_struct(member_type)) { + if (rec >= 3) { + bpf_log(log, "max struct nesting depth exceeded\n"); + return false; + } + if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1)) + return false; + continue; + } + if (btf_type_is_array(member_type)) { + array = btf_type_array(member_type); + if (!array->nelems) + return false; + member_type = btf_type_skip_modifiers(btf, array->type, NULL); + if (!btf_type_is_scalar(member_type)) + return false; + continue; + } + if (!btf_type_is_scalar(member_type)) + return false; + } + return true; +} + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok) { struct bpf_verifier_log *log = &env->log; + bool is_kfunc = btf_is_kernel(btf); const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; @@ -5634,7 +5675,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - if (btf_is_kernel(btf)) { + if (btf_get_prog_ctx_type(log, btf, t, + env->prog->type, i)) { + /* If function expects ctx type in BTF check that caller + * is passing PTR_TO_CTX. + */ + if (reg->type != PTR_TO_CTX) { + bpf_log(log, + "arg#%d expected pointer to ctx, but got %s\n", + i, btf_type_str(t)); + return -EINVAL; + } + if (check_ctx_reg(env, reg, regno)) + return -EINVAL; + } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) { const struct btf_type *reg_ref_t; const struct btf *reg_btf; const char *reg_ref_tname; @@ -5650,14 +5704,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; - } else if (reg2btf_ids[reg->type]) { + } else { reg_btf = btf_vmlinux; reg_ref_id = *reg2btf_ids[reg->type]; - } else { - bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n", - func_name, i, - btf_type_str(ref_t), ref_tname, regno); - return -EINVAL; } reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, @@ -5673,23 +5722,24 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname); return -EINVAL; } - } else if (btf_get_prog_ctx_type(log, btf, t, - env->prog->type, i)) { - /* If function expects ctx type in BTF check that caller - * is passing PTR_TO_CTX. - */ - if (reg->type != PTR_TO_CTX) { - bpf_log(log, - "arg#%d expected pointer to ctx, but got %s\n", - i, btf_type_str(t)); - return -EINVAL; - } - if (check_ctx_reg(env, reg, regno)) - return -EINVAL; } else if (ptr_to_mem_ok) { const struct btf_type *resolve_ret; u32 type_size; + if (is_kfunc) { + /* Permit pointer to mem, but only when argument + * type is pointer to scalar, or struct composed + * (recursively) of scalars. + */ + if (!btf_type_is_scalar(ref_t) && + !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) { + bpf_log(log, + "arg#%d pointer type %s %s must point to scalar or struct with scalar\n", + i, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + } + resolve_ret = btf_resolve_size(btf, ref_t, &type_size); if (IS_ERR(resolve_ret)) { bpf_log(log, @@ -5702,6 +5752,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (check_mem_reg(env, reg, regno, type_size)) return -EINVAL; } else { + bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i, + is_kfunc ? "kernel " : "", func_name, func_id); return -EINVAL; } } @@ -5751,7 +5803,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs) { - return btf_check_func_arg_match(env, btf, func_id, regs, false); + return btf_check_func_arg_match(env, btf, func_id, regs, true); } /* Convert BTF of a function into bpf_reg_state if possible -- cgit v1.2.3 From 6692c98c7df53502adb8b8b73ab9bcbd399f7a06 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 20 Dec 2021 10:20:14 -0600 Subject: fork: Stop protecting back_fork_cleanup_cgroup_lock with CONFIG_NUMA Mark Brown reported: > This is also causing further build errors including but not limited to: > > /tmp/next/build/kernel/fork.c: In function 'copy_process': > /tmp/next/build/kernel/fork.c:2106:4: error: label 'bad_fork_cleanup_threadgroup_lock' used but not defined > 2106 | goto bad_fork_cleanup_threadgroup_lock; > | ^~~~ It turns out that I messed up and was depending upon a label protected by an ifdef. Move the label out of the ifdef as the ifdef around the label no longer makes sense (if it ever did). Link: https://lkml.kernel.org/r/YbugCP144uxXvRsk@sirena.org.uk Fixes: 40966e316f86 ("kthread: Ensure struct kthread is present for all kthreads") Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 04fa3e5d97af..23ad62965fbf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2464,8 +2464,8 @@ bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); -bad_fork_cleanup_threadgroup_lock: #endif +bad_fork_cleanup_threadgroup_lock: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); -- cgit v1.2.3 From ff8288ff475e47544569359772f88f2b39fd2cf9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 20 Dec 2021 10:42:18 -0600 Subject: fork: Rename bad_fork_cleanup_threadgroup_lock to bad_fork_cleanup_delayacct I just fixed a bug in copy_process when using the label bad_fork_cleanup_threadgroup_lock. While fixing the bug I looked closer at the label and realized it has been misnamed since 568ac888215c ("cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork"). Fix the name so that fork is easier to understand. Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 23ad62965fbf..0816be1bb044 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2120,14 +2120,14 @@ static __latent_entropy struct task_struct *copy_process( cgroup_fork(p); if (p->flags & PF_KTHREAD) { if (!set_kthread_struct(p)) - goto bad_fork_cleanup_threadgroup_lock; + goto bad_fork_cleanup_delayacct; } #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup_threadgroup_lock; + goto bad_fork_cleanup_delayacct; } #endif #ifdef CONFIG_CPUSETS @@ -2465,7 +2465,7 @@ bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_put(p->mempolicy); #endif -bad_fork_cleanup_threadgroup_lock: +bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); -- cgit v1.2.3 From 1a5e91d8375fc8369207cc0b9894a324f2bbf1d9 Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Mon, 13 Dec 2021 02:14:02 -0500 Subject: swiotlb: Add swiotlb bounce buffer remap function for HV IVM In Isolation VM with AMD SEV, bounce buffer needs to be accessed via extra address space which is above shared_gpa_boundary (E.G 39 bit address line) reported by Hyper-V CPUID ISOLATION_CONFIG. The access physical address will be original physical address + shared_gpa_boundary. The shared_gpa_boundary in the AMD SEV SNP spec is called virtual top of memory(vTOM). Memory addresses below vTOM are automatically treated as private while memory above vTOM is treated as shared. Expose swiotlb_unencrypted_base for platforms to set unencrypted memory base offset and platform calls swiotlb_update_mem_attributes() to remap swiotlb mem to unencrypted address space. memremap() can not be called in the early stage and so put remapping code into swiotlb_update_mem_attributes(). Store remap address and use it to copy data from/to swiotlb bounce buffer. Signed-off-by: Tianyu Lan Acked-by: Christoph Hellwig Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20211213071407.314309-2-ltykernel@gmail.com Signed-off-by: Wei Liu --- include/linux/swiotlb.h | 6 ++++++ kernel/dma/swiotlb.c | 43 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 569272871375..f6c3638255d5 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -73,6 +73,9 @@ extern enum swiotlb_force swiotlb_force; * @end: The end address of the swiotlb memory pool. Used to do a quick * range check to see if the memory was in fact allocated by this * API. + * @vaddr: The vaddr of the swiotlb memory pool. The swiotlb memory pool + * may be remapped in the memory encrypted case and store virtual + * address for bounce buffer operation. * @nslabs: The number of IO TLB blocks (in groups of 64) between @start and * @end. For default swiotlb, this is command line adjustable via * setup_io_tlb_npages. @@ -92,6 +95,7 @@ extern enum swiotlb_force swiotlb_force; struct io_tlb_mem { phys_addr_t start; phys_addr_t end; + void *vaddr; unsigned long nslabs; unsigned long used; unsigned int index; @@ -186,4 +190,6 @@ static inline bool is_swiotlb_for_alloc(struct device *dev) } #endif /* CONFIG_DMA_RESTRICTED_POOL */ +extern phys_addr_t swiotlb_unencrypted_base; + #endif /* __LINUX_SWIOTLB_H */ diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 8e840fbbed7c..b36c1cdd0c4f 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -50,6 +50,7 @@ #include #include +#include #include #include #include @@ -72,6 +73,8 @@ enum swiotlb_force swiotlb_force; struct io_tlb_mem io_tlb_default_mem; +phys_addr_t swiotlb_unencrypted_base; + /* * Max segment that we can provide which (if pages are contingous) will * not be bounced (unless SWIOTLB_FORCE is set). @@ -155,6 +158,27 @@ static inline unsigned long nr_slots(u64 val) return DIV_ROUND_UP(val, IO_TLB_SIZE); } +/* + * Remap swioltb memory in the unencrypted physical address space + * when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP + * Isolation VMs). + */ +static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) +{ + void *vaddr = NULL; + + if (swiotlb_unencrypted_base) { + phys_addr_t paddr = mem->start + swiotlb_unencrypted_base; + + vaddr = memremap(paddr, bytes, MEMREMAP_WB); + if (!vaddr) + pr_err("Failed to map the unencrypted memory %pa size %lx.\n", + &paddr, bytes); + } + + return vaddr; +} + /* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to @@ -172,7 +196,12 @@ void __init swiotlb_update_mem_attributes(void) vaddr = phys_to_virt(mem->start); bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - memset(vaddr, 0, bytes); + + mem->vaddr = swiotlb_mem_remap(mem, bytes); + if (!mem->vaddr) + mem->vaddr = vaddr; + + memset(mem->vaddr, 0, bytes); } static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, @@ -196,7 +225,17 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; } + + /* + * If swiotlb_unencrypted_base is set, the bounce buffer memory will + * be remapped and cleared in swiotlb_update_mem_attributes. + */ + if (swiotlb_unencrypted_base) + return; + memset(vaddr, 0, bytes); + mem->vaddr = vaddr; + return; } int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) @@ -371,7 +410,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size phys_addr_t orig_addr = mem->slots[index].orig_addr; size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); - unsigned char *vaddr = phys_to_virt(tlb_addr); + unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start; unsigned int tlb_offset, orig_addr_offset; if (orig_addr == INVALID_PHYS_ADDR) -- cgit v1.2.3 From 30561b51cc8d1daa27a48eb29dd9424858576b19 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Fri, 17 Dec 2021 09:01:51 +0800 Subject: audit: use struct_size() helper in audit_[send|make]_reply() Make use of struct_size() helper instead of an open-coded calculation. Link: https://github.com/KSPP/linux/issues/160 Signed-off-by: Xiu Jianfeng Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/auditfilter.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 3dd8bde2c00f..bc21d5007047 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1459,7 +1459,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) security_release_secctx(ctx, len); } audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, - sig_data, sizeof(*sig_data) + len); + sig_data, struct_size(sig_data, ctx, len)); kfree(sig_data); break; case AUDIT_TTY_GET: { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 398b4c57e921..1bbe25f47c17 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1092,7 +1092,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q) break; skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1, data, - sizeof(*data) + data->buflen); + struct_size(data, buf, data->buflen)); if (skb) skb_queue_tail(q, skb); kfree(data); -- cgit v1.2.3 From 361c81dbc58c8aa230e1f2d556045fa7bc3eb4a3 Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Mon, 20 Dec 2021 16:28:27 -0300 Subject: blktrace: switch trace spinlock to a raw spinlock The running_trace_lock protects running_trace_list and is acquired within the tracepoint which implies disabled preemption. The spinlock_t typed lock can not be acquired with disabled preemption on PREEMPT_RT because it becomes a sleeping lock. The runtime of the tracepoint depends on the number of entries in running_trace_list and has no limit. The blk-tracer is considered debug code and higher latencies here are okay. Make running_trace_lock a raw_spinlock_t. Signed-off-by: Wander Lairson Costa Link: https://lore.kernel.org/r/20211220192827.38297-1-wander@redhat.com Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 431e41bc4c23..af68a67179b4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -34,7 +34,7 @@ static struct trace_array *blk_tr; static bool blk_tracer_enabled __read_mostly; static LIST_HEAD(running_trace_list); -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(running_trace_lock); +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(running_trace_lock); /* Select an alternative, minimalistic output than the original one */ #define TRACE_BLK_OPT_CLASSIC 0x1 @@ -121,12 +121,12 @@ static void trace_note_tsk(struct task_struct *tsk) struct blk_trace *bt; tsk->btrace_seq = blktrace_seq; - spin_lock_irqsave(&running_trace_lock, flags); + raw_spin_lock_irqsave(&running_trace_lock, flags); list_for_each_entry(bt, &running_trace_list, running_list) { trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm), 0); } - spin_unlock_irqrestore(&running_trace_lock, flags); + raw_spin_unlock_irqrestore(&running_trace_lock, flags); } static void trace_note_time(struct blk_trace *bt) @@ -666,9 +666,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start) blktrace_seq++; smp_mb(); bt->trace_state = Blktrace_running; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_add(&bt->running_list, &running_trace_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); trace_note_time(bt); ret = 0; @@ -676,9 +676,9 @@ static int __blk_trace_startstop(struct request_queue *q, int start) } else { if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_del_init(&bt->running_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); ret = 0; } @@ -1608,9 +1608,9 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt->trace_state == Blktrace_running) { bt->trace_state = Blktrace_stopped; - spin_lock_irq(&running_trace_lock); + raw_spin_lock_irq(&running_trace_lock); list_del_init(&bt->running_list); - spin_unlock_irq(&running_trace_lock); + raw_spin_unlock_irq(&running_trace_lock); relay_flush(bt->rchan); } -- cgit v1.2.3 From dd621ee0cf8eb32445c8f5f26d3b7555953071d8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 21 Dec 2021 11:41:14 -0600 Subject: kthread: Warn about failed allocations for the init kthread Failed allocates are not expected when setting up the initial task and it is not really possible to handle them either. So I added a warning to report if such an allocation failure ever happens. Correct the sense of the warning so it warns when an allocation failure happens not when the allocation succeeded. Oops. Reported-by: kernel test robot Reported-by: Stephen Rothwell Reported-by: Naresh Kamboju Link: https://lkml.kernel.org/r/20211221231611.785b74cf@canb.auug.org.au Link: https://lkml.kernel.org/r/CA+G9fYvLaR5CF777CKeWTO+qJFTN6vAvm95gtzN+7fw3Wi5hkA@mail.gmail.com Link: https://lkml.kernel.org/r/20211216102956.GC10708@xsang-OptiPlex-9020 Fixes: 40966e316f86 ("kthread: Ensure struct kthread is present for all kthreads") Signed-off-by: "Eric W. Biederman" --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0404a8c572a1..ee222b89c692 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9425,7 +9425,7 @@ void __init sched_init(void) * if we want to avoid special-casing it in code that deals with per-CPU * kthreads. */ - WARN_ON(set_kthread_struct(current)); + WARN_ON(!set_kthread_struct(current)); /* * Make us the idle thread. Technically, schedule() should not be -- cgit v1.2.3 From 0dd668d2080c46cf914e131f341fa114a34c5a20 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Mon, 20 Dec 2021 19:30:48 +0800 Subject: bpf: Use struct_size() helper In an effort to avoid open-coded arithmetic in the kernel, use the struct_size() helper instead of open-coded calculation. Signed-off-by: Xiu Jianfeng Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://github.com/KSPP/linux/issues/160 Link: https://lore.kernel.org/bpf/20211220113048.2859-1-xiujianfeng@huawei.com --- kernel/bpf/local_storage.c | 3 +-- kernel/bpf/reuseport_array.c | 6 +----- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 035e9e3a7132..23f7f9d08a62 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -163,8 +163,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return 0; } - new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) + - map->value_size, + new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, map->numa_node); if (!new) diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 93a55391791a..556a769b5b80 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -152,16 +152,12 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; - u64 array_size; if (!bpf_capable()) return ERR_PTR(-EPERM); - array_size = sizeof(*array); - array_size += (u64)attr->max_entries * sizeof(struct sock *); - /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size, numa_node); + array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); if (!array) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 00580f03af5eb2a527875b4a80a5effd95bda2fa Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 22 Dec 2021 16:57:50 -0600 Subject: kthread: Never put_user the set_child_tid address Kernel threads abuse set_child_tid. Historically that has been fine as set_child_tid was initialized after the kernel thread had been forked. Unfortunately storing struct kthread in set_child_tid after the thread is running makes struct kthread being unusable for storing result codes of the thread. When set_child_tid is set to struct kthread during fork that results in schedule_tail writing the thread id to the beggining of struct kthread (if put_user does not realize it is a kernel address). Solve this by skipping the put_user for all kthreads. Reported-by: Nathan Chancellor Link: https://lkml.kernel.org/r/YcNsG0Lp94V13whH@archlinux-ax161 Signed-off-by: "Eric W. Biederman" --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee222b89c692..d8adbea77be1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4908,7 +4908,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) finish_task_switch(prev); preempt_enable(); - if (current->set_child_tid) + if (!(current->flags & PF_KTHREAD) && current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); calculate_sigpending(); -- cgit v1.2.3 From e368cd72880360ffe9b298349ae96286dd121499 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 21 Dec 2021 06:57:45 -0800 Subject: Documentation: livepatch: Add livepatch API page The livepatch subsystem has several exported functions and objects with kerneldoc comments. Though the livepatch documentation contains handwritten descriptions of all of these exported functions, they are currently not pulled into the docs build using the kernel-doc directive. In order to allow readers of the documentation to see the full kerneldoc comments in the generated documentation files, this change adds a new Documentation/livepatch/api.rst page which contains kernel-doc directives to link the kerneldoc comments directly in the documentation. With this, all of the hand-written descriptions of the APIs now cross-reference the kerneldoc comments on the new Livepatching APIs page, and running ./scripts/find-unused-docs.sh on kernel/livepatch no longer shows any files as missing documentation. Note that all of the handwritten API descriptions were left alone with the exception of Documentation/livepatch/system-state.rst, which was updated to allow the cross-referencing to work correctly. The file now follows the cross-referencing formatting guidance specified in Documentation/doc-guide/kernel-doc.rst. Furthermore, some comments around klp_shadow_free_all() were updated to say <_, id> rather than <*, id> to match the rest of the file, and to prevent the docs build from emitting an "Inline emphasis start-string without end string" error. Signed-off-by: David Vernet Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211221145743.4098360-1-void@manifault.com --- Documentation/livepatch/api.rst | 30 ++++++++++++++++++++++++++++++ Documentation/livepatch/index.rst | 1 + Documentation/livepatch/shadow-vars.rst | 4 ++-- Documentation/livepatch/system-state.rst | 4 ++-- kernel/livepatch/shadow.c | 6 +++--- 5 files changed, 38 insertions(+), 7 deletions(-) create mode 100644 Documentation/livepatch/api.rst (limited to 'kernel') diff --git a/Documentation/livepatch/api.rst b/Documentation/livepatch/api.rst new file mode 100644 index 000000000000..78944b63d74b --- /dev/null +++ b/Documentation/livepatch/api.rst @@ -0,0 +1,30 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +Livepatching APIs +================= + +Livepatch Enablement +==================== + +.. kernel-doc:: kernel/livepatch/core.c + :export: + + +Shadow Variables +================ + +.. kernel-doc:: kernel/livepatch/shadow.c + :export: + +System State Changes +==================== + +.. kernel-doc:: kernel/livepatch/state.c + :export: + +Object Types +============ + +.. kernel-doc:: include/linux/livepatch.h + :identifiers: klp_patch klp_object klp_func klp_callbacks klp_state diff --git a/Documentation/livepatch/index.rst b/Documentation/livepatch/index.rst index 43cce5fad705..cebf1c71d4a5 100644 --- a/Documentation/livepatch/index.rst +++ b/Documentation/livepatch/index.rst @@ -14,6 +14,7 @@ Kernel Livepatching shadow-vars system-state reliable-stacktrace + api .. only:: subproject and html diff --git a/Documentation/livepatch/shadow-vars.rst b/Documentation/livepatch/shadow-vars.rst index 6a7d43a8787d..7a7098bfb5c8 100644 --- a/Documentation/livepatch/shadow-vars.rst +++ b/Documentation/livepatch/shadow-vars.rst @@ -82,8 +82,8 @@ to do actions that can be done only once when a new variable is allocated. - call destructor function if defined - free shadow variable -* klp_shadow_free_all() - detach and free all <*, id> shadow variables - - find and remove any <*, id> references from global hashtable +* klp_shadow_free_all() - detach and free all <_, id> shadow variables + - find and remove any <_, id> references from global hashtable - if found diff --git a/Documentation/livepatch/system-state.rst b/Documentation/livepatch/system-state.rst index c6d127c2d9aa..7a3935fd812b 100644 --- a/Documentation/livepatch/system-state.rst +++ b/Documentation/livepatch/system-state.rst @@ -52,12 +52,12 @@ struct klp_state: The state can be manipulated using two functions: - - *klp_get_state(patch, id)* + - klp_get_state() - Get struct klp_state associated with the given livepatch and state id. - - *klp_get_prev_state(id)* + - klp_get_prev_state() - Get struct klp_state associated with the given feature id and already installed livepatches. diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index e5c9fb295ba9..c2e724d97ddf 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c @@ -272,12 +272,12 @@ void klp_shadow_free(void *obj, unsigned long id, klp_shadow_dtor_t dtor) EXPORT_SYMBOL_GPL(klp_shadow_free); /** - * klp_shadow_free_all() - detach and free all <*, id> shadow variables + * klp_shadow_free_all() - detach and free all <_, id> shadow variables * @id: data identifier * @dtor: custom callback that can be used to unregister the variable * and/or free data that the shadow variable points to (optional) * - * This function releases the memory for all <*, id> shadow variable + * This function releases the memory for all <_, id> shadow variable * instances, callers should stop referencing them accordingly. */ void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor) @@ -288,7 +288,7 @@ void klp_shadow_free_all(unsigned long id, klp_shadow_dtor_t dtor) spin_lock_irqsave(&klp_shadow_lock, flags); - /* Delete all <*, id> from hash */ + /* Delete all <_, id> from hash */ hash_for_each(klp_shadow_hash, i, shadow, node) { if (klp_shadow_match(shadow, shadow->obj, id)) klp_shadow_free_struct(shadow, dtor); -- cgit v1.2.3 From 71d2bcec2d4d69ff109c497e6611d6c53c8926d4 Mon Sep 17 00:00:00 2001 From: Philipp Rudo Date: Fri, 24 Dec 2021 21:12:39 -0800 Subject: kernel/crash_core: suppress unknown crashkernel parameter warning When booting with crashkernel= on the kernel command line a warning similar to Kernel command line: ro console=ttyS0 crashkernel=256M Unknown kernel command line parameters "crashkernel=256M", will be passed to user space. is printed. This comes from crashkernel= being parsed independent from the kernel parameter handling mechanism. So the code in init/main.c doesn't know that crashkernel= is a valid kernel parameter and prints this incorrect warning. Suppress the warning by adding a dummy early_param handler for crashkernel=. Link: https://lkml.kernel.org/r/20211208133443.6867-1-prudo@redhat.com Fixes: 86d1919a4fb0 ("init: print out unknown kernel parameters") Signed-off-by: Philipp Rudo Acked-by: Baoquan He Cc: Andrew Halaney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/crash_core.c b/kernel/crash_core.c index eb53f5ec62c9..256cf6db573c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -295,6 +296,16 @@ int __init parse_crashkernel_low(char *cmdline, "crashkernel=", suffix_tbl[SUFFIX_LOW]); } +/* + * Add a dummy early_param handler to mark crashkernel= as a known command line + * parameter and suppress incorrect warnings in init/main.c. + */ +static int __init parse_crashkernel_dummy(char *arg) +{ + return 0; +} +early_param("crashkernel", parse_crashkernel_dummy); + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { -- cgit v1.2.3 From ee6d3dd4ed48ab24b74bab3c3977b8218518247d Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Fri, 24 Dec 2021 23:13:45 +0000 Subject: driver core: make kobj_type constant. This way instances of kobj_type (which contain function pointers) can be stored in .rodata, which means that they cannot be [easily/accidentally] modified at runtime. Signed-off-by: Wedson Almeida Filho Link: https://lore.kernel.org/r/20211224231345.777370-1-wedsonaf@google.com Signed-off-by: Greg Kroah-Hartman --- Documentation/core-api/kobject.rst | 4 ++-- drivers/base/bus.c | 2 +- drivers/base/core.c | 2 +- include/linux/kobject.h | 8 ++++---- kernel/params.c | 2 +- lib/kobject.c | 8 ++++---- 6 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/Documentation/core-api/kobject.rst b/Documentation/core-api/kobject.rst index 2739f8b72575..d3b5bf9f643a 100644 --- a/Documentation/core-api/kobject.rst +++ b/Documentation/core-api/kobject.rst @@ -118,7 +118,7 @@ Initialization of kobjects Code which creates a kobject must, of course, initialize that object. Some of the internal fields are setup with a (mandatory) call to kobject_init():: - void kobject_init(struct kobject *kobj, struct kobj_type *ktype); + void kobject_init(struct kobject *kobj, const struct kobj_type *ktype); The ktype is required for a kobject to be created properly, as every kobject must have an associated kobj_type. After calling kobject_init(), to @@ -156,7 +156,7 @@ kobject_name():: There is a helper function to both initialize and add the kobject to the kernel at the same time, called surprisingly enough kobject_init_and_add():: - int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, + int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...); The arguments are the same as the individual kobject_init() and diff --git a/drivers/base/bus.c b/drivers/base/bus.c index bdc98c5713d5..a64454f5f8c0 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -165,7 +165,7 @@ static struct kobj_type bus_ktype = { static int bus_uevent_filter(struct kset *kset, struct kobject *kobj) { - struct kobj_type *ktype = get_ktype(kobj); + const struct kobj_type *ktype = get_ktype(kobj); if (ktype == &bus_ktype) return 1; diff --git a/drivers/base/core.c b/drivers/base/core.c index fd034d742447..d712ea11066b 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2263,7 +2263,7 @@ static struct kobj_type device_ktype = { static int dev_uevent_filter(struct kset *kset, struct kobject *kobj) { - struct kobj_type *ktype = get_ktype(kobj); + const struct kobj_type *ktype = get_ktype(kobj); if (ktype == &device_ktype) { struct device *dev = kobj_to_dev(kobj); diff --git a/include/linux/kobject.h b/include/linux/kobject.h index c740062b4b1a..683172b2e094 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -66,7 +66,7 @@ struct kobject { struct list_head entry; struct kobject *parent; struct kset *kset; - struct kobj_type *ktype; + const struct kobj_type *ktype; struct kernfs_node *sd; /* sysfs directory entry */ struct kref kref; #ifdef CONFIG_DEBUG_KOBJECT_RELEASE @@ -90,13 +90,13 @@ static inline const char *kobject_name(const struct kobject *kobj) return kobj->name; } -extern void kobject_init(struct kobject *kobj, struct kobj_type *ktype); +extern void kobject_init(struct kobject *kobj, const struct kobj_type *ktype); extern __printf(3, 4) __must_check int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...); extern __printf(4, 5) __must_check int kobject_init_and_add(struct kobject *kobj, - struct kobj_type *ktype, struct kobject *parent, + const struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...); extern void kobject_del(struct kobject *kobj); @@ -217,7 +217,7 @@ static inline void kset_put(struct kset *k) kobject_put(&k->kobj); } -static inline struct kobj_type *get_ktype(struct kobject *kobj) +static inline const struct kobj_type *get_ktype(struct kobject *kobj) { return kobj->ktype; } diff --git a/kernel/params.c b/kernel/params.c index 8299bd764e42..9b90e3c4d3c0 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -928,7 +928,7 @@ static const struct sysfs_ops module_sysfs_ops = { static int uevent_filter(struct kset *kset, struct kobject *kobj) { - struct kobj_type *ktype = get_ktype(kobj); + const struct kobj_type *ktype = get_ktype(kobj); if (ktype == &module_ktype) return 1; diff --git a/lib/kobject.c b/lib/kobject.c index 4a56f519139d..56fa037501b5 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -65,7 +65,7 @@ void kobject_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) */ static int populate_dir(struct kobject *kobj) { - struct kobj_type *t = get_ktype(kobj); + const struct kobj_type *t = get_ktype(kobj); struct attribute *attr; int error = 0; int i; @@ -346,7 +346,7 @@ EXPORT_SYMBOL(kobject_set_name); * to kobject_put(), not by a call to kfree directly to ensure that all of * the memory is cleaned up properly. */ -void kobject_init(struct kobject *kobj, struct kobj_type *ktype) +void kobject_init(struct kobject *kobj, const struct kobj_type *ktype) { char *err_str; @@ -461,7 +461,7 @@ EXPORT_SYMBOL(kobject_add); * same type of error handling after a call to kobject_add() and kobject * lifetime rules are the same here. */ -int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, +int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...) { va_list args; @@ -679,7 +679,7 @@ EXPORT_SYMBOL(kobject_get_unless_zero); static void kobject_cleanup(struct kobject *kobj) { struct kobject *parent = kobj->parent; - struct kobj_type *t = get_ktype(kobj); + const struct kobj_type *t = get_ktype(kobj); const char *name = kobj->name; pr_debug("kobject: '%s' (%p): %s, parent %p\n", -- cgit v1.2.3 From cf6299b6101903c31bddb0065804b2121ed510c7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 27 Dec 2021 17:39:24 +0100 Subject: kobject: remove kset from struct kset_uevent_ops callbacks There is no need to pass the pointer to the kset in the struct kset_uevent_ops callbacks as no one uses it, so just remove that pointer entirely. Reviewed-by: Rafael J. Wysocki Reviewed-by: Wedson Almeida Filho Link: https://lore.kernel.org/r/20211227163924.3970661-1-gregkh@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- Documentation/core-api/kobject.rst | 7 +++---- Documentation/translations/zh_CN/core-api/kobject.rst | 7 +++---- drivers/base/bus.c | 2 +- drivers/base/core.c | 11 +++++------ drivers/dma-buf/dma-buf-sysfs-stats.c | 2 +- fs/dlm/lockspace.c | 3 +-- fs/gfs2/sys.c | 3 +-- include/linux/kobject.h | 7 +++---- kernel/params.c | 2 +- lib/kobject_uevent.c | 6 +++--- 10 files changed, 22 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/Documentation/core-api/kobject.rst b/Documentation/core-api/kobject.rst index d3b5bf9f643a..3d6e3107315d 100644 --- a/Documentation/core-api/kobject.rst +++ b/Documentation/core-api/kobject.rst @@ -373,10 +373,9 @@ If a kset wishes to control the uevent operations of the kobjects associated with it, it can use the struct kset_uevent_ops to handle it:: struct kset_uevent_ops { - int (* const filter)(struct kset *kset, struct kobject *kobj); - const char *(* const name)(struct kset *kset, struct kobject *kobj); - int (* const uevent)(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env); + int (* const filter)(struct kobject *kobj); + const char *(* const name)(struct kobject *kobj); + int (* const uevent)(struct kobject *kobj, struct kobj_uevent_env *env); }; diff --git a/Documentation/translations/zh_CN/core-api/kobject.rst b/Documentation/translations/zh_CN/core-api/kobject.rst index b7c37794cc7f..95634083dca0 100644 --- a/Documentation/translations/zh_CN/core-api/kobject.rst +++ b/Documentation/translations/zh_CN/core-api/kobject.rst @@ -325,10 +325,9 @@ ksets 结构体kset_uevent_ops来处理它:: struct kset_uevent_ops { - int (* const filter)(struct kset *kset, struct kobject *kobj); - const char *(* const name)(struct kset *kset, struct kobject *kobj); - int (* const uevent)(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env); + int (* const filter)(struct kobject *kobj); + const char *(* const name)(struct kobject *kobj); + int (* const uevent)(struct kobject *kobj, struct kobj_uevent_env *env); }; diff --git a/drivers/base/bus.c b/drivers/base/bus.c index a64454f5f8c0..97936ec49bde 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -163,7 +163,7 @@ static struct kobj_type bus_ktype = { .release = bus_release, }; -static int bus_uevent_filter(struct kset *kset, struct kobject *kobj) +static int bus_uevent_filter(struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); diff --git a/drivers/base/core.c b/drivers/base/core.c index d712ea11066b..60d703ebd123 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2261,7 +2261,7 @@ static struct kobj_type device_ktype = { }; -static int dev_uevent_filter(struct kset *kset, struct kobject *kobj) +static int dev_uevent_filter(struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); @@ -2275,7 +2275,7 @@ static int dev_uevent_filter(struct kset *kset, struct kobject *kobj) return 0; } -static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj) +static const char *dev_uevent_name(struct kobject *kobj) { struct device *dev = kobj_to_dev(kobj); @@ -2286,8 +2286,7 @@ static const char *dev_uevent_name(struct kset *kset, struct kobject *kobj) return NULL; } -static int dev_uevent(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env) +static int dev_uevent(struct kobject *kobj, struct kobj_uevent_env *env) { struct device *dev = kobj_to_dev(kobj); int retval = 0; @@ -2382,7 +2381,7 @@ static ssize_t uevent_show(struct device *dev, struct device_attribute *attr, /* respect filter */ if (kset->uevent_ops && kset->uevent_ops->filter) - if (!kset->uevent_ops->filter(kset, &dev->kobj)) + if (!kset->uevent_ops->filter(&dev->kobj)) goto out; env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); @@ -2390,7 +2389,7 @@ static ssize_t uevent_show(struct device *dev, struct device_attribute *attr, return -ENOMEM; /* let the kset specific function add its keys */ - retval = kset->uevent_ops->uevent(kset, &dev->kobj, env); + retval = kset->uevent_ops->uevent(&dev->kobj, env); if (retval) goto out; diff --git a/drivers/dma-buf/dma-buf-sysfs-stats.c b/drivers/dma-buf/dma-buf-sysfs-stats.c index 053baadcada9..2bba0babcb62 100644 --- a/drivers/dma-buf/dma-buf-sysfs-stats.c +++ b/drivers/dma-buf/dma-buf-sysfs-stats.c @@ -132,7 +132,7 @@ void dma_buf_stats_teardown(struct dma_buf *dmabuf) /* Statistics files do not need to send uevents. */ -static int dmabuf_sysfs_uevent_filter(struct kset *kset, struct kobject *kobj) +static int dmabuf_sysfs_uevent_filter(struct kobject *kobj) { return 0; } diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 10eddfa6c3d7..0bbb346cb892 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -216,8 +216,7 @@ static int do_uevent(struct dlm_ls *ls, int in) return ls->ls_uevent_result; } -static int dlm_uevent(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env) +static int dlm_uevent(struct kobject *kobj, struct kobj_uevent_env *env) { struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index c0a34d9ddee4..a6002b2d146d 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -767,8 +767,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) wait_for_completion(&sdp->sd_kobj_unregister); } -static int gfs2_uevent(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env) +static int gfs2_uevent(struct kobject *kobj, struct kobj_uevent_env *env) { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); struct super_block *s = sdp->sd_vfs; diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 683172b2e094..ad90b49824dc 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -153,10 +153,9 @@ struct kobj_uevent_env { }; struct kset_uevent_ops { - int (* const filter)(struct kset *kset, struct kobject *kobj); - const char *(* const name)(struct kset *kset, struct kobject *kobj); - int (* const uevent)(struct kset *kset, struct kobject *kobj, - struct kobj_uevent_env *env); + int (* const filter)(struct kobject *kobj); + const char *(* const name)(struct kobject *kobj); + int (* const uevent)(struct kobject *kobj, struct kobj_uevent_env *env); }; struct kobj_attribute { diff --git a/kernel/params.c b/kernel/params.c index 9b90e3c4d3c0..5b92310425c5 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -926,7 +926,7 @@ static const struct sysfs_ops module_sysfs_ops = { .store = module_attr_store, }; -static int uevent_filter(struct kset *kset, struct kobject *kobj) +static int uevent_filter(struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c index c87d5b6a8a55..7c44b7ae4c5c 100644 --- a/lib/kobject_uevent.c +++ b/lib/kobject_uevent.c @@ -501,7 +501,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, } /* skip the event, if the filter returns zero. */ if (uevent_ops && uevent_ops->filter) - if (!uevent_ops->filter(kset, kobj)) { + if (!uevent_ops->filter(kobj)) { pr_debug("kobject: '%s' (%p): %s: filter function " "caused the event to drop!\n", kobject_name(kobj), kobj, __func__); @@ -510,7 +510,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, /* originating subsystem */ if (uevent_ops && uevent_ops->name) - subsystem = uevent_ops->name(kset, kobj); + subsystem = uevent_ops->name(kobj); else subsystem = kobject_name(&kset->kobj); if (!subsystem) { @@ -554,7 +554,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, /* let the kset specific function add its stuff */ if (uevent_ops && uevent_ops->uevent) { - retval = uevent_ops->uevent(kset, kobj, env); + retval = uevent_ops->uevent(kobj, env); if (retval) { pr_debug("kobject: '%s' (%p): %s: uevent() returned " "%d\n", kobject_name(kobj), kobj, -- cgit v1.2.3 From 5abb065dca7301de90b7c44bbcdc378e49e4d362 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 1 Dec 2021 22:28:14 +0100 Subject: notifier: Return an error when a callback has already been registered Return -EEXIST when a notifier callback has already been registered on a notifier chain. This should avoid any homegrown registration tracking at the callsite like https://lore.kernel.org/amd-gfx/20210512013058.6827-1-mukul.joshi@amd.com for example. This version is an alternative of https://lore.kernel.org/r/20211108101157.15189-1-bp@alien8.de which needed to touch every caller not checking the registration routine's return value. Signed-off-by: Borislav Petkov Reviewed-by: Sebastian Andrzej Siewior Acked-by: Alan Stern Link: https://lore.kernel.org/r/YcSWNdUBS8A2ZB3s@zn.tnic --- kernel/notifier.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index b8251dc0bc0f..ba005ebf4730 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -20,12 +20,13 @@ BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); */ static int notifier_chain_register(struct notifier_block **nl, - struct notifier_block *n) + struct notifier_block *n) { while ((*nl) != NULL) { if (unlikely((*nl) == n)) { - WARN(1, "double register detected"); - return 0; + WARN(1, "notifier callback %ps already registered", + n->notifier_call); + return -EEXIST; } if (n->priority > (*nl)->priority) break; @@ -134,7 +135,7 @@ static int notifier_call_chain_robust(struct notifier_block **nl, * * Adds a notifier to an atomic notifier chain. * - * Currently always returns zero. + * Returns 0 on success, %-EEXIST on error. */ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *n) @@ -216,7 +217,7 @@ NOKPROBE_SYMBOL(atomic_notifier_call_chain); * Adds a notifier to a blocking notifier chain. * Must be called in process context. * - * Currently always returns zero. + * Returns 0 on success, %-EEXIST on error. */ int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *n) @@ -335,7 +336,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); * Adds a notifier to a raw notifier chain. * All locking must be provided by the caller. * - * Currently always returns zero. + * Returns 0 on success, %-EEXIST on error. */ int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *n) @@ -406,7 +407,7 @@ EXPORT_SYMBOL_GPL(raw_notifier_call_chain); * Adds a notifier to an SRCU notifier chain. * Must be called in process context. * - * Currently always returns zero. + * Returns 0 on success, %-EEXIST on error. */ int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *n) -- cgit v1.2.3 From b6459415b384cb829f0b2a4268f211c789f6cf0b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 28 Dec 2021 16:49:13 -0800 Subject: net: Don't include filter.h from net/sock.h sock.h is pretty heavily used (5k objects rebuilt on x86 after it's touched). We can drop the include of filter.h from it and add a forward declaration of struct sk_filter instead. This decreases the number of rebuilt objects when bpf.h is touched from ~5k to ~1k. There's a lot of missing includes this was masking. Primarily in networking tho, this time. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Acked-by: Marc Kleine-Budde Acked-by: Florian Fainelli Acked-by: Nikolay Aleksandrov Acked-by: Stefano Garzarella Link: https://lore.kernel.org/bpf/20211229004913.513372-1-kuba@kernel.org --- drivers/bluetooth/btqca.c | 1 + drivers/infiniband/core/cache.c | 1 + drivers/infiniband/hw/irdma/ctrl.c | 2 ++ drivers/infiniband/hw/irdma/uda.c | 2 ++ drivers/infiniband/hw/mlx5/doorbell.c | 1 + drivers/infiniband/hw/mlx5/qp.c | 1 + drivers/net/amt.c | 1 + drivers/net/appletalk/ipddp.c | 1 + drivers/net/bonding/bond_main.c | 1 + drivers/net/can/usb/peak_usb/pcan_usb.c | 1 + drivers/net/dsa/microchip/ksz8795.c | 1 + drivers/net/dsa/xrs700x/xrs700x.c | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 1 + drivers/net/ethernet/huawei/hinic/hinic_tx.c | 1 + drivers/net/ethernet/intel/ice/ice_devlink.c | 2 ++ drivers/net/ethernet/intel/ice/ice_txrx_lib.c | 2 ++ drivers/net/ethernet/intel/igc/igc_xdp.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en/qos.c | 1 + drivers/net/ethernet/sfc/efx.c | 1 + drivers/net/ethernet/sfc/efx_channels.c | 1 + drivers/net/ethernet/sfc/efx_common.c | 1 + drivers/net/hamradio/hdlcdrv.c | 1 + drivers/net/hamradio/scc.c | 1 + drivers/net/loopback.c | 1 + drivers/net/vrf.c | 1 + drivers/net/wireless/ath/ath11k/debugfs.c | 2 ++ drivers/net/wireless/realtek/rtw89/debug.c | 2 ++ fs/nfs/dir.c | 1 + fs/nfs/fs_context.c | 1 + fs/select.c | 1 + include/linux/bpf_local_storage.h | 1 + include/linux/dsa/loop.h | 1 + include/net/ipv6.h | 2 ++ include/net/route.h | 1 + include/net/sock.h | 2 +- include/net/xdp_sock.h | 1 + kernel/sysctl.c | 1 + net/bluetooth/bnep/sock.c | 1 + net/bluetooth/eir.h | 2 ++ net/bluetooth/hidp/sock.c | 1 + net/bluetooth/l2cap_sock.c | 1 + net/bridge/br_ioctl.c | 1 + net/caif/caif_socket.c | 1 + net/core/devlink.c | 1 + net/core/flow_dissector.c | 1 + net/core/lwt_bpf.c | 1 + net/core/sock_diag.c | 1 + net/core/sysctl_net_core.c | 1 + net/decnet/dn_nsp_in.c | 1 + net/dsa/dsa_priv.h | 1 + net/ethtool/ioctl.c | 1 + net/ipv4/nexthop.c | 1 + net/ipv6/ip6_fib.c | 1 + net/ipv6/seg6_local.c | 1 + net/iucv/af_iucv.c | 1 + net/kcm/kcmsock.c | 1 + net/netfilter/nfnetlink_hook.c | 1 + net/netfilter/nft_reject_netdev.c | 1 + net/netlink/af_netlink.c | 2 ++ net/packet/af_packet.c | 1 + net/rose/rose_in.c | 1 + net/sched/sch_frag.c | 1 + net/smc/smc_ib.c | 2 ++ net/smc/smc_ism.c | 1 + net/unix/af_unix.c | 1 + net/vmw_vsock/af_vsock.c | 1 + net/xdp/xskmap.c | 1 + net/xfrm/xfrm_state.c | 1 + net/xfrm/xfrm_user.c | 1 + 70 files changed, 80 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index be04d74037d2..f7bf311d7910 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -6,6 +6,7 @@ */ #include #include +#include #include #include diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 0c98dd3dee67..b79f816a7203 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -33,6 +33,7 @@ * SOFTWARE. */ +#include #include #include #include diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 7264f8c2f7d5..3141a9c85de5 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB /* Copyright (c) 2015 - 2021 Intel Corporation */ +#include + #include "osdep.h" #include "status.h" #include "hmc.h" diff --git a/drivers/infiniband/hw/irdma/uda.c b/drivers/infiniband/hw/irdma/uda.c index f5b1b6150cdc..7a9988ddbd01 100644 --- a/drivers/infiniband/hw/irdma/uda.c +++ b/drivers/infiniband/hw/irdma/uda.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB /* Copyright (c) 2016 - 2021 Intel Corporation */ +#include + #include "osdep.h" #include "status.h" #include "hmc.h" diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c index 6398e2f48579..e32111117a5e 100644 --- a/drivers/infiniband/hw/mlx5/doorbell.c +++ b/drivers/infiniband/hw/mlx5/doorbell.c @@ -32,6 +32,7 @@ #include #include +#include #include #include "mlx5_ib.h" diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index e5abbcfc1d57..29475cf8c7c3 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -30,6 +30,7 @@ * SOFTWARE. */ +#include #include #include #include diff --git a/drivers/net/amt.c b/drivers/net/amt.c index b732ee9a50ef..a1c7a8acd9c8 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c index 5566daefbff4..d558535390f9 100644 --- a/drivers/net/appletalk/ipddp.c +++ b/drivers/net/appletalk/ipddp.c @@ -23,6 +23,7 @@ * of the GNU General Public License, incorporated herein by reference. */ +#include #include #include #include diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 0f39ad2af81c..d483f1102a9e 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c index 876218752766..ac6772fe9746 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb.c @@ -8,6 +8,7 @@ * * Many thanks to Klaus Hitschler */ +#include #include #include #include diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 013e9c02be71..991b9c6b6ce7 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/dsa/xrs700x/xrs700x.c b/drivers/net/dsa/xrs700x/xrs700x.c index 35fa19ddaf19..0730352cdd57 100644 --- a/drivers/net/dsa/xrs700x/xrs700x.c +++ b/drivers/net/dsa/xrs700x/xrs700x.c @@ -5,6 +5,7 @@ */ #include +#include #include #include #include diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 951c4c569a9b..4da31b1b84f9 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -9,6 +9,7 @@ #include #include +#include #include #include "bnxt_hsi.h" #include "bnxt.h" diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c b/drivers/net/ethernet/huawei/hinic/hinic_tx.c index a984a7a6dd2e..8d59babbf476 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c @@ -4,6 +4,7 @@ * Copyright(c) 2017 Huawei Technologies Co., Ltd */ +#include #include #include #include diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c index 1cfe918db8b9..716ec8616ff0 100644 --- a/drivers/net/ethernet/intel/ice/ice_devlink.c +++ b/drivers/net/ethernet/intel/ice/ice_devlink.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020, Intel Corporation. */ +#include + #include "ice.h" #include "ice_lib.h" #include "ice_devlink.h" diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c index 1dd7e84f41f8..9520b140bdbf 100644 --- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019, Intel Corporation. */ +#include + #include "ice_txrx_lib.h" #include "ice_eswitch.h" #include "ice_lib.h" diff --git a/drivers/net/ethernet/intel/igc/igc_xdp.c b/drivers/net/ethernet/intel/igc/igc_xdp.c index a8cf5374be47..aeeb34e64610 100644 --- a/drivers/net/ethernet/intel/igc/igc_xdp.c +++ b/drivers/net/ethernet/intel/igc/igc_xdp.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020, Intel Corporation. */ +#include #include #include "igc.h" diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index f1c10f2bda78..40acfe12adc9 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -33,6 +33,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c index 50977f01a050..00449df98a5e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/qos.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. */ +#include #include "en.h" #include "params.h" diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index a8c252e2b252..302dc835ac3d 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -5,6 +5,7 @@ * Copyright 2005-2013 Solarflare Communications Inc. */ +#include #include #include #include diff --git a/drivers/net/ethernet/sfc/efx_channels.c b/drivers/net/ethernet/sfc/efx_channels.c index 3dbea028b325..b015d1f2e204 100644 --- a/drivers/net/ethernet/sfc/efx_channels.c +++ b/drivers/net/ethernet/sfc/efx_channels.c @@ -10,6 +10,7 @@ #include "net_driver.h" #include +#include #include "efx_channels.h" #include "efx.h" #include "efx_common.h" diff --git a/drivers/net/ethernet/sfc/efx_common.c b/drivers/net/ethernet/sfc/efx_common.c index f187631b2c5c..af37c990217e 100644 --- a/drivers/net/ethernet/sfc/efx_common.c +++ b/drivers/net/ethernet/sfc/efx_common.c @@ -9,6 +9,7 @@ */ #include "net_driver.h" +#include #include #include #include diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c index b0edb91bb10a..8297411e87ea 100644 --- a/drivers/net/hamradio/hdlcdrv.c +++ b/drivers/net/hamradio/hdlcdrv.c @@ -30,6 +30,7 @@ /*****************************************************************************/ #include +#include #include #include #include diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c index 3d59dac063ac..f90830d3dfa6 100644 --- a/drivers/net/hamradio/scc.c +++ b/drivers/net/hamradio/scc.c @@ -148,6 +148,7 @@ /* ----------------------------------------------------------------------- */ +#include #include #include #include diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index a1c77cc00416..ed0edf5884ef 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include /* For the statistics structure. */ diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index b4c64226b7ca..e0b1ab99a359 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/wireless/ath/ath11k/debugfs.c b/drivers/net/wireless/ath/ath11k/debugfs.c index dba055d085be..eb8b4f20c95e 100644 --- a/drivers/net/wireless/ath/ath11k/debugfs.c +++ b/drivers/net/wireless/ath/ath11k/debugfs.c @@ -3,6 +3,8 @@ * Copyright (c) 2018-2020 The Linux Foundation. All rights reserved. */ +#include + #include "debugfs.h" #include "core.h" diff --git a/drivers/net/wireless/realtek/rtw89/debug.c b/drivers/net/wireless/realtek/rtw89/debug.c index 1e85808aaf4b..be761157ea22 100644 --- a/drivers/net/wireless/realtek/rtw89/debug.c +++ b/drivers/net/wireless/realtek/rtw89/debug.c @@ -2,6 +2,8 @@ /* Copyright(c) 2019-2020 Realtek Corporation */ +#include + #include "coex.h" #include "debug.h" #include "fw.h" diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 731d31015b6a..347793626f19 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -18,6 +18,7 @@ * 6 Jun 1999 Cache readdir lookups in the page cache. -DaveM */ +#include #include #include #include diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 0d444a90f513..ea17fa1f31ec 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -10,6 +10,7 @@ * Split from fs/nfs/super.c by David Howells */ +#include #include #include #include diff --git a/fs/select.c b/fs/select.c index 945896d0ac9e..02cd8cb5e69f 100644 --- a/fs/select.c +++ b/fs/select.c @@ -15,6 +15,7 @@ * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). */ +#include #include #include #include diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 24496bc28e7b..a2b625960ffe 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -8,6 +8,7 @@ #define _BPF_LOCAL_STORAGE_H #include +#include #include #include #include diff --git a/include/linux/dsa/loop.h b/include/linux/dsa/loop.h index 5a3470bcc8a7..b8fef35591aa 100644 --- a/include/linux/dsa/loop.h +++ b/include/linux/dsa/loop.h @@ -2,6 +2,7 @@ #ifndef DSA_LOOP_H #define DSA_LOOP_H +#include #include #include #include diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 53ac7707ca70..3afcb128e064 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -21,6 +21,8 @@ #include #include +struct ip_tunnel_info; + #define SIN6_LEN_RFC2133 24 #define IPV6_MAXPLEN 65535 diff --git a/include/net/route.h b/include/net/route.h index 2e6c0e153e3a..4c858dcf1aa8 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -43,6 +43,7 @@ #define RT_CONN_FLAGS(sk) (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE)) #define RT_CONN_FLAGS_TOS(sk,tos) (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE)) +struct ip_tunnel_info; struct fib_nh; struct fib_info; struct uncached_list; diff --git a/include/net/sock.h b/include/net/sock.h index 37f878564d25..40f6406b9ca5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -249,6 +248,7 @@ struct sock_common { }; struct bpf_local_storage; +struct sk_filter; /** * struct sock - network layer representation of sockets diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index fff069d2ed1b..3057e1a4a11c 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -6,6 +6,7 @@ #ifndef _LINUX_XDP_SOCK_H #define _LINUX_XDP_SOCK_H +#include #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 083be6af29d7..d7ed1dffa426 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index d515571b2afb..57d509d77cb4 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -24,6 +24,7 @@ SOFTWARE IS DISCLAIMED. */ +#include #include #include diff --git a/net/bluetooth/eir.h b/net/bluetooth/eir.h index 724662f8f8b1..05e2e917fc25 100644 --- a/net/bluetooth/eir.h +++ b/net/bluetooth/eir.h @@ -5,6 +5,8 @@ * Copyright (C) 2021 Intel Corporation */ +#include + void eir_create(struct hci_dev *hdev, u8 *data); u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr); diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 595fb3c9d6c3..369ed92dac99 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -20,6 +20,7 @@ SOFTWARE IS DISCLAIMED. */ +#include #include #include diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 4574c5cb1b59..dc50737b785b 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -29,6 +29,7 @@ #include #include +#include #include #include diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index db4ab2c2ce18..9b54d7d0bfc4 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index e12fd3cad619..2b8892d502f7 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -6,6 +6,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ +#include #include #include #include diff --git a/net/core/devlink.c b/net/core/devlink.c index 0a9349a02cad..492a26d3c3f1 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7,6 +7,7 @@ * Copyright (c) 2016 Jiri Pirko */ +#include #include #include #include diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 257976cb55ce..de1109f2cfcf 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 2f7940bcf715..349480ef68a5 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -2,6 +2,7 @@ /* Copyright (c) 2016 Thomas Graf */ +#include #include #include #include diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index c9c45b935f99..f7cf74cdd3db 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -1,5 +1,6 @@ /* License: GPL */ +#include #include #include #include diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5f88526ad61c..7b4d485aac7a 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -6,6 +6,7 @@ * Added /proc/sys/net/core directory entry (empty =) ). [MS] */ +#include #include #include #include diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 7ab788f41a3f..c59be5b04479 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -38,6 +38,7 @@ *******************************************************************************/ #include +#include #include #include #include diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 38ce5129a33d..0194a969c9b5 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -8,6 +8,7 @@ #define __DSA_PRIV_H #include +#include #include #include #include diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 9a113d893521..b2cdba1b4aae 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -8,6 +8,7 @@ */ #include +#include #include #include #include diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 1319d093cdda..eeafeccebb8d 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0371d2c14145..463c37dea449 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -15,6 +15,7 @@ #define pr_fmt(fmt) "IPv6: " fmt +#include #include #include #include diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 2dc40b3f373e..a5eea182149d 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -7,6 +7,7 @@ * eBPF support: Mathieu Xhonneux */ +#include #include #include #include diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 49ecbe8d176a..a1760add5bf1 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -13,6 +13,7 @@ #define KMSG_COMPONENT "af_iucv" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#include #include #include #include diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 11a715d76a4f..71899e5a5a11 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index d5c719c9e36c..71e29adac48b 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include diff --git a/net/netfilter/nft_reject_netdev.c b/net/netfilter/nft_reject_netdev.c index d89f68754f42..61cd8c4ac385 100644 --- a/net/netfilter/nft_reject_netdev.c +++ b/net/netfilter/nft_reject_netdev.c @@ -4,6 +4,7 @@ * Copyright (c) 2020 Jose M. Guisado */ +#include #include #include #include diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4be2d97ff93e..7b344035bfe3 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -20,8 +20,10 @@ #include +#include #include #include +#include #include #include #include diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a1ffdb48cc47..3ca4f890371a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -49,6 +49,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include +#include #include #include #include diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 6af786d66b03..4d67f36dce1b 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -9,6 +9,7 @@ * diagrams as the code is not obvious and probably very easy to break. */ #include +#include #include #include #include diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c index 8c06381391d6..cd85a69820b1 100644 --- a/net/sched/sch_frag.c +++ b/net/sched/sch_frag.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +#include #include #include #include diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index d93055ec17ae..905604c378ad 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -12,6 +12,8 @@ * Author(s): Ursula Braun */ +#include +#include #include #include #include diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index fd28cc498b98..a2084ecdb97e 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -6,6 +6,7 @@ * Copyright IBM Corp. 2018 */ +#include #include #include #include diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4d6e33bbd446..c19569819866 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index ed0df839c38c..3235261f138d 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -85,6 +85,7 @@ * TCP_LISTEN - listening */ +#include #include #include #include diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index 2e48d0e094d9..65b53fb3de13 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -4,6 +4,7 @@ */ #include +#include #include #include #include diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index a2f4001221d1..0407272a990c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -14,6 +14,7 @@ * */ +#include #include #include #include diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 7c36cc1f3d79..e3e26f4da6c2 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -11,6 +11,7 @@ * */ +#include #include #include #include -- cgit v1.2.3 From 3ccdcee28415c4226de05438b4d89eb5514edf73 Mon Sep 17 00:00:00 2001 From: Haimin Zhang Date: Wed, 29 Dec 2021 19:20:02 +0800 Subject: bpf: Add missing map_get_next_key method to bloom filter map. Without it, kernel crashes in map_get_next_key(). Fixes: 9330986c0300 ("bpf: Add bloom filter map implementation") Reported-by: TCS Robot Signed-off-by: Haimin Zhang Signed-off-by: Alexei Starovoitov Acked-by: Joanne Koong Link: https://lore.kernel.org/bpf/1640776802-22421-1-git-send-email-tcs.kernel@gmail.com --- kernel/bpf/bloom_filter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 277a05e9c984..b141a1346f72 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -82,6 +82,11 @@ static int bloom_map_delete_elem(struct bpf_map *map, void *value) return -EOPNOTSUPP; } +static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EOPNOTSUPP; +} + static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) { u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits; @@ -192,6 +197,7 @@ const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bloom_map_alloc, .map_free = bloom_map_free, + .map_get_next_key = bloom_map_get_next_key, .map_push_elem = bloom_map_push_elem, .map_peek_elem = bloom_map_peek_elem, .map_pop_elem = bloom_map_pop_elem, -- cgit v1.2.3 From 0fe4b381a59ebc53522fce579b281a67a9e1bee6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 24 Dec 2021 15:29:15 +0000 Subject: bpf: Allow bpf_local_storage to be used by sleepable programs Other maps like hashmaps are already available to sleepable programs. Sleepable BPF programs run under trace RCU. Allow task, sk and inode storage to be used from sleepable programs. This allows sleepable and non-sleepable programs to provide shareable annotations on kernel objects. Sleepable programs run in trace RCU where as non-sleepable programs run in a normal RCU critical section i.e. __bpf_prog_enter{_sleepable} and __bpf_prog_exit{_sleepable}) (rcu_read_lock or rcu_read_lock_trace). In order to make the local storage maps accessible to both sleepable and non-sleepable programs, one needs to call both call_rcu_tasks_trace and call_rcu to wait for both trace and classical RCU grace periods to expire before freeing memory. Paul's work on call_rcu_tasks_trace allows us to have per CPU queueing for call_rcu_tasks_trace. This behaviour can be achieved by setting rcupdate.rcu_task_enqueue_lim= boot parameter. In light of these new performance changes and to keep the local storage code simple, avoid adding a new flag for sleepable maps / local storage to select the RCU synchronization (trace / classical). Also, update the dereferencing of the pointers to use rcu_derference_check (with either the trace or normal RCU locks held) with a common bpf_rcu_lock_held helper method. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20211224152916.1550677-2-kpsingh@kernel.org --- include/linux/bpf_local_storage.h | 5 ++++ kernel/bpf/bpf_inode_storage.c | 6 ++++- kernel/bpf/bpf_local_storage.c | 50 +++++++++++++++++++++++++++++---------- kernel/bpf/bpf_task_storage.c | 6 ++++- kernel/bpf/verifier.c | 3 +++ net/core/bpf_sk_storage.c | 8 ++++++- 6 files changed, 62 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a2b625960ffe..37b3906af8b1 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -17,6 +17,9 @@ #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 +#define bpf_rcu_lock_held() \ + (rcu_read_lock_held() || rcu_read_lock_trace_held() || \ + rcu_read_lock_bh_held()) struct bpf_local_storage_map_bucket { struct hlist_head list; raw_spinlock_t lock; @@ -162,4 +165,6 @@ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags); +void bpf_local_storage_free_rcu(struct rcu_head *rcu); + #endif /* _BPF_LOCAL_STORAGE_H */ diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 96ceed0e0fb5..e29d9e3d853e 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -17,6 +17,7 @@ #include #include #include +#include DEFINE_BPF_STORAGE_CACHE(inode_cache); @@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, if (!bsb) return NULL; - inode_storage = rcu_dereference(bsb->storage); + inode_storage = + rcu_dereference_check(bsb->storage, bpf_rcu_lock_held()); if (!inode_storage) return NULL; @@ -172,6 +174,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, { struct bpf_local_storage_data *sdata; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; @@ -204,6 +207,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, BPF_CALL_2(bpf_inode_storage_delete, struct bpf_map *, map, struct inode *, inode) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!inode) return -EINVAL; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b305270b7a4b..71de2a89869c 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -11,6 +11,9 @@ #include #include #include +#include +#include +#include #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) @@ -81,6 +84,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; } +void bpf_local_storage_free_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage *local_storage; + + local_storage = container_of(rcu, struct bpf_local_storage, rcu); + kfree_rcu(local_storage, rcu); +} + +static void bpf_selem_free_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage_elem *selem; + + selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + kfree_rcu(selem, rcu); +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. @@ -93,7 +112,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, bool free_local_storage; void *owner; - smap = rcu_dereference(SDATA(selem)->smap); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); owner = local_storage->owner; /* All uncharging on the owner must be done first. @@ -118,12 +137,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, * * Although the unlock will be done under * rcu_read_lock(), it is more intutivie to - * read if kfree_rcu(local_storage, rcu) is done + * read if the freeing of the storage is done * after the raw_spin_unlock_bh(&local_storage->lock). * * Hence, a "bool free_local_storage" is returned - * to the caller which then calls the kfree_rcu() - * after unlock. + * to the caller which then calls then frees the storage after + * all the RCU grace periods have expired. */ } hlist_del_init_rcu(&selem->snode); @@ -131,8 +150,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - kfree_rcu(selem, rcu); - + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); return free_local_storage; } @@ -146,7 +164,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) /* selem has already been unlinked from sk */ return; - local_storage = rcu_dereference(selem->local_storage); + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( @@ -154,7 +173,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) - kfree_rcu(local_storage, rcu); + call_rcu_tasks_trace(&local_storage->rcu, + bpf_local_storage_free_rcu); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -174,7 +194,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) /* selem has already be unlinked from smap */ return; - smap = rcu_dereference(SDATA(selem)->smap); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, selem); raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) @@ -213,12 +233,14 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem; /* Fast path (cache hit) */ - sdata = rcu_dereference(local_storage->cache[smap->cache_idx]); + sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx], + bpf_rcu_lock_held()); if (sdata && rcu_access_pointer(sdata->smap) == smap) return sdata; /* Slow path (cache miss) */ - hlist_for_each_entry_rcu(selem, &local_storage->list, snode) + hlist_for_each_entry_rcu(selem, &local_storage->list, snode, + rcu_read_lock_trace_held()) if (rcu_access_pointer(SDATA(selem)->smap) == smap) break; @@ -306,7 +328,8 @@ int bpf_local_storage_alloc(void *owner, * bucket->list, first_selem can be freed immediately * (instead of kfree_rcu) because * bpf_local_storage_map_free() does a - * synchronize_rcu() before walking the bucket->list. + * synchronize_rcu_mult (waiting for both sleepable and + * normal programs) before walking the bucket->list. * Hence, no one is accessing selem from the * bucket->list under rcu_read_lock(). */ @@ -342,7 +365,8 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, !map_value_has_spin_lock(&smap->map))) return ERR_PTR(-EINVAL); - local_storage = rcu_dereference(*owner_storage(smap, owner)); + local_storage = rcu_dereference_check(*owner_storage(smap, owner), + bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { /* Very first elem for the owner */ err = check_flags(NULL, map_flags); diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index bb69aea1a777..5da7bed0f5f6 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -17,6 +17,7 @@ #include #include #include +#include DEFINE_BPF_STORAGE_CACHE(task_cache); @@ -59,7 +60,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, struct bpf_local_storage *task_storage; struct bpf_local_storage_map *smap; - task_storage = rcu_dereference(task->bpf_storage); + task_storage = + rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held()); if (!task_storage) return NULL; @@ -229,6 +231,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, { struct bpf_local_storage_data *sdata; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; @@ -260,6 +263,7 @@ BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, { int ret; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!task) return -EINVAL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ca5cd0de804c..133599dfe2a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11874,6 +11874,9 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } break; case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_INODE_STORAGE: + case BPF_MAP_TYPE_SK_STORAGE: + case BPF_MAP_TYPE_TASK_STORAGE: break; default: verbose(env, diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index ea61dfe19c86..d9c37fd10809 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -13,6 +13,7 @@ #include #include #include +#include DEFINE_BPF_STORAGE_CACHE(sk_cache); @@ -22,7 +23,8 @@ bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit) struct bpf_local_storage *sk_storage; struct bpf_local_storage_map *smap; - sk_storage = rcu_dereference(sk->sk_bpf_storage); + sk_storage = + rcu_dereference_check(sk->sk_bpf_storage, bpf_rcu_lock_held()); if (!sk_storage) return NULL; @@ -258,6 +260,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, { struct bpf_local_storage_data *sdata; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE) return (unsigned long)NULL; @@ -288,6 +291,7 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!sk || !sk_fullsock(sk)) return -EINVAL; @@ -416,6 +420,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return (unsigned long)NULL; @@ -425,6 +430,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, struct sock *, sk) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (in_hardirq() || in_nmi()) return -EPERM; -- cgit v1.2.3 From 3b80b73a4b3de38f72cd79e1a157449917f2bcb5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 29 Dec 2021 17:27:41 -0800 Subject: net: Add includes masked by netdevice.h including uapi/bpf.h Add missing includes unmasked by the subsequent change. Mostly network drivers missing an include for XDP_PACKET_HEADROOM. Signed-off-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211230012742.770642-2-kuba@kernel.org --- drivers/net/ethernet/amazon/ena/ena_netdev.h | 1 + drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 1 + drivers/net/ethernet/microsoft/mana/mana_en.c | 2 ++ drivers/net/ethernet/stmicro/stmmac/stmmac.h | 1 + drivers/net/ethernet/ti/cpsw_priv.h | 2 ++ include/net/ip6_fib.h | 1 + kernel/bpf/net_namespace.c | 1 + 7 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.h b/drivers/net/ethernet/amazon/ena/ena_netdev.h index 0c39fc2fa345..9391c7101fba 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.h +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.h @@ -14,6 +14,7 @@ #include #include #include +#include #include "ena_com.h" #include "ena_eth_com.h" diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c index 50bbe79fb93d..4367edbdd579 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "nic_reg.h" #include "nic.h" diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index c1d5a374b967..2ece9e90dc50 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Copyright (c) 2021, Microsoft Corporation. */ +#include + #include #include #include diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h index 4f5292cadf54..d42b6af32d6e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h @@ -22,6 +22,7 @@ #include #include #include +#include struct stmmac_resources { void __iomem *addr; diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h index f33c882eb70e..74555970730c 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.h +++ b/drivers/net/ethernet/ti/cpsw_priv.h @@ -6,6 +6,8 @@ #ifndef DRIVERS_NET_ETHERNET_TI_CPSW_PRIV_H_ #define DRIVERS_NET_ETHERNET_TI_CPSW_PRIV_H_ +#include + #include "davinci_cpdma.h" #define CPSW_DEBUG (NETIF_MSG_HW | NETIF_MSG_WOL | \ diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 83b8070d1cc9..a9a4ccc0cdb5 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef CONFIG_IPV6_MULTIPLE_TABLES #define FIB6_TABLE_HASHSZ 256 diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 542f275bf252..868cc2c43899 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include -- cgit v1.2.3 From 9e6b19a66d9b6b94395478fe79c5a3ccba181ad3 Mon Sep 17 00:00:00 2001 From: Leon Huayra Date: Wed, 29 Dec 2021 22:44:22 +0800 Subject: bpf: Fix typo in a comment in bpf lpm_trie. Fix typo in a comment in trie_update_elem(). Signed-off-by: Leon Huayra Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211229144422.70339-1-hffilwlqm@gmail.com --- kernel/bpf/lpm_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 423549d2c52e..5763cc7ac4f1 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -412,7 +412,7 @@ static int trie_update_elem(struct bpf_map *map, rcu_assign_pointer(im_node->child[1], node); } - /* Finally, assign the intermediate node to the determined spot */ + /* Finally, assign the intermediate node to the determined slot */ rcu_assign_pointer(*slot, im_node); out: -- cgit v1.2.3 From 5ef3dd20555e8e878ac390a71e658db5fd02845c Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 21 Dec 2021 07:39:31 -0800 Subject: livepatch: Fix kobject refcount bug on klp_init_patch_early failure path When enabling a klp patch with klp_enable_patch(), klp_init_patch_early() is invoked to initialize the kobjects for the patch itself, as well as the 'struct klp_object' and 'struct klp_func' objects that comprise it. However, there are some error paths in klp_enable_patch() where some kobjects may have been initialized with kobject_init(), but an error code is still returned due to e.g. a 'struct klp_object' having a NULL funcs pointer. In these paths, the initial reference of the kobject of the 'struct klp_patch' may never be released, along with one or more of its objects and their functions, as kobject_put() is not invoked on the cleanup path if klp_init_patch_early() returns an error code. For example, if an object entry such as the following were added to the sample livepatch module's klp patch, it would cause the vmlinux klp_object, and its klp_func which updates 'cmdline_proc_show', to never be released: static struct klp_object objs[] = { { /* name being NULL means vmlinux */ .funcs = funcs, }, { /* NULL funcs -- would cause reference leak */ .name = "kvm", }, { } }; Without this change, if CONFIG_DEBUG_KOBJECT is enabled, and the sample klp patch is loaded, the kobjects (the patch, the vmlinux 'struct klp_object', and its func) are observed as initialized, but never released, in the dmesg log output. With the change, these kobject references no longer fail to be released as the error case is properly handled before they are initialized. Signed-off-by: David Vernet Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Acked-by: Josh Poimboeuf Signed-off-by: Petr Mladek --- kernel/livepatch/core.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 335d988bd811..7d228cdb44c5 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -862,14 +862,11 @@ static void klp_init_object_early(struct klp_patch *patch, list_add_tail(&obj->node, &patch->obj_list); } -static int klp_init_patch_early(struct klp_patch *patch) +static void klp_init_patch_early(struct klp_patch *patch) { struct klp_object *obj; struct klp_func *func; - if (!patch->objs) - return -EINVAL; - INIT_LIST_HEAD(&patch->list); INIT_LIST_HEAD(&patch->obj_list); kobject_init(&patch->kobj, &klp_ktype_patch); @@ -879,20 +876,12 @@ static int klp_init_patch_early(struct klp_patch *patch) init_completion(&patch->finish); klp_for_each_object_static(patch, obj) { - if (!obj->funcs) - return -EINVAL; - klp_init_object_early(patch, obj); klp_for_each_func_static(obj, func) { klp_init_func_early(obj, func); } } - - if (!try_module_get(patch->mod)) - return -ENODEV; - - return 0; } static int klp_init_patch(struct klp_patch *patch) @@ -1024,10 +1013,17 @@ err: int klp_enable_patch(struct klp_patch *patch) { int ret; + struct klp_object *obj; - if (!patch || !patch->mod) + if (!patch || !patch->mod || !patch->objs) return -EINVAL; + klp_for_each_object_static(patch, obj) { + if (!obj->funcs) + return -EINVAL; + } + + if (!is_livepatch_module(patch->mod)) { pr_err("module %s is not marked as a livepatch module\n", patch->mod->name); @@ -1051,11 +1047,10 @@ int klp_enable_patch(struct klp_patch *patch) return -EINVAL; } - ret = klp_init_patch_early(patch); - if (ret) { - mutex_unlock(&klp_mutex); - return ret; - } + if (!try_module_get(patch->mod)) + return -ENODEV; + + klp_init_patch_early(patch); ret = klp_init_patch(patch); if (ret) -- cgit v1.2.3 From 50a0f3f55e382b313e7cbebdf8ccf1593296e16f Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Sat, 25 Dec 2021 10:51:15 +0800 Subject: livepatch: Fix missing unlock on error in klp_enable_patch() Add missing unlock when try_module_get() fails in klp_enable_patch(). Fixes: 5ef3dd20555e8e8 ("livepatch: Fix kobject refcount bug on klp_init_patch_early failure path") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Acked-by: David Vernet Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211225025115.475348-1-yangyingliang@huawei.com --- kernel/livepatch/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 7d228cdb44c5..585494ec464f 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -1047,8 +1047,10 @@ int klp_enable_patch(struct klp_patch *patch) return -EINVAL; } - if (!try_module_get(patch->mod)) + if (!try_module_get(patch->mod)) { + mutex_unlock(&klp_mutex); return -ENODEV; + } klp_init_patch_early(patch); -- cgit v1.2.3 From 2deb55d9f57bb7a877c0d77115cc4077e1e974ff Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 4 Jan 2022 16:11:19 +0000 Subject: swiotlb: Add CONFIG_HAS_IOMEM check around swiotlb_mem_remap() HAS_IOMEM option may not be selected on some platforms (e.g, s390) and this will cause compilation failure due to missing memremap() implementation. Fix it by stubbing out swiotlb_mem_remap when CONFIG_HAS_IOMEM is not set. Reported-by: kernel test robot Signed-off-by: Tianyu Lan Acked-by: Christoph Hellwig Signed-off-by: Wei Liu --- kernel/dma/swiotlb.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b36c1cdd0c4f..f1e7ea160b43 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -163,6 +163,7 @@ static inline unsigned long nr_slots(u64 val) * when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP * Isolation VMs). */ +#ifdef CONFIG_HAS_IOMEM static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) { void *vaddr = NULL; @@ -178,6 +179,12 @@ static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) return vaddr; } +#else +static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) +{ + return NULL; +} +#endif /* * Early SWIOTLB allocation may be too early to allow an architecture to -- cgit v1.2.3 From e60b0d12a95dcf16a63225cead4541567f5cb517 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 5 Jan 2022 11:35:13 -0800 Subject: bpf: Don't promote bogus looking registers after null check. If we ever get to a point again where we convert a bogus looking _or_null typed register containing a non-zero fixed or variable offset, then lets not reset these bounds to zero since they are not and also don't promote the register to a type, but instead leave it as _or_null. Converting to a unknown register could be an avenue as well, but then if we run into this case it would allow to leak a kernel pointer this way. Fixes: f1174f77b50c ("bpf/verifier: rework value tracking") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b70c66c6db3b..c8d9e761173b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9079,15 +9079,15 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, { if (type_may_be_null(reg->type) && reg->id == id && !WARN_ON_ONCE(!reg->id)) { - /* Old offset (both fixed and variable parts) should - * have been known-zero, because we don't allow pointer - * arithmetic on pointers that might be NULL. - */ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0) || reg->off)) { - __mark_reg_known_zero(reg); - reg->off = 0; + /* Old offset (both fixed and variable parts) should + * have been known-zero, because we don't allow pointer + * arithmetic on pointers that might be NULL. If we + * see this happening, don't convert the register. + */ + return; } if (is_null) { reg->type = SCALAR_VALUE; -- cgit v1.2.3 From a5bebc4f00dee47113eed48098c68e88b5ba70e8 Mon Sep 17 00:00:00 2001 From: Kris Van Hees Date: Wed, 5 Jan 2022 16:01:50 -0500 Subject: bpf: Fix verifier support for validation of async callbacks Commit bfc6bb74e4f1 ("bpf: Implement verifier support for validation of async callbacks.") added support for BPF_FUNC_timer_set_callback to the __check_func_call() function. The test in __check_func_call() is flaweed because it can mis-interpret a regular BPF-to-BPF pseudo-call as a BPF_FUNC_timer_set_callback callback call. Consider the conditional in the code: if (insn->code == (BPF_JMP | BPF_CALL) && insn->imm == BPF_FUNC_timer_set_callback) { The BPF_FUNC_timer_set_callback has value 170. This means that if you have a BPF program that contains a pseudo-call with an instruction delta of 170, this conditional will be found to be true by the verifier, and it will interpret the pseudo-call as a callback. This leads to a mess with the verification of the program because it makes the wrong assumptions about the nature of this call. Solution: include an explicit check to ensure that insn->src_reg == 0. This ensures that calls cannot be mis-interpreted as an async callback call. Fixes: bfc6bb74e4f1 ("bpf: Implement verifier support for validation of async callbacks.") Signed-off-by: Kris Van Hees Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220105210150.GH1559@oracle.com --- kernel/bpf/verifier.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c8d9e761173b..bfb45381fb3f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6031,6 +6031,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn } if (insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) { struct bpf_verifier_state *async_cb; -- cgit v1.2.3 From 823e670f7ed616d0ce993075c8afe0217885f79d Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 23 Dec 2021 16:04:38 +0530 Subject: tracing: Fix check for trace_percpu_buffer validity in get_trace_buf() With the new osnoise tracer, we are seeing the below splat: Kernel attempted to read user page (c7d880000) - exploit attempt? (uid: 0) BUG: Unable to handle kernel data access on read at 0xc7d880000 Faulting instruction address: 0xc0000000002ffa10 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries ... NIP [c0000000002ffa10] __trace_array_vprintk.part.0+0x70/0x2f0 LR [c0000000002ff9fc] __trace_array_vprintk.part.0+0x5c/0x2f0 Call Trace: [c0000008bdd73b80] [c0000000001c49cc] put_prev_task_fair+0x3c/0x60 (unreliable) [c0000008bdd73be0] [c000000000301430] trace_array_printk_buf+0x70/0x90 [c0000008bdd73c00] [c0000000003178b0] trace_sched_switch_callback+0x250/0x290 [c0000008bdd73c90] [c000000000e70d60] __schedule+0x410/0x710 [c0000008bdd73d40] [c000000000e710c0] schedule+0x60/0x130 [c0000008bdd73d70] [c000000000030614] interrupt_exit_user_prepare_main+0x264/0x270 [c0000008bdd73de0] [c000000000030a70] syscall_exit_prepare+0x150/0x180 [c0000008bdd73e10] [c00000000000c174] system_call_vectored_common+0xf4/0x278 osnoise tracer on ppc64le is triggering osnoise_taint() for negative duration in get_int_safe_duration() called from trace_sched_switch_callback()->thread_exit(). The problem though is that the check for a valid trace_percpu_buffer is incorrect in get_trace_buf(). The check is being done after calculating the pointer for the current cpu, rather than on the main percpu pointer. Fix the check to be against trace_percpu_buffer. Link: https://lkml.kernel.org/r/a920e4272e0b0635cf20c444707cbce1b2c8973d.1640255304.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: e2ace001176dc9 ("tracing: Choose static tp_printk buffer by explicit nesting count") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 88de94da596b..e1f55851e53f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3217,7 +3217,7 @@ static char *get_trace_buf(void) { struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer); - if (!buffer || buffer->nesting >= 4) + if (!trace_percpu_buffer || buffer->nesting >= 4) return NULL; buffer->nesting++; -- cgit v1.2.3 From f28439db470cca8b6b082239314e9fd10bd39034 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 23 Dec 2021 16:04:39 +0530 Subject: tracing: Tag trace_percpu_buffer as a percpu pointer Tag trace_percpu_buffer as a percpu pointer to resolve warnings reported by sparse: /linux/kernel/trace/trace.c:3218:46: warning: incorrect type in initializer (different address spaces) /linux/kernel/trace/trace.c:3218:46: expected void const [noderef] __percpu *__vpp_verify /linux/kernel/trace/trace.c:3218:46: got struct trace_buffer_struct * /linux/kernel/trace/trace.c:3234:9: warning: incorrect type in initializer (different address spaces) /linux/kernel/trace/trace.c:3234:9: expected void const [noderef] __percpu *__vpp_verify /linux/kernel/trace/trace.c:3234:9: got int * Link: https://lkml.kernel.org/r/ebabd3f23101d89cb75671b68b6f819f5edc830b.1640255304.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Reported-by: kernel test robot Fixes: 07d777fe8c398 ("tracing: Add percpu buffers for trace_printk()") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e1f55851e53f..78ea542ce3bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3207,7 +3207,7 @@ struct trace_buffer_struct { char buffer[4][TRACE_BUF_SIZE]; }; -static struct trace_buffer_struct *trace_percpu_buffer; +static struct trace_buffer_struct __percpu *trace_percpu_buffer; /* * This allows for lockless recording. If we're nested too deeply, then @@ -3236,7 +3236,7 @@ static void put_trace_buf(void) static int alloc_percpu_trace_buffer(void) { - struct trace_buffer_struct *buffers; + struct trace_buffer_struct __percpu *buffers; if (trace_percpu_buffer) return 0; -- cgit v1.2.3 From d53ad5d8b218a885e95080d4d3d556b16b91b1b9 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Mon, 3 Jan 2022 16:08:09 +0100 Subject: xdp: Move conversion to xdp_frame out of map functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All map redirect functions except XSK maps convert xdp_buff to xdp_frame before enqueueing it. So move this conversion of out the map functions and into xdp_do_redirect(). This removes a bit of duplicated code, but more importantly it makes it possible to support caller-allocated xdp_frame structures, which will be added in a subsequent commit. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220103150812.87914-5-toke@redhat.com --- include/linux/bpf.h | 20 ++++++++++---------- kernel/bpf/cpumap.c | 8 +------- kernel/bpf/devmap.c | 32 +++++++++++--------------------- net/core/filter.c | 24 +++++++++++++++++------- 4 files changed, 39 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 26753139d5b4..6e947cd91152 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1669,17 +1669,17 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); struct btf *bpf_get_btf_vmlinux(void); /* Map specifics */ -struct xdp_buff; +struct xdp_frame; struct sk_buff; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; void __dev_flush(void); -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx); -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx); -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); @@ -1688,7 +1688,7 @@ int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, bool exclude_ingress); void __cpu_map_flush(void); -int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx); int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, struct sk_buff *skb); @@ -1866,26 +1866,26 @@ static inline void __dev_flush(void) { } -struct xdp_buff; +struct xdp_frame; struct bpf_dtab_netdev; struct bpf_cpu_map_entry; static inline -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; } static inline -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { return 0; @@ -1913,7 +1913,7 @@ static inline void __cpu_map_flush(void) } static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, - struct xdp_buff *xdp, + struct xdp_frame *xdpf, struct net_device *dev_rx) { return 0; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 0421061d95f1..b3e6b9422238 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -746,15 +746,9 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) list_add(&bq->flush_node, flush_list); } -int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx) { - struct xdp_frame *xdpf; - - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - /* Info needed when constructing SKB on remote CPU */ xdpf->dev_rx = dev_rx; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 6feea293ff10..fe019dbdb3f0 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -467,24 +467,19 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, bq->q[bq->count++] = xdpf; } -static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { - struct xdp_frame *xdpf; int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + err = xdp_ok_fwd_dev(dev, xdpf->len); if (unlikely(err)) return err; - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } @@ -520,27 +515,27 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev return act; } -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { - return __xdp_enqueue(dev, xdp, dev_rx, NULL); + return __xdp_enqueue(dev, xdpf, dev_rx, NULL); } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct net_device *dev = dst->dev; - return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); + return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); } -static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp) +static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) { if (!obj || !obj->dev->netdev_ops->ndo_xdp_xmit) return false; - if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data)) + if (xdp_ok_fwd_dev(obj->dev, xdpf->len)) return false; return true; @@ -586,14 +581,13 @@ static int get_upper_ifindexes(struct net_device *dev, int *indexes) return n; } -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; - struct xdp_frame *xdpf; int num_excluded = 0; unsigned int i; int err; @@ -603,15 +597,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, excluded_devices[num_excluded++] = dev_rx->ifindex; } - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); - if (!is_valid_dst(dst, xdp)) + if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) @@ -634,7 +624,7 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, lockdep_is_held(&dtab->index_lock)) { - if (!is_valid_dst(dst, xdp)) + if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, diff --git a/net/core/filter.c b/net/core/filter.c index cac2be559ab0..e2b83056246c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3964,12 +3964,24 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; u32 map_id = ri->map_id; + struct xdp_frame *xdpf; struct bpf_map *map; int err; ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; + if (map_type == BPF_MAP_TYPE_XSKMAP) { + err = __xsk_map_redirect(fwd, xdp); + goto out; + } + + xdpf = xdp_convert_buff_to_frame(xdp); + if (unlikely(!xdpf)) { + err = -EOVERFLOW; + goto err; + } + switch (map_type) { case BPF_MAP_TYPE_DEVMAP: fallthrough; @@ -3977,17 +3989,14 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, map = READ_ONCE(ri->map); if (unlikely(map)) { WRITE_ONCE(ri->map, NULL); - err = dev_map_enqueue_multi(xdp, dev, map, + err = dev_map_enqueue_multi(xdpf, dev, map, ri->flags & BPF_F_EXCLUDE_INGRESS); } else { - err = dev_map_enqueue(fwd, xdp, dev); + err = dev_map_enqueue(fwd, xdpf, dev); } break; case BPF_MAP_TYPE_CPUMAP: - err = cpu_map_enqueue(fwd, xdp, dev); - break; - case BPF_MAP_TYPE_XSKMAP: - err = __xsk_map_redirect(fwd, xdp); + err = cpu_map_enqueue(fwd, xdpf, dev); break; case BPF_MAP_TYPE_UNSPEC: if (map_id == INT_MAX) { @@ -3996,7 +4005,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, err = -EINVAL; break; } - err = dev_xdp_enqueue(fwd, xdp, dev); + err = dev_xdp_enqueue(fwd, xdpf, dev); break; } fallthrough; @@ -4004,6 +4013,7 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, err = -EBADRQC; } +out: if (unlikely(err)) goto err; -- cgit v1.2.3 From 1756d7994ad85c2479af6ae5a9750b92324685af Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:28 -1000 Subject: cgroup: Use open-time credentials for process migraton perm checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup process migration permission checks are performed at write time as whether a given operation is allowed or not is dependent on the content of the write - the PID. This currently uses current's credentials which is a potential security weakness as it may allow scenarios where a less privileged process tricks a more privileged one into writing into a fd that it created. This patch makes both cgroup2 and cgroup1 process migration interfaces to use the credentials saved at the time of open (file->f_cred) instead of current's. Reported-by: "Eric W. Biederman" Suggested-by: Linus Torvalds Fixes: 187fe84067bd ("cgroup: require write perm on common ancestor when moving processes on the default hierarchy") Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 7 ++++--- kernel/cgroup/cgroup.c | 9 ++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 81c9e0685948..0e7369103ba6 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -504,10 +504,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, goto out_unlock; /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. + * Even if we're attaching all tasks in the thread group, we only need + * to check permissions on one of them. Check permissions using the + * credentials from file open to protect against inherited fd attacks. */ - cred = current_cred(); + cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 919194de39c8..2632e46da1d4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4892,6 +4892,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, { struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; + const struct cred *saved_cred; ssize_t ret; bool locked; @@ -4909,9 +4910,15 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - /* process and thread migrations follow same delegation rule */ + /* + * Process and thread migrations follow same delegation rule. Check + * permissions using the credentials from file open to protect against + * inherited fd attacks. + */ + saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, of->file->f_path.dentry->d_sb, threadgroup); + revert_creds(saved_cred); if (ret) goto out_finish; -- cgit v1.2.3 From 0d2b5955b36250a9428c832664f2079cbf723bec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:29 -1000 Subject: cgroup: Allocate cgroup_file_ctx for kernfs_open_file->priv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit of->priv is currently used by each interface file implementation to store private information. This patch collects the current two private data usages into struct cgroup_file_ctx which is allocated and freed by the common path. This allows generic private data which applies to multiple files, which will be used to in the following patch. Note that cgroup_procs iterator is now embedded as procs.iter in the new cgroup_file_ctx so that it doesn't need to be allocated and freed separately. v2: union dropped from cgroup_file_ctx and the procs iterator is embedded in cgroup_file_ctx as suggested by Linus. v3: Michal pointed out that cgroup1's procs pidlist uses of->priv too. Converted. Didn't change to embedded allocation as cgroup1 pidlists get stored for caching. Signed-off-by: Tejun Heo Cc: Linus Torvalds Reviewed-by: Michal Koutný --- kernel/cgroup/cgroup-internal.h | 17 +++++++++++++ kernel/cgroup/cgroup-v1.c | 26 ++++++++++---------- kernel/cgroup/cgroup.c | 53 ++++++++++++++++++++++++++--------------- 3 files changed, 65 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index bfbeabc17a9d..cf637bc4ab45 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -65,6 +65,23 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) return container_of(kfc, struct cgroup_fs_context, kfc); } +struct cgroup_pidlist; + +struct cgroup_file_ctx { + struct { + void *trigger; + } psi; + + struct { + bool started; + struct css_task_iter iter; + } procs; + + struct { + struct cgroup_pidlist *pidlist; + } procs1; +}; + /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 0e7369103ba6..41e0837a5a0b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -394,6 +394,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * next pid to display, if any */ struct kernfs_open_file *of = s->private; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; @@ -403,25 +404,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) mutex_lock(&cgrp->pidlist_mutex); /* - * !NULL @of->priv indicates that this isn't the first start() - * after open. If the matching pidlist is around, we can use that. - * Look for it. Note that @of->priv can't be used directly. It - * could already have been destroyed. + * !NULL @ctx->procs1.pidlist indicates that this isn't the first + * start() after open. If the matching pidlist is around, we can use + * that. Look for it. Note that @ctx->procs1.pidlist can't be used + * directly. It could already have been destroyed. */ - if (of->priv) - of->priv = cgroup_pidlist_find(cgrp, type); + if (ctx->procs1.pidlist) + ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ - if (!of->priv) { - ret = pidlist_array_load(cgrp, type, - (struct cgroup_pidlist **)&of->priv); + if (!ctx->procs1.pidlist) { + ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } - l = of->priv; + l = ctx->procs1.pidlist; if (pid) { int end = l->length; @@ -449,7 +449,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, @@ -460,7 +461,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v) static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2632e46da1d4..a84631d08d98 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3630,6 +3630,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { + struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; @@ -3648,7 +3649,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); } - psi_trigger_replace(&of->priv, new); + psi_trigger_replace(&ctx->psi.trigger, new); cgroup_put(cgrp); @@ -3679,12 +3680,16 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { - return psi_trigger_poll(&of->priv, of->file, pt); + struct cgroup_file_ctx *ctx = of->priv; + + return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static void cgroup_pressure_release(struct kernfs_open_file *of) { - psi_trigger_replace(&of->priv, NULL); + struct cgroup_file_ctx *ctx = of->priv; + + psi_trigger_replace(&ctx->psi.trigger, NULL); } bool cgroup_psi_enabled(void) @@ -3811,18 +3816,31 @@ static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx; + int ret; - if (cft->open) - return cft->open(of); - return 0; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + of->priv = ctx; + + if (!cft->open) + return 0; + + ret = cft->open(of); + if (ret) + kfree(ctx); + return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); + kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, @@ -4751,21 +4769,21 @@ void css_task_iter_end(struct css_task_iter *it) static void cgroup_procs_release(struct kernfs_open_file *of) { - if (of->priv) { - css_task_iter_end(of->priv); - kfree(of->priv); - } + struct cgroup_file_ctx *ctx = of->priv; + + if (ctx->procs.started) + css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; - return css_task_iter_next(it); + return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, @@ -4773,21 +4791,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ - if (!it) { + if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); - - it = kzalloc(sizeof(*it), GFP_KERNEL); - if (!it) - return ERR_PTR(-ENOMEM); - of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); + ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); -- cgit v1.2.3 From e57457641613fef0d147ede8bd6a3047df588b95 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:29 -1000 Subject: cgroup: Use open-time cgroup namespace for process migration perm checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup process migration permission checks are performed at write time as whether a given operation is allowed or not is dependent on the content of the write - the PID. This currently uses current's cgroup namespace which is a potential security weakness as it may allow scenarios where a less privileged process tricks a more privileged one into writing into a fd that it created. This patch makes cgroup remember the cgroup namespace at the time of open and uses it for migration permission checks instad of current's. Note that this only applies to cgroup2 as cgroup1 doesn't have namespace support. This also fixes a use-after-free bug on cgroupns reported in https://lore.kernel.org/r/00000000000048c15c05d0083397@google.com Note that backporting this fix also requires the preceding patch. Reported-by: "Eric W. Biederman" Suggested-by: Linus Torvalds Cc: Michal Koutný Cc: Oleg Nesterov Reviewed-by: Michal Koutný Reported-by: syzbot+50f5cf33a284ce738b62@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/00000000000048c15c05d0083397@google.com Fixes: 5136f6365ce3 ("cgroup: implement "nsdelegate" mount option") Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 2 ++ kernel/cgroup/cgroup.c | 28 +++++++++++++++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index cf637bc4ab45..6e36e854b512 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -68,6 +68,8 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) struct cgroup_pidlist; struct cgroup_file_ctx { + struct cgroup_namespace *ns; + struct { void *trigger; } psi; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a84631d08d98..cafb8c114a21 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3822,14 +3822,19 @@ static int cgroup_file_open(struct kernfs_open_file *of) ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; + + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); of->priv = ctx; if (!cft->open) return 0; ret = cft->open(of); - if (ret) + if (ret) { + put_cgroup_ns(ctx->ns); kfree(ctx); + } return ret; } @@ -3840,13 +3845,14 @@ static void cgroup_file_release(struct kernfs_open_file *of) if (cft->release) cft->release(of); + put_cgroup_ns(ctx->ns); kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; @@ -3863,7 +3869,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) @@ -4853,9 +4859,9 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb) + struct super_block *sb, + struct cgroup_namespace *ns) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; int ret; @@ -4884,11 +4890,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, static int cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb, bool threadgroup) + struct super_block *sb, bool threadgroup, + struct cgroup_namespace *ns) { int ret = 0; - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret; @@ -4905,6 +4912,7 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp, static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, bool threadgroup) { + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; const struct cred *saved_cred; @@ -4932,7 +4940,8 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, */ saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, threadgroup); + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); revert_creds(saved_cred); if (ret) goto out_finish; @@ -6152,7 +6161,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) goto err; ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, - !(kargs->flags & CLONE_THREAD)); + !(kargs->flags & CLONE_THREAD), + current->nsproxy->cgroup_ns); if (ret) goto err; -- cgit v1.2.3 From 0da41f7348fff193d01d031ce255088fa98324b7 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 25 Dec 2021 00:09:31 +0000 Subject: cgroup: rstat: explicitly put loop variant in while MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of do while unconditionally, let's put the loop variant in while. Signed-off-by: Wei Yang Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1abe74114527..bc6993258271 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -124,12 +124,10 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, prstatc = cgroup_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; - while (true) { + while (*nextp != pos) { struct cgroup_rstat_cpu *nrstatc; nrstatc = cgroup_rstat_cpu(*nextp, cpu); - if (*nextp == pos) - break; WARN_ON_ONCE(*nextp == parent); nextp = &nrstatc->updated_next; } -- cgit v1.2.3 From f5f60d235e7058da13a643c33fc7599c05ec0b73 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 25 Dec 2021 00:09:32 +0000 Subject: cgroup/rstat: check updated_next only for root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit dc26532aed0a ("cgroup: rstat: punt root-level optimization to individual controllers"), each rstat on updated_children list has its ->updated_next not NULL. This means we can remove the check on ->updated_next, if we make sure the subtree from @root is on list, which could be done by checking updated_next for root. tj: Coding style fixes. Signed-off-by: Wei Yang Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index bc6993258271..9d331ba44870 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -88,6 +88,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; + struct cgroup *parent; if (pos == root) return NULL; @@ -96,10 +97,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * We're gonna walk down to the first leaf and visit/remove it. We * can pick whatever unvisited node as the starting point. */ - if (!pos) + if (!pos) { pos = root; - else + /* return NULL if this subtree is not on-list */ + if (!cgroup_rstat_cpu(pos, cpu)->updated_next) + return NULL; + } else { pos = cgroup_parent(pos); + } /* walk down to the first leaf */ while (true) { @@ -115,31 +120,25 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ - if (rstatc->updated_next) { - struct cgroup *parent = cgroup_parent(pos); - - if (parent) { - struct cgroup_rstat_cpu *prstatc; - struct cgroup **nextp; - - prstatc = cgroup_rstat_cpu(parent, cpu); - nextp = &prstatc->updated_children; - while (*nextp != pos) { - struct cgroup_rstat_cpu *nrstatc; - - nrstatc = cgroup_rstat_cpu(*nextp, cpu); - WARN_ON_ONCE(*nextp == parent); - nextp = &nrstatc->updated_next; - } - *nextp = rstatc->updated_next; - } + parent = cgroup_parent(pos); + if (parent) { + struct cgroup_rstat_cpu *prstatc; + struct cgroup **nextp; - rstatc->updated_next = NULL; - return pos; + prstatc = cgroup_rstat_cpu(parent, cpu); + nextp = &prstatc->updated_children; + while (*nextp != pos) { + struct cgroup_rstat_cpu *nrstatc; + + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + *nextp = rstatc->updated_next; } - /* only happens for @root */ - return NULL; + rstatc->updated_next = NULL; + return pos; } /* see cgroup_rstat_flush() */ -- cgit v1.2.3 From 703f7066f40599c290babdb79dd61319264987e9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 7 Dec 2021 13:17:33 +0100 Subject: random: remove unused irq_flags argument from add_interrupt_randomness() Since commit ee3e00e9e7101 ("random: use registers from interrupted code for CPU's w/o a cycle counter") the irq_flags argument is no longer used. Remove unused irq_flags. Cc: Borislav Petkov Cc: Dave Hansen Cc: Dexuan Cui Cc: H. Peter Anvin Cc: Haiyang Zhang Cc: Ingo Molnar Cc: K. Y. Srinivasan Cc: Stephen Hemminger Cc: Thomas Gleixner Cc: Wei Liu Cc: linux-hyperv@vger.kernel.org Cc: x86@kernel.org Signed-off-by: Sebastian Andrzej Siewior Acked-by: Wei Liu Signed-off-by: Jason A. Donenfeld --- arch/x86/kernel/cpu/mshyperv.c | 2 +- drivers/char/random.c | 4 ++-- drivers/hv/vmbus_drv.c | 2 +- include/linux/random.h | 2 +- kernel/irq/handle.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index ff55df60228f..2a0f83678911 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -79,7 +79,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); - add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); + add_interrupt_randomness(HYPERV_STIMER0_VECTOR); ack_APIC_irq(); set_irq_regs(old_regs); diff --git a/drivers/char/random.c b/drivers/char/random.c index c81485e2f126..4b0753eb8bfb 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -200,7 +200,7 @@ * void add_device_randomness(const void *buf, unsigned int size); * void add_input_randomness(unsigned int type, unsigned int code, * unsigned int value); - * void add_interrupt_randomness(int irq, int irq_flags); + * void add_interrupt_randomness(int irq); * void add_disk_randomness(struct gendisk *disk); * void add_hwgenerator_randomness(const char *buffer, size_t count, * size_t entropy); @@ -1253,7 +1253,7 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs) return *ptr; } -void add_interrupt_randomness(int irq, int irq_flags) +void add_interrupt_randomness(int irq) { struct entropy_store *r; struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness); diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 392c1ac4f819..7ae04ccb1043 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -1381,7 +1381,7 @@ static void vmbus_isr(void) tasklet_schedule(&hv_cpu->msg_dpc); } - add_interrupt_randomness(vmbus_interrupt, 0); + add_interrupt_randomness(vmbus_interrupt); } static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) diff --git a/include/linux/random.h b/include/linux/random.h index f45b8be3e3c4..c45b2693e51f 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -35,7 +35,7 @@ static inline void add_latent_entropy(void) {} extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) __latent_entropy; -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy; +extern void add_interrupt_randomness(int irq) __latent_entropy; extern void get_random_bytes(void *buf, int nbytes); extern int wait_for_random_bytes(void); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 27182003b879..68c76c3caaf5 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -197,7 +197,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) retval = __handle_irq_event_percpu(desc, &flags); - add_interrupt_randomness(desc->irq_data.irq, flags); + add_interrupt_randomness(desc->irq_data.irq); if (!irq_settings_no_debug(desc)) note_interrupt(desc, retval); -- cgit v1.2.3 From 5320eb42dec7a7ef3ab7da3c5c0d7f889a5181e5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 7 Dec 2021 13:17:34 +0100 Subject: irq: remove unused flags argument from __handle_irq_event_percpu() The __IRQF_TIMER bit from the flags argument was used in add_interrupt_randomness() to distinguish the timer interrupt from other interrupts. This is no longer the case. Remove the flags argument from __handle_irq_event_percpu(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jason A. Donenfeld --- kernel/irq/chip.c | 4 +--- kernel/irq/handle.c | 9 ++------- kernel/irq/internals.h | 2 +- 3 files changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f895265d7548..c09324663088 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -575,8 +575,6 @@ EXPORT_SYMBOL_GPL(handle_simple_irq); */ void handle_untracked_irq(struct irq_desc *desc) { - unsigned int flags = 0; - raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) @@ -593,7 +591,7 @@ void handle_untracked_irq(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); - __handle_irq_event_percpu(desc, &flags); + __handle_irq_event_percpu(desc); raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 68c76c3caaf5..9489f93b3db3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,7 +136,7 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } -irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags) +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int irq = desc->irq_data.irq; @@ -174,10 +174,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags } __irq_wake_thread(desc, action); - - fallthrough; /* to add to randomness */ - case IRQ_HANDLED: - *flags |= action->flags; break; default: @@ -193,9 +189,8 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval; - unsigned int flags = 0; - retval = __handle_irq_event_percpu(desc, &flags); + retval = __handle_irq_event_percpu(desc); add_interrupt_randomness(desc->irq_data.irq); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 54363527feea..99cbdf55a8bd 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -103,7 +103,7 @@ extern int __irq_get_irqchip_state(struct irq_data *data, extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); -- cgit v1.2.3 From f5bdb34bf0c9314548f2d8e2360b703ff3610303 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 29 Dec 2021 13:56:47 -0800 Subject: livepatch: Avoid CPU hogging with cond_resched When initializing a 'struct klp_object' in klp_init_object_loaded(), and performing relocations in klp_resolve_symbols(), klp_find_object_symbol() is invoked to look up the address of a symbol in an already-loaded module (or vmlinux). This, in turn, calls kallsyms_on_each_symbol() or module_kallsyms_on_each_symbol() to find the address of the symbol that is being patched. It turns out that symbol lookups often take up the most CPU time when enabling and disabling a patch, and may hog the CPU and cause other tasks on that CPU's runqueue to starve -- even in paths where interrupts are enabled. For example, under certain workloads, enabling a KLP patch with many objects or functions may cause ksoftirqd to be starved, and thus for interrupts to be backlogged and delayed. This may end up causing TCP retransmits on the host where the KLP patch is being applied, and in general, may cause any interrupts serviced by softirqd to be delayed while the patch is being applied. So as to ensure that kallsyms_on_each_symbol() does not end up hogging the CPU, this patch adds a call to cond_resched() in kallsyms_on_each_symbol() and module_kallsyms_on_each_symbol(), which are invoked when doing a symbol lookup in vmlinux and a module respectively. Without this patch, if a live-patch is applied on a 36-core Intel host with heavy TCP traffic, a ~10x spike is observed in TCP retransmits while the patch is being applied. Additionally, collecting sched events with perf indicates that ksoftirqd is awakened ~1.3 seconds before it's eventually scheduled. With the patch, no increase in TCP retransmit events is observed, and ksoftirqd is scheduled shortly after it's awakened. Signed-off-by: David Vernet Acked-by: Miroslav Benes Acked-by: Song Liu Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211229215646.830451-1-void@manifault.com --- kernel/kallsyms.c | 1 + kernel/module.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 0ba87982d017..2a9afe484aec 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -223,6 +223,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); if (ret != 0) return ret; + cond_resched(); } return 0; } diff --git a/kernel/module.c b/kernel/module.c index 40ec9a030eec..c96160f7f3f5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4462,6 +4462,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, mod, kallsyms_symbol_value(sym)); if (ret != 0) goto out; + + cond_resched(); } } out: -- cgit v1.2.3 From d4296faebd337e5f76c0fddb815de33d2b0ad118 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Sun, 19 Dec 2021 10:41:54 +0800 Subject: cpuset: convert 'allowed' in __cpuset_node_allowed() to be boolean Convert 'allowed' in __cpuset_node_allowed() to be boolean since the return types of node_isset() and __cpuset_node_allowed() are both boolean. Signed-off-by: Qi Zheng Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0dd7d853ed17..dc653ab26e50 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3528,7 +3528,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ - int allowed; /* is allocation in zone z allowed? */ + bool allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) -- cgit v1.2.3 From e32cf5dfbe227b355776948b2c9b5691b84d1cbd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 22 Dec 2021 22:10:09 -0600 Subject: kthread: Generalize pf_io_worker so it can point to struct kthread The point of using set_child_tid to hold the kthread pointer was that it already did what is necessary. There are now restrictions on when set_child_tid can be initialized and when set_child_tid can be used in schedule_tail. Which indicates that continuing to use set_child_tid to hold the kthread pointer is a bad idea. Instead of continuing to use the set_child_tid field of task_struct generalize the pf_io_worker field of task_struct and use it to hold the kthread pointer. Rename pf_io_worker (which is a void * pointer) to worker_private so it can be used to store kthreads struct kthread pointer. Update the kthread code to store the kthread pointer in the worker_private field. Remove the places where set_child_tid had to be dealt with carefully because kthreads also used it. Link: https://lkml.kernel.org/r/CAHk-=wgtFAA9SbVYg0gR1tqPMC17-NYcs0GQkaYg1bGhh1uJQQ@mail.gmail.com Link: https://lkml.kernel.org/r/87a6grvqy8.fsf_-_@email.froward.int.ebiederm.org Suggested-by: Linus Torvalds Signed-off-by: "Eric W. Biederman" --- fs/io-wq.c | 6 +++--- fs/io-wq.h | 2 +- include/linux/sched.h | 4 ++-- kernel/fork.c | 8 +------- kernel/kthread.c | 14 +++++--------- kernel/sched/core.c | 2 +- 6 files changed, 13 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/fs/io-wq.c b/fs/io-wq.c index 88202de519f6..e4fc7384b40c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -657,7 +657,7 @@ loop: */ void io_wq_worker_running(struct task_struct *tsk) { - struct io_worker *worker = tsk->pf_io_worker; + struct io_worker *worker = tsk->worker_private; if (!worker) return; @@ -675,7 +675,7 @@ void io_wq_worker_running(struct task_struct *tsk) */ void io_wq_worker_sleeping(struct task_struct *tsk) { - struct io_worker *worker = tsk->pf_io_worker; + struct io_worker *worker = tsk->worker_private; if (!worker) return; @@ -694,7 +694,7 @@ void io_wq_worker_sleeping(struct task_struct *tsk) static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, struct task_struct *tsk) { - tsk->pf_io_worker = worker; + tsk->worker_private = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, wqe->cpu_mask); tsk->flags |= PF_NO_SETAFFINITY; diff --git a/fs/io-wq.h b/fs/io-wq.h index 41bf37674a49..c7c23947cbcd 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -200,6 +200,6 @@ static inline void io_wq_worker_running(struct task_struct *tsk) static inline bool io_wq_current_is_worker(void) { return in_task() && (current->flags & PF_IO_WORKER) && - current->pf_io_worker; + current->worker_private; } #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 78c351e35fec..52f2fdffa3ab 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -987,8 +987,8 @@ struct task_struct { /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; - /* PF_IO_WORKER */ - void *pf_io_worker; + /* PF_KTHREAD | PF_IO_WORKER */ + void *worker_private; u64 utime; u64 stime; diff --git a/kernel/fork.c b/kernel/fork.c index 0816be1bb044..6f0293cb29c9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -950,7 +950,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - tsk->pf_io_worker = NULL; + tsk->worker_private = NULL; account_kernel_stack(tsk, 1); @@ -2032,12 +2032,6 @@ static __latent_entropy struct task_struct *copy_process( siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } - /* - * This _must_ happen before we call free_task(), i.e. before we jump - * to any of the bad_fork_* labels. This is to avoid freeing - * p->set_child_tid which is (ab)used as a kthread's data pointer for - * kernel threads (PF_KTHREAD). - */ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? diff --git a/kernel/kthread.c b/kernel/kthread.c index c14707d15341..261a3c3b9c6c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -72,7 +72,7 @@ enum KTHREAD_BITS { static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); - return (__force void *)k->set_child_tid; + return k->worker_private; } /* @@ -80,7 +80,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) * * Per construction; when: * - * (p->flags & PF_KTHREAD) && p->set_child_tid + * (p->flags & PF_KTHREAD) && p->worker_private * * the task is both a kthread and struct kthread is persistent. However * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and @@ -88,7 +88,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) */ static inline struct kthread *__to_kthread(struct task_struct *p) { - void *kthread = (__force void *)p->set_child_tid; + void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; @@ -109,11 +109,7 @@ bool set_kthread_struct(struct task_struct *p) init_completion(&kthread->parked); p->vfork_done = &kthread->exited; - /* - * We abuse ->set_child_tid to avoid the new member and because it - * can't be wrongly copied by copy_process(). - */ - p->set_child_tid = (__force void __user *)kthread; + p->worker_private = kthread; return true; } @@ -128,7 +124,7 @@ void free_kthread_struct(struct task_struct *k) #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread && kthread->blkcg_css); #endif - k->set_child_tid = (__force void __user *)NULL; + k->worker_private = NULL; kfree(kthread); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d8adbea77be1..ee222b89c692 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4908,7 +4908,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) finish_task_switch(prev); preempt_enable(); - if (!(current->flags & PF_KTHREAD) && current->set_child_tid) + if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); calculate_sigpending(); -- cgit v1.2.3 From 912616f142bfeb1dc41f40dbe7ce38331886a94a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 5 Jan 2022 16:30:21 -0600 Subject: exit: Guarantee make_task_dead leaks the tsk when calling do_task_exit Change the task state to EXIT_DEAD and take an extra rcu_refernce to guarantee the task will not be reaped and that it will not be freed. Link: https://lkml.kernel.org/r/YdUzjrLAlRiNLQp2@zeniv-ca.linux.org.uk Pointed-out-by: Al Viro Fixes: 7f80a2fd7db9 ("exit: Stop poorly open coding do_task_dead in make_task_dead") Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 6c4b04531f17..db4eeb7fc680 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -885,6 +885,8 @@ void __noreturn make_task_dead(int signr) if (unlikely(tsk->flags & PF_EXITING)) { pr_alert("Fixing recursive fault but reboot is needed!\n"); futex_exit_recursive(tsk); + tsk->exit_state = EXIT_DEAD; + refcount_inc(&tsk->rcu_users); do_task_dead(); } -- cgit v1.2.3 From de77c3a5b95c95a4915142071643d94e3e1ada35 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 7 Jan 2022 12:18:12 -0600 Subject: exit: Move force_uaccess back into do_exit With kernel threads on architectures that still have set_fs/get_fs running as KERNEL_DS moving force_uaccess_begin does not appear safe. Calling force_uaccess_begin is a noop on anything people care about. Update the comment to explain why this code while looking like an obvious candidate for moving to make_task_dead probably needs to remain in do_exit until set_fs/get_fs are entirely removed from the kernel. Fixes: 05ea0424f0e2 ("exit: Move oops specific logic from do_exit into make_task_dead") Suggested-by: Al Viro Link: https://lkml.kernel.org/r/YdUxGKRcSiDy8jGg@zeniv-ca.linux.org.uk Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index db4eeb7fc680..fc0726cb22db 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -737,6 +737,20 @@ void __noreturn do_exit(long code) WARN_ON(blk_needs_flush_plug(tsk)); + /* + * If do_dead is called because this processes oopsed, it's possible + * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before + * continuing. Amongst other possible reasons, this is to prevent + * mm_release()->clear_child_tid() from writing to a user-controlled + * kernel address. + * + * On uptodate architectures force_uaccess_begin is a noop. On + * architectures that still have set_fs/get_fs in addition to handling + * oopses handles kernel threads that run as set_fs(KERNEL_DS) by + * default. + */ + force_uaccess_begin(); + profile_task_exit(tsk); kcov_task_exit(tsk); @@ -862,15 +876,6 @@ void __noreturn make_task_dead(int signr) if (unlikely(!tsk->pid)) panic("Attempted to kill the idle task!"); - /* - * If make_task_dead is called because this processes oopsed, it's possible - * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before - * continuing. Amongst other possible reasons, this is to prevent - * mm_release()->clear_child_tid() from writing to a user-controlled - * kernel address. - */ - force_uaccess_begin(); - if (unlikely(in_atomic())) { pr_info("note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), -- cgit v1.2.3 From a0287db0f1d6918919203ba31fd7cda59bf889e8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 8 Jan 2022 09:34:50 -0600 Subject: signal: Have prepare_signal detect coredumps using signal->core_state In preparation for removing the flag SIGNAL_GROUP_COREDUMP, change prepare_signal to test signal->core_state instead of the flag SIGNAL_GROUP_COREDUMP. Both fields are protected by siglock and both live in signal_struct so there are no real tradeoffs here, just a change to which field is being tested. Link: https://lkml.kernel.org/r/20211213225350.27481-1-ebiederm@xmission.com Link: https://lkml.kernel.org/r/875yqu14co.fsf_-_@email.froward.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 8272cac5f429..f95a4423519d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -906,8 +906,8 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) struct task_struct *t; sigset_t flush; - if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { - if (!(signal->flags & SIGNAL_GROUP_EXIT)) + if ((signal->flags & SIGNAL_GROUP_EXIT) || signal->core_state) { + if (signal->core_state) return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. -- cgit v1.2.3 From 7ba03471ac4ad2432e5ccf67d9d4ab03c177578a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 8 Jan 2022 11:01:12 -0600 Subject: signal: Make coredump handling explicit in complete_signal Ever since commit 6cd8f0acae34 ("coredump: ensure that SIGKILL always kills the dumping thread") it has been possible for a SIGKILL received during a coredump to set SIGNAL_GROUP_EXIT and trigger a process shutdown (for a second time). Update the logic to explicitly allow coredumps so that coredumps can set SIGNAL_GROUP_EXIT and shutdown like an ordinary process. Link: https://lkml.kernel.org/r/87zgo6ytyf.fsf_-_@email.froward.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f95a4423519d..0706c1345a71 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1032,7 +1032,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type) * then start taking the whole group down immediately. */ if (sig_fatal(p, sig) && - !(signal->flags & SIGNAL_GROUP_EXIT) && + (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) && !sigismember(&t->real_blocked, sig) && (sig == SIGKILL || !p->ptrace)) { /* -- cgit v1.2.3 From 2f824d4d197e02275562359a2ae5274177ce500c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 8 Jan 2022 09:48:31 -0600 Subject: signal: Remove SIGNAL_GROUP_COREDUMP After the previous cleanups "signal->core_state" is set whenever SIGNAL_GROUP_COREDUMP is set and "signal->core_state" is tested whenver the code wants to know if a coredump is in progress. The remaining tests of SIGNAL_GROUP_COREDUMP also test to see if SIGNAL_GROUP_EXIT is set. Similarly the only place that sets SIGNAL_GROUP_COREDUMP also sets SIGNAL_GROUP_EXIT. Which makes SIGNAL_GROUP_COREDUMP unecessary and redundant. So stop setting SIGNAL_GROUP_COREDUMP, stop testing SIGNAL_GROUP_COREDUMP, and remove it's definition. With the setting of SIGNAL_GROUP_COREDUMP gone, coredump_finish no longer needs to clear SIGNAL_GROUP_COREDUMP out of signal->flags by setting SIGNAL_GROUP_EXIT. Link: https://lkml.kernel.org/r/20211213225350.27481-5-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- fs/coredump.c | 3 +-- include/linux/sched/signal.h | 3 +-- kernel/signal.c | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/fs/coredump.c b/fs/coredump.c index 0864941a879b..fee1c57aee89 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -353,7 +353,7 @@ static int zap_process(struct task_struct *start, int exit_code) int nr = 0; /* ignore all signals except SIGKILL, see prepare_signal() */ - start->signal->flags = SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP; + start->signal->flags = SIGNAL_GROUP_EXIT; start->signal->group_exit_code = exit_code; start->signal->group_stop_count = 0; @@ -427,7 +427,6 @@ static void coredump_finish(bool core_dumped) if (core_dumped && !__fatal_signal_pending(current)) current->signal->group_exit_code |= 0x80; current->signal->group_exit_task = NULL; - current->signal->flags = SIGNAL_GROUP_EXIT; next = current->signal->core_state->dumper.next; current->signal->core_state = NULL; spin_unlock_irq(¤t->sighand->siglock); diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index fa26d2a58413..ecc10e148799 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -256,7 +256,6 @@ struct signal_struct { #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ -#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */ /* * Pending notifications to parent. */ @@ -272,7 +271,7 @@ struct signal_struct { static inline void signal_set_stop_flags(struct signal_struct *sig, unsigned int flags) { - WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); + WARN_ON(sig->flags & SIGNAL_GROUP_EXIT); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } diff --git a/kernel/signal.c b/kernel/signal.c index 0706c1345a71..bae231bc2f4a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -906,7 +906,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) struct task_struct *t; sigset_t flush; - if ((signal->flags & SIGNAL_GROUP_EXIT) || signal->core_state) { + if (signal->flags & SIGNAL_GROUP_EXIT) { if (signal->core_state) return sig == SIGKILL; /* -- cgit v1.2.3 From 60700e38fb68e800607ca7a027060d5419fc5798 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 6 Jun 2021 13:47:53 -0500 Subject: signal: Rename group_exit_task group_exec_task The only remaining user of group_exit_task is exec. Rename the field so that it is clear which part of the code uses it. Update the comment above the definition of group_exec_task to document how it is currently used. Link: https://lkml.kernel.org/r/20211213225350.27481-7-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- fs/exec.c | 8 ++++---- include/linux/sched/signal.h | 12 ++++-------- kernel/exit.c | 4 ++-- 3 files changed, 10 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 59cac7c18178..9d2925811011 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1054,7 +1054,7 @@ static int de_thread(struct task_struct *tsk) return -EAGAIN; } - sig->group_exit_task = tsk; + sig->group_exec_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; @@ -1082,7 +1082,7 @@ static int de_thread(struct task_struct *tsk) write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that - * exit_notify() can't miss ->group_exit_task + * exit_notify() can't miss ->group_exec_task */ sig->notify_count = -1; if (likely(leader->exit_state)) @@ -1149,7 +1149,7 @@ static int de_thread(struct task_struct *tsk) release_task(leader); } - sig->group_exit_task = NULL; + sig->group_exec_task = NULL; sig->notify_count = 0; no_thread_group: @@ -1162,7 +1162,7 @@ no_thread_group: killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); - sig->group_exit_task = NULL; + sig->group_exec_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index ecc10e148799..d3248aba5183 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -109,13 +109,9 @@ struct signal_struct { /* thread group exit support */ int group_exit_code; - /* overloaded: - * - notify group_exit_task when ->count is equal to notify_count - * - everyone except group_exit_task is stopped during signal delivery - * of fatal signals, group_exit_task processes the signal. - */ + /* notify group_exec_task when notify_count is less or equal to 0 */ int notify_count; - struct task_struct *group_exit_task; + struct task_struct *group_exec_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; @@ -275,11 +271,11 @@ static inline void signal_set_stop_flags(struct signal_struct *sig, sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } -/* If true, all threads except ->group_exit_task have pending SIGKILL */ +/* If true, all threads except ->group_exec_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { return (sig->flags & SIGNAL_GROUP_EXIT) || - (sig->group_exit_task != NULL); + (sig->group_exec_task != NULL); } extern void flush_signals(struct task_struct *); diff --git a/kernel/exit.c b/kernel/exit.c index fc0726cb22db..b05578abbf26 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -116,7 +116,7 @@ static void __exit_signal(struct task_struct *tsk) * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) - wake_up_process(sig->group_exit_task); + wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); @@ -697,7 +697,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) - wake_up_process(tsk->signal->group_exit_task); + wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { -- cgit v1.2.3 From 49697335e0b441b0553598c1b48ee9ebb053d2f1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 24 Jun 2021 02:14:30 -0500 Subject: signal: Remove the helper signal_group_exit This helper is misleading. It tests for an ongoing exec as well as the process having received a fatal signal. Sometimes it is appropriate to treat an on-going exec differently than a process that is shutting down due to a fatal signal. In particular taking the fast path out of exit_signals instead of retargeting signals is not appropriate during exec, and not changing the the exit code in do_group_exit during exec. Removing the helper makes it more obvious what is going on as both cases must be coded for explicitly. While removing the helper fix the two cases where I have observed using signal_group_exit resulted in the wrong result. In exit_signals only test for SIGNAL_GROUP_EXIT so that signals are retargetted during an exec. In do_group_exit use 0 as the exit code during an exec as de_thread does not set group_exit_code. As best as I can determine group_exit_code has been is set to 0 most of the time during de_thread. During a thread group stop group_exit_code is set to the stop signal and when the thread group receives SIGCONT group_exit_code is reset to 0. Link: https://lkml.kernel.org/r/20211213225350.27481-8-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- fs/coredump.c | 5 +++-- fs/exec.c | 2 +- include/linux/sched/signal.h | 7 ------- kernel/exit.c | 8 ++++++-- kernel/signal.c | 8 +++++--- 5 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/fs/coredump.c b/fs/coredump.c index c92ffc0bf2c2..7dece20b162b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -372,11 +372,12 @@ static int zap_process(struct task_struct *start, int exit_code) static int zap_threads(struct task_struct *tsk, struct core_state *core_state, int exit_code) { + struct signal_struct *signal = tsk->signal; int nr = -EAGAIN; spin_lock_irq(&tsk->sighand->siglock); - if (!signal_group_exit(tsk->signal)) { - tsk->signal->core_state = core_state; + if (!(signal->flags & SIGNAL_GROUP_EXIT) && !signal->group_exec_task) { + signal->core_state = core_state; nr = zap_process(tsk, exit_code); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); tsk->flags |= PF_DUMPCORE; diff --git a/fs/exec.c b/fs/exec.c index 9d2925811011..82db656ca709 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1045,7 +1045,7 @@ static int de_thread(struct task_struct *tsk) * Kill all other threads in the thread group. */ spin_lock_irq(lock); - if (signal_group_exit(sig)) { + if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { /* * Another group action in progress, just * return so that the signal is processed. diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index d3248aba5183..b6ecb9fc4cd2 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -271,13 +271,6 @@ static inline void signal_set_stop_flags(struct signal_struct *sig, sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } -/* If true, all threads except ->group_exec_task have pending SIGKILL */ -static inline int signal_group_exit(const struct signal_struct *sig) -{ - return (sig->flags & SIGNAL_GROUP_EXIT) || - (sig->group_exec_task != NULL); -} - extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); diff --git a/kernel/exit.c b/kernel/exit.c index b05578abbf26..861cfb1e2f77 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -914,15 +914,19 @@ do_group_exit(int exit_code) BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; + else if (sig->group_exec_task) + exit_code = 0; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; + else if (sig->group_exec_task) + exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; diff --git a/kernel/signal.c b/kernel/signal.c index bae231bc2f4a..167b8e196a79 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2386,7 +2386,8 @@ static bool do_signal_stop(int signr) WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK); if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) || - unlikely(signal_group_exit(sig))) + unlikely(sig->flags & SIGNAL_GROUP_EXIT) || + unlikely(sig->group_exec_task)) return false; /* * There is no group stop already in progress. We must @@ -2693,7 +2694,8 @@ relock: enum pid_type type; /* Has this task already been marked for death? */ - if (signal_group_exit(signal)) { + if ((signal->flags & SIGNAL_GROUP_EXIT) || + signal->group_exec_task) { ksig->info.si_signo = signr = SIGKILL; sigdelset(¤t->pending.signal, SIGKILL); trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, @@ -2949,7 +2951,7 @@ void exit_signals(struct task_struct *tsk) */ cgroup_threadgroup_change_begin(tsk); - if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { + if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { tsk->flags |= PF_EXITING; cgroup_threadgroup_change_end(tsk); return; -- cgit v1.2.3 From 6410349ea5e177f3e53c2006d2041eed47e986ae Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 21 Dec 2021 19:10:27 -0800 Subject: signal: clean up kernel-doc comments Fix kernel-doc warnings in kernel/signal.c: kernel/signal.c:1830: warning: Function parameter or member 'force_coredump' not described in 'force_sig_seccomp' kernel/signal.c:2873: warning: missing initial short description on line: * signal_delivered - Also add a closing parenthesis to the comments in signal_delivered(). Signed-off-by: Randy Dunlap Cc: Alexander Viro Cc: Richard Weinberger Cc: Andrew Morton Cc: "Eric W. Biederman" Cc: Jens Axboe Cc: Peter Zijlstra Cc: Marco Elver Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20211222031027.29694-1-rdunlap@infradead.org Signed-off-by: Eric W. Biederman --- kernel/signal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 167b8e196a79..6324104cf244 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1823,6 +1823,7 @@ int force_sig_perf(void __user *addr, u32 type, u64 sig_data) * force_sig_seccomp - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland * @reason: filter-supplied reason code to send to userland (via si_errno) + * @force_coredump: true to trigger a coredump * * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info. */ @@ -2872,13 +2873,13 @@ out: } /** - * signal_delivered - + * signal_delivered - called after signal delivery to update blocked signals * @ksig: kernel signal struct * @stepping: nonzero if debugger single-step or block-step in use * * This function should be called when a signal has successfully been * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask - * is always blocked, and the signal itself is blocked unless %SA_NODEFER + * is always blocked), and the signal itself is blocked unless %SA_NODEFER * is set in @ksig->ka.sa.sa_flags. Tracing is notified. */ static void signal_delivered(struct ksignal *ksig, int stepping) -- cgit v1.2.3 From 2d4bcf886e42f0f4846a3d9bdc3a90d278903a2e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 8 Jan 2022 11:23:02 -0600 Subject: exit: Remove profile_task_exit & profile_munmap When I say remove I mean remove. All profile_task_exit and profile_munmap do is call a blocking notifier chain. The helpers profile_task_register and profile_task_unregister are not called anywhere in the tree. Which means this is all dead code. So remove the dead code and make it easier to read do_exit. Reviewed-by: Christoph Hellwig Link: https://lkml.kernel.org/r/20220103213312.9144-1-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- include/linux/profile.h | 26 ------------------------- kernel/exit.c | 1 - kernel/profile.c | 50 ------------------------------------------------- mm/mmap.c | 1 - 4 files changed, 78 deletions(-) (limited to 'kernel') diff --git a/include/linux/profile.h b/include/linux/profile.h index fd18ca96f557..f7eb2b57d890 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -31,11 +31,6 @@ static inline int create_proc_profile(void) } #endif -enum profile_type { - PROFILE_TASK_EXIT, - PROFILE_MUNMAP -}; - #ifdef CONFIG_PROFILING extern int prof_on __read_mostly; @@ -66,23 +61,14 @@ static inline void profile_hit(int type, void *ip) struct task_struct; struct mm_struct; -/* task is in do_exit() */ -void profile_task_exit(struct task_struct * task); - /* task is dead, free task struct ? Returns 1 if * the task was taken, 0 if the task should be freed. */ int profile_handoff_task(struct task_struct * task); -/* sys_munmap */ -void profile_munmap(unsigned long addr); - int task_handoff_register(struct notifier_block * n); int task_handoff_unregister(struct notifier_block * n); -int profile_event_register(enum profile_type, struct notifier_block * n); -int profile_event_unregister(enum profile_type, struct notifier_block * n); - #else #define prof_on 0 @@ -117,19 +103,7 @@ static inline int task_handoff_unregister(struct notifier_block * n) return -ENOSYS; } -static inline int profile_event_register(enum profile_type t, struct notifier_block * n) -{ - return -ENOSYS; -} - -static inline int profile_event_unregister(enum profile_type t, struct notifier_block * n) -{ - return -ENOSYS; -} - -#define profile_task_exit(a) do { } while (0) #define profile_handoff_task(a) (0) -#define profile_munmap(a) do { } while (0) #endif /* CONFIG_PROFILING */ diff --git a/kernel/exit.c b/kernel/exit.c index 861cfb1e2f77..64e907bc87d5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -751,7 +751,6 @@ void __noreturn do_exit(long code) */ force_uaccess_begin(); - profile_task_exit(tsk); kcov_task_exit(tsk); coredump_task_exit(tsk); diff --git a/kernel/profile.c b/kernel/profile.c index eb9c7f0f5ac5..9355cc934a96 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -135,14 +135,7 @@ int __ref profile_init(void) /* Profile event notifications */ -static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); static ATOMIC_NOTIFIER_HEAD(task_free_notifier); -static BLOCKING_NOTIFIER_HEAD(munmap_notifier); - -void profile_task_exit(struct task_struct *task) -{ - blocking_notifier_call_chain(&task_exit_notifier, 0, task); -} int profile_handoff_task(struct task_struct *task) { @@ -151,11 +144,6 @@ int profile_handoff_task(struct task_struct *task) return (ret == NOTIFY_OK) ? 1 : 0; } -void profile_munmap(unsigned long addr) -{ - blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); -} - int task_handoff_register(struct notifier_block *n) { return atomic_notifier_chain_register(&task_free_notifier, n); @@ -168,44 +156,6 @@ int task_handoff_unregister(struct notifier_block *n) } EXPORT_SYMBOL_GPL(task_handoff_unregister); -int profile_event_register(enum profile_type type, struct notifier_block *n) -{ - int err = -EINVAL; - - switch (type) { - case PROFILE_TASK_EXIT: - err = blocking_notifier_chain_register( - &task_exit_notifier, n); - break; - case PROFILE_MUNMAP: - err = blocking_notifier_chain_register( - &munmap_notifier, n); - break; - } - - return err; -} -EXPORT_SYMBOL_GPL(profile_event_register); - -int profile_event_unregister(enum profile_type type, struct notifier_block *n) -{ - int err = -EINVAL; - - switch (type) { - case PROFILE_TASK_EXIT: - err = blocking_notifier_chain_unregister( - &task_exit_notifier, n); - break; - case PROFILE_MUNMAP: - err = blocking_notifier_chain_unregister( - &munmap_notifier, n); - break; - } - - return err; -} -EXPORT_SYMBOL_GPL(profile_event_unregister); - #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending diff --git a/mm/mmap.c b/mm/mmap.c index bfb0ea164a90..70318c2a47c3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2928,7 +2928,6 @@ EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { addr = untagged_addr(addr); - profile_munmap(addr); return __vm_munmap(addr, len, true); } -- cgit v1.2.3 From 2873cd31a20c25b5e763b35e5fb886f0938c6dd5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 8 Jan 2022 10:03:24 -0600 Subject: exit: Remove profile_handoff_task All profile_handoff_task does is notify the task_free_notifier chain. The helpers task_handoff_register and task_handoff_unregister are used to add and delete entries from that chain and are never called. So remove the dead code and make it much easier to read and reason about __put_task_struct. Suggested-by: Al Viro Link: https://lkml.kernel.org/r/87fspyw6m0.fsf@email.froward.int.ebiederm.org Signed-off-by: "Eric W. Biederman" --- include/linux/profile.h | 19 ------------------- kernel/fork.c | 4 +--- kernel/profile.c | 23 ----------------------- 3 files changed, 1 insertion(+), 45 deletions(-) (limited to 'kernel') diff --git a/include/linux/profile.h b/include/linux/profile.h index f7eb2b57d890..11db1ec516e2 100644 --- a/include/linux/profile.h +++ b/include/linux/profile.h @@ -61,14 +61,6 @@ static inline void profile_hit(int type, void *ip) struct task_struct; struct mm_struct; -/* task is dead, free task struct ? Returns 1 if - * the task was taken, 0 if the task should be freed. - */ -int profile_handoff_task(struct task_struct * task); - -int task_handoff_register(struct notifier_block * n); -int task_handoff_unregister(struct notifier_block * n); - #else #define prof_on 0 @@ -93,17 +85,6 @@ static inline void profile_hit(int type, void *ip) return; } -static inline int task_handoff_register(struct notifier_block * n) -{ - return -ENOSYS; -} - -static inline int task_handoff_unregister(struct notifier_block * n) -{ - return -ENOSYS; -} - -#define profile_handoff_task(a) (0) #endif /* CONFIG_PROFILING */ diff --git a/kernel/fork.c b/kernel/fork.c index 6f0293cb29c9..494539ecb6d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -754,9 +754,7 @@ void __put_task_struct(struct task_struct *tsk) delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); sched_core_free(tsk); - - if (!profile_handoff_task(tsk)) - free_task(tsk); + free_task(tsk); } EXPORT_SYMBOL_GPL(__put_task_struct); diff --git a/kernel/profile.c b/kernel/profile.c index 9355cc934a96..37640a0bd8a3 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -133,29 +133,6 @@ int __ref profile_init(void) return -ENOMEM; } -/* Profile event notifications */ - -static ATOMIC_NOTIFIER_HEAD(task_free_notifier); - -int profile_handoff_task(struct task_struct *task) -{ - int ret; - ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); - return (ret == NOTIFY_OK) ? 1 : 0; -} - -int task_handoff_register(struct notifier_block *n) -{ - return atomic_notifier_chain_register(&task_free_notifier, n); -} -EXPORT_SYMBOL_GPL(task_handoff_register); - -int task_handoff_unregister(struct notifier_block *n) -{ - return atomic_notifier_chain_unregister(&task_free_notifier, n); -} -EXPORT_SYMBOL_GPL(task_handoff_unregister); - #if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending -- cgit v1.2.3 From 270b6541e603a7fae0cad7af3dc3bca6adb343f3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 23 Dec 2021 10:05:19 -0600 Subject: exit: Coredumps reach do_group_exit The comment about coredumps not reaching do_group_exit and the corresponding BUG_ON are bogus. What happens and has happened for years is that get_signal calls do_coredump (which sets SIGNAL_GROUP_EXIT and group_exit_code) and then do_group_exit passing the signal number. Then do_group_exit ignores the exit_code it is passed and uses signal->group_exit_code from the coredump. The comment and BUG_ON were correct when they were added during the 2.5 development cycle, but became obsolete and incorrect when get_signal was changed to fall through to do_group_exit after do_coredump in 2.6.10-rc2. So remove the stale comment and BUG_ON Fixes: 63bd6144f191 ("[PATCH] Invalid BUG_ONs in signal.c") History-Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Link: https://lkml.kernel.org/r/20220103213312.9144-2-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 64e907bc87d5..db86307077d4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -911,8 +911,6 @@ do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; - BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; else if (sig->group_exec_task) -- cgit v1.2.3 From 907c311f37ba04ccebd00a9b9f3ba718e318a1de Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 21 Dec 2021 10:11:01 -0600 Subject: exit: Fix the exit_code for wait_task_zombie The function wait_task_zombie is defined to always returns the process not thread exit status. Unfortunately when process group exit support was added to wait_task_zombie the WNOWAIT case was overlooked. Usually tsk->exit_code and tsk->signal->group_exit_code will be in sync so fixing this is bug probably has no effect in practice. But fix it anyway so that people aren't scratching their heads about why the two code paths are different. History-Tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Fixes: 2c66151cbc2c ("[PATCH] sys_exit() threading improvements, BK-curr") Link: https://lkml.kernel.org/r/20220103213312.9144-3-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/exit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index db86307077d4..b00a25bb4ab9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1018,7 +1018,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { - status = p->exit_code; + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); -- cgit v1.2.3 From 1b5a42d9c85f0e731f01c8d1129001fd8531a8a0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 3 Jan 2022 11:32:36 -0600 Subject: taskstats: Cleanup the use of task->exit_code In the function bacct_add_task the code reading task->exit_code was introduced in commit f3cef7a99469 ("[PATCH] csa: basic accounting over taskstats"), and it is not entirely clear what the taskstats interface is trying to return as only returning the exit_code of the first task in a process doesn't make a lot of sense. As best as I can figure the intent is to return task->exit_code after a task exits. The field is returned with per task fields, so the exit_code of the entire process is not wanted. Only the value of the first task is returned so this is not a useful way to get the per task ptrace stop code. The ordinary case of returning this value is returning after a task exits, which also precludes use for getting a ptrace value. It is common to for the first task of a process to also be the last task of a process so this field may have done something reasonable by accident in testing. Make ac_exitcode a reliable per task value by always returning it for every exited task. Setting ac_exitcode in a sensible mannter makes it possible to continue to provide this value going forward. Cc: Balbir Singh Fixes: f3cef7a99469 ("[PATCH] csa: basic accounting over taskstats") Link: https://lkml.kernel.org/r/20220103213312.9144-5-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/tsacct.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/tsacct.c b/kernel/tsacct.c index f00de83d0246..1d261fbe367b 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -38,11 +38,10 @@ void bacct_add_tsk(struct user_namespace *user_ns, stats->ac_btime = clamp_t(time64_t, btime, 0, U32_MAX); stats->ac_btime64 = btime; - if (thread_group_leader(tsk)) { + if (tsk->flags & PF_EXITING) stats->ac_exitcode = tsk->exit_code; - if (tsk->flags & PF_FORKNOEXEC) - stats->ac_flag |= AFORK; - } + if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC)) + stats->ac_flag |= AFORK; if (tsk->flags & PF_SUPERPRIV) stats->ac_flag |= ASU; if (tsk->flags & PF_DUMPCORE) -- cgit v1.2.3 From 6707d0fc60576fa8ef2dfa2f9009b606df35ba24 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 20 Dec 2021 17:15:09 -0600 Subject: ptrace: Remove second setting of PT_SEIZED in ptrace_attach The code is totally redundant remove it. Link: https://lkml.kernel.org/r/20220103213312.9144-6-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index f8589bf8d7dc..eea265082e97 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -419,8 +419,6 @@ static int ptrace_attach(struct task_struct *task, long request, if (task->ptrace) goto unlock_tasklist; - if (seize) - flags |= PT_SEIZED; task->ptrace = flags; ptrace_link(task, current); -- cgit v1.2.3 From 9ec5a7d16899ed9062cc4c3dd3a13e1771411ab3 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Jan 2022 08:04:11 -0600 Subject: tracing: Change event_command func() to parse() The name of the func() callback on event_command is too generic and is easily confused with other callbacks with that name, so change it to something that reflects its actual purpose. In this case, the main purpose of the callback is to parse an event command, so call it parse() instead. Link: https://lkml.kernel.org/r/7784e321840752ed88aac0b349c0c685fc9247b1.1641823001.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 19 ++++++++++++------- kernel/trace/trace_eprobe.c | 8 ++++---- kernel/trace/trace_events_hist.c | 26 +++++++++++++------------- kernel/trace/trace_events_trigger.c | 30 +++++++++++++++--------------- 4 files changed, 44 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 64a7ec44a635..3b2b1bfc686f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1578,9 +1578,9 @@ extern int event_enable_trigger_print(struct seq_file *m, struct event_trigger_data *data); extern void event_enable_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data); -extern int event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param); +extern int event_enable_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); extern int event_enable_register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, @@ -1702,7 +1702,7 @@ struct event_trigger_ops { * All the methods below, except for @set_filter() and @unreg_all(), * must be implemented. * - * @func: The callback function responsible for parsing and + * @parse: The callback function responsible for parsing and * registering the trigger written to the 'trigger' file by the * user. It allocates the trigger instance and registers it with * the appropriate trace event. It makes use of the other @@ -1737,15 +1737,20 @@ struct event_trigger_ops { * * @get_trigger_ops: The callback function invoked to retrieve the * event_trigger_ops implementation associated with the command. + * This callback function allows a single event_command to + * support multiple trigger implementations via different sets of + * event_trigger_ops, depending on the value of the @param + * string. */ struct event_command { struct list_head list; char *name; enum event_trigger_type trigger_type; int flags; - int (*func)(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *params); + int (*parse)(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, + char *param_and_filter); int (*reg)(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 88487752d307..84d5bfa34a99 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -549,9 +549,9 @@ static struct event_trigger_ops eprobe_trigger_ops = { .free = eprobe_trigger_free, }; -static int eprobe_trigger_cmd_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { return -1; } @@ -580,7 +580,7 @@ static struct event_command event_trigger_cmd = { .name = "eprobe", .trigger_type = ETT_EVENT_EPROBE, .flags = EVENT_CMD_FL_NEEDS_REC, - .func = eprobe_trigger_cmd_func, + .parse = eprobe_trigger_cmd_parse, .reg = eprobe_trigger_reg_func, .unreg = eprobe_trigger_unreg_func, .unreg_all = NULL, diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9b8da439149c..89bbbbd3a3f5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2761,9 +2761,9 @@ static char *find_trigger_filter(struct hist_trigger_data *hist_data, } static struct event_command trigger_hist_cmd; -static int event_hist_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param); +static int event_hist_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param); static bool compatible_keys(struct hist_trigger_data *target_hist_data, struct hist_trigger_data *hist_data, @@ -2966,8 +2966,8 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, var_hist->hist_data = hist_data; /* Create the new histogram with our variable */ - ret = event_hist_trigger_func(&trigger_hist_cmd, file, - "", "hist", cmd); + ret = event_hist_trigger_parse(&trigger_hist_cmd, file, + "", "hist", cmd); if (ret) { kfree(cmd); kfree(var_hist->cmd); @@ -5729,8 +5729,8 @@ static void unregister_field_var_hists(struct hist_trigger_data *hist_data) for (i = 0; i < hist_data->n_field_var_hists; i++) { file = hist_data->field_var_hists[i]->hist_data->event_file; cmd = hist_data->field_var_hists[i]->cmd; - ret = event_hist_trigger_func(&trigger_hist_cmd, file, - "!hist", "hist", cmd); + ret = event_hist_trigger_parse(&trigger_hist_cmd, file, + "!hist", "hist", cmd); WARN_ON_ONCE(ret < 0); } } @@ -6146,9 +6146,9 @@ static void hist_unreg_all(struct trace_event_file *file) } } -static int event_hist_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +static int event_hist_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT; struct event_trigger_data *trigger_data; @@ -6331,7 +6331,7 @@ static struct event_command trigger_hist_cmd = { .name = "hist", .trigger_type = ETT_EVENT_HIST, .flags = EVENT_CMD_FL_NEEDS_REC, - .func = event_hist_trigger_func, + .parse = event_hist_trigger_parse, .reg = hist_register_trigger, .unreg = hist_unregister_trigger, .unreg_all = hist_unreg_all, @@ -6446,7 +6446,7 @@ static void hist_enable_unreg_all(struct trace_event_file *file) static struct event_command trigger_hist_enable_cmd = { .name = ENABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, @@ -6457,7 +6457,7 @@ static struct event_command trigger_hist_enable_cmd = { static struct event_command trigger_hist_disable_cmd = { .name = DISABLE_HIST_STR, .trigger_type = ETT_HIST_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .unreg_all = hist_enable_unreg_all, diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 3d5c07239a2a..15aae07cbe61 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -245,7 +245,7 @@ int trigger_process_regex(struct trace_event_file *file, char *buff) mutex_lock(&trigger_cmd_mutex); list_for_each_entry(p, &trigger_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(p, file, buff, command, next); + ret = p->parse(p, file, buff, command, next); goto out_unlock; } } @@ -622,7 +622,7 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, } /** - * event_trigger_callback - Generic event_command @func implementation + * event_trigger_parse - Generic event_command @parse implementation * @cmd_ops: The command ops, used for trigger registration * @file: The trace_event_file associated with the event * @glob: The raw string used to register the trigger @@ -632,15 +632,15 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, * Common implementation for event command parsing and trigger * instantiation. * - * Usually used directly as the @func method in event command + * Usually used directly as the @parse method in event command * implementations. * * Return: 0 on success, errno otherwise */ static int -event_trigger_callback(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +event_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct event_trigger_data *trigger_data; struct event_trigger_ops *trigger_ops; @@ -1069,7 +1069,7 @@ onoff_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_traceon_cmd = { .name = "traceon", .trigger_type = ETT_TRACE_ONOFF, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = onoff_get_trigger_ops, @@ -1080,7 +1080,7 @@ static struct event_command trigger_traceoff_cmd = { .name = "traceoff", .trigger_type = ETT_TRACE_ONOFF, .flags = EVENT_CMD_FL_POST_TRIGGER, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = onoff_get_trigger_ops, @@ -1157,7 +1157,7 @@ snapshot_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_snapshot_cmd = { .name = "snapshot", .trigger_type = ETT_SNAPSHOT, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_snapshot_trigger, .unreg = unregister_trigger, .get_trigger_ops = snapshot_get_trigger_ops, @@ -1249,7 +1249,7 @@ static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, .flags = EVENT_CMD_FL_POST_TRIGGER, - .func = event_trigger_callback, + .parse = event_trigger_parse, .reg = register_trigger, .unreg = unregister_trigger, .get_trigger_ops = stacktrace_get_trigger_ops, @@ -1380,9 +1380,9 @@ static struct event_trigger_ops event_disable_count_trigger_ops = { .free = event_enable_trigger_free, }; -int event_enable_trigger_func(struct event_command *cmd_ops, - struct trace_event_file *file, - char *glob, char *cmd, char *param) +int event_enable_trigger_parse(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, char *cmd, char *param) { struct trace_event_file *event_enable_file; struct enable_trigger_data *enable_data; @@ -1628,7 +1628,7 @@ event_enable_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_enable_cmd = { .name = ENABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .get_trigger_ops = event_enable_get_trigger_ops, @@ -1638,7 +1638,7 @@ static struct event_command trigger_enable_cmd = { static struct event_command trigger_disable_cmd = { .name = DISABLE_EVENT_STR, .trigger_type = ETT_EVENT_ENABLE, - .func = event_enable_trigger_func, + .parse = event_enable_trigger_parse, .reg = event_enable_register_trigger, .unreg = event_enable_unregister_trigger, .get_trigger_ops = event_enable_get_trigger_ops, -- cgit v1.2.3 From fb339e531bfccbd12d49b165f37636e62778b69f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Jan 2022 08:04:12 -0600 Subject: tracing: Change event_trigger_ops func() to trigger() The name of the func() callback on event_trigger_ops is too generic and is easily confused with other callbacks with that name, so change it to something that reflects its actual purpose. In this case, the main purpose of the callback is to implement an event trigger, so call it trigger() instead. Also add some more documentation to event_trigger_ops describing the callbacks a bit better. Link: https://lkml.kernel.org/r/36ab812e3ee74ee03ae0043fda41a858ee728c00.1641823001.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 19 +++++++++++++++---- kernel/trace/trace_eprobe.c | 2 +- kernel/trace/trace_events_hist.c | 12 ++++++------ kernel/trace/trace_events_trigger.c | 30 +++++++++++++++--------------- 4 files changed, 37 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3b2b1bfc686f..13f23082f256 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1619,10 +1619,20 @@ extern int register_trigger_hist_enable_disable_cmds(void); * The methods in this structure provide per-event trigger hooks for * various trigger operations. * + * The @init and @free methods are used during trigger setup and + * teardown, typically called from an event_command's @parse() + * function implementation. + * + * The @print method is used to print the trigger spec. + * + * The @trigger method is the function that actually implements the + * trigger and is called in the context of the triggering event + * whenever that event occurs. + * * All the methods below, except for @init() and @free(), must be * implemented. * - * @func: The trigger 'probe' function called when the triggering + * @trigger: The trigger 'probe' function called when the triggering * event occurs. The data passed into this callback is the data * that was supplied to the event_command @reg() function that * registered the trigger (see struct event_command) along with @@ -1651,9 +1661,10 @@ extern int register_trigger_hist_enable_disable_cmds(void); * (see trace_event_triggers.c). */ struct event_trigger_ops { - void (*func)(struct event_trigger_data *data, - struct trace_buffer *buffer, void *rec, - struct ring_buffer_event *rbe); + void (*trigger)(struct event_trigger_data *data, + struct trace_buffer *buffer, + void *rec, + struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 84d5bfa34a99..6d363fd8a1e4 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -543,7 +543,7 @@ static void eprobe_trigger_func(struct event_trigger_data *data, } static struct event_trigger_ops eprobe_trigger_ops = { - .func = eprobe_trigger_func, + .trigger = eprobe_trigger_func, .print = eprobe_trigger_print, .init = eprobe_trigger_init, .free = eprobe_trigger_free, diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 89bbbbd3a3f5..229ce5c2dfd3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5759,7 +5759,7 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_hist_trigger_ops = { - .func = event_hist_trigger, + .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_init, .free = event_hist_trigger_free, @@ -5793,7 +5793,7 @@ static void event_hist_trigger_named_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_hist_trigger_named_ops = { - .func = event_hist_trigger, + .trigger = event_hist_trigger, .print = event_hist_trigger_print, .init = event_hist_trigger_named_init, .free = event_hist_trigger_named_free, @@ -6383,28 +6383,28 @@ hist_enable_count_trigger(struct event_trigger_data *data, } static struct event_trigger_ops hist_enable_trigger_ops = { - .func = hist_enable_trigger, + .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_enable_count_trigger_ops = { - .func = hist_enable_count_trigger, + .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_disable_trigger_ops = { - .func = hist_enable_trigger, + .trigger = hist_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops hist_disable_count_trigger_ops = { - .func = hist_enable_count_trigger, + .trigger = hist_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 15aae07cbe61..24aceeb50dc0 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -68,7 +68,7 @@ event_triggers_call(struct trace_event_file *file, if (data->paused) continue; if (!rec) { - data->ops->func(data, buffer, rec, event); + data->ops->trigger(data, buffer, rec, event); continue; } filter = rcu_dereference_sched(data->filter); @@ -78,7 +78,7 @@ event_triggers_call(struct trace_event_file *file, tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data, buffer, rec, event); + data->ops->trigger(data, buffer, rec, event); } return tt; } @@ -106,7 +106,7 @@ event_triggers_post_call(struct trace_event_file *file, if (data->paused) continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data, NULL, NULL, NULL); + data->ops->trigger(data, NULL, NULL, NULL); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -1023,28 +1023,28 @@ traceoff_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops traceon_trigger_ops = { - .func = traceon_trigger, + .trigger = traceon_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceon_count_trigger_ops = { - .func = traceon_count_trigger, + .trigger = traceon_count_trigger, .print = traceon_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceoff_trigger_ops = { - .func = traceoff_trigger, + .trigger = traceoff_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops traceoff_count_trigger_ops = { - .func = traceoff_count_trigger, + .trigger = traceoff_count_trigger, .print = traceoff_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1135,14 +1135,14 @@ snapshot_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops snapshot_trigger_ops = { - .func = snapshot_trigger, + .trigger = snapshot_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops snapshot_count_trigger_ops = { - .func = snapshot_count_trigger, + .trigger = snapshot_count_trigger, .print = snapshot_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1226,14 +1226,14 @@ stacktrace_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, } static struct event_trigger_ops stacktrace_trigger_ops = { - .func = stacktrace_trigger, + .trigger = stacktrace_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, }; static struct event_trigger_ops stacktrace_count_trigger_ops = { - .func = stacktrace_count_trigger, + .trigger = stacktrace_count_trigger, .print = stacktrace_trigger_print, .init = event_trigger_init, .free = event_trigger_free, @@ -1353,28 +1353,28 @@ void event_enable_trigger_free(struct event_trigger_ops *ops, } static struct event_trigger_ops event_enable_trigger_ops = { - .func = event_enable_trigger, + .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_enable_count_trigger_ops = { - .func = event_enable_count_trigger, + .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_disable_trigger_ops = { - .func = event_enable_trigger, + .trigger = event_enable_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, }; static struct event_trigger_ops event_disable_count_trigger_ops = { - .func = event_enable_count_trigger, + .trigger = event_enable_count_trigger, .print = event_enable_trigger_print, .init = event_trigger_init, .free = event_enable_trigger_free, -- cgit v1.2.3 From 2378a2d6b6cf863bdd566aae495336c72bdaec99 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Jan 2022 08:04:13 -0600 Subject: tracing: Remove ops param from event_command reg()/unreg() callbacks The event_trigger_ops for an event_command are already accessible via event_trigger_data.ops so remove the redundant ops from the callback. Link: https://lkml.kernel.org/r/4c6f2a41820452f9cacddc7634ad442928aa2aa6.1641823001.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 4 ---- kernel/trace/trace_eprobe.c | 12 ++++++------ kernel/trace/trace_events_hist.c | 10 +++++----- kernel/trace/trace_events_trigger.c | 22 +++++++++------------- 4 files changed, 20 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 13f23082f256..22a1e8635acf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1582,11 +1582,9 @@ extern int event_enable_trigger_parse(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param); extern int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); extern void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *test, struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); @@ -1763,11 +1761,9 @@ struct event_command { char *glob, char *cmd, char *param_and_filter); int (*reg)(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg)(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg_all)(struct trace_event_file *file); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 6d363fd8a1e4..191db32dec46 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -556,16 +556,16 @@ static int eprobe_trigger_cmd_parse(struct event_command *cmd_ops, return -1; } -static int eprobe_trigger_reg_func(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +static int eprobe_trigger_reg_func(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) { return -1; } -static void eprobe_trigger_unreg_func(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *data, - struct trace_event_file *file) +static void eprobe_trigger_unreg_func(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) { } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 229ce5c2dfd3..5e6a988a8a51 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5910,7 +5910,7 @@ static bool hist_trigger_match(struct event_trigger_data *data, return true; } -static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, +static int hist_register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -6062,7 +6062,7 @@ static bool hist_trigger_check_refs(struct event_trigger_data *data, return false; } -static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, +static void hist_unregister_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -6262,7 +6262,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); se_name = trace_event_name(file->event_call); se = find_synth_event(se_name); if (se) @@ -6271,7 +6271,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops, goto out_free; } - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of triggers registered, * but if it didn't register any it returns zero. Consider no @@ -6314,7 +6314,7 @@ enable: return ret; out_unreg: - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); out_free: if (cmd_ops->set_filter) cmd_ops->set_filter(NULL, trigger_data, NULL); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 24aceeb50dc0..d40b857db572 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -540,7 +540,6 @@ void update_cond_flag(struct trace_event_file *file) /** * register_trigger - Generic event_command @reg implementation * @glob: The raw string used to register the trigger - * @ops: The trigger ops associated with the trigger * @data: Trigger-specific data to associate with the trigger * @file: The trace_event_file associated with the event * @@ -551,7 +550,7 @@ void update_cond_flag(struct trace_event_file *file) * * Return: 0 on success, errno otherwise */ -static int register_trigger(char *glob, struct event_trigger_ops *ops, +static int register_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { @@ -589,7 +588,6 @@ out: /** * unregister_trigger - Generic event_command @unreg implementation * @glob: The raw string used to register the trigger - * @ops: The trigger ops associated with the trigger * @test: Trigger-specific data used to find the trigger to remove * @file: The trace_event_file associated with the event * @@ -598,7 +596,7 @@ out: * Usually used directly as the @unreg method in event command * implementations. */ -static void unregister_trigger(char *glob, struct event_trigger_ops *ops, +static void unregister_trigger(char *glob, struct event_trigger_data *test, struct trace_event_file *file) { @@ -673,7 +671,7 @@ event_trigger_parse(struct event_command *cmd_ops, INIT_LIST_HEAD(&trigger_data->named_list); if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); kfree(trigger_data); ret = 0; goto out; @@ -708,14 +706,14 @@ event_trigger_parse(struct event_command *cmd_ops, out_reg: /* Up the trigger_data count to make sure reg doesn't free it on failure */ event_trigger_init(trigger_ops, trigger_data); - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. * Consider no functions a failure too. */ if (!ret) { - cmd_ops->unreg(glob, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob, trigger_data, file); ret = -ENOENT; } else if (ret > 0) ret = 0; @@ -1116,14 +1114,14 @@ snapshot_count_trigger(struct event_trigger_data *data, } static int -register_snapshot_trigger(char *glob, struct event_trigger_ops *ops, +register_snapshot_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { if (tracing_alloc_snapshot_instance(file->tr) != 0) return 0; - return register_trigger(glob, ops, data, file); + return register_trigger(glob, data, file); } static int @@ -1455,7 +1453,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, trigger_data->private_data = enable_data; if (glob[0] == '!') { - cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); + cmd_ops->unreg(glob+1, trigger_data, file); kfree(trigger_data); kfree(enable_data); ret = 0; @@ -1502,7 +1500,7 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, ret = trace_event_enable_disable(event_enable_file, 1, 1); if (ret < 0) goto out_put; - ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file); + ret = cmd_ops->reg(glob, trigger_data, file); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. @@ -1532,7 +1530,6 @@ int event_enable_trigger_parse(struct event_command *cmd_ops, } int event_enable_register_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file) { @@ -1574,7 +1571,6 @@ out: } void event_enable_unregister_trigger(char *glob, - struct event_trigger_ops *ops, struct event_trigger_data *test, struct trace_event_file *file) { -- cgit v1.2.3 From 86599dbe2c5272588f859858239d1f52321eb0f9 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 10 Jan 2022 08:04:14 -0600 Subject: tracing: Add helper functions to simplify event_command.parse() callback handling The event_command.parse() callback is responsible for parsing and registering triggers. The existing command implementions for this callback duplicate a lot of the same code, so to clean up and consolidate those implementations, introduce a handful of helper functions for implementors to use. This also makes it easier for new commands to be implemented and allows them to focus more on the customizations they provide rather than obscuring and complicating it with boilerplate code. Link: https://lkml.kernel.org/r/c1ff71f594d45177706571132bd3119491097221.1641823001.git.zanussi@kernel.org Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 24 +++ kernel/trace/trace_events_trigger.c | 342 ++++++++++++++++++++++++++++++++++++ 2 files changed, 366 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 22a1e8635acf..d038ddbf1bea 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1610,6 +1610,30 @@ get_named_trigger_data(struct event_trigger_data *data); extern int register_event_command(struct event_command *cmd); extern int unregister_event_command(struct event_command *cmd); extern int register_trigger_hist_enable_disable_cmds(void); +extern bool event_trigger_check_remove(const char *glob); +extern bool event_trigger_empty_param(const char *param); +extern int event_trigger_separate_filter(char *param_and_filter, char **param, + char **filter, bool param_required); +extern struct event_trigger_data * +event_trigger_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data); +extern int event_trigger_parse_num(char *trigger, + struct event_trigger_data *trigger_data); +extern int event_trigger_set_filter(struct event_command *cmd_ops, + struct trace_event_file *file, + char *param, + struct event_trigger_data *trigger_data); +extern void event_trigger_reset_filter(struct event_command *cmd_ops, + struct event_trigger_data *trigger_data); +extern int event_trigger_register(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + char *cmd, + char *trigger, + struct event_trigger_data *trigger_data, + int *n_registered); /** * struct event_trigger_ops - callbacks for trace event triggers diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index d40b857db572..d00fee705f9c 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -619,6 +619,348 @@ static void unregister_trigger(char *glob, data->ops->free(data->ops, data); } +/* + * Event trigger parsing helper functions. + * + * These functions help make it easier to write an event trigger + * parsing function i.e. the struct event_command.parse() callback + * function responsible for parsing and registering a trigger command + * written to the 'trigger' file. + * + * A trigger command (or just 'trigger' for short) takes the form: + * [trigger] [if filter] + * + * The struct event_command.parse() callback (and other struct + * event_command functions) refer to several components of a trigger + * command. Those same components are referenced by the event trigger + * parsing helper functions defined below. These components are: + * + * cmd - the trigger command name + * glob - the trigger command name optionally prefaced with '!' + * param_and_filter - text following cmd and ':' + * param - text following cmd and ':' and stripped of filter + * filter - the optional filter text following (and including) 'if' + * + * To illustrate the use of these componenents, here are some concrete + * examples. For the following triggers: + * + * echo 'traceon:5 if pid == 0' > trigger + * - 'traceon' is both cmd and glob + * - '5 if pid == 0' is the param_and_filter + * - '5' is the param + * - 'if pid == 0' is the filter + * + * echo 'enable_event:sys:event:n' > trigger + * - 'enable_event' is both cmd and glob + * - 'sys:event:n' is the param_and_filter + * - 'sys:event:n' is the param + * - there is no filter + * + * echo 'hist:keys=pid if prio > 50' > trigger + * - 'hist' is both cmd and glob + * - 'keys=pid if prio > 50' is the param_and_filter + * - 'keys=pid' is the param + * - 'if prio > 50' is the filter + * + * echo '!enable_event:sys:event:n' > trigger + * - 'enable_event' the cmd + * - '!enable_event' is the glob + * - 'sys:event:n' is the param_and_filter + * - 'sys:event:n' is the param + * - there is no filter + * + * echo 'traceoff' > trigger + * - 'traceoff' is both cmd and glob + * - there is no param_and_filter + * - there is no param + * - there is no filter + * + * There are a few different categories of event trigger covered by + * these helpers: + * + * - triggers that don't require a parameter e.g. traceon + * - triggers that do require a parameter e.g. enable_event and hist + * - triggers that though they may not require a param may support an + * optional 'n' param (n = number of times the trigger should fire) + * e.g.: traceon:5 or enable_event:sys:event:n + * - triggers that do not support an 'n' param e.g. hist + * + * These functions can be used or ignored as necessary - it all + * depends on the complexity of the trigger, and the granularity of + * the functions supported reflects the fact that some implementations + * may need to customize certain aspects of their implementations and + * won't need certain functions. For instance, the hist trigger + * implementation doesn't use event_trigger_separate_filter() because + * it has special requirements for handling the filter. + */ + +/** + * event_trigger_check_remove - check whether an event trigger specifies remove + * @glob: The trigger command string, with optional remove(!) operator + * + * The event trigger callback implementations pass in 'glob' as a + * parameter. This is the command name either with or without a + * remove(!) operator. This function simply parses the glob and + * determines whether the command corresponds to a trigger removal or + * a trigger addition. + * + * Return: true if this is a remove command, false otherwise + */ +bool event_trigger_check_remove(const char *glob) +{ + return (glob && glob[0] == '!') ? true : false; +} + +/** + * event_trigger_empty_param - check whether the param is empty + * @param: The trigger param string + * + * The event trigger callback implementations pass in 'param' as a + * parameter. This corresponds to the string following the command + * name minus the command name. This function can be called by a + * callback implementation for any command that requires a param; a + * callback that doesn't require a param can ignore it. + * + * Return: true if this is an empty param, false otherwise + */ +bool event_trigger_empty_param(const char *param) +{ + return !param; +} + +/** + * event_trigger_separate_filter - separate an event trigger from a filter + * @param: The param string containing trigger and possibly filter + * @trigger: outparam, will be filled with a pointer to the trigger + * @filter: outparam, will be filled with a pointer to the filter + * @param_required: Specifies whether or not the param string is required + * + * Given a param string of the form '[trigger] [if filter]', this + * function separates the filter from the trigger and returns the + * trigger in *trigger and the filter in *filter. Either the *trigger + * or the *filter may be set to NULL by this function - if not set to + * NULL, they will contain strings corresponding to the trigger and + * filter. + * + * There are two cases that need to be handled with respect to the + * passed-in param: either the param is required, or it is not + * required. If @param_required is set, and there's no param, it will + * return -EINVAL. If @param_required is not set and there's a param + * that starts with a number, that corresponds to the case of a + * trigger with :n (n = number of times the trigger should fire) and + * the parsing continues normally; otherwise the function just returns + * and assumes param just contains a filter and there's nothing else + * to do. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_separate_filter(char *param_and_filter, char **param, + char **filter, bool param_required) +{ + int ret = 0; + + *param = *filter = NULL; + + if (!param_and_filter) { + if (param_required) + ret = -EINVAL; + goto out; + } + + /* + * Here we check for an optional param. The only legal + * optional param is :n, and if that's the case, continue + * below. Otherwise we assume what's left is a filter and + * return it as the filter string for the caller to deal with. + */ + if (!param_required && param_and_filter && !isdigit(param_and_filter[0])) { + *filter = param_and_filter; + goto out; + } + + /* + * Separate the param from the filter (param [if filter]). + * Here we have either an optional :n param or a required + * param and an optional filter. + */ + *param = strsep(¶m_and_filter, " \t"); + + /* + * Here we have a filter, though it may be empty. + */ + if (param_and_filter) { + *filter = skip_spaces(param_and_filter); + if (!**filter) + *filter = NULL; + } +out: + return ret; +} + +/** + * event_trigger_alloc - allocate and init event_trigger_data for a trigger + * @cmd_ops: The event_command operations for the trigger + * @cmd: The cmd string + * @param: The param string + * @private_data: User data to associate with the event trigger + * + * Allocate an event_trigger_data instance and initialize it. The + * @cmd_ops are used along with the @cmd and @param to get the + * trigger_ops to assign to the event_trigger_data. @private_data can + * also be passed in and associated with the event_trigger_data. + * + * Use event_trigger_free() to free an event_trigger_data object. + * + * Return: The trigger_data object success, NULL otherwise + */ +struct event_trigger_data *event_trigger_alloc(struct event_command *cmd_ops, + char *cmd, + char *param, + void *private_data) +{ + struct event_trigger_data *trigger_data; + struct event_trigger_ops *trigger_ops; + + trigger_ops = cmd_ops->get_trigger_ops(cmd, param); + + trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); + if (!trigger_data) + return NULL; + + trigger_data->count = -1; + trigger_data->ops = trigger_ops; + trigger_data->cmd_ops = cmd_ops; + trigger_data->private_data = private_data; + + INIT_LIST_HEAD(&trigger_data->list); + INIT_LIST_HEAD(&trigger_data->named_list); + RCU_INIT_POINTER(trigger_data->filter, NULL); + + return trigger_data; +} + +/** + * event_trigger_parse_num - parse and return the number param for a trigger + * @param: The param string + * @trigger_data: The trigger_data for the trigger + * + * Parse the :n (n = number of times the trigger should fire) param + * and set the count variable in the trigger_data to the parsed count. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_parse_num(char *param, + struct event_trigger_data *trigger_data) +{ + char *number; + int ret = 0; + + if (param) { + number = strsep(¶m, ":"); + + if (!strlen(number)) + return -EINVAL; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &trigger_data->count); + } + + return ret; +} + +/** + * event_trigger_set_filter - set an event trigger's filter + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @param: The string containing the filter + * @trigger_data: The trigger_data for the trigger + * + * Set the filter for the trigger. If the filter is NULL, just return + * without error. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_set_filter(struct event_command *cmd_ops, + struct trace_event_file *file, + char *param, + struct event_trigger_data *trigger_data) +{ + if (param && cmd_ops->set_filter) + return cmd_ops->set_filter(param, trigger_data, file); + + return 0; +} + +/** + * event_trigger_reset_filter - reset an event trigger's filter + * @cmd_ops: The event_command operations for the trigger + * @trigger_data: The trigger_data for the trigger + * + * Reset the filter for the trigger to no filter. + */ +void event_trigger_reset_filter(struct event_command *cmd_ops, + struct event_trigger_data *trigger_data) +{ + if (cmd_ops->set_filter) + cmd_ops->set_filter(NULL, trigger_data, NULL); +} + +/** + * event_trigger_register - register an event trigger + * @cmd_ops: The event_command operations for the trigger + * @file: The event file for the trigger's event + * @glob: The trigger command string, with optional remove(!) operator + * @cmd: The cmd string + * @param: The param string + * @trigger_data: The trigger_data for the trigger + * @n_registered: optional outparam, the number of triggers registered + * + * Register an event trigger. The @cmd_ops are used to call the + * cmd_ops->reg() function which actually does the registration. The + * cmd_ops->reg() function returns the number of triggers registered, + * which is assigned to n_registered, if n_registered is non-NULL. + * + * Return: 0 on success, errno otherwise + */ +int event_trigger_register(struct event_command *cmd_ops, + struct trace_event_file *file, + char *glob, + char *cmd, + char *param, + struct event_trigger_data *trigger_data, + int *n_registered) +{ + int ret; + + if (n_registered) + *n_registered = 0; + + ret = cmd_ops->reg(glob, trigger_data, file); + /* + * The above returns on success the # of functions enabled, + * but if it didn't find any functions it returns zero. + * Consider no functions a failure too. + */ + if (!ret) { + cmd_ops->unreg(glob, trigger_data, file); + ret = -ENOENT; + } else if (ret > 0) { + if (n_registered) + *n_registered = ret; + /* Just return zero, not the number of enabled functions */ + ret = 0; + } + + return ret; +} + +/* + * End event trigger parsing helper functions. + */ + /** * event_trigger_parse - Generic event_command @parse implementation * @cmd_ops: The command ops, used for trigger registration -- cgit v1.2.3 From 74a5257a0c175810d620b5e631c4e7554955ac25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jan 2022 19:12:45 +0100 Subject: genirq/msi: Populate sysfs entry only once The MSI entries for multi-MSI are populated en bloc for the MSI descriptor, but the current code invokes the population inside the per interrupt loop which triggers a warning in the sysfs code and causes the interrupt allocation to fail. Move it outside of the loop so it works correctly for single and multi-MSI. Fixes: bf5e758f02fc ("genirq/msi: Simplify sysfs handling") Reported-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Borislav Petkov Link: https://lore.kernel.org/r/87leznqx2a.ffs@tglx --- kernel/irq/msi.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 173bc04f9fe5..2bdfce5edafd 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -887,12 +887,11 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, ret = msi_init_virq(domain, virq + i, vflags); if (ret) return ret; - - if (info->flags & MSI_FLAG_DEV_SYSFS) { - ret = msi_sysfs_populate_desc(dev, desc); - if (ret) - return ret; - } + } + if (info->flags & MSI_FLAG_DEV_SYSFS) { + ret = msi_sysfs_populate_desc(dev, desc); + if (ret) + return ret; } allocated++; } -- cgit v1.2.3 From 1e9d74660d4df625b0889e77018f9e94727ceacd Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 8 Jan 2022 13:46:23 +0000 Subject: bpf: Fix mount source show for bpffs We noticed our tc ebpf tools can't start after we upgrade our in-house kernel version from 4.19 to 5.10. That is because of the behaviour change in bpffs caused by commit d2935de7e4fd ("vfs: Convert bpf to use the new mount API"). In our tc ebpf tools, we do strict environment check. If the environment is not matched, we won't allow to start the ebpf progs. One of the check is whether bpffs is properly mounted. The mount information of bpffs in kernel-4.19 and kernel-5.10 are as follows: - kernel 4.19 $ mount -t bpf bpffs /sys/fs/bpf $ mount -t bpf bpffs on /sys/fs/bpf type bpf (rw,relatime) - kernel 5.10 $ mount -t bpf bpffs /sys/fs/bpf $ mount -t bpf none on /sys/fs/bpf type bpf (rw,relatime) The device name in kernel-5.10 is displayed as none instead of bpffs, then our environment check fails. Currently we modify the tools to adopt to the kernel behaviour change, but I think we'd better change the kernel code to keep the behavior consistent. After this change, the mount information will be displayed the same with the behavior in kernel-4.19, for example: $ mount -t bpf bpffs /sys/fs/bpf $ mount -t bpf bpffs on /sys/fs/bpf type bpf (rw,relatime) Fixes: d2935de7e4fd ("vfs: Convert bpf to use the new mount API") Suggested-by: Daniel Borkmann Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann Acked-by: Christian Brauner Cc: David Howells Cc: Al Viro Link: https://lore.kernel.org/bpf/20220108134623.32467-1-laoar.shao@gmail.com --- kernel/bpf/inode.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 80da1db47c68..5a8d9f7467bf 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -648,12 +648,22 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) int opt; opt = fs_parse(fc, bpf_fs_parameters, param, &result); - if (opt < 0) + if (opt < 0) { /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ - return opt == -ENOPARAM ? 0 : opt; + if (opt == -ENOPARAM) { + opt = vfs_parse_fs_param_source(fc, param); + if (opt != -ENOPARAM) + return opt; + + return 0; + } + + if (opt < 0) + return opt; + } switch (opt) { case OPT_MODE: -- cgit v1.2.3 From 343e53754b21ae45530623222aa079fecd3cf942 Mon Sep 17 00:00:00 2001 From: Christy Lee Date: Fri, 7 Jan 2022 16:58:54 -0800 Subject: bpf: Fix incorrect integer literal used for marking scratched stack. env->scratched_stack_slots is a 64-bit value, we should use ULL instead of UL literal values. Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Christy Lee Acked-by: Song Liu Link: https://lore.kernel.org/r/20220108005854.658596-1-christylee@fb.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bfb45381fb3f..a8587210907d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -616,7 +616,7 @@ static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) { - env->scratched_stack_slots |= 1UL << spi; + env->scratched_stack_slots |= 1ULL << spi; } static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) @@ -637,14 +637,14 @@ static bool verifier_state_scratched(const struct bpf_verifier_env *env) static void mark_verifier_state_clean(struct bpf_verifier_env *env) { env->scratched_regs = 0U; - env->scratched_stack_slots = 0UL; + env->scratched_stack_slots = 0ULL; } /* Used for printing the entire verifier state. */ static void mark_verifier_state_scratched(struct bpf_verifier_env *env) { env->scratched_regs = ~0U; - env->scratched_stack_slots = ~0UL; + env->scratched_stack_slots = ~0ULL; } /* The reg state of a pointer or a bounded scalar was saved when -- cgit v1.2.3 From 9dc3c3f691bca10d3aa94887eee33bf629840b23 Mon Sep 17 00:00:00 2001 From: Yu Chen Date: Mon, 22 Nov 2021 06:26:48 -0800 Subject: module: Remove outdated comment Since commit e513cc1c07e2 ("module: Remove stop_machine from module unloading") this comment is no longer correct. Remove it. Signed-off-by: Yu Chen Signed-off-by: Luis Chamberlain --- kernel/module.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 84a9141a5e15..320ec908045f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -958,7 +958,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, } } - /* Stop the machine so refcounts can't move and disable module. */ ret = try_stop_module(mod, flags, &forced); if (ret != 0) goto out; -- cgit v1.2.3 From b1ae6dc41eaaa98bb75671e0f3665bfda248c3e7 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 5 Jan 2022 13:55:12 -0800 Subject: module: add in-kernel support for decompressing Current scheme of having userspace decompress kernel modules before loading them into the kernel runs afoul of LoadPin security policy, as it loses link between the source of kernel module on the disk and binary blob that is being loaded into the kernel. To solve this issue let's implement decompression in kernel, so that we can pass a file descriptor of compressed module file into finit_module() which will keep LoadPin happy. To let userspace know what compression/decompression scheme kernel supports it will create /sys/module/compression attribute. kmod can read this attribute and decide if it can pass compressed file to finit_module(). New MODULE_INIT_COMPRESSED_DATA flag indicates that the kernel should attempt to decompress the data read from file descriptor prior to trying load the module. To simplify things kernel will only implement single decompression method matching compression method selected when generating modules. This patch implements gzip and xz; more can be added later, Signed-off-by: Dmitry Torokhov Signed-off-by: Luis Chamberlain --- include/uapi/linux/module.h | 1 + init/Kconfig | 13 +++ kernel/Makefile | 1 + kernel/module-internal.h | 19 ++++ kernel/module.c | 35 ++++-- kernel/module_decompress.c | 271 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 329 insertions(+), 11 deletions(-) create mode 100644 kernel/module_decompress.c (limited to 'kernel') diff --git a/include/uapi/linux/module.h b/include/uapi/linux/module.h index 50d98ec5e866..03a33ffffcba 100644 --- a/include/uapi/linux/module.h +++ b/include/uapi/linux/module.h @@ -5,5 +5,6 @@ /* Flags for sys_finit_module: */ #define MODULE_INIT_IGNORE_MODVERSIONS 1 #define MODULE_INIT_IGNORE_VERMAGIC 2 +#define MODULE_INIT_COMPRESSED_FILE 4 #endif /* _UAPI_LINUX_MODULE_H */ diff --git a/init/Kconfig b/init/Kconfig index f2ae41e6717f..faf3a4b5cc8f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2274,6 +2274,19 @@ config MODULE_COMPRESS_ZSTD endchoice +config MODULE_DECOMPRESS + bool "Support in-kernel module decompression" + depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ + select ZLIB_INFLATE if MODULE_COMPRESS_GZIP + select XZ_DEC if MODULE_COMPRESS_XZ + help + + Support for decompressing kernel modules by the kernel itself + instead of relying on userspace to perform this task. Useful when + load pinning security policy is enabled. + + If unsure, say N. + config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS bool "Allow loading of modules with missing namespace imports" help diff --git a/kernel/Makefile b/kernel/Makefile index 186c49582f45..56f4ee97f328 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -67,6 +67,7 @@ obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 33783abc377b..8c381c99062f 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h @@ -22,6 +22,11 @@ struct load_info { bool sig_ok; #ifdef CONFIG_KALLSYMS unsigned long mod_kallsyms_init_off; +#endif +#ifdef CONFIG_MODULE_DECOMPRESS + struct page **pages; + unsigned int max_pages; + unsigned int used_pages; #endif struct { unsigned int sym, str, mod, vers, info, pcpu; @@ -29,3 +34,17 @@ struct load_info { }; extern int mod_verify_sig(const void *mod, struct load_info *info); + +#ifdef CONFIG_MODULE_DECOMPRESS +int module_decompress(struct load_info *info, const void *buf, size_t size); +void module_decompress_cleanup(struct load_info *info); +#else +static inline int module_decompress(struct load_info *info, + const void *buf, size_t size) +{ + return -EOPNOTSUPP; +} +static inline void module_decompress_cleanup(struct load_info *info) +{ +} +#endif diff --git a/kernel/module.c b/kernel/module.c index 320ec908045f..34fe2824eb56 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3173,9 +3173,12 @@ out: return err; } -static void free_copy(struct load_info *info) +static void free_copy(struct load_info *info, int flags) { - vfree(info->hdr); + if (flags & MODULE_INIT_COMPRESSED_FILE) + module_decompress_cleanup(info); + else + vfree(info->hdr); } static int rewrite_section_headers(struct load_info *info, int flags) @@ -4124,7 +4127,7 @@ static int load_module(struct load_info *info, const char __user *uargs, } /* Get rid of temporary copy. */ - free_copy(info); + free_copy(info, flags); /* Done! */ trace_module_load(mod); @@ -4173,7 +4176,7 @@ static int load_module(struct load_info *info, const char __user *uargs, module_deallocate(mod, info); free_copy: - free_copy(info); + free_copy(info, flags); return err; } @@ -4200,7 +4203,8 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { struct load_info info = { }; - void *hdr = NULL; + void *buf = NULL; + int len; int err; err = may_init_module(); @@ -4210,15 +4214,24 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS - |MODULE_INIT_IGNORE_VERMAGIC)) + |MODULE_INIT_IGNORE_VERMAGIC + |MODULE_INIT_COMPRESSED_FILE)) return -EINVAL; - err = kernel_read_file_from_fd(fd, 0, &hdr, INT_MAX, NULL, + len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL, READING_MODULE); - if (err < 0) - return err; - info.hdr = hdr; - info.len = err; + if (len < 0) + return len; + + if (flags & MODULE_INIT_COMPRESSED_FILE) { + err = module_decompress(&info, buf, len); + vfree(buf); /* compressed data is no longer needed */ + if (err) + return err; + } else { + info.hdr = buf; + info.len = len; + } return load_module(&info, uargs, flags); } diff --git a/kernel/module_decompress.c b/kernel/module_decompress.c new file mode 100644 index 000000000000..aeefd95a3337 --- /dev/null +++ b/kernel/module_decompress.c @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright 2021 Google LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "module-internal.h" + +static int module_extend_max_pages(struct load_info *info, unsigned int extent) +{ + struct page **new_pages; + + new_pages = kvmalloc_array(info->max_pages + extent, + sizeof(info->pages), GFP_KERNEL); + if (!new_pages) + return -ENOMEM; + + memcpy(new_pages, info->pages, info->max_pages * sizeof(info->pages)); + kvfree(info->pages); + info->pages = new_pages; + info->max_pages += extent; + + return 0; +} + +static struct page *module_get_next_page(struct load_info *info) +{ + struct page *page; + int error; + + if (info->max_pages == info->used_pages) { + error = module_extend_max_pages(info, info->used_pages); + if (error) + return ERR_PTR(error); + } + + page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!page) + return ERR_PTR(-ENOMEM); + + info->pages[info->used_pages++] = page; + return page; +} + +#ifdef CONFIG_MODULE_COMPRESS_GZIP +#include +#define MODULE_COMPRESSION gzip +#define MODULE_DECOMPRESS_FN module_gzip_decompress + +/* + * Calculate length of the header which consists of signature, header + * flags, time stamp and operating system ID (10 bytes total), plus + * an optional filename. + */ +static size_t module_gzip_header_len(const u8 *buf, size_t size) +{ + const u8 signature[] = { 0x1f, 0x8b, 0x08 }; + size_t len = 10; + + if (size < len || memcmp(buf, signature, sizeof(signature))) + return 0; + + if (buf[3] & 0x08) { + do { + /* + * If we can't find the end of the file name we must + * be dealing with a corrupted file. + */ + if (len == size) + return 0; + } while (buf[len++] != '\0'); + } + + return len; +} + +static ssize_t module_gzip_decompress(struct load_info *info, + const void *buf, size_t size) +{ + struct z_stream_s s = { 0 }; + size_t new_size = 0; + size_t gzip_hdr_len; + ssize_t retval; + int rc; + + gzip_hdr_len = module_gzip_header_len(buf, size); + if (!gzip_hdr_len) { + pr_err("not a gzip compressed module\n"); + return -EINVAL; + } + + s.next_in = buf + gzip_hdr_len; + s.avail_in = size - gzip_hdr_len; + + s.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); + if (!s.workspace) + return -ENOMEM; + + rc = zlib_inflateInit2(&s, -MAX_WBITS); + if (rc != Z_OK) { + pr_err("failed to initialize decompresser: %d\n", rc); + retval = -EINVAL; + goto out; + } + + do { + struct page *page = module_get_next_page(info); + if (!page) { + retval = -ENOMEM; + goto out_inflate_end; + } + + s.next_out = kmap(page); + s.avail_out = PAGE_SIZE; + rc = zlib_inflate(&s, 0); + kunmap(page); + + new_size += PAGE_SIZE - s.avail_out; + } while (rc == Z_OK); + + if (rc != Z_STREAM_END) { + pr_err("decompression failed with status %d\n", rc); + retval = -EINVAL; + goto out_inflate_end; + } + + retval = new_size; + +out_inflate_end: + zlib_inflateEnd(&s); +out: + kfree(s.workspace); + return retval; +} +#elif CONFIG_MODULE_COMPRESS_XZ +#include +#define MODULE_COMPRESSION xz +#define MODULE_DECOMPRESS_FN module_xz_decompress + +static ssize_t module_xz_decompress(struct load_info *info, + const void *buf, size_t size) +{ + static const u8 signature[] = { 0xfd, '7', 'z', 'X', 'Z', 0 }; + struct xz_dec *xz_dec; + struct xz_buf xz_buf; + enum xz_ret xz_ret; + size_t new_size = 0; + ssize_t retval; + + if (size < sizeof(signature) || + memcmp(buf, signature, sizeof(signature))) { + pr_err("not an xz compressed module\n"); + return -EINVAL; + } + + xz_dec = xz_dec_init(XZ_DYNALLOC, (u32)-1); + if (!xz_dec) + return -ENOMEM; + + xz_buf.in_size = size; + xz_buf.in = buf; + xz_buf.in_pos = 0; + + do { + struct page *page = module_get_next_page(info); + if (!page) { + retval = -ENOMEM; + goto out; + } + + xz_buf.out = kmap(page); + xz_buf.out_pos = 0; + xz_buf.out_size = PAGE_SIZE; + xz_ret = xz_dec_run(xz_dec, &xz_buf); + kunmap(page); + + new_size += xz_buf.out_pos; + } while (xz_buf.out_pos == PAGE_SIZE && xz_ret == XZ_OK); + + if (xz_ret != XZ_STREAM_END) { + pr_err("decompression failed with status %d\n", xz_ret); + retval = -EINVAL; + goto out; + } + + retval = new_size; + + out: + xz_dec_end(xz_dec); + return retval; +} +#else +#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS" +#endif + +int module_decompress(struct load_info *info, const void *buf, size_t size) +{ + unsigned int n_pages; + ssize_t data_size; + int error; + + /* + * Start with number of pages twice as big as needed for + * compressed data. + */ + n_pages = DIV_ROUND_UP(size, PAGE_SIZE) * 2; + error = module_extend_max_pages(info, n_pages); + + data_size = MODULE_DECOMPRESS_FN(info, buf, size); + if (data_size < 0) { + error = data_size; + goto err; + } + + info->hdr = vmap(info->pages, info->used_pages, VM_MAP, PAGE_KERNEL); + if (!info->hdr) { + error = -ENOMEM; + goto err; + } + + info->len = data_size; + return 0; + +err: + module_decompress_cleanup(info); + return error; +} + +void module_decompress_cleanup(struct load_info *info) +{ + int i; + + if (info->hdr) + vunmap(info->hdr); + + for (i = 0; i < info->used_pages; i++) + __free_page(info->pages[i]); + + kvfree(info->pages); + + info->pages = NULL; + info->max_pages = info->used_pages = 0; +} + +static ssize_t compression_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION)); +} +static struct kobj_attribute module_compression_attr = __ATTR_RO(compression); + +static int __init module_decompress_sysfs_init(void) +{ + int error; + + error = sysfs_create_file(&module_kset->kobj, + &module_compression_attr.attr); + if (error) + pr_warn("Failed to create 'compression' attribute"); + + return 0; +} +late_initcall(module_decompress_sysfs_init); -- cgit v1.2.3 From 285ac8dca4df48e9a29fcc1c7f27602e1299a819 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 13 Jan 2022 09:24:17 +0000 Subject: kernel: Fix spelling mistake "compresser" -> "compressor" There is a spelling mistake in a pr_err error message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Luis Chamberlain --- kernel/module_decompress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module_decompress.c b/kernel/module_decompress.c index aeefd95a3337..b01c69c2ff99 100644 --- a/kernel/module_decompress.c +++ b/kernel/module_decompress.c @@ -106,7 +106,7 @@ static ssize_t module_gzip_decompress(struct load_info *info, rc = zlib_inflateInit2(&s, -MAX_WBITS); if (rc != Z_OK) { - pr_err("failed to initialize decompresser: %d\n", rc); + pr_err("failed to initialize decompressor: %d\n", rc); retval = -EINVAL; goto out; } -- cgit v1.2.3 From 289e7b0f7eb47b87a0441e6c81336316f301eb39 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 13 Dec 2021 11:08:53 +0100 Subject: tracing: Account bottom half disabled sections. Disabling only bottom halves via local_bh_disable() disables also preemption but this remains invisible to tracing. On a CONFIG_PREEMPT kernel one might wonder why there is no scheduling happening despite the N flag in the trace. The reason might be the a rcu_read_lock_bh() section. Add a 'b' to the tracing output if in task context with disabled bottom halves. Link: https://lkml.kernel.org/r/YbcbtdtC/bjCKo57@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Steven Rostedt --- include/linux/trace_events.h | 1 + kernel/trace/trace.c | 6 ++++-- kernel/trace/trace_output.c | 4 ++++ 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 3900404aa063..70c069aef02c 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -172,6 +172,7 @@ enum trace_flag_type { TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, TRACE_FLAG_NMI = 0x40, + TRACE_FLAG_BH_OFF = 0x80, }; #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 547d82628c2e..a73d78dcda2c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2603,6 +2603,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) trace_flags |= TRACE_FLAG_HARDIRQ; if (in_serving_softirq()) trace_flags |= TRACE_FLAG_SOFTIRQ; + if (softirq_count() >> (SOFTIRQ_SHIFT + 1)) + trace_flags |= TRACE_FLAG_BH_OFF; if (tif_need_resched()) trace_flags |= TRACE_FLAG_NEED_RESCHED; @@ -4190,7 +4192,7 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" + "# / _-----=> irqs-off/BH-disabled\n" "# | / _----=> need-resched \n" "# || / _---=> hardirq/softirq \n" "# ||| / _--=> preempt-depth \n" @@ -4231,7 +4233,7 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s _-----=> irqs-off/BH-disabled\n", prec, space); seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3547e7176ff7..8aa493d25c73 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -445,14 +445,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; + int bh_off; int nmi; nmi = entry->flags & TRACE_FLAG_NMI; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + bh_off = entry->flags & TRACE_FLAG_BH_OFF; irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + bh_off ? 'b' : (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.'; -- cgit v1.2.3 From 8c7224245557707c613f130431cafbaaa4889615 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Tue, 14 Dec 2021 09:28:02 +0800 Subject: tracing/uprobes: Check the return value of kstrdup() for tu->filename kstrdup() returns NULL when some internal memory errors happen, it is better to check the return value of it so to catch the memory error in time. Link: https://lkml.kernel.org/r/tencent_3C2E330722056D7891D2C83F29C802734B06@qq.com Acked-by: Masami Hiramatsu Fixes: 33ea4b24277b ("perf/core: Implement the 'perf_uprobe' PMU") Signed-off-by: Xiaoke Wang Signed-off-by: Steven Rostedt --- kernel/trace/trace_uprobe.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3bd09d612137..08b0e8417302 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1609,6 +1609,11 @@ create_local_trace_uprobe(char *name, unsigned long offs, tu->path = path; tu->ref_ctr_offset = ref_ctr_offset; tu->filename = kstrdup(name, GFP_KERNEL); + if (!tu->filename) { + ret = -ENOMEM; + goto error; + } + init_trace_event_call(tu); ptype = is_ret_probe(tu) ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL; -- cgit v1.2.3 From 1c1857d400355e96f0fe8b32adc6fa7594d03b52 Mon Sep 17 00:00:00 2001 From: Xiaoke Wang Date: Tue, 14 Dec 2021 10:26:46 +0800 Subject: tracing/probes: check the return value of kstrndup() for pbuf kstrndup() is a memory allocation-related function, it returns NULL when some internal memory errors happen. It is better to check the return value of it so to catch the memory error in time. Link: https://lkml.kernel.org/r/tencent_4D6E270731456EB88712ED7F13883C334906@qq.com Acked-by: Masami Hiramatsu Fixes: a42e3c4de964 ("tracing/probe: Add immediate string parameter support") Signed-off-by: Xiaoke Wang Signed-off-by: Steven Rostedt --- kernel/trace/trace_probe.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8a3822818bf8..73d90179b51b 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -356,6 +356,8 @@ static int __parse_imm_string(char *str, char **pbuf, int offs) return -EINVAL; } *pbuf = kstrndup(str, len - 1, GFP_KERNEL); + if (!*pbuf) + return -ENOMEM; return 0; } -- cgit v1.2.3 From 72b3942a173c387b27860ba1069636726e208777 Mon Sep 17 00:00:00 2001 From: Yinan Liu Date: Sun, 12 Dec 2021 19:33:58 +0800 Subject: scripts: ftrace - move the sort-processing in ftrace_init When the kernel starts, the initialization of ftrace takes up a portion of the time (approximately 6~8ms) to sort mcount addresses. We can save this time by moving mcount-sorting to compile time. Link: https://lkml.kernel.org/r/20211212113358.34208-2-yinan@linux.alibaba.com Signed-off-by: Yinan Liu Reported-by: kernel test robot Reported-by: kernel test robot Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 11 ++++- scripts/Makefile | 6 ++- scripts/link-vmlinux.sh | 6 +-- scripts/sorttable.c | 2 + scripts/sorttable.h | 120 +++++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 137 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 30bc880c3849..9ca63df6553a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6406,8 +6406,15 @@ static int ftrace_process_locs(struct module *mod, if (!count) return 0; - sort(start, count, sizeof(*start), - ftrace_cmp_ips, NULL); + /* + * Sorting mcount in vmlinux at build time depend on + * CONFIG_BUILDTIME_TABLE_SORT, while mcount loc in + * modules can not be sorted at build time. + */ + if (!IS_ENABLED(CONFIG_BUILDTIME_TABLE_SORT) || mod) { + sort(start, count, sizeof(*start), + ftrace_cmp_ips, NULL); + } start_pg = ftrace_allocate_pages(count); if (!start_pg) diff --git a/scripts/Makefile b/scripts/Makefile index 9adb6d247818..b082d2f93357 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -17,6 +17,7 @@ hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert hostprogs-always-$(CONFIG_SYSTEM_REVOCATION_LIST) += extract-cert HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include +HOSTLDLIBS_sorttable = -lpthread HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include HOSTCFLAGS_sign-file.o = $(CRYPTO_CFLAGS) HOSTLDLIBS_sign-file = $(CRYPTO_LIBS) @@ -29,7 +30,10 @@ ARCH := x86 endif HOSTCFLAGS_sorttable.o += -I$(srctree)/tools/arch/x86/include HOSTCFLAGS_sorttable.o += -DUNWINDER_ORC_ENABLED -HOSTLDLIBS_sorttable = -lpthread +endif + +ifdef CONFIG_DYNAMIC_FTRACE +HOSTCFLAGS_sorttable.o += -DMCOUNT_SORT_ENABLED endif # The following programs are only built on demand diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 5cdd9bc5c385..dd9955f45774 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -400,6 +400,9 @@ if [ -n "${CONFIG_DEBUG_INFO_BTF}" -a -n "${CONFIG_BPF}" ]; then ${RESOLVE_BTFIDS} vmlinux fi +info SYSMAP System.map +mksysmap vmlinux System.map + if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then info SORTTAB vmlinux if ! sorttable vmlinux; then @@ -408,9 +411,6 @@ if [ -n "${CONFIG_BUILDTIME_TABLE_SORT}" ]; then fi fi -info SYSMAP System.map -mksysmap vmlinux System.map - # step a (see comment above) if [ -n "${CONFIG_KALLSYMS}" ]; then mksysmap ${kallsyms_vmlinux} .tmp_System.map diff --git a/scripts/sorttable.c b/scripts/sorttable.c index b7c2ad71f9cf..70bdc787ddfb 100644 --- a/scripts/sorttable.c +++ b/scripts/sorttable.c @@ -30,6 +30,8 @@ #include #include #include +#include +#include #include #include diff --git a/scripts/sorttable.h b/scripts/sorttable.h index 7b9745cf8c70..1e8b77928fa4 100644 --- a/scripts/sorttable.h +++ b/scripts/sorttable.h @@ -19,6 +19,9 @@ #undef extable_ent_size #undef compare_extable +#undef get_mcount_loc +#undef sort_mcount_loc +#undef elf_mcount_loc #undef do_sort #undef Elf_Addr #undef Elf_Ehdr @@ -41,6 +44,9 @@ #ifdef SORTTABLE_64 # define extable_ent_size 16 # define compare_extable compare_extable_64 +# define get_mcount_loc get_mcount_loc_64 +# define sort_mcount_loc sort_mcount_loc_64 +# define elf_mcount_loc elf_mcount_loc_64 # define do_sort do_sort_64 # define Elf_Addr Elf64_Addr # define Elf_Ehdr Elf64_Ehdr @@ -62,6 +68,9 @@ #else # define extable_ent_size 8 # define compare_extable compare_extable_32 +# define get_mcount_loc get_mcount_loc_32 +# define sort_mcount_loc sort_mcount_loc_32 +# define elf_mcount_loc elf_mcount_loc_32 # define do_sort do_sort_32 # define Elf_Addr Elf32_Addr # define Elf_Ehdr Elf32_Ehdr @@ -84,8 +93,6 @@ #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) /* ORC unwinder only support X86_64 */ -#include -#include #include #define ERRSTR_MAXSZ 256 @@ -191,7 +198,64 @@ static int compare_extable(const void *a, const void *b) return 1; return 0; } +#ifdef MCOUNT_SORT_ENABLED +struct elf_mcount_loc { + Elf_Ehdr *ehdr; + Elf_Shdr *init_data_sec; + uint_t start_mcount_loc; + uint_t stop_mcount_loc; +}; + +/* Sort the addresses stored between __start_mcount_loc to __stop_mcount_loc in vmlinux */ +static void *sort_mcount_loc(void *arg) +{ + struct elf_mcount_loc *emloc = (struct elf_mcount_loc *)arg; + uint_t offset = emloc->start_mcount_loc - _r(&(emloc->init_data_sec)->sh_addr) + + _r(&(emloc->init_data_sec)->sh_offset); + uint_t count = emloc->stop_mcount_loc - emloc->start_mcount_loc; + unsigned char *start_loc = (void *)emloc->ehdr + offset; + + qsort(start_loc, count/sizeof(uint_t), sizeof(uint_t), compare_extable); + return NULL; +} + +/* Get the address of __start_mcount_loc and __stop_mcount_loc in System.map */ +static void get_mcount_loc(uint_t *_start, uint_t *_stop) +{ + FILE *file_start, *file_stop; + char start_buff[20]; + char stop_buff[20]; + int len = 0; + + file_start = popen(" grep start_mcount System.map | awk '{print $1}' ", "r"); + if (!file_start) { + fprintf(stderr, "get start_mcount_loc error!"); + return; + } + + file_stop = popen(" grep stop_mcount System.map | awk '{print $1}' ", "r"); + if (!file_stop) { + fprintf(stderr, "get stop_mcount_loc error!"); + pclose(file_start); + return; + } + + while (fgets(start_buff, sizeof(start_buff), file_start) != NULL) { + len = strlen(start_buff); + start_buff[len - 1] = '\0'; + } + *_start = strtoul(start_buff, NULL, 16); + + while (fgets(stop_buff, sizeof(stop_buff), file_stop) != NULL) { + len = strlen(stop_buff); + stop_buff[len - 1] = '\0'; + } + *_stop = strtoul(stop_buff, NULL, 16); + pclose(file_start); + pclose(file_stop); +} +#endif static int do_sort(Elf_Ehdr *ehdr, char const *const fname, table_sort_t custom_sort) @@ -217,6 +281,12 @@ static int do_sort(Elf_Ehdr *ehdr, int idx; unsigned int shnum; unsigned int shstrndx; +#ifdef MCOUNT_SORT_ENABLED + struct elf_mcount_loc mstruct; + uint_t _start_mcount_loc = 0; + uint_t _stop_mcount_loc = 0; + pthread_t mcount_sort_thread; +#endif #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) unsigned int orc_ip_size = 0; unsigned int orc_size = 0; @@ -253,6 +323,17 @@ static int do_sort(Elf_Ehdr *ehdr, symtab_shndx = (Elf32_Word *)((const char *)ehdr + _r(&s->sh_offset)); +#ifdef MCOUNT_SORT_ENABLED + /* locate the .init.data section in vmlinux */ + if (!strcmp(secstrings + idx, ".init.data")) { + get_mcount_loc(&_start_mcount_loc, &_stop_mcount_loc); + mstruct.ehdr = ehdr; + mstruct.init_data_sec = s; + mstruct.start_mcount_loc = _start_mcount_loc; + mstruct.stop_mcount_loc = _stop_mcount_loc; + } +#endif + #if defined(SORTTABLE_64) && defined(UNWINDER_ORC_ENABLED) /* locate the ORC unwind tables */ if (!strcmp(secstrings + idx, ".orc_unwind_ip")) { @@ -294,6 +375,23 @@ static int do_sort(Elf_Ehdr *ehdr, goto out; } #endif + +#ifdef MCOUNT_SORT_ENABLED + if (!mstruct.init_data_sec || !_start_mcount_loc || !_stop_mcount_loc) { + fprintf(stderr, + "incomplete mcount's sort in file: %s\n", + fname); + goto out; + } + + /* create thread to sort mcount_loc concurrently */ + if (pthread_create(&mcount_sort_thread, NULL, &sort_mcount_loc, &mstruct)) { + fprintf(stderr, + "pthread_create mcount_sort_thread failed '%s': %s\n", + strerror(errno), fname); + goto out; + } +#endif if (!extab_sec) { fprintf(stderr, "no __ex_table in file: %s\n", fname); goto out; @@ -376,5 +474,23 @@ out: } } #endif + +#ifdef MCOUNT_SORT_ENABLED + if (mcount_sort_thread) { + void *retval = NULL; + /* wait for mcount sort done */ + rc = pthread_join(mcount_sort_thread, &retval); + if (rc) { + fprintf(stderr, + "pthread_join failed '%s': %s\n", + strerror(errno), fname); + } else if (retval) { + rc = -1; + fprintf(stderr, + "failed to sort mcount '%s': %s\n", + (char *)retval, fname); + } + } +#endif return rc; } -- cgit v1.2.3 From 8147dc78e6e4b645f8277bdf377f2193ddfcdee1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 6 Dec 2021 15:18:58 -0500 Subject: ftrace: Add test to make sure compiled time sorts work Now that ftrace function pointers are sorted at compile time, add a test that makes sure they are sorted at run time. This test is only run if it is configured in. Link: https://lkml.kernel.org/r/20211206151858.4d21a24d@gandalf.local.home Cc: Yinan Liu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 14 ++++++++++++++ kernel/trace/ftrace.c | 23 +++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 420ff4bc67fd..f468767bc287 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -915,6 +915,20 @@ config EVENT_TRACE_TEST_SYSCALLS TBD - enable a way to actually call the syscalls as we test their events +config FTRACE_SORT_STARTUP_TEST + bool "Verify compile time sorting of ftrace functions" + depends on DYNAMIC_FTRACE + depends on BUILDTIME_TABLE_SORT + help + Sorting of the mcount_loc sections that is used to find the + where the ftrace knows where to patch functions for tracing + and other callbacks is done at compile time. But if the sort + is not done correctly, it will cause non-deterministic failures. + When this is set, the sorted sections will be verified that they + are in deed sorted and will warn if they are not. + + If unsure, say N + config RING_BUFFER_STARTUP_TEST bool "Ring buffer startup self test" depends on RING_BUFFER diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9ca63df6553a..403e485bf091 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6388,6 +6388,27 @@ static int ftrace_cmp_ips(const void *a, const void *b) return 0; } +#ifdef CONFIG_FTRACE_SORT_STARTUP_TEST +static void test_is_sorted(unsigned long *start, unsigned long count) +{ + int i; + + for (i = 1; i < count; i++) { + if (WARN(start[i - 1] > start[i], + "[%d] %pS at %lx is not sorted with %pS at %lx\n", i, + (void *)start[i - 1], start[i - 1], + (void *)start[i], start[i])) + break; + } + if (i == count) + pr_info("ftrace section at %px sorted properly\n", start); +} +#else +static void test_is_sorted(unsigned long *start, unsigned long count) +{ +} +#endif + static int ftrace_process_locs(struct module *mod, unsigned long *start, unsigned long *end) @@ -6414,6 +6435,8 @@ static int ftrace_process_locs(struct module *mod, if (!IS_ENABLED(CONFIG_BUILDTIME_TABLE_SORT) || mod) { sort(start, count, sizeof(*start), ftrace_cmp_ips, NULL); + } else { + test_is_sorted(start, count); } start_pg = ftrace_allocate_pages(count); -- cgit v1.2.3 From 3e2a56e6f639492311e0a8533f0a7aed60816308 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Jan 2022 17:56:56 -0500 Subject: tracing: Have syscall trace events use trace_event_buffer_lock_reserve() Currently, the syscall trace events call trace_buffer_lock_reserve() directly, which means that it misses out on some of the filtering optimizations provided by the helper function trace_event_buffer_lock_reserve(). Have the syscall trace events call that instead, as it was missed when adding the update to use the temp buffer when filtering. Link: https://lkml.kernel.org/r/20220107225839.823118570@goodmis.org Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Tom Zanussi Reviewed-by: Masami Hiramatsu Fixes: 0fc1b09ff1ff4 ("tracing: Use temp buffer when filtering events") Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8bfcd3b09422..f755bde42fd0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -323,8 +323,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, sys_data->enter_event->event.type, size, trace_ctx); if (!event) return; @@ -367,8 +366,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) trace_ctx = tracing_gen_ctx(); - buffer = tr->array_buffer.buffer; - event = trace_buffer_lock_reserve(buffer, + event = trace_event_buffer_lock_reserve(&buffer, trace_file, sys_data->exit_event->event.type, sizeof(*entry), trace_ctx); if (!event) -- cgit v1.2.3 From 77360f9bbc7e5e2ab7a2c8b4c0244fbbfcfc6f62 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 10 Jan 2022 11:55:32 -0500 Subject: tracing: Add test for user space strings when filtering on string pointers Pingfan reported that the following causes a fault: echo "filename ~ \"cpu\"" > events/syscalls/sys_enter_openat/filter echo 1 > events/syscalls/sys_enter_at/enable The reason is that trace event filter treats the user space pointer defined by "filename" as a normal pointer to compare against the "cpu" string. The following bug happened: kvm-03-guest16 login: [72198.026181] BUG: unable to handle page fault for address: 00007fffaae8ef60 #PF: supervisor read access in kernel mode #PF: error_code(0x0001) - permissions violation PGD 80000001008b7067 P4D 80000001008b7067 PUD 2393f1067 PMD 2393ec067 PTE 8000000108f47867 Oops: 0001 [#1] PREEMPT SMP PTI CPU: 1 PID: 1 Comm: systemd Kdump: loaded Not tainted 5.14.0-32.el9.x86_64 #1 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:strlen+0x0/0x20 Code: 48 89 f9 74 09 48 83 c1 01 80 39 00 75 f7 31 d2 44 0f b6 04 16 44 88 04 11 48 83 c2 01 45 84 c0 75 ee c3 0f 1f 80 00 00 00 00 <80> 3f 00 74 10 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 c3 31 RSP: 0018:ffffb5b900013e48 EFLAGS: 00010246 RAX: 0000000000000018 RBX: ffff8fc1c49ede00 RCX: 0000000000000000 RDX: 0000000000000020 RSI: ffff8fc1c02d601c RDI: 00007fffaae8ef60 RBP: 00007fffaae8ef60 R08: 0005034f4ddb8ea4 R09: 0000000000000000 R10: ffff8fc1c02d601c R11: 0000000000000000 R12: ffff8fc1c8a6e380 R13: 0000000000000000 R14: ffff8fc1c02d6010 R15: ffff8fc1c00453c0 FS: 00007fa86123db40(0000) GS:ffff8fc2ffd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fffaae8ef60 CR3: 0000000102880001 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: filter_pred_pchar+0x18/0x40 filter_match_preds+0x31/0x70 ftrace_syscall_enter+0x27a/0x2c0 syscall_trace_enter.constprop.0+0x1aa/0x1d0 do_syscall_64+0x16/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fa861d88664 The above happened because the kernel tried to access user space directly and triggered a "supervisor read access in kernel mode" fault. Worse yet, the memory could not even be loaded yet, and a SEGFAULT could happen as well. This could be true for kernel space accessing as well. To be even more robust, test both kernel and user space strings. If the string fails to read, then simply have the filter fail. Note, TASK_SIZE is used to determine if the pointer is user or kernel space and the appropriate strncpy_from_kernel/user_nofault() function is used to copy the memory. For some architectures, the compare to TASK_SIZE may always pick user space or kernel space. If it gets it wrong, the only thing is that the filter will fail to match. In the future, this needs to be fixed to have the event denote which should be used. But failing a filter is much better than panicing the machine, and that can be solved later. Link: https://lore.kernel.org/all/20220107044951.22080-1-kernelfans@gmail.com/ Link: https://lkml.kernel.org/r/20220110115532.536088fd@gandalf.local.home Cc: stable@vger.kernel.org Cc: Ingo Molnar Cc: Andrew Morton Cc: Masami Hiramatsu Cc: Tom Zanussi Reported-by: Pingfan Liu Tested-by: Pingfan Liu Fixes: 87a342f5db69d ("tracing/filters: Support filtering for char * strings") Signed-off-by: Steven Rostedt --- Documentation/trace/events.rst | 10 ++++++ kernel/trace/trace_events_filter.c | 66 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 73 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst index 8ddb9b09451c..45e66a60a816 100644 --- a/Documentation/trace/events.rst +++ b/Documentation/trace/events.rst @@ -230,6 +230,16 @@ Currently the caret ('^') for an error always appears at the beginning of the filter string; the error message should still be useful though even without more accurate position info. +5.2.1 Filter limitations +------------------------ + +If a filter is placed on a string pointer ``(char *)`` that does not point +to a string on the ring buffer, but instead points to kernel or user space +memory, then, for safety reasons, at most 1024 bytes of the content is +copied onto a temporary buffer to do the compare. If the copy of the memory +faults (the pointer points to memory that should not be accessed), then the +string compare will be treated as not matching. + 5.3 Clearing filters -------------------- diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 996920ed1812..2e9ef64e9ee9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -5,6 +5,7 @@ * Copyright (C) 2009 Tom Zanussi */ +#include #include #include #include @@ -654,6 +655,47 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); +/* user space strings temp buffer */ +#define USTRING_BUF_SIZE 1024 + +struct ustring_buffer { + char buffer[USTRING_BUF_SIZE]; +}; + +static __percpu struct ustring_buffer *ustring_per_cpu; + +static __always_inline char *test_string(char *str) +{ + struct ustring_buffer *ubuf; + char __user *ustr; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* + * We use TASK_SIZE to denote user or kernel space, but this will + * not work for all architectures. If it picks the wrong one, it may + * just fail the filter (but will not bug). + * + * TODO: Have a way to properly denote which one this is for. + */ + if (likely((unsigned long)str >= TASK_SIZE)) { + /* For safety, do not trust the string pointer */ + if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + return NULL; + } else { + /* user space address? */ + ustr = (char __user *)str; + if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + return NULL; + } + return kstr; +} + /* Filter predicate for fixed sized arrays of characters */ static int filter_pred_string(struct filter_pred *pred, void *event) { @@ -671,10 +713,16 @@ static int filter_pred_string(struct filter_pred *pred, void *event) static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); + char *str; int cmp, match; - int len = strlen(*addr) + 1; /* including tailing '\0' */ + int len; - cmp = pred->regex.match(*addr, &pred->regex, len); + str = test_string(*addr); + if (!str) + return 0; + + len = strlen(str) + 1; /* including tailing '\0' */ + cmp = pred->regex.match(str, &pred->regex, len); match = cmp ^ pred->not; @@ -1348,8 +1396,17 @@ static int parse_pred(const char *str, void *data, pred->fn = filter_pred_strloc; } else if (field->filter_type == FILTER_RDYN_STRING) pred->fn = filter_pred_strrelloc; - else + else { + + if (!ustring_per_cpu) { + /* Once allocated, keep it around for good */ + ustring_per_cpu = alloc_percpu(struct ustring_buffer); + if (!ustring_per_cpu) + goto err_mem; + } + pred->fn = filter_pred_pchar; + } /* go past the last quote */ i++; @@ -1415,6 +1472,9 @@ static int parse_pred(const char *str, void *data, err_free: kfree(pred); return -EINVAL; +err_mem: + kfree(pred); + return -ENOMEM; } enum { -- cgit v1.2.3 From dfea08a2116fe327f79d8f4d4b2cf6e0c88be11f Mon Sep 17 00:00:00 2001 From: Xiangyang Zhang Date: Fri, 7 Jan 2022 23:02:42 +0800 Subject: tracing/kprobes: 'nmissed' not showed correctly for kretprobe The 'nmissed' column of the 'kprobe_profile' file for kretprobe is not showed correctly, kretprobe can be skipped by two reasons, shortage of kretprobe_instance which is counted by tk->rp.nmissed, and kprobe itself is missed by some reason, so to show the sum. Link: https://lkml.kernel.org/r/20220107150242.5019-1-xyz.sun.ok@gmail.com Cc: stable@vger.kernel.org Fixes: 4a846b443b4e ("tracing/kprobes: Cleanup kprobe tracer code") Acked-by: Masami Hiramatsu Signed-off-by: Xiangyang Zhang Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f8c26ee72de3..3d85323278ed 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1170,15 +1170,18 @@ static int probes_profile_seq_show(struct seq_file *m, void *v) { struct dyn_event *ev = v; struct trace_kprobe *tk; + unsigned long nmissed; if (!is_trace_kprobe(ev)) return 0; tk = to_trace_kprobe(ev); + nmissed = trace_kprobe_is_return(tk) ? + tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed; seq_printf(m, " %-44s %15lu %15lu\n", trace_probe_name(&tk->tp), trace_kprobe_nhit(tk), - tk->rp.kp.nmissed); + nmissed); return 0; } -- cgit v1.2.3 From 6e1b4bd1911d814077d77e2ac6529d74ee68c0f6 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Mon, 10 Jan 2022 00:22:32 +0800 Subject: tracing: Remove duplicate warnings when calling trace_create_file() Since the same warning message is already printed in the trace_create_file() function, there is no need to print it again. Link: https://lkml.kernel.org/r/20220109162232.361747-1-ytcoode@gmail.com Signed-off-by: Yuntao Wang Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 92be9cb1d7d4..3147614c1812 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3461,10 +3461,8 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events, tr, &ftrace_tr_enable_fops); - if (!entry) { - pr_warn("Could not create tracefs 'enable' entry\n"); + if (!entry) return -ENOMEM; - } /* There are not as crucial, just warn if they are not created */ @@ -3480,17 +3478,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); /* ring buffer internal formats */ - entry = trace_create_file("header_page", TRACE_MODE_READ, d_events, + trace_create_file("header_page", TRACE_MODE_READ, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); - if (!entry) - pr_warn("Could not create tracefs 'header_page' entry\n"); - entry = trace_create_file("header_event", TRACE_MODE_READ, d_events, + trace_create_file("header_event", TRACE_MODE_READ, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); - if (!entry) - pr_warn("Could not create tracefs 'header_event' entry\n"); tr->event_dir = d_events; -- cgit v1.2.3 From 0878355b51f5f26632e652c848a8e174bb02d22d Mon Sep 17 00:00:00 2001 From: Nikita Yushchenko Date: Sun, 9 Jan 2022 18:34:59 +0300 Subject: tracing/osnoise: Properly unhook events if start_per_cpu_kthreads() fails If start_per_cpu_kthreads() called from osnoise_workload_start() returns error, event hooks are left in broken state: unhook_irq_events() called but unhook_thread_events() and unhook_softirq_events() not called, and trace_osnoise_callback_enabled flag not cleared. On the next tracer enable, hooks get not installed due to trace_osnoise_callback_enabled flag. And on the further tracer disable an attempt to remove non-installed hooks happened, hitting a WARN_ON_ONCE() in tracepoint_remove_func(). Fix the error path by adding the missing part of cleanup. While at this, introduce osnoise_unhook_events() to avoid code duplication between this error path and normal tracer disable. Link: https://lkml.kernel.org/r/20220109153459.3701773-1-nikita.yushchenko@virtuozzo.com Cc: stable@vger.kernel.org Fixes: bce29ac9ce0b ("trace: Add osnoise tracer") Acked-by: Daniel Bristot de Oliveira Signed-off-by: Nikita Yushchenko Signed-off-by: Steven Rostedt --- kernel/trace/trace_osnoise.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 4719a848bf17..36d9d5be08b4 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -2122,6 +2122,13 @@ out_unhook_irq: return -EINVAL; } +static void osnoise_unhook_events(void) +{ + unhook_thread_events(); + unhook_softirq_events(); + unhook_irq_events(); +} + /* * osnoise_workload_start - start the workload and hook to events */ @@ -2154,7 +2161,14 @@ static int osnoise_workload_start(void) retval = start_per_cpu_kthreads(); if (retval) { - unhook_irq_events(); + trace_osnoise_callback_enabled = false; + /* + * Make sure that ftrace_nmi_enter/exit() see + * trace_osnoise_callback_enabled as false before continuing. + */ + barrier(); + + osnoise_unhook_events(); return retval; } @@ -2185,9 +2199,7 @@ static void osnoise_workload_stop(void) stop_per_cpu_kthreads(); - unhook_irq_events(); - unhook_softirq_events(); - unhook_thread_events(); + osnoise_unhook_events(); } static void osnoise_tracer_start(struct trace_array *tr) -- cgit v1.2.3 From f37c3bbc635994eda203a6da4ba0f9d05165a8d6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Jan 2022 20:08:40 -0500 Subject: tracing: Add ustring operation to filtering string pointers Since referencing user space pointers is special, if the user wants to filter on a field that is a pointer to user space, then they need to specify it. Add a ".ustring" attribute to the field name for filters to state that the field is pointing to user space such that the kernel can take the appropriate action to read that pointer. Link: https://lore.kernel.org/all/yt9d8rvmt2jq.fsf@linux.ibm.com/ Fixes: 77360f9bbc7e ("tracing: Add test for user space strings when filtering on string pointers") Tested-by: Sven Schnelle Signed-off-by: Steven Rostedt --- Documentation/trace/events.rst | 9 +++++ kernel/trace/trace_events_filter.c | 81 +++++++++++++++++++++++++++----------- 2 files changed, 66 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst index 45e66a60a816..c47f381d0c00 100644 --- a/Documentation/trace/events.rst +++ b/Documentation/trace/events.rst @@ -198,6 +198,15 @@ The glob (~) accepts a wild card character (\*,?) and character classes prev_comm ~ "*sh*" prev_comm ~ "ba*sh" +If the field is a pointer that points into user space (for example +"filename" from sys_enter_openat), then you have to append ".ustring" to the +field name:: + + filename.ustring ~ "password" + +As the kernel will have to know how to retrieve the memory that the pointer +is at from user space. + 5.2 Setting filters ------------------- diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2e9ef64e9ee9..b458a9afa2c0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -665,6 +665,23 @@ struct ustring_buffer { static __percpu struct ustring_buffer *ustring_per_cpu; static __always_inline char *test_string(char *str) +{ + struct ustring_buffer *ubuf; + char *kstr; + + if (!ustring_per_cpu) + return NULL; + + ubuf = this_cpu_ptr(ustring_per_cpu); + kstr = ubuf->buffer; + + /* For safety, do not trust the string pointer */ + if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + return NULL; + return kstr; +} + +static __always_inline char *test_ustring(char *str) { struct ustring_buffer *ubuf; char __user *ustr; @@ -676,23 +693,11 @@ static __always_inline char *test_string(char *str) ubuf = this_cpu_ptr(ustring_per_cpu); kstr = ubuf->buffer; - /* - * We use TASK_SIZE to denote user or kernel space, but this will - * not work for all architectures. If it picks the wrong one, it may - * just fail the filter (but will not bug). - * - * TODO: Have a way to properly denote which one this is for. - */ - if (likely((unsigned long)str >= TASK_SIZE)) { - /* For safety, do not trust the string pointer */ - if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) - return NULL; - } else { - /* user space address? */ - ustr = (char __user *)str; - if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) - return NULL; - } + /* user space address? */ + ustr = (char __user *)str; + if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + return NULL; + return kstr; } @@ -709,24 +714,42 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } +static __always_inline int filter_pchar(struct filter_pred *pred, char *str) +{ + int cmp, match; + int len; + + len = strlen(str) + 1; /* including tailing '\0' */ + cmp = pred->regex.match(str, &pred->regex, len); + + match = cmp ^ pred->not; + + return match; +} /* Filter predicate for char * pointers */ static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); char *str; - int cmp, match; - int len; str = test_string(*addr); if (!str) return 0; - len = strlen(str) + 1; /* including tailing '\0' */ - cmp = pred->regex.match(str, &pred->regex, len); + return filter_pchar(pred, str); +} - match = cmp ^ pred->not; +/* Filter predicate for char * pointers in user space*/ +static int filter_pred_pchar_user(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + char *str; - return match; + str = test_ustring(*addr); + if (!str) + return 0; + + return filter_pchar(pred, str); } /* @@ -1232,6 +1255,7 @@ static int parse_pred(const char *str, void *data, struct filter_pred *pred = NULL; char num_buf[24]; /* Big enough to hold an address */ char *field_name; + bool ustring = false; char q; u64 val; int len; @@ -1266,6 +1290,12 @@ static int parse_pred(const char *str, void *data, return -EINVAL; } + /* See if the field is a user space string */ + if ((len = str_has_prefix(str + i, ".ustring"))) { + ustring = true; + i += len; + } + while (isspace(str[i])) i++; @@ -1405,7 +1435,10 @@ static int parse_pred(const char *str, void *data, goto err_mem; } - pred->fn = filter_pred_pchar; + if (ustring) + pred->fn = filter_pred_pchar_user; + else + pred->fn = filter_pred_pchar; } /* go past the last quote */ i++; -- cgit v1.2.3 From a97ac8cb24a3c3ad74794adb83717ef1605d1b47 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 13 Jan 2022 16:51:52 -0800 Subject: module: fix signature check failures when using in-kernel decompression The new flag MODULE_INIT_COMPRESSED_FILE unintentionally trips check in module_sig_check(). The check was supposed to catch case when version info or magic was removed from a signed module, making signature invalid, but it was coded too broadly and was catching this new flag as well. Change the check to only test the 2 particular flags affecting signature validity. Fixes: b1ae6dc41eaa ("module: add in-kernel support for decompressing") Signed-off-by: Dmitry Torokhov Reviewed-by: Douglas Anderson Signed-off-by: Luis Chamberlain --- kernel/module.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 34fe2824eb56..387ee77bdbd6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2883,12 +2883,13 @@ static int module_sig_check(struct load_info *info, int flags) const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; const char *reason; const void *mod = info->hdr; - + bool mangled_module = flags & (MODULE_INIT_IGNORE_MODVERSIONS | + MODULE_INIT_IGNORE_VERMAGIC); /* - * Require flags == 0, as a module with version information - * removed is no longer the module that was signed + * Do not allow mangled modules as a module with version information + * removed is no longer the module that was signed. */ - if (flags == 0 && + if (!mangled_module && info->len > markerlen && memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ -- cgit v1.2.3 From 800977f6f32e452cba6b04ef21d2f5383ca29209 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:02:52 -0800 Subject: kthread: add the helper function kthread_run_on_cpu() Add a new helper function kthread_run_on_cpu(), which includes kthread_create_on_cpu/wake_up_process(). In some cases, use kthread_run_on_cpu() directly instead of kthread_create_on_node/kthread_bind/wake_up_process() or kthread_create_on_cpu/wake_up_process() or kthreadd_create/kthread_bind/wake_up_process() to simplify the code. [akpm@linux-foundation.org: export kthread_create_on_cpu to modules] Link: https://lkml.kernel.org/r/20211022025711.3673-2-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Cai Huoqing Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kthread.h | 25 +++++++++++++++++++++++++ kernel/kthread.c | 1 + 2 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 346b0f269161..db47aae7c481 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -56,6 +56,31 @@ bool kthread_is_per_cpu(struct task_struct *k); __k; \ }) +/** + * kthread_run_on_cpu - create and wake a cpu bound thread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @cpu: The cpu on which the thread should be bound, + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: Convenient wrapper for kthread_create_on_cpu() + * followed by wake_up_process(). Returns the kthread or + * ERR_PTR(-ENOMEM). + */ +static inline struct task_struct * +kthread_run_on_cpu(int (*threadfn)(void *data), void *data, + unsigned int cpu, const char *namefmt) +{ + struct task_struct *p; + + p = kthread_create_on_cpu(threadfn, data, cpu, namefmt); + if (!IS_ERR(p)) + wake_up_process(p); + + return p; +} + void free_kthread_struct(struct task_struct *k); void kthread_bind(struct task_struct *k, unsigned int cpu); void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); diff --git a/kernel/kthread.c b/kernel/kthread.c index 7113003fab63..4ed9e7bce9e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -523,6 +523,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), to_kthread(p)->cpu = cpu; return p; } +EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { -- cgit v1.2.3 From 64ed3a049e3e81b801e7c5bb052416152443f585 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:02:59 -0800 Subject: ring-buffer: make use of the helper function kthread_run_on_cpu() Replace kthread_create/kthread_bind/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-4-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/ring_buffer.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2699e9e562b1..05dfc7a12d3d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -5898,16 +5898,13 @@ static __init int test_ringbuffer(void) rb_data[cpu].buffer = buffer; rb_data[cpu].cpu = cpu; rb_data[cpu].cnt = cpu; - rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], - "rbtester/%d", cpu); + rb_threads[cpu] = kthread_run_on_cpu(rb_test, &rb_data[cpu], + cpu, "rbtester/%u"); if (WARN_ON(IS_ERR(rb_threads[cpu]))) { pr_cont("FAILED\n"); ret = PTR_ERR(rb_threads[cpu]); goto out_free; } - - kthread_bind(rb_threads[cpu], cpu); - wake_up_process(rb_threads[cpu]); } /* Now create the rb hammer! */ -- cgit v1.2.3 From 3b9cb4ba4b54ecc6cf7d04ea9085d2ad2be48733 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:03:02 -0800 Subject: rcutorture: make use of the helper function kthread_run_on_cpu() Replace kthread_create_on_node/kthread_bind/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-5-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rcu/rcutorture.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8b410d982990..42bc66a2f170 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1992,9 +1992,8 @@ static int rcutorture_booster_init(unsigned int cpu) mutex_lock(&boost_mutex); rcu_torture_disable_rt_throttle(); VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, - cpu_to_node(cpu), - "rcu_torture_boost"); + boost_tasks[cpu] = kthread_run_on_cpu(rcu_torture_boost, NULL, + cpu, "rcu_torture_boost_%u"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed"); @@ -2003,8 +2002,6 @@ static int rcutorture_booster_init(unsigned int cpu) mutex_unlock(&boost_mutex); return retval; } - kthread_bind(boost_tasks[cpu], cpu); - wake_up_process(boost_tasks[cpu]); mutex_unlock(&boost_mutex); return 0; } -- cgit v1.2.3 From 11e4e3523da98c065a6c249013ace0d388e41c25 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:03:06 -0800 Subject: trace/osnoise: make use of the helper function kthread_run_on_cpu() Replace kthread_create_on_cpu/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-6-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_osnoise.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 7520d43aed55..89d6cbac6f10 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1701,7 +1701,7 @@ static int start_kthread(unsigned int cpu) snprintf(comm, 24, "osnoise/%d", cpu); } - kthread = kthread_create_on_cpu(main, NULL, cpu, comm); + kthread = kthread_run_on_cpu(main, NULL, cpu, comm); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); @@ -1710,7 +1710,6 @@ static int start_kthread(unsigned int cpu) } per_cpu(per_cpu_osnoise_var, cpu).kthread = kthread; - wake_up_process(kthread); return 0; } -- cgit v1.2.3 From ff78f6679d2e223e073fcbdc8f70b6bc0abadf99 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 14 Jan 2022 14:03:10 -0800 Subject: trace/hwlat: make use of the helper function kthread_run_on_cpu() Replace kthread_create_on_cpu/wake_up_process() with kthread_run_on_cpu() to simplify the code. Link: https://lkml.kernel.org/r/20211022025711.3673-7-caihuoqing@baidu.com Signed-off-by: Cai Huoqing Cc: Bernard Metzler Cc: Daniel Bristot de Oliveira Cc: Davidlohr Bueso Cc: Doug Ledford Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Joel Fernandes (Google) Cc: Josh Triplett Cc: Lai Jiangshan Cc: Mathieu Desnoyers Cc: "Paul E . McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/trace_hwlat.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 56bb7b890578..d440ddd5fd8b 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -491,18 +491,14 @@ static void stop_per_cpu_kthreads(void) static int start_cpu_kthread(unsigned int cpu) { struct task_struct *kthread; - char comm[24]; - snprintf(comm, 24, "hwlatd/%d", cpu); - - kthread = kthread_create_on_cpu(kthread_fn, NULL, cpu, comm); + kthread = kthread_run_on_cpu(kthread_fn, NULL, cpu, "hwlatd/%u"); if (IS_ERR(kthread)) { pr_err(BANNER "could not start sampling thread\n"); return -ENOMEM; } per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread; - wake_up_process(kthread); return 0; } -- cgit v1.2.3 From 9a10064f5625d5572c3626c1516e0bebc6c9fe9b Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Fri, 14 Jan 2022 14:05:59 -0800 Subject: mm: add a field to store names for private anonymous memory In many userspace applications, and especially in VM based applications like Android uses heavily, there are multiple different allocators in use. At a minimum there is libc malloc and the stack, and in many cases there are libc malloc, the stack, direct syscalls to mmap anonymous memory, and multiple VM heaps (one for small objects, one for big objects, etc.). Each of these layers usually has its own tools to inspect its usage; malloc by compiling a debug version, the VM through heap inspection tools, and for direct syscalls there is usually no way to track them. On Android we heavily use a set of tools that use an extended version of the logic covered in Documentation/vm/pagemap.txt to walk all pages mapped in userspace and slice their usage by process, shared (COW) vs. unique mappings, backing, etc. This can account for real physical memory usage even in cases like fork without exec (which Android uses heavily to share as many private COW pages as possible between processes), Kernel SamePage Merging, and clean zero pages. It produces a measurement of the pages that only exist in that process (USS, for unique), and a measurement of the physical memory usage of that process with the cost of shared pages being evenly split between processes that share them (PSS). If all anonymous memory is indistinguishable then figuring out the real physical memory usage (PSS) of each heap requires either a pagemap walking tool that can understand the heap debugging of every layer, or for every layer's heap debugging tools to implement the pagemap walking logic, in which case it is hard to get a consistent view of memory across the whole system. Tracking the information in userspace leads to all sorts of problems. It either needs to be stored inside the process, which means every process has to have an API to export its current heap information upon request, or it has to be stored externally in a filesystem that somebody needs to clean up on crashes. It needs to be readable while the process is still running, so it has to have some sort of synchronization with every layer of userspace. Efficiently tracking the ranges requires reimplementing something like the kernel vma trees, and linking to it from every layer of userspace. It requires more memory, more syscalls, more runtime cost, and more complexity to separately track regions that the kernel is already tracking. This patch adds a field to /proc/pid/maps and /proc/pid/smaps to show a userspace-provided name for anonymous vmas. The names of named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps as [anon:]. Userspace can set the name for a region of memory by calling prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name) Setting the name to NULL clears it. The name length limit is 80 bytes including NUL-terminator and is checked to contain only printable ascii characters (including space), except '[',']','\','$' and '`'. Ascii strings are being used to have a descriptive identifiers for vmas, which can be understood by the users reading /proc/pid/maps or /proc/pid/smaps. Names can be standardized for a given system and they can include some variable parts such as the name of the allocator or a library, tid of the thread using it, etc. The name is stored in a pointer in the shared union in vm_area_struct that points to a null terminated string. Anonymous vmas with the same name (equivalent strings) and are otherwise mergeable will be merged. The name pointers are not shared between vmas even if they contain the same name. The name pointer is stored in a union with fields that are only used on file-backed mappings, so it does not increase memory usage. CONFIG_ANON_VMA_NAME kernel configuration is introduced to enable this feature. It keeps the feature disabled by default to prevent any additional memory overhead and to avoid confusing procfs parsers on systems which are not ready to support named anonymous vmas. The patch is based on the original patch developed by Colin Cross, more specifically on its latest version [1] posted upstream by Sumit Semwal. It used a userspace pointer to store vma names. In that design, name pointers could be shared between vmas. However during the last upstreaming attempt, Kees Cook raised concerns [2] about this approach and suggested to copy the name into kernel memory space, perform validity checks [3] and store as a string referenced from vm_area_struct. One big concern is about fork() performance which would need to strdup anonymous vma names. Dave Hansen suggested experimenting with worst-case scenario of forking a process with 64k vmas having longest possible names [4]. I ran this experiment on an ARM64 Android device and recorded a worst-case regression of almost 40% when forking such a process. This regression is addressed in the followup patch which replaces the pointer to a name with a refcounted structure that allows sharing the name pointer between vmas of the same name. Instead of duplicating the string during fork() or when splitting a vma it increments the refcount. [1] https://lore.kernel.org/linux-mm/20200901161459.11772-4-sumit.semwal@linaro.org/ [2] https://lore.kernel.org/linux-mm/202009031031.D32EF57ED@keescook/ [3] https://lore.kernel.org/linux-mm/202009031022.3834F692@keescook/ [4] https://lore.kernel.org/linux-mm/5d0358ab-8c47-2f5f-8e43-23b89d6a8e95@intel.com/ Changes for prctl(2) manual page (in the options section): PR_SET_VMA Sets an attribute specified in arg2 for virtual memory areas starting from the address specified in arg3 and spanning the size specified in arg4. arg5 specifies the value of the attribute to be set. Note that assigning an attribute to a virtual memory area might prevent it from being merged with adjacent virtual memory areas due to the difference in that attribute's value. Currently, arg2 must be one of: PR_SET_VMA_ANON_NAME Set a name for anonymous virtual memory areas. arg5 should be a pointer to a null-terminated string containing the name. The name length including null byte cannot exceed 80 bytes. If arg5 is NULL, the name of the appropriate anonymous virtual memory areas will be reset. The name can contain only printable ascii characters (including space), except '[',']','\','$' and '`'. This feature is available only if the kernel is built with the CONFIG_ANON_VMA_NAME option enabled. [surenb@google.com: docs: proc.rst: /proc/PID/maps: fix malformed table] Link: https://lkml.kernel.org/r/20211123185928.2513763-1-surenb@google.com [surenb: rebased over v5.15-rc6, replaced userpointer with a kernel copy, added input sanitization and CONFIG_ANON_VMA_NAME config. The bulk of the work here was done by Colin Cross, therefore, with his permission, keeping him as the author] Link: https://lkml.kernel.org/r/20211019215511.3771969-2-surenb@google.com Signed-off-by: Colin Cross Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: Stephen Rothwell Cc: Al Viro Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Rientjes Cc: "Eric W. Biederman" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Glauber Cc: Johannes Weiner Cc: John Stultz Cc: Mel Gorman Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rob Landley Cc: "Serge E. Hallyn" Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.rst | 6 +- fs/proc/task_mmu.c | 12 +++- fs/userfaultfd.c | 7 +- include/linux/mm.h | 13 +++- include/linux/mm_types.h | 64 ++++++++++++++++-- include/uapi/linux/prctl.h | 3 + kernel/fork.c | 2 + kernel/sys.c | 63 ++++++++++++++++++ mm/Kconfig | 14 ++++ mm/madvise.c | 129 +++++++++++++++++++++++++++++++++++-- mm/mempolicy.c | 3 +- mm/mlock.c | 2 +- mm/mmap.c | 38 ++++++----- mm/mprotect.c | 2 +- 14 files changed, 324 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 8d7f141c6fc7..061744c436d9 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -426,12 +426,14 @@ with the memory region, as the case would be with BSS (uninitialized data). The "pathname" shows the name associated file for this mapping. If the mapping is not associated with a file: - ======= ==================================== + ============= ==================================== [heap] the heap of the program [stack] the stack of the main process [vdso] the "virtual dynamic shared object", the kernel system call handler - ======= ==================================== + [anon:] an anonymous mapping that has been + named by userspace + ============= ==================================== or if empty, the mapping is anonymous. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ad667dbc96f5..e6998652fd67 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -308,6 +308,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { + const char *anon_name; + if (!mm) { name = "[vdso]"; goto done; @@ -319,8 +321,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (is_stack(vma)) + if (is_stack(vma)) { name = "[stack]"; + goto done; + } + + anon_name = vma_anon_name(vma); + if (anon_name) { + seq_pad(m, ' '); + seq_printf(m, "[anon:%s]", anon_name); + } } done: diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 22bf14ab2d16..5b2af7b82776 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -877,7 +877,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) vma = prev; else @@ -1436,7 +1436,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx })); + ((struct vm_userfaultfd_ctx){ ctx }), + vma_anon_name(vma)); if (prev) { vma = prev; goto next; @@ -1613,7 +1614,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) { vma = prev; goto next; diff --git a/include/linux/mm.h b/include/linux/mm.h index a7e4a9e7d807..7000442984b9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2658,7 +2658,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + struct mempolicy *, struct vm_userfaultfd_ctx, const char *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -3391,5 +3391,16 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) return 0; } +#ifdef CONFIG_ANON_VMA_NAME +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name); +#else +static inline int +madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) { + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3a6e6209600..799e2ee626b2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -426,11 +426,19 @@ struct vm_area_struct { /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. + * + * For private anonymous mappings, a pointer to a null terminated string + * containing the name given to the vma, or NULL if unnamed. */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; + + union { + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; + /* Serialized by mmap_sem. */ + char *anon_name; + }; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -875,4 +883,52 @@ typedef struct { unsigned long val; } swp_entry_t; +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index bb73e9a0b24f..e998764f0262 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -272,4 +272,7 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 # define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 +#define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 3244cc56b697..4cf20b5f2da3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -365,12 +365,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; + dup_vma_anon_name(orig, new); } return new; } void vm_area_free(struct vm_area_struct *vma) { + free_vma_anon_name(vma); kmem_cache_free(vm_area_cachep, vma); } diff --git a/kernel/sys.c b/kernel/sys.c index 8fdac0d90504..2450a9f33cb0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2261,6 +2261,66 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) +#ifdef CONFIG_ANON_VMA_NAME + +#define ANON_VMA_NAME_MAX_LEN 80 +#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" + +static inline bool is_valid_name_char(char ch) +{ + /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ + return ch > 0x1f && ch < 0x7f && + !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); +} + +static int prctl_set_vma(unsigned long opt, unsigned long addr, + unsigned long size, unsigned long arg) +{ + struct mm_struct *mm = current->mm; + const char __user *uname; + char *name, *pch; + int error; + + switch (opt) { + case PR_SET_VMA_ANON_NAME: + uname = (const char __user *)arg; + if (uname) { + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + + if (IS_ERR(name)) + return PTR_ERR(name); + + for (pch = name; *pch != '\0'; pch++) { + if (!is_valid_name_char(*pch)) { + kfree(name); + return -EINVAL; + } + } + } else { + /* Reset the name */ + name = NULL; + } + + mmap_write_lock(mm); + error = madvise_set_anon_name(mm, addr, size, name); + mmap_write_unlock(mm); + kfree(name); + break; + default: + error = -EINVAL; + } + + return error; +} + +#else /* CONFIG_ANON_VMA_NAME */ +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long size, unsigned long arg) +{ + return -EINVAL; +} +#endif /* CONFIG_ANON_VMA_NAME */ + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2530,6 +2590,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_VMA: + error = prctl_set_vma(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/mm/Kconfig b/mm/Kconfig index 356f4f2c779e..53d7485fc38f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -900,6 +900,20 @@ config IO_MAPPING config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +config ANON_VMA_NAME + bool "Anonymous VMA name support" + depends on PROC_FS && ADVISE_SYSCALLS && MMU + + help + Allow naming anonymous virtual memory areas. + + This feature allows assigning names to virtual memory areas. Assigned + names can be later retrieved from /proc/pid/maps and /proc/pid/smaps + and help identifying individual anonymous memory areas. + Assigning a name to anonymous virtual memory area might prevent that + area from being merged with adjacent virtual memory areas due to the + difference in their name. + source "mm/damon/Kconfig" endmenu diff --git a/mm/madvise.c b/mm/madvise.c index 4b9c5509990c..413bbc6e40a0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -62,19 +63,84 @@ static int madvise_need_mmap_write(int behavior) } } +#ifdef CONFIG_ANON_VMA_NAME +static inline bool has_vma_anon_name(struct vm_area_struct *vma) +{ + return !vma->vm_file && vma->anon_name; +} + +const char *vma_anon_name(struct vm_area_struct *vma) +{ + if (!has_vma_anon_name(vma)) + return NULL; + + mmap_assert_locked(vma->vm_mm); + + return vma->anon_name; +} + +void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + if (!has_vma_anon_name(orig_vma)) + return; + + new_vma->anon_name = kstrdup(orig_vma->anon_name, GFP_KERNEL); +} + +void free_vma_anon_name(struct vm_area_struct *vma) +{ + if (!has_vma_anon_name(vma)) + return; + + kfree(vma->anon_name); + vma->anon_name = NULL; +} + +/* mmap_lock should be write-locked */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + if (!name) { + free_vma_anon_name(vma); + return 0; + } + + if (vma->anon_name) { + /* Same name, nothing to do here */ + if (!strcmp(name, vma->anon_name)) + return 0; + + free_vma_anon_name(vma); + } + vma->anon_name = kstrdup(name, GFP_KERNEL); + if (!vma->anon_name) + return -ENOMEM; + + return 0; +} +#else /* CONFIG_ANON_VMA_NAME */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + if (name) + return -EINVAL; + + return 0; +} +#endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_sem held for writing; */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long new_flags) + unsigned long end, unsigned long new_flags, + const char *name) { struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; - if (new_flags == vma->vm_flags) { + if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) { *prev = vma; return 0; } @@ -82,7 +148,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, name); if (*prev) { vma = *prev; goto success; @@ -111,6 +177,11 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; + if (!vma->vm_file) { + error = replace_vma_anon_name(vma, name); + if (error) + return error; + } return 0; } @@ -938,7 +1009,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; } - error = madvise_update_vma(vma, prev, start, end, new_flags); + error = madvise_update_vma(vma, prev, start, end, new_flags, + vma_anon_name(vma)); out: /* @@ -1118,6 +1190,55 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, return unmapped_error; } +#ifdef CONFIG_ANON_VMA_NAME +static int madvise_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long name) +{ + int error; + + /* Only anonymous mappings can be named */ + if (vma->vm_file) + return -EBADF; + + error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, + (const char *)name); + + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) +{ + unsigned long end; + unsigned long len; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + return madvise_walk_vmas(mm, start, end, (unsigned long)name, + madvise_vma_anon_name); +} +#endif /* CONFIG_ANON_VMA_NAME */ /* * The madvise(2) system call. * diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f6248affaf38..679f47b3a079 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -810,7 +810,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx); + new_pol, vma->vm_userfaultfd_ctx, + vma_anon_name(vma)); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/mlock.c b/mm/mlock.c index e263d62ae2d0..8f584eddd305 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index bfb0ea164a90..85edb0011453 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1029,7 +1029,8 @@ again: */ static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1047,6 +1048,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; + if (!is_same_vma_anon_name(vma, anon_name)) + return 0; return 1; } @@ -1079,9 +1082,10 @@ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -1100,9 +1104,10 @@ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1113,9 +1118,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, } /* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), + * figure out whether that can be merged with its predecessor or its + * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when @@ -1160,7 +1165,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1190,7 +1196,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { /* * OK, it can. Can we now merge in the successor as well? */ @@ -1199,7 +1205,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx) && + vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1222,7 +1228,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); @@ -1754,7 +1760,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -1803,7 +1809,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { /* ->mmap() can change vma->vm_file and fput the original file. So * fput the vma->vm_file here or we would add an extra fput for file @@ -3056,7 +3062,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -3249,7 +3255,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index e552f5e0ccbd..0138dfcdb1d8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); -- cgit v1.2.3 From 17fca131cee21724ee953a17c185c14e9533af5b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jan 2022 14:06:07 -0800 Subject: mm: move anon_vma declarations to linux/mm_inline.h The patch to add anonymous vma names causes a build failure in some configurations: include/linux/mm_types.h: In function 'is_same_vma_anon_name': include/linux/mm_types.h:924:37: error: implicit declaration of function 'strcmp' [-Werror=implicit-function-declaration] 924 | return name && vma_name && !strcmp(name, vma_name); | ^~~~~~ include/linux/mm_types.h:22:1: note: 'strcmp' is defined in header ''; did you forget to '#include '? This should not really be part of linux/mm_types.h in the first place, as that header is meant to only contain structure defintions and need a minimum set of indirect includes itself. While the header clearly includes more than it should at this point, let's not make it worse by including string.h as well, which would pull in the expensive (compile-speed wise) fortify-string logic. Move the new functions into a separate header that only needs to be included in a couple of locations. Link: https://lkml.kernel.org/r/20211207125710.2503446-1-arnd@kernel.org Fixes: "mm: add a field to store names for private anonymous memory" Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Colin Cross Cc: Eric Biederman Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Stephen Rothwell Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 1 + fs/userfaultfd.c | 1 + include/linux/mm_inline.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 48 --------------------------------------------- kernel/fork.c | 1 + mm/madvise.c | 1 + mm/mmap.c | 1 + 7 files changed, 55 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e6998652fd67..18f8c3acbb85 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5b2af7b82776..e26b10132d47 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index e2ec68b0515c..47d96d2647ca 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@ #include #include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -135,4 +136,53 @@ static __always_inline void del_page_from_lru_list(struct page *page, { lruvec_del_folio(lruvec, page_folio(page)); } + +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 449b6eafc695..4d5fb84eed5e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -890,52 +890,4 @@ typedef struct { unsigned long val; } swp_entry_t; -#ifdef CONFIG_ANON_VMA_NAME -/* - * mmap_lock should be read-locked when calling vma_anon_name() and while using - * the returned pointer. - */ -extern const char *vma_anon_name(struct vm_area_struct *vma); - -/* - * mmap_lock should be read-locked for orig_vma->vm_mm. - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be - * isolated. - */ -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma); - -/* - * mmap_lock should be write-locked or vma should have been isolated under - * write-locked mmap_lock protection. - */ -extern void free_vma_anon_name(struct vm_area_struct *vma); - -/* mmap_lock should be read-locked */ -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - const char *vma_name = vma_anon_name(vma); - - /* either both NULL, or pointers to same string */ - if (vma_name == name) - return true; - - return name && vma_name && !strcmp(name, vma_name); -} -#else /* CONFIG_ANON_VMA_NAME */ -static inline const char *vma_anon_name(struct vm_area_struct *vma) -{ - return NULL; -} -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) {} -static inline void free_vma_anon_name(struct vm_area_struct *vma) {} -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - return true; -} -#endif /* CONFIG_ANON_VMA_NAME */ - #endif /* _LINUX_MM_TYPES_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 4cf20b5f2da3..75737e566441 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/madvise.c b/mm/madvise.c index c63aacbbfa78..5604064df464 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/mmap.c b/mm/mmap.c index 85edb0011453..77733b113c40 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 39c65a94cd9661532be150e88f8b02f4a6844a35 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:07:17 -0800 Subject: mm/pagealloc: sysctl: change watermark_scale_factor max limit to 30% For embedded systems with low total memory, having to run applications with relatively large memory requirements, 10% max limitation for watermark_scale_factor poses an issue of triggering direct reclaim every time such application is started. This results in slow application startup times and bad end-user experience. By increasing watermark_scale_factor max limit we allow vendors more flexibility to choose the right level of kswapd aggressiveness for their device and workload requirements. Link: https://lkml.kernel.org/r/20211124193604.2758863-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Dave Hansen Cc: Vlastimil Babka Cc: Mel Gorman Cc: Jonathan Corbet Cc: Zhang Yi Cc: Fengfei Xi Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 2 +- kernel/sysctl.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5e795202111f..f4804ce37c58 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -948,7 +948,7 @@ how much memory needs to be free before kswapd goes back to sleep. The unit is in fractions of 10,000. The default value of 10 means the distances between watermarks are 0.1% of the available memory in the -node/system. The maximum value is 1000, or 10% of memory. +node/system. The maximum value is 3000, or 30% of memory. A high rate of threads entering direct reclaim (allocstall) or kswapd going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 083be6af29d7..2ab4edb6e450 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -122,6 +122,7 @@ static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int two_hundred = 200; static int one_thousand = 1000; +static int three_thousand = 3000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -2959,7 +2960,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &one_thousand, + .extra2 = &three_thousand, }, { .procname = "percpu_pagelist_high_fraction", -- cgit v1.2.3 From a674e48c5443d12a8a43c3ac42367aa39505d506 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 14 Jan 2022 14:07:41 -0800 Subject: dma/pool: create dma atomic pool only if dma zone has managed pages Currently three dma atomic pools are initialized as long as the relevant kernel codes are built in. While in kdump kernel of x86_64, this is not right when trying to create atomic_pool_dma, because there's no managed pages in DMA zone. In the case, DMA zone only has low 1M memory presented and locked down by memblock allocator. So no pages are added into buddy of DMA zone. Please check commit f1d4d47c5851 ("x86/setup: Always reserve the first 1M of RAM"). Then in kdump kernel of x86_64, it always prints below failure message: DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations swapper/0: page allocation failure: order:5, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.13.0-0.rc5.20210611git929d931f2b40.42.fc35.x86_64 #1 Hardware name: Dell Inc. PowerEdge R910/0P658H, BIOS 2.12.0 06/04/2018 Call Trace: dump_stack+0x7f/0xa1 warn_alloc.cold+0x72/0xd6 __alloc_pages_slowpath.constprop.0+0xf29/0xf50 __alloc_pages+0x24d/0x2c0 alloc_page_interleave+0x13/0xb0 atomic_pool_expand+0x118/0x210 __dma_atomic_pool_init+0x45/0x93 dma_atomic_pool_init+0xdb/0x176 do_one_initcall+0x67/0x320 kernel_init_freeable+0x290/0x2dc kernel_init+0xa/0x111 ret_from_fork+0x22/0x30 Mem-Info: ...... DMA: failed to allocate 128 KiB GFP_KERNEL|GFP_DMA pool for atomic allocation DMA: preallocated 128 KiB GFP_KERNEL|GFP_DMA32 pool for atomic allocations Here, let's check if DMA zone has managed pages, then create atomic_pool_dma if yes. Otherwise just skip it. Link: https://lkml.kernel.org/r/20211223094435.248523-3-bhe@redhat.com Fixes: 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified") Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Acked-by: John Donnelly Reviewed-by: David Hildenbrand Cc: Marek Szyprowski Cc: Robin Murphy Cc: Borislav Petkov Cc: Christoph Lameter Cc: David Laight Cc: David Rientjes Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma/pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 5f84e6cdb78e..4d40dcce7604 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void) GFP_KERNEL); if (!atomic_pool_kernel) ret = -ENOMEM; - if (IS_ENABLED(CONFIG_ZONE_DMA)) { + if (has_managed_dma()) { atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL | GFP_DMA); if (!atomic_pool_dma) @@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) if (prev == NULL) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) return atomic_pool_dma32; - if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + if (atomic_pool_dma && (gfp & GFP_DMA)) return atomic_pool_dma; return atomic_pool_kernel; } -- cgit v1.2.3 From 21b084fdf2a49ca1634e8e360e9ab6f9ff0dee11 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 14 Jan 2022 14:08:21 -0800 Subject: mm/mempolicy: wire up syscall set_mempolicy_home_node Link: https://lkml.kernel.org/r/20211202123810.267175-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/syscalls.h | 3 +++ include/uapi/asm-generic/unistd.h | 5 ++++- kernel/sys_ni.c | 1 + 21 files changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index ca5a32228cd6..3515bc4f16a4 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -489,3 +489,4 @@ # 557 reserved for memfd_secret 558 common process_mrelease sys_process_mrelease 559 common futex_waitv sys_futex_waitv +560 common set_mempolicy_home_node sys_ni_syscall diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 543100151f2b..ac964612d8b0 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -463,3 +463,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 6bdb5f5db438..4e65da3445c7 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 450 +#define __NR_compat_syscalls 451 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 41ea1195e44b..604a2053d006 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -905,6 +905,8 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) __SYSCALL(__NR_process_mrelease, sys_process_mrelease) #define __NR_futex_waitv 449 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 707ae121f6d3..78b1d03e86e1 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -370,3 +370,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 45bc32a41b90..b1f3940bc298 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -449,3 +449,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 2204bde3ce4a..820145e47350 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -455,3 +455,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 72d02d363f36..253ff994ed2e 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -388,3 +388,4 @@ # 447 reserved for memfd_secret 448 n32 process_mrelease sys_process_mrelease 449 n32 futex_waitv sys_futex_waitv +450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index e2c481fcede6..3f1886ad9d80 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -364,3 +364,4 @@ # 447 reserved for memfd_secret 448 n64 process_mrelease sys_process_mrelease 449 n64 futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 3714c97b2643..8f243e35a7b2 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -437,3 +437,4 @@ # 447 reserved for memfd_secret 448 o32 process_mrelease sys_process_mrelease 449 o32 futex_waitv sys_futex_waitv +450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 358c00000755..68b46fe2f17c 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -447,3 +447,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 15109af9d075..2600b4237292 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -529,3 +529,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index ed9c5c2eafad..799147658dee 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -452,3 +452,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index d9539d28bdaa..2de85c977f54 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -452,3 +452,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 46adabcb1720..4398cc6fb68d 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -495,3 +495,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 7e25543693de..320480a8db4f 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -454,3 +454,4 @@ 447 i386 memfd_secret sys_memfd_secret 448 i386 process_mrelease sys_process_mrelease 449 i386 futex_waitv sys_futex_waitv +450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index fe8f8dd157b4..c84d12608cd2 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -371,6 +371,7 @@ 447 common memfd_secret sys_memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 3e3e1a506bed..52c94ab5c205 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -420,3 +420,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 528a478dbda8..819c0cb00b6d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1057,6 +1057,9 @@ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type ru const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); asmlinkage long sys_memfd_secret(unsigned int flags); +asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); /* * Architecture-specific system calls diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 4557a8b6086f..1c48b0ae3ba3 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -883,8 +883,11 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) #define __NR_futex_waitv 449 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) + #undef __NR_syscalls -#define __NR_syscalls 450 +#define __NR_syscalls 451 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d1944258cfc0..a492f159624f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -297,6 +297,7 @@ COND_SYSCALL(get_mempolicy); COND_SYSCALL(set_mempolicy); COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); +COND_SYSCALL(set_mempolicy_home_node); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); -- cgit v1.2.3 From 9b51d9d866482a703646fd4c07e433c3d9d88efd Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 14 Aug 2021 14:17:05 -0700 Subject: cpumask: replace cpumask_next_* with cpumask_first_* where appropriate cpumask_first() is a more effective analogue of 'next' version if n == -1 (which means start == 0). This patch replaces 'next' with 'first' where things look trivial. There's no cpumask_first_zero() function, so create it. Signed-off-by: Yury Norov Tested-by: Wolfram Sang --- block/blk-mq.c | 2 +- drivers/net/virtio_net.c | 2 +- drivers/soc/fsl/qbman/bman_portal.c | 2 +- drivers/soc/fsl/qbman/qman_portal.c | 2 +- include/linux/cpumask.h | 16 ++++++++++++++++ kernel/time/clocksource.c | 4 ++-- 6 files changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/block/blk-mq.c b/block/blk-mq.c index 8874a63ae952..ef56e753ab38 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2967,7 +2967,7 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, struct blk_mq_hw_ctx *hctx) { - if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu) return false; if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) return false; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b107835242ad..8c70ab3c436a 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2101,7 +2101,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi) stragglers = num_cpu >= vi->curr_queue_pairs ? num_cpu % vi->curr_queue_pairs : 0; - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); for (i = 0; i < vi->curr_queue_pairs; i++) { group_size = stride + (i < stragglers ? 1 : 0); diff --git a/drivers/soc/fsl/qbman/bman_portal.c b/drivers/soc/fsl/qbman/bman_portal.c index acda8a5637c5..4d7b9caee1c4 100644 --- a/drivers/soc/fsl/qbman/bman_portal.c +++ b/drivers/soc/fsl/qbman/bman_portal.c @@ -155,7 +155,7 @@ static int bman_portal_probe(struct platform_device *pdev) } spin_lock(&bman_lock); - cpu = cpumask_next_zero(-1, &portal_cpus); + cpu = cpumask_first_zero(&portal_cpus); if (cpu >= nr_cpu_ids) { __bman_portals_probed = 1; /* unassigned portal, skip init */ diff --git a/drivers/soc/fsl/qbman/qman_portal.c b/drivers/soc/fsl/qbman/qman_portal.c index 96f74a1dc603..e23b60618c1a 100644 --- a/drivers/soc/fsl/qbman/qman_portal.c +++ b/drivers/soc/fsl/qbman/qman_portal.c @@ -248,7 +248,7 @@ static int qman_portal_probe(struct platform_device *pdev) pcfg->pools = qm_get_pools_sdqcr(); spin_lock(&qman_lock); - cpu = cpumask_next_zero(-1, &portal_cpus); + cpu = cpumask_first_zero(&portal_cpus); if (cpu >= nr_cpu_ids) { __qman_portals_probed = 1; /* unassigned portal, skip init */ diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c4e1b9ea0ba4..64dae70d31f5 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -123,6 +123,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return 0; } +static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +{ + return 0; +} + static inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { @@ -201,6 +206,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp) return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } +/** + * cpumask_first_zero - get the first unset cpu in a cpumask + * @srcp: the cpumask pointer + * + * Returns >= nr_cpu_ids if all cpus are set. + */ +static inline unsigned int cpumask_first_zero(const struct cpumask *srcp) +{ + return find_first_zero_bit(cpumask_bits(srcp), nr_cpumask_bits); +} + /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @src1p: the first input diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index b8a14d2fb5ba..f29d1a524fa5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -257,7 +257,7 @@ static void clocksource_verify_choose_cpus(void) return; /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (cpu == smp_processor_id()) cpu = cpumask_next(cpu, cpu_online_mask); if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) @@ -279,7 +279,7 @@ static void clocksource_verify_choose_cpus(void) cpu = prandom_u32() % nr_cpu_ids; cpu = cpumask_next(cpu - 1, cpu_online_mask); if (cpu >= nr_cpu_ids) - cpu = cpumask_next(-1, cpu_online_mask); + cpu = cpumask_first(cpu_online_mask); if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) cpumask_set_cpu(cpu, &cpus_chosen); } -- cgit v1.2.3 From be80a1d3f9dbe5aee79a325964f7037fe2d92f30 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 10 Jan 2022 14:05:49 +0000 Subject: bpf: Generalize check_ctx_reg for reuse with other types Generalize the check_ctx_reg() helper function into a more generic named one so that it can be reused for other register types as well to check whether their offset is non-zero. No functional change. Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- kernel/bpf/btf.c | 2 +- kernel/bpf/verifier.c | 21 +++++++++++---------- 3 files changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 143401d4c9d9..e9993172f892 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -519,8 +519,8 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno); +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 33bb8ae4a804..e16dafeb2450 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5686,7 +5686,7 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, i, btf_type_str(t)); return -EINVAL; } - if (check_ctx_reg(env, reg, regno)) + if (check_ptr_off_reg(env, reg, regno)) return -EINVAL; } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) { const struct btf_type *reg_ref_t; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a8587210907d..9b8334068e71 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3969,16 +3969,16 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) { - /* Access to ctx or passing it to a helper is only allowed in - * its original, unmodified form. + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. */ if (reg->off) { - verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", - regno, reg->off); + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); return -EACCES; } @@ -3986,7 +3986,8 @@ int check_ctx_reg(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; } @@ -4437,7 +4438,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err; @@ -5305,7 +5306,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return err; if (type == PTR_TO_CTX) { - err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err; } @@ -9651,7 +9652,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } - err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + err = check_ptr_off_reg(env, ®s[ctx_reg], ctx_reg); if (err < 0) return err; -- cgit v1.2.3 From d400a6cf1c8a57cdf10f35220ead3284320d85ff Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 14 Jan 2022 13:58:36 +0000 Subject: bpf: Mark PTR_TO_FUNC register initially with zero offset Similar as with other pointer types where we use ldimm64, clear the register content to zero first, and then populate the PTR_TO_FUNC type and subprogno number. Currently this is not done, and leads to reuse of stale register tracking data. Given for special ldimm64 cases we always clear the register offset, make it common for all cases, so it won't be forgotten in future. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9b8334068e71..ffec0baaf2b6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9508,9 +9508,13 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - if (insn->src_reg == BPF_PSEUDO_BTF_ID) { - mark_reg_known_zero(env, regs, insn->dst_reg); + /* All special src_reg cases are listed below. From this point onwards + * we either succeed and assign a corresponding dst_reg->type after + * zeroing the offset, or fail and reject the program. + */ + mark_reg_known_zero(env, regs, insn->dst_reg); + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { dst_reg->type = aux->btf_var.reg_type; switch (base_type(dst_reg->type)) { case PTR_TO_MEM: @@ -9548,7 +9552,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } map = env->used_maps[aux->map_index]; - mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || -- cgit v1.2.3 From 6788ab23508bddb0a9d88e104284922cb2c22b77 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 10 Jan 2022 14:40:40 +0000 Subject: bpf: Generally fix helper register offset check Right now the assertion on check_ptr_off_reg() is only enforced for register types PTR_TO_CTX (and open coded also for PTR_TO_BTF_ID), however, this is insufficient since many other PTR_TO_* register types such as PTR_TO_FUNC do not handle/expect register offsets when passed to helper functions. Given this can slip-through easily when adding new types, make this an explicit allow-list and reject all other current and future types by default if this is encountered. Also, extend check_ptr_off_reg() to handle PTR_TO_BTF_ID as well instead of duplicating it. For PTR_TO_BTF_ID, reg->off is used for BTF to match expected BTF ids if struct offset is used. This part still needs to be allowed, but the dynamic off from the tnum must be rejected. Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ffec0baaf2b6..e0b3f4d683eb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3969,14 +3969,15 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) { /* Access to this pointer-typed register or passing it to a helper * is only allowed in its original, unmodified form. */ - if (reg->off) { + if (!fixed_off_ok && reg->off) { verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", reg_type_str(env, reg->type), regno, reg->off); return -EACCES; @@ -3994,6 +3995,12 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, return 0; } +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, @@ -5245,12 +5252,6 @@ found: kernel_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } - - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", - regno); - return -EACCES; - } } return 0; @@ -5305,10 +5306,26 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; - if (type == PTR_TO_CTX) { - err = check_ptr_off_reg(env, reg, regno); + switch ((u32)type) { + case SCALAR_VALUE: + /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_BUF: + case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_STACK: + break; + /* All the rest must be rejected: */ + default: + err = __check_ptr_off_reg(env, reg, regno, + type == PTR_TO_BTF_ID); if (err < 0) return err; + break; } skip_type_check: -- cgit v1.2.3 From 64620e0a1e712a778095bd35cbb277dc2259281f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 11 Jan 2022 14:43:41 +0000 Subject: bpf: Fix out of bounds access for ringbuf helpers Both bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM in their bpf_func_proto definition as their first argument. They both expect the result from a prior bpf_ringbuf_reserve() call which has a return type of RET_PTR_TO_ALLOC_MEM_OR_NULL. Meaning, after a NULL check in the code, the verifier will promote the register type in the non-NULL branch to a PTR_TO_MEM and in the NULL branch to a known zero scalar. Generally, pointer arithmetic on PTR_TO_MEM is allowed, so the latter could have an offset. The ARG_PTR_TO_ALLOC_MEM expects a PTR_TO_MEM register type. However, the non- zero result from bpf_ringbuf_reserve() must be fed into either bpf_ringbuf_submit() or bpf_ringbuf_discard() but with the original offset given it will then read out the struct bpf_ringbuf_hdr mapping. The verifier missed to enforce a zero offset, so that out of bounds access can be triggered which could be used to escalate privileges if unprivileged BPF was enabled (disabled by default in kernel). Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: (SecCoder Security Lab) Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e0b3f4d683eb..c72c57a6684f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5318,9 +5318,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case PTR_TO_STACK: + /* Some of the argument types nevertheless require a + * zero register offset. + */ + if (arg_type == ARG_PTR_TO_ALLOC_MEM) + goto force_off_check; break; /* All the rest must be rejected: */ default: +force_off_check: err = __check_ptr_off_reg(env, reg, regno, type == PTR_TO_BTF_ID); if (err < 0) -- cgit v1.2.3 From a672b2e36a648afb04ad3bda93b6bda947a479a5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 13 Jan 2022 11:11:30 +0000 Subject: bpf: Fix ringbuf memory type confusion when passing to helpers The bpf_ringbuf_submit() and bpf_ringbuf_discard() have ARG_PTR_TO_ALLOC_MEM in their bpf_func_proto definition as their first argument, and thus both expect the result from a prior bpf_ringbuf_reserve() call which has a return type of RET_PTR_TO_ALLOC_MEM_OR_NULL. While the non-NULL memory from bpf_ringbuf_reserve() can be passed to other helpers, the two sinks (bpf_ringbuf_submit(), bpf_ringbuf_discard()) right now only enforce a register type of PTR_TO_MEM. This can lead to potential type confusion since it would allow other PTR_TO_MEM memory to be passed into the two sinks which did not come from bpf_ringbuf_reserve(). Add a new MEM_ALLOC composable type attribute for PTR_TO_MEM, and enforce that: - bpf_ringbuf_reserve() returns NULL or PTR_TO_MEM | MEM_ALLOC - bpf_ringbuf_submit() and bpf_ringbuf_discard() only take PTR_TO_MEM | MEM_ALLOC but not plain PTR_TO_MEM arguments via ARG_PTR_TO_ALLOC_MEM - however, other helpers might treat PTR_TO_MEM | MEM_ALLOC as plain PTR_TO_MEM to populate the memory area when they use ARG_PTR_TO_{UNINIT_,}MEM in their func proto description Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Reported-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++-- kernel/bpf/verifier.c | 6 +++++- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6e947cd91152..fa517ae604ad 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -316,7 +316,12 @@ enum bpf_type_flag { */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_RDONLY, + /* MEM was "allocated" from a different helper, and cannot be mixed + * with regular non-MEM_ALLOC'ed MEM types. + */ + MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_ALLOC, }; /* Max number of base types. */ @@ -400,7 +405,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, - RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c72c57a6684f..a39eedecc93a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -570,6 +570,8 @@ static const char *reg_type_str(struct bpf_verifier_env *env, if (type & MEM_RDONLY) strncpy(prefix, "rdonly_", 16); + if (type & MEM_ALLOC) + strncpy(prefix, "alloc_", 16); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -5135,6 +5137,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, + PTR_TO_MEM | MEM_ALLOC, PTR_TO_BUF, }, }; @@ -5152,7 +5155,7 @@ static const struct bpf_reg_types int_ptr_types = { static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; -static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -5315,6 +5318,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, case PTR_TO_MAP_VALUE: case PTR_TO_MEM: case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_MEM | MEM_ALLOC: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case PTR_TO_STACK: -- cgit v1.2.3 From d6986ce24fc00b0638bd29efe8fb7ba7619ed2aa Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:43 -0800 Subject: kthread: dynamically allocate memory to store kthread's full name When I was implementing a new per-cpu kthread cfs_migration, I found the comm of it "cfs_migration/%u" is truncated due to the limitation of TASK_COMM_LEN. For example, the comm of the percpu thread on CPU10~19 all have the same name "cfs_migration/1", which will confuse the user. This issue is not critical, because we can get the corresponding CPU from the task's Cpus_allowed. But for kthreads corresponding to other hardware devices, it is not easy to get the detailed device info from task comm, for example, jbd2/nvme0n1p2- xfs-reclaim/sdf Currently there are so many truncated kthreads: rcu_tasks_kthre rcu_tasks_rude_ rcu_tasks_trace poll_mpt3sas0_s ext4-rsv-conver xfs-reclaim/sd{a, b, c, ...} xfs-blockgc/sd{a, b, c, ...} xfs-inodegc/sd{a, b, c, ...} audit_send_repl ecryptfs-kthrea vfio-irqfd-clea jbd2/nvme0n1p2- ... We can shorten these names to work around this problem, but it may be not applied to all of the truncated kthreads. Take 'jbd2/nvme0n1p2-' for example, it is a nice name, and it is not a good idea to shorten it. One possible way to fix this issue is extending the task comm size, but as task->comm is used in lots of places, that may cause some potential buffer overflows. Another more conservative approach is introducing a new pointer to store kthread's full name if it is truncated, which won't introduce too much overhead as it is in the non-critical path. Finally we make a dicision to use the second approach. See also the discussions in this thread: https://lore.kernel.org/lkml/20211101060419.4682-1-laoar.shao@gmail.com/ After this change, the full name of these truncated kthreads will be displayed via /proc/[pid]/comm: rcu_tasks_kthread rcu_tasks_rude_kthread rcu_tasks_trace_kthread poll_mpt3sas0_statu ext4-rsv-conversion xfs-reclaim/sdf1 xfs-blockgc/sdf1 xfs-inodegc/sdf1 audit_send_reply ecryptfs-kthread vfio-irqfd-cleanup jbd2/nvme0n1p2-8 Link: https://lkml.kernel.org/r/20211120112850.46047-1-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: David Hildenbrand Reviewed-by: Petr Mladek Suggested-by: Petr Mladek Suggested-by: Steven Rostedt Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 3 +++ include/linux/kthread.h | 1 + kernel/kthread.c | 32 ++++++++++++++++++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/proc/array.c b/fs/proc/array.c index ff869a66b34e..4321aa63835d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -92,6 +92,7 @@ #include #include #include +#include #include #include "internal.h" @@ -102,6 +103,8 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); + else if (p->flags & PF_KTHREAD) + get_kthread_comm(tcomm, sizeof(tcomm), p); else __get_task_comm(tcomm, sizeof(tcomm), p); diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 346b0f269161..2a5c04494663 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -33,6 +33,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), unsigned int cpu, const char *namefmt); +void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk); void set_kthread_struct(struct task_struct *p); void kthread_set_per_cpu(struct task_struct *k, int cpu); diff --git a/kernel/kthread.c b/kernel/kthread.c index 7113003fab63..a70cd5dc94e3 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -60,6 +60,8 @@ struct kthread { #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif + /* To store the full name if task comm is truncated. */ + char *full_name; }; enum KTHREAD_BITS { @@ -93,6 +95,18 @@ static inline struct kthread *__to_kthread(struct task_struct *p) return kthread; } +void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) +{ + struct kthread *kthread = to_kthread(tsk); + + if (!kthread || !kthread->full_name) { + __get_task_comm(buf, buf_size, tsk); + return; + } + + strscpy_pad(buf, kthread->full_name, buf_size); +} + void set_kthread_struct(struct task_struct *p) { struct kthread *kthread; @@ -118,9 +132,13 @@ void free_kthread_struct(struct task_struct *k) * or if kmalloc() in kthread() failed. */ kthread = to_kthread(k); + if (!kthread) + return; + #ifdef CONFIG_BLK_CGROUP - WARN_ON_ONCE(kthread && kthread->blkcg_css); + WARN_ON_ONCE(kthread->blkcg_css); #endif + kfree(kthread->full_name); kfree(kthread); } @@ -406,12 +424,22 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { char name[TASK_COMM_LEN]; + va_list aq; + int len; /* * task is already visible to other tasks, so updating * COMM must be protected. */ - vsnprintf(name, sizeof(name), namefmt, args); + va_copy(aq, args); + len = vsnprintf(name, sizeof(name), namefmt, aq); + va_end(aq); + if (len >= TASK_COMM_LEN) { + struct kthread *kthread = to_kthread(task); + + /* leave it truncated when out of memory. */ + kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args); + } set_task_comm(task, name); } kfree(create); -- cgit v1.2.3 From 7f8ca0edfe07d271ba6bef3cef5ec7fc1bbe8a68 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 19 Jan 2022 18:08:47 -0800 Subject: kernel/sys.c: only take tasklist_lock for get/setpriority(PRIO_PGRP) PRIO_PGRP needs the tasklist_lock mainly to serialize vs setpgid(2), to protect against any concurrent change_pid(PIDTYPE_PGID) that can move the task from one hlist to another while iterating. However, the remaining can only rely only on RCU: PRIO_PROCESS only does the task lookup and never iterates over tasklist and we already have an rcu-aware stable pointer. PRIO_USER is already racy vs setuid(2) so with creds being rcu protected, we can end up seeing stale data. When removing the tasklist_lock there can be a race with (i) fork but this is benign as the child's nice is inherited and the new task is not observable by the user yet either, hence the return semantics do not differ. And (ii) a race with exit, which is a small window and can cause us to miss a task which was removed from the list and it had the highest nice. Similarly change the buggy do_each_thread/while_each_thread combo in PRIO_USER for the rcu-safe for_each_process_thread flavor, which doesn't make use of next_thread/p->thread_group. [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/20211210182250.43734-1-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 8fdac0d90504..34bbe8cd1f04 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -220,7 +220,6 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) niceval = MAX_NICE; rcu_read_lock(); - read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: if (who) @@ -235,9 +234,11 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) pgrp = find_vpid(who); else pgrp = task_pgrp(current); + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); @@ -249,16 +250,15 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) if (!user) goto out_unlock; /* No processes for this user */ } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); - } while_each_thread(g, p); + } if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } out_unlock: - read_unlock(&tasklist_lock); rcu_read_unlock(); out: return error; @@ -283,7 +283,6 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) return -EINVAL; rcu_read_lock(); - read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: if (who) @@ -301,11 +300,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); @@ -317,19 +318,18 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) if (!user) goto out_unlock; /* No processes for this user */ } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } - } while_each_thread(g, p); + } if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } out_unlock: - read_unlock(&tasklist_lock); rcu_read_unlock(); return retval; -- cgit v1.2.3 From 23b36fec7e14f8cf1c17e832e53dd4761e0dfe83 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 19 Jan 2022 18:09:56 -0800 Subject: panic: use error_report_end tracepoint on warnings Introduce the error detector "warning" to the error_report event and use the error_report_end tracepoint at the end of a warning report. This allows in-kernel tests but also userspace to more easily determine if a warning occurred without polling kernel logs. [akpm@linux-foundation.org: add comma to enum list, per Andy] Link: https://lkml.kernel.org/r/20211115085630.1756817-1-elver@google.com Signed-off-by: Marco Elver Cc: Steven Rostedt Cc: Ingo Molnar Cc: Alexander Potapenko Cc: Petr Mladek Cc: Luis Chamberlain Cc: Wei Liu Cc: Mike Rapoport Cc: Arnd Bergmann Cc: John Ogness Cc: Andy Shevchenko Cc: Alexander Popov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/error_report.h | 8 +++++--- kernel/panic.c | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/error_report.h b/include/trace/events/error_report.h index 96f64bf218b2..a1922a800e6f 100644 --- a/include/trace/events/error_report.h +++ b/include/trace/events/error_report.h @@ -17,14 +17,16 @@ enum error_detector { ERROR_DETECTOR_KFENCE, - ERROR_DETECTOR_KASAN + ERROR_DETECTOR_KASAN, + ERROR_DETECTOR_WARN, }; #endif /* __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY */ -#define error_detector_list \ +#define error_detector_list \ EM(ERROR_DETECTOR_KFENCE, "kfence") \ - EMe(ERROR_DETECTOR_KASAN, "kasan") + EM(ERROR_DETECTOR_KASAN, "kasan") \ + EMe(ERROR_DETECTOR_WARN, "warning") /* Always end the list with an EMe. */ #undef EM diff --git a/kernel/panic.c b/kernel/panic.c index cefd7d82366f..8e299cae1615 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #define PANIC_TIMER_STEP 100 @@ -609,6 +610,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, print_irqtrace_events(current); print_oops_end_marker(); + trace_error_report_end(ERROR_DETECTOR_WARN, (unsigned long)caller); /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); -- cgit v1.2.3 From e83a4472bf9f556d01984048e398e64246c4dd6f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 19 Jan 2022 18:09:59 -0800 Subject: panic: remove oops_id The oops id has been added as part of the end of trace marker for the kerneloops.org project. The id is used to automatically identify duplicate submissions of the same report. Identical looking reports with different a id can be considered as the same oops occurred again. The early initialisation of the oops_id can create a warning if the random core is not yet fully initialized. On PREEMPT_RT it is problematic if the id is initialized on demand from non preemptible context. The kernel oops project is not available since 2017. Remove the oops_id and use 0 in the output in case parser rely on it. Link: https://bugs.debian.org/953172 Link: https://lkml.kernel.org/r/Ybdi16aP2NEugWHq@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Cc: Arjan van de Ven Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 8e299cae1615..55b50e052ec3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -534,26 +534,9 @@ void oops_enter(void) trigger_all_cpu_backtrace(); } -/* - * 64-bit random ID for oopses: - */ -static u64 oops_id; - -static int init_oops_id(void) -{ - if (!oops_id) - get_random_bytes(&oops_id, sizeof(oops_id)); - else - oops_id++; - - return 0; -} -late_initcall(init_oops_id); - static void print_oops_end_marker(void) { - init_oops_id(); - pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); + pr_warn("---[ end trace %016llx ]---\n", 0ULL); } /* -- cgit v1.2.3 From a3d5dc908a5f572ce3e31fe83fd2459a1c3c5422 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 19 Jan 2022 18:10:02 -0800 Subject: delayacct: support swapin delay accounting for swapping without blkio Currently delayacct accounts swapin delay only for swapping that cause blkio. If we use zram for swapping, tools/accounting/getdelays can't get any SWAP delay. It's useful to get zram swapin delay information, for example to adjust compress algorithm or /proc/sys/vm/swappiness. Reference to PSI, it accounts any kind of swapping by doing its work in swap_readpage(), no matter whether swapping causes blkio. Let delayacct do the similar work. Link: https://lkml.kernel.org/r/20211112083813.8559-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reported-by: Zeal Robot Cc: Balbir Singh Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 44 ++++++++++++++++++++++---------------------- kernel/delayacct.c | 33 ++++++++++++++++++--------------- mm/memory.c | 4 ---- mm/page_io.c | 3 +++ 4 files changed, 43 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index af7e6eb50283..b96d68f310a2 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -9,14 +9,6 @@ #include -/* - * Per-task flags relevant to delay accounting - * maintained privately to avoid exhausting similar flags in sched.h:PF_* - * Used to set current->delays->flags - */ -#define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */ -#define DELAYACCT_PF_BLKIO 0x00000002 /* I am waiting on IO */ - #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { raw_spinlock_t lock; @@ -37,13 +29,13 @@ struct task_delay_info { * associated with the operation is added to XXX_delay. * XXX_delay contains the accumulated delay time in nanoseconds. */ - u64 blkio_start; /* Shared by blkio, swapin */ + u64 blkio_start; u64 blkio_delay; /* wait for sync block io completion */ - u64 swapin_delay; /* wait for swapin block io completion */ + u64 swapin_start; + u64 swapin_delay; /* wait for swapin */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ - u32 swapin_count; /* total count of the number of swapin block */ - /* io operations performed */ + u32 swapin_count; /* total count of swapin */ u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ @@ -79,14 +71,8 @@ extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); extern void __delayacct_thrashing_start(void); extern void __delayacct_thrashing_end(void); - -static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) -{ - if (p->delays) - return (p->delays->flags & DELAYACCT_PF_BLKIO); - else - return 0; -} +extern void __delayacct_swapin_start(void); +extern void __delayacct_swapin_end(void); static inline void delayacct_set_flag(struct task_struct *p, int flag) { @@ -123,7 +109,6 @@ static inline void delayacct_blkio_start(void) if (!static_branch_unlikely(&delayacct_key)) return; - delayacct_set_flag(current, DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); } @@ -135,7 +120,6 @@ static inline void delayacct_blkio_end(struct task_struct *p) if (p->delays) __delayacct_blkio_end(p); - delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); } static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) @@ -169,6 +153,18 @@ static inline void delayacct_thrashing_end(void) __delayacct_thrashing_end(); } +static inline void delayacct_swapin_start(void) +{ + if (current->delays) + __delayacct_swapin_start(); +} + +static inline void delayacct_swapin_end(void) +{ + if (current->delays) + __delayacct_swapin_end(); +} + #else static inline void delayacct_set_flag(struct task_struct *p, int flag) {} @@ -199,6 +195,10 @@ static inline void delayacct_thrashing_start(void) {} static inline void delayacct_thrashing_end(void) {} +static inline void delayacct_swapin_start(void) +{} +static inline void delayacct_swapin_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 51530d5b15a8..97699848c1f0 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -100,19 +100,10 @@ void __delayacct_blkio_start(void) */ void __delayacct_blkio_end(struct task_struct *p) { - struct task_delay_info *delays = p->delays; - u64 *total; - u32 *count; - - if (p->delays->flags & DELAYACCT_PF_SWAPIN) { - total = &delays->swapin_delay; - count = &delays->swapin_count; - } else { - total = &delays->blkio_delay; - count = &delays->blkio_count; - } - - delayacct_end(&delays->lock, &delays->blkio_start, total, count); + delayacct_end(&p->delays->lock, + &p->delays->blkio_start, + &p->delays->blkio_delay, + &p->delays->blkio_count); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -179,8 +170,7 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) unsigned long flags; raw_spin_lock_irqsave(&tsk->delays->lock, flags); - ret = nsec_to_clock_t(tsk->delays->blkio_delay + - tsk->delays->swapin_delay); + ret = nsec_to_clock_t(tsk->delays->blkio_delay); raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } @@ -210,3 +200,16 @@ void __delayacct_thrashing_end(void) ¤t->delays->thrashing_delay, ¤t->delays->thrashing_count); } + +void __delayacct_swapin_start(void) +{ + current->delays->swapin_start = local_clock(); +} + +void __delayacct_swapin_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->swapin_start, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); +} diff --git a/mm/memory.c b/mm/memory.c index 8f1de811a1dc..ced3274c3deb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3507,7 +3507,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - delayacct_set_flag(current, DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; @@ -3555,7 +3554,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto unlock; } @@ -3569,13 +3567,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto out_release; } locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; goto out_release; diff --git a/mm/page_io.c b/mm/page_io.c index 9725c7e1eeea..0bf8e40f4e57 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -25,6 +25,7 @@ #include #include #include +#include void end_swap_bio_write(struct bio *bio) { @@ -370,6 +371,7 @@ int swap_readpage(struct page *page, bool synchronous) * significant part of overall IO time. */ psi_memstall_enter(&pflags); + delayacct_swapin_start(); if (frontswap_load(page) == 0) { SetPageUptodate(page); @@ -432,6 +434,7 @@ int swap_readpage(struct page *page, bool synchronous) out: psi_memstall_leave(&pflags); + delayacct_swapin_end(); return ret; } -- cgit v1.2.3 From 5bf18281534451bf1ad56a45a3085cd7ad46860d Mon Sep 17 00:00:00 2001 From: wangyong Date: Wed, 19 Jan 2022 18:10:15 -0800 Subject: delayacct: track delays from memory compact Delay accounting does not track the delay of memory compact. When there is not enough free memory, tasks can spend a amount of their time waiting for compact. To get the impact of tasks in direct memory compact, measure the delay when allocating memory through memory compact. Also update tools/accounting/getdelays.c: / # ./getdelays_next -di -p 304 print delayacct stats ON printing IO accounting PID 304 CPU count real total virtual total delay total delay average 277 780000000 849039485 18877296 0.068ms IO count delay total delay average 0 0 0ms SWAP count delay total delay average 0 0 0ms RECLAIM count delay total delay average 5 11088812685 2217ms THRASHING count delay total delay average 0 0 0ms COMPACT count delay total delay average 3 72758 0ms watch: read=0, write=0, cancelled_write=0 Link: https://lkml.kernel.org/r/1638619795-71451-1-git-send-email-wang.yong12@zte.com.cn Signed-off-by: wangyong Reviewed-by: Jiang Xuexin Reviewed-by: Zhang Wenya Reviewed-by: Yang Yang Reviewed-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 28 ++++++++++++++++++++++++++++ include/uapi/linux/taskstats.h | 6 +++++- kernel/delayacct.c | 16 ++++++++++++++++ mm/page_alloc.c | 3 +++ tools/accounting/getdelays.c | 8 +++++++- 5 files changed, 59 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 435c3654a0ff..3e03d010bd2e 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -42,8 +42,12 @@ struct task_delay_info { u64 thrashing_start; u64 thrashing_delay; /* wait for thrashing page */ + u64 compact_start; + u64 compact_delay; /* wait for memory compact */ + u32 freepages_count; /* total count of memory reclaim */ u32 thrashing_count; /* total count of thrash waits */ + u32 compact_count; /* total count of memory compact */ }; #endif @@ -72,6 +76,8 @@ extern void __delayacct_thrashing_start(void); extern void __delayacct_thrashing_end(void); extern void __delayacct_swapin_start(void); extern void __delayacct_swapin_end(void); +extern void __delayacct_compact_start(void); +extern void __delayacct_compact_end(void); static inline void delayacct_tsk_init(struct task_struct *tsk) { @@ -170,6 +176,24 @@ static inline void delayacct_swapin_end(void) __delayacct_swapin_end(); } +static inline void delayacct_compact_start(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_compact_start(); +} + +static inline void delayacct_compact_end(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_compact_end(); +} + #else static inline void delayacct_init(void) {} @@ -200,6 +224,10 @@ static inline void delayacct_swapin_start(void) {} static inline void delayacct_swapin_end(void) {} +static inline void delayacct_compact_start(void) +{} +static inline void delayacct_compact_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index ccbd08709321..12327d32378f 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 10 +#define TASKSTATS_VERSION 11 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -172,6 +172,10 @@ struct taskstats { /* v10: 64-bit btime to avoid overflow */ __u64 ac_btime64; /* 64-bit begin time */ + + /* Delay waiting for memory compact */ + __u64 compact_count; + __u64 compact_delay_total; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 97699848c1f0..c5e8cea9e05f 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -155,10 +155,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; + tmp = d->compact_delay_total + tsk->delays->compact_delay; + d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; + d->compact_count += tsk->delays->compact_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -213,3 +216,16 @@ void __delayacct_swapin_end(void) ¤t->delays->swapin_delay, ¤t->delays->swapin_count); } + +void __delayacct_compact_start(void) +{ + current->delays->compact_start = local_clock(); +} + +void __delayacct_compact_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->compact_start, + ¤t->delays->compact_delay, + ¤t->delays->compact_count); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c5952749ad40..635063f49671 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -4348,6 +4349,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; psi_memstall_enter(&pflags); + delayacct_compact_start(); noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, @@ -4355,6 +4357,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); + delayacct_compact_end(); if (*compact_result == COMPACT_SKIPPED) return NULL; diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 5ef1c15e88ad..11e86739456d 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -205,6 +205,8 @@ static void print_delayacct(struct taskstats *t) "RECLAIM %12s%15s%15s\n" " %15llu%15llu%15llums\n" "THRASHING%12s%15s%15s\n" + " %15llu%15llu%15llums\n" + "COMPACT %12s%15s%15s\n" " %15llu%15llu%15llums\n", "count", "real total", "virtual total", "delay total", "delay average", @@ -228,7 +230,11 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->thrashing_count, (unsigned long long)t->thrashing_delay_total, - average_ms(t->thrashing_delay_total, t->thrashing_count)); + average_ms(t->thrashing_delay_total, t->thrashing_count), + "count", "delay total", "delay average", + (unsigned long long)t->compact_count, + (unsigned long long)t->compact_delay_total, + average_ms(t->compact_delay_total, t->compact_count)); } static void task_context_switch_counts(struct taskstats *t) -- cgit v1.2.3 From 0aaa8977acbf3996d351f51b3b15295943092f63 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 19 Jan 2022 18:10:18 -0800 Subject: configs: introduce debug.config for CI-like setup Some general debugging features like kmemleak, KASAN, lockdep, UBSAN etc help fix many viruses like a microscope. On the other hand, those features are scatter around and mixed up with more situational debugging options making them difficult to consume properly. This cold help amplify the general debugging/testing efforts and help establish sensitive default values for those options across the broad. This could also help different distros to collaborate on maintaining debug-flavored kernels. The config is based on years' experiences running daily CI inside the largest enterprise Linux distro company to seek regressions on linux-next builds on different bare-metal and virtual platforms. It can be used for example, $ make ARCH=arm64 defconfig debug.config Since KASAN and KCSAN can't be enabled together, we will need to create a separate one for KCSAN later as well. Link: https://lkml.kernel.org/r/20211115134754.7334-1-quic_qiancai@quicinc.com Signed-off-by: Qian Cai Acked-by: Paul E. McKenney Cc: Marco Elver Cc: Dmitry Vyukov Cc: Daniel Thompson Cc: Masahiro Yamada Cc: Naresh Kamboju Cc: "Stephen Rothwell" Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs/debug.config | 105 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 kernel/configs/debug.config (limited to 'kernel') diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config new file mode 100644 index 000000000000..e9ffb0cc1eec --- /dev/null +++ b/kernel/configs/debug.config @@ -0,0 +1,105 @@ +# The config is based on running daily CI for enterprise Linux distros to +# seek regressions on linux-next builds on different bare-metal and virtual +# platforms. It can be used for example, +# +# $ make ARCH=arm64 defconfig debug.config +# +# Keep alphabetically sorted inside each section. +# +# printk and dmesg options +# +CONFIG_DEBUG_BUGVERBOSE=y +CONFIG_DYNAMIC_DEBUG=y +CONFIG_PRINTK_CALLER=y +CONFIG_PRINTK_TIME=y +CONFIG_SYMBOLIC_ERRNAME=y +# +# Compile-time checks and compiler options +# +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_SECTION_MISMATCH=y +CONFIG_FRAME_WARN=2048 +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +# +# Generic Kernel Debugging Instruments +# +# CONFIG_UBSAN_ALIGNMENT is not set +# CONFIG_UBSAN_DIV_ZERO is not set +# CONFIG_UBSAN_TRAP is not set +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_FS_ALLOW_ALL=y +CONFIG_DEBUG_IRQFLAGS=y +CONFIG_UBSAN=y +CONFIG_UBSAN_BOOL=y +CONFIG_UBSAN_BOUNDS=y +CONFIG_UBSAN_ENUM=y +CONFIG_UBSAN_SHIFT=y +CONFIG_UBSAN_UNREACHABLE=y +# +# Memory Debugging +# +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +# CONFIG_DEBUG_WX is not set +# CONFIG_KFENCE is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_SLUB_STATS is not set +CONFIG_PAGE_EXTENSION=y +CONFIG_PAGE_OWNER=y +CONFIG_DEBUG_KMEMLEAK=y +CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y +CONFIG_DEBUG_OBJECTS=y +CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1 +CONFIG_DEBUG_OBJECTS_FREE=y +CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y +CONFIG_DEBUG_OBJECTS_RCU_HEAD=y +CONFIG_DEBUG_OBJECTS_TIMERS=y +CONFIG_DEBUG_OBJECTS_WORK=y +CONFIG_DEBUG_PER_CPU_MAPS=y +CONFIG_DEBUG_STACK_USAGE=y +CONFIG_DEBUG_VIRTUAL=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_VM_PGFLAGS=y +CONFIG_DEBUG_VM_RB=y +CONFIG_DEBUG_VM_VMACACHE=y +CONFIG_GENERIC_PTDUMP=y +CONFIG_KASAN=y +CONFIG_KASAN_GENERIC=y +CONFIG_KASAN_INLINE=y +CONFIG_KASAN_VMALLOC=y +CONFIG_PTDUMP_DEBUGFS=y +CONFIG_SCHED_STACK_END_CHECK=y +CONFIG_SLUB_DEBUG_ON=y +# +# Debug Oops, Lockups and Hangs +# +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_DEBUG_ATOMIC_SLEEP=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_PANIC_TIMEOUT=0 +CONFIG_SOFTLOCKUP_DETECTOR=y +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +# CONFIG_PROVE_RAW_LOCK_NESTING is not set +CONFIG_PROVE_LOCKING=y +# +# Debug kernel data structures +# +CONFIG_BUG_ON_DATA_CORRUPTION=y +# +# RCU Debugging +# +CONFIG_PROVE_RCU=y +CONFIG_PROVE_RCU_LIST=y +# +# Tracers +# +CONFIG_BRANCH_PROFILE_NONE=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y -- cgit v1.2.3 From 78e36f3b0dae586f623c4a37ec5eb5496f5abbe1 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:10:55 -0800 Subject: sysctl: move some boundary constants from sysctl.c to sysctl_vals sysctl has helpers which let us specify boundary values for a min or max int value. Since these are used for a boundary check only they don't change, so move these variables to sysctl_vals to avoid adding duplicate variables. This will help with our cleanup of kernel/sysctl.c. [akpm@linux-foundation.org: update it for "mm/pagealloc: sysctl: change watermark_scale_factor max limit to 30%"] [mcgrof@kernel.org: major rebase] Link: https://lkml.kernel.org/r/20211123202347.818157-3-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h | 13 ++++++++++--- kernel/sysctl.c | 45 +++++++++++++++++++-------------------------- 3 files changed, 30 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 55a54973f061..cd8bbf13d0f0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -26,7 +26,7 @@ static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { 0, 1, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); /* Support for permanently empty directories */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index d3ab7969b6b5..47cf70c8eb93 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -38,9 +38,16 @@ struct ctl_table_header; struct ctl_dir; /* Keep the same order as in fs/proc/proc_sysctl.c */ -#define SYSCTL_ZERO ((void *)&sysctl_vals[0]) -#define SYSCTL_ONE ((void *)&sysctl_vals[1]) -#define SYSCTL_INT_MAX ((void *)&sysctl_vals[2]) +#define SYSCTL_NEG_ONE ((void *)&sysctl_vals[0]) +#define SYSCTL_ZERO ((void *)&sysctl_vals[1]) +#define SYSCTL_ONE ((void *)&sysctl_vals[2]) +#define SYSCTL_TWO ((void *)&sysctl_vals[3]) +#define SYSCTL_FOUR ((void *)&sysctl_vals[4]) +#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5]) +#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6]) +#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[7]) +#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) +#define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) extern const int sysctl_vals[]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ef77be575d87..901a9dfe4d62 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -114,16 +114,9 @@ static int sixty = 60; #endif -static int __maybe_unused neg_one = -1; -static int __maybe_unused two = 2; -static int __maybe_unused four = 4; static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; -static int one_hundred = 100; -static int two_hundred = 200; -static int one_thousand = 1000; -static int three_thousand = 3000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -1964,7 +1957,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, + .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, #endif @@ -2306,7 +2299,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax_sysadmin, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, #endif { @@ -2566,7 +2559,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &neg_one, + .extra1 = SYSCTL_NEG_ONE, }, #endif #ifdef CONFIG_RT_MUTEXES @@ -2628,7 +2621,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_cpu_time_max_percent_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "perf_event_max_stack", @@ -2646,7 +2639,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_thousand, + .extra2 = SYSCTL_ONE_THOUSAND, }, #endif { @@ -2677,7 +2670,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = bpf_unpriv_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "bpf_stats_enabled", @@ -2731,7 +2724,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = overcommit_policy_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "panic_on_oom", @@ -2740,7 +2733,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "oom_kill_allocating_task", @@ -2785,7 +2778,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = dirty_background_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_background_bytes", @@ -2802,7 +2795,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = dirty_ratio_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "dirty_bytes", @@ -2842,7 +2835,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two_hundred, + .extra2 = SYSCTL_TWO_HUNDRED, }, #ifdef CONFIG_HUGETLB_PAGE { @@ -2899,7 +2892,7 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &four, + .extra2 = SYSCTL_FOUR, }, #ifdef CONFIG_COMPACTION { @@ -2916,7 +2909,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = compaction_proactiveness_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "extfrag_threshold", @@ -2961,7 +2954,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &three_thousand, + .extra2 = SYSCTL_THREE_THOUSAND, }, { .procname = "percpu_pagelist_high_fraction", @@ -3040,7 +3033,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, { .procname = "min_slab_ratio", @@ -3049,7 +3042,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = sysctl_min_slab_ratio_sysctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &one_hundred, + .extra2 = SYSCTL_ONE_HUNDRED, }, #endif #ifdef CONFIG_SMP @@ -3339,7 +3332,7 @@ static struct ctl_table fs_table[] = { .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "protected_regular", @@ -3348,7 +3341,7 @@ static struct ctl_table fs_table[] = { .mode = 0600, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, { .procname = "suid_dumpable", @@ -3357,7 +3350,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = SYSCTL_TWO, }, #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) { -- cgit v1.2.3 From bbe7a10ed83a5fa0b0ff6161ecdc4e65a0e9c993 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:00 -0800 Subject: hung_task: move hung_task sysctl interface to hung_task.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move hung_task sysctl interface to hung_task.c and use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: commit log refresh and fixed 2-3 0day reported compile issues] Link: https://lkml.kernel.org/r/20211123202347.818157-4-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Paul Turner Cc: Peter Zijlstra Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/sysctl.h | 14 +------- kernel/hung_task.c | 81 ++++++++++++++++++++++++++++++++++++++++++-- kernel/sysctl.c | 61 --------------------------------- 3 files changed, 79 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 304f431178fd..c19dd5a2c05c 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,20 +7,8 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK - -#ifdef CONFIG_SMP -extern unsigned int sysctl_hung_task_all_cpu_backtrace; -#else -#define sysctl_hung_task_all_cpu_backtrace 0 -#endif /* CONFIG_SMP */ - -extern int sysctl_hung_task_check_count; -extern unsigned int sysctl_hung_task_panic; +/* used for hung_task and block/ */ extern unsigned long sysctl_hung_task_timeout_secs; -extern unsigned long sysctl_hung_task_check_interval_secs; -extern int sysctl_hung_task_warnings; -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9888e2bc8c76..52501e5f7655 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,7 +63,9 @@ static struct task_struct *watchdog_task; * Should we dump all CPUs backtraces in a hung task event? * Defaults to 0, can be changed via sysctl. */ -unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#else +#define sysctl_hung_task_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ /* @@ -222,11 +224,13 @@ static long hung_timeout_jiffies(unsigned long last_checked, MAX_SCHEDULE_TIMEOUT; } +#ifdef CONFIG_SYSCTL /* * Process updating of timeout sysctl */ -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -241,6 +245,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ +static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); +static struct ctl_table hung_task_sysctls[] = { +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ + { + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = &sysctl_hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = &sysctl_hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_NEG_ONE, + }, + {} +}; + +static void __init hung_task_sysctl_init(void) +{ + register_sysctl_init("kernel", hung_task_sysctls); +} +#else +#define hung_task_sysctl_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + + static atomic_t reset_hung_task = ATOMIC_INIT(0); void reset_hung_task_detector(void) @@ -310,6 +384,7 @@ static int __init hung_task_init(void) pm_notifier(hungtask_pm_notify, 0); watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + hung_task_sysctl_init(); return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 901a9dfe4d62..a4e00abe6f78 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -134,13 +134,6 @@ static int minolduid; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -/* - * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs - * and hung_task_check_interval_secs - */ -#ifdef CONFIG_DETECT_HUNG_TASK -static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); -#endif #ifdef CONFIG_INOTIFY_USER #include @@ -2508,60 +2501,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DETECT_HUNG_TASK -#ifdef CONFIG_SMP - { - .procname = "hung_task_all_cpu_backtrace", - .data = &sysctl_hung_task_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ - { - .procname = "hung_task_panic", - .data = &sysctl_hung_task_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "hung_task_check_count", - .data = &sysctl_hung_task_check_count, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "hung_task_timeout_secs", - .data = &sysctl_hung_task_timeout_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_check_interval_secs", - .data = &sysctl_hung_task_check_interval_secs, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_dohung_task_timeout_secs, - .extra2 = &hung_task_timeout_max, - }, - { - .procname = "hung_task_warnings", - .data = &sysctl_hung_task_warnings, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_NEG_ONE, - }, -#endif #ifdef CONFIG_RT_MUTEXES { .procname = "max_lock_depth", -- cgit v1.2.3 From dd0693fdf054f2ed37202ed56649ae2cccd29474 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:05 -0800 Subject: watchdog: move watchdog sysctl interface to watchdog.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic of proc sysctl. So, move the watchdog syscl interface to watchdog.c. Use register_sysctl() to register the sysctl interface to avoid merge conflicts when different features modify sysctl.c at the same time. [mcgrof@kernel.org: justify the move on the commit log] Link: https://lkml.kernel.org/r/20211123202347.818157-5-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Paul Turner Cc: Peter Zijlstra Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 96 --------------------------------------------------- kernel/watchdog.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a4e00abe6f78..55e25996e3cf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -103,16 +103,10 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #endif -#ifdef CONFIG_LOCKUP_DETECTOR -#include -#endif #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ -#ifdef CONFIG_LOCKUP_DETECTOR -static int sixty = 60; -#endif static unsigned long zero_ul; static unsigned long one_ul = 1; @@ -2309,96 +2303,6 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_LOCKUP_DETECTOR) - { - .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_thresh", - .data = &watchdog_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_watchdog_thresh, - .extra1 = SYSCTL_ZERO, - .extra2 = &sixty, - }, - { - .procname = "nmi_watchdog", - .data = &nmi_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = NMI_WATCHDOG_SYSCTL_PERM, - .proc_handler = proc_nmi_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "watchdog_cpumask", - .data = &watchdog_cpumask_bits, - .maxlen = NR_CPUS, - .mode = 0644, - .proc_handler = proc_watchdog_cpumask, - }, -#ifdef CONFIG_SOFTLOCKUP_DETECTOR - { - .procname = "soft_watchdog", - .data = &soft_watchdog_user_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_soft_watchdog, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "softlockup_panic", - .data = &softlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "softlockup_all_cpu_backtrace", - .data = &sysctl_softlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR - { - .procname = "hardlockup_panic", - .data = &hardlockup_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#ifdef CONFIG_SMP - { - .procname = "hardlockup_all_cpu_backtrace", - .data = &sysctl_hardlockup_all_cpu_backtrace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SMP */ -#endif -#endif - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) { .procname = "unknown_nmi_panic", diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ad912511a0c0..99afb88d2e85 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -740,6 +740,106 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, mutex_unlock(&watchdog_mutex); return err; } + +static const int sixty = 60; + +static struct ctl_table watchdog_sysctls[] = { + { + .procname = "watchdog", + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_thresh", + .data = &watchdog_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_watchdog_thresh, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&sixty, + }, + { + .procname = "nmi_watchdog", + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = NMI_WATCHDOG_SYSCTL_PERM, + .proc_handler = proc_nmi_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "watchdog_cpumask", + .data = &watchdog_cpumask_bits, + .maxlen = NR_CPUS, + .mode = 0644, + .proc_handler = proc_watchdog_cpumask, + }, +#ifdef CONFIG_SOFTLOCKUP_DETECTOR + { + .procname = "soft_watchdog", + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_soft_watchdog, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR + { + .procname = "hardlockup_panic", + .data = &hardlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#ifdef CONFIG_SMP + { + .procname = "hardlockup_all_cpu_backtrace", + .data = &sysctl_hardlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ +#endif + {} +}; + +static void __init watchdog_sysctl_init(void) +{ + register_sysctl_init("kernel", watchdog_sysctls); +} +#else +#define watchdog_sysctl_init() do { } while (0) #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) @@ -753,4 +853,5 @@ void __init lockup_detector_init(void) if (!watchdog_nmi_probe()) nmi_watchdog_available = true; lockup_detector_setup(); + watchdog_sysctl_init(); } -- cgit v1.2.3 From f628867da46f8867e1854e43d7200e42ec22eee2 Mon Sep 17 00:00:00 2001 From: Stephen Kitt Date: Fri, 21 Jan 2022 22:11:09 -0800 Subject: sysctl: make ngroups_max const ngroups_max is a read-only sysctl entry, reflecting NGROUPS_MAX. Make it const, in the same way as cap_last_cap. Link: https://lkml.kernel.org/r/20211123202347.818157-6-mcgrof@kernel.org Signed-off-by: Stephen Kitt Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Tetsuo Handa Cc: Xiaoming Ni Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 55e25996e3cf..29babf1f0d4a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -125,7 +125,7 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; static int maxolduid = 65535; static int minolduid; -static int ngroups_max = NGROUPS_MAX; +static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -2291,7 +2291,7 @@ static struct ctl_table kern_table[] = { #endif { .procname = "ngroups_max", - .data = &ngroups_max, + .data = (void *)&ngroups_max, .maxlen = sizeof (int), .mode = 0444, .proc_handler = proc_dointvec, -- cgit v1.2.3 From d73840ec2f747b860331bbba53677d0ce38fb9c1 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:14 -0800 Subject: sysctl: use const for typically used max/min proc sysctls When proc_dointvec_minmax() or proc_doulongvec_minmax() are used we are using the extra1 and extra2 parameters on the sysctl table only for a min and max boundary, these extra1 and extra2 arguments are then used for read-only operations. So make them const to reflect this. [mcgrof@kernel.org: commit log love] Link: https://lkml.kernel.org/r/20211123202347.818157-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 29babf1f0d4a..134c2cb9ed4e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -108,27 +108,26 @@ /* Constants used for minimum and maximum */ -static unsigned long zero_ul; -static unsigned long one_ul = 1; -static unsigned long long_max = LONG_MAX; +static const unsigned long zero_ul; +static const unsigned long one_ul = 1; +static const unsigned long long_max = LONG_MAX; #ifdef CONFIG_PRINTK -static int ten_thousand = 10000; +static const int ten_thousand = 10000; #endif #ifdef CONFIG_PERF_EVENTS -static int six_hundred_forty_kb = 640 * 1024; +static const int six_hundred_forty_kb = 640 * 1024; #endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ -static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; +static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ -static int maxolduid = 65535; -static int minolduid; +static const int maxolduid = 65535; +static const int minolduid; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; - #ifdef CONFIG_INOTIFY_USER #include #endif @@ -172,8 +171,8 @@ int sysctl_legacy_va_layout; #endif #ifdef CONFIG_COMPACTION -static int min_extfrag_threshold; -static int max_extfrag_threshold = 1000; +static const int min_extfrag_threshold; +static const int max_extfrag_threshold = 1000; #endif #endif /* CONFIG_SYSCTL */ @@ -2177,8 +2176,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, { .procname = "overflowgid", @@ -2186,8 +2185,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_S390 { @@ -2261,7 +2260,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &ten_thousand, + .extra2 = (void *)&ten_thousand, }, { .procname = "printk_devkmsg", @@ -2473,7 +2472,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = perf_event_max_stack_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &six_hundred_forty_kb, + .extra2 = (void *)&six_hundred_forty_kb, }, { .procname = "perf_event_max_contexts_per_stack", @@ -2629,7 +2628,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_background_bytes), .mode = 0644, .proc_handler = dirty_background_bytes_handler, - .extra1 = &one_ul, + .extra1 = (void *)&one_ul, }, { .procname = "dirty_ratio", @@ -2646,7 +2645,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(vm_dirty_bytes), .mode = 0644, .proc_handler = dirty_bytes_handler, - .extra1 = &dirty_bytes_min, + .extra1 = (void *)&dirty_bytes_min, }, { .procname = "dirty_writeback_centisecs", @@ -2760,8 +2759,8 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &min_extfrag_threshold, - .extra2 = &max_extfrag_threshold, + .extra1 = (void *)&min_extfrag_threshold, + .extra2 = (void *)&max_extfrag_threshold, }, { .procname = "compact_unevictable_allowed", @@ -3047,8 +3046,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero_ul, - .extra2 = &long_max, + .extra1 = (void *)&zero_ul, + .extra2 = (void *)&long_max, }, { .procname = "nr_open", @@ -3072,8 +3071,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, { .procname = "overflowgid", @@ -3081,8 +3080,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &minolduid, - .extra2 = &maxolduid, + .extra1 = (void *)&minolduid, + .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_FILE_LOCKING { -- cgit v1.2.3 From 2452dcb9f7f2bab5b47344148bc23a732be9195c Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:19 -0800 Subject: sysctl: use SYSCTL_ZERO to replace some static int zero uses Use the variable SYSCTL_ZERO to replace some static int boundary variables with a value of 0 (minolduid, min_extfrag_threshold, min_wakeup_granularity_ns). Link: https://lkml.kernel.org/r/20211123202347.818157-8-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jan Kara Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 134c2cb9ed4e..a2175a305f10 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -123,7 +123,7 @@ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static const int maxolduid = 65535; -static const int minolduid; +/* minolduid is SYSCTL_ZERO */ static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -171,7 +171,7 @@ int sysctl_legacy_va_layout; #endif #ifdef CONFIG_COMPACTION -static const int min_extfrag_threshold; +/* min_extfrag_threshold is SYSCTL_ZERO */; static const int max_extfrag_threshold = 1000; #endif @@ -2176,7 +2176,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, { @@ -2185,7 +2185,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_S390 @@ -2759,7 +2759,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&min_extfrag_threshold, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&max_extfrag_threshold, }, { @@ -3071,7 +3071,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, { @@ -3080,7 +3080,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)&minolduid, + .extra1 = SYSCTL_ZERO, .extra2 = (void *)&maxolduid, }, #ifdef CONFIG_FILE_LOCKING -- cgit v1.2.3 From 86b12b6c5d6b46e64bf2e8080528781032e4bd90 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:24 -0800 Subject: aio: move aio sysctl to aio.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Move aio sysctl to aio.c and use the new register_sysctl_init() to register the sysctl interface for aio. [mcgrof@kernel.org: adjust commit log to justify the move] Link: https://lkml.kernel.org/r/20211123202347.818157-9-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Reviewed-by: Jan Kara Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 31 +++++++++++++++++++++++++++++-- include/linux/aio.h | 4 ---- kernel/sysctl.c | 17 ----------------- 3 files changed, 29 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/fs/aio.c b/fs/aio.c index f6f1cbffef9e..4ceba13a7db0 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -220,9 +220,35 @@ struct aio_kiocb { /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); -unsigned long aio_nr; /* current system wide number of aio requests */ -unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ +static unsigned long aio_nr; /* current system wide number of aio requests */ +static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ +#ifdef CONFIG_SYSCTL +static struct ctl_table aio_sysctls[] = { + { + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + {} +}; + +static void __init aio_sysctl_init(void) +{ + register_sysctl_init("fs", aio_sysctls); +} +#else +#define aio_sysctl_init() do { } while (0) +#endif static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; @@ -275,6 +301,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); + aio_sysctl_init(); return 0; } __initcall(aio_setup); diff --git a/include/linux/aio.h b/include/linux/aio.h index b83e68dd006f..86892a4fe7c8 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -20,8 +20,4 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */ -/* for sysctl: */ -extern unsigned long aio_nr; -extern unsigned long aio_max_nr; - #endif /* __LINUX__AIO_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a2175a305f10..822555a0ec76 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -20,7 +20,6 @@ */ #include -#include #include #include #include @@ -3111,22 +3110,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_AIO - { - .procname = "aio-nr", - .data = &aio_nr, - .maxlen = sizeof(aio_nr), - .mode = 0444, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "aio-max-nr", - .data = &aio_max_nr, - .maxlen = sizeof(aio_max_nr), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, -#endif /* CONFIG_AIO */ #ifdef CONFIG_INOTIFY_USER { .procname = "inotify", -- cgit v1.2.3 From 49a4de75719b6c0f1f375df9908a95cef1e34945 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:29 -0800 Subject: dnotify: move dnotify sysctl to dnotify.c The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move dnotify sysctls to dnotify.c and use the new register_sysctl_init() to register the sysctl interface. [mcgrof@kernel.org: adjust the commit log to justify the move] Link: https://lkml.kernel.org/r/20211123202347.818157-10-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Acked-by: Jan Kara Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Benjamin LaHaise Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Kees Cook Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Qing Wang Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Phillip Potter Cc: Rodrigo Vivi Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/dnotify/dnotify.c | 21 ++++++++++++++++++++- include/linux/dnotify.h | 1 - kernel/sysctl.c | 10 ---------- 3 files changed, 20 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index d5ebebb034ff..829dd4a61b66 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -19,7 +19,25 @@ #include #include -int dir_notify_enable __read_mostly = 1; +static int dir_notify_enable __read_mostly = 1; +#ifdef CONFIG_SYSCTL +static struct ctl_table dnotify_sysctls[] = { + { + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +static void __init dnotify_sysctl_init(void) +{ + register_sysctl_init("fs", dnotify_sysctls); +} +#else +#define dnotify_sysctl_init() do { } while (0) +#endif static struct kmem_cache *dnotify_struct_cache __read_mostly; static struct kmem_cache *dnotify_mark_cache __read_mostly; @@ -386,6 +404,7 @@ static int __init dnotify_init(void) dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops); if (IS_ERR(dnotify_group)) panic("unable to allocate fsnotify group for dnotify\n"); + dnotify_sysctl_init(); return 0; } diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h index b87c3b85a166..b1d26f9f1c9f 100644 --- a/include/linux/dnotify.h +++ b/include/linux/dnotify.h @@ -29,7 +29,6 @@ struct dnotify_struct { FS_CREATE | FS_RENAME |\ FS_MOVED_FROM | FS_MOVED_TO) -extern int dir_notify_enable; extern void dnotify_flush(struct file *, fl_owner_t); extern int fcntl_dirnotify(int, struct file *, unsigned long); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 822555a0ec76..fb37825cd3cd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include @@ -3091,15 +3090,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_DNOTIFY - { - .procname = "dir-notify-enable", - .data = &dir_notify_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_MMU #ifdef CONFIG_FILE_LOCKING { -- cgit v1.2.3 From 7b9ad122b52c9839e1f68f16c907990a6ad6f793 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:11:59 -0800 Subject: inotify: simplify subdirectory registration with register_sysctl() There is no need to user boiler plate code to specify a set of base directories we're going to stuff sysctls under. Simplify this by using register_sysctl() and specifying the directory path directly. Move inotify_user sysctl to inotify_user.c while at it to remove clutter from kernel/sysctl.c. [mcgrof@kernel.org: remember to register fanotify_table] Link: https://lkml.kernel.org/r/YZ5A6iWLb0h3N3RC@bombadil.infradead.org [mcgrof@kernel.org: update commit log to reflect new path we decided to take] Link: https://lkml.kernel.org/r/20211123202422.819032-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 10 +++++++++- fs/notify/inotify/inotify_user.c | 11 ++++++++++- include/linux/fanotify.h | 2 -- include/linux/inotify.h | 3 --- kernel/sysctl.c | 21 --------------------- 5 files changed, 19 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 73a3e939c921..73b1615f9d96 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -59,7 +59,7 @@ static int fanotify_max_queued_events __read_mostly; static long ft_zero = 0; static long ft_int_max = INT_MAX; -struct ctl_table fanotify_table[] = { +static struct ctl_table fanotify_table[] = { { .procname = "max_user_groups", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], @@ -88,6 +88,13 @@ struct ctl_table fanotify_table[] = { }, { } }; + +static void __init fanotify_sysctls_init(void) +{ + register_sysctl("fs/fanotify", fanotify_table); +} +#else +#define fanotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ /* @@ -1743,6 +1750,7 @@ static int __init fanotify_user_setup(void) init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = FANOTIFY_DEFAULT_MAX_GROUPS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; + fanotify_sysctls_init(); return 0; } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 29fca3284bb5..54583f62dc44 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -58,7 +58,7 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; static long it_zero = 0; static long it_int_max = INT_MAX; -struct ctl_table inotify_table[] = { +static struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], @@ -87,6 +87,14 @@ struct ctl_table inotify_table[] = { }, { } }; + +static void __init inotify_sysctls_init(void) +{ + register_sysctl("fs/inotify", inotify_table); +} + +#else +#define inotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) @@ -849,6 +857,7 @@ static int __init inotify_user_setup(void) inotify_max_queued_events = 16384; init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max; + inotify_sysctls_init(); return 0; } diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index 3afdf339d53c..419cadcd7ff5 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -5,8 +5,6 @@ #include #include -extern struct ctl_table fanotify_table[]; /* for sysctl */ - #define FAN_GROUP_FLAG(group, flag) \ ((group)->fanotify_data.flags & (flag)) diff --git a/include/linux/inotify.h b/include/linux/inotify.h index 6a24905f6e1e..8d20caa1b268 100644 --- a/include/linux/inotify.h +++ b/include/linux/inotify.h @@ -7,11 +7,8 @@ #ifndef _LINUX_INOTIFY_H #define _LINUX_INOTIFY_H -#include #include -extern struct ctl_table inotify_table[]; /* for sysctl */ - #define ALL_INOTIFY_BITS (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | \ IN_MOVED_TO | IN_CREATE | IN_DELETE | \ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fb37825cd3cd..2ee0e76888d8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,13 +126,6 @@ static const int maxolduid = 65535; static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; -#ifdef CONFIG_INOTIFY_USER -#include -#endif -#ifdef CONFIG_FANOTIFY -#include -#endif - #ifdef CONFIG_PROC_SYSCTL /** @@ -3100,20 +3093,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_INOTIFY_USER - { - .procname = "inotify", - .mode = 0555, - .child = inotify_table, - }, -#endif -#ifdef CONFIG_FANOTIFY - { - .procname = "fanotify", - .mode = 0555, - .child = fanotify_table, - }, -#endif #ifdef CONFIG_EPOLL { .procname = "epoll", -- cgit v1.2.3 From a8f5de894f76f1c73f4a068d04897a5e2f873825 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:09 -0800 Subject: eventpoll: simplify sysctl declaration with register_sysctl() The kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the epoll_table sysctl to fs/eventpoll.c and use register_sysctl(). Link: https://lkml.kernel.org/r/20211123202422.819032-9-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: "Eric W. Biederman" Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Tetsuo Handa Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Jani Nikula Cc: John Ogness Cc: Martin K. Petersen Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 10 +++++++++- include/linux/poll.h | 2 -- include/linux/sysctl.h | 1 - kernel/sysctl.c | 7 ------- 4 files changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 06f4c5ae1451..e2daa940ebce 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -307,7 +307,7 @@ static void unlist_file(struct epitems_head *head) static long long_zero; static long long_max = LONG_MAX; -struct ctl_table epoll_table[] = { +static struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, @@ -319,6 +319,13 @@ struct ctl_table epoll_table[] = { }, { } }; + +static void __init epoll_sysctls_init(void) +{ + register_sysctl("fs/epoll", epoll_table); +} +#else +#define epoll_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static const struct file_operations eventpoll_fops; @@ -2378,6 +2385,7 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); + epoll_sysctls_init(); ephead_cache = kmem_cache_create("ep_head", sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); diff --git a/include/linux/poll.h b/include/linux/poll.h index 1cdc32b1f1b0..a9e0e1c2d1f2 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -8,12 +8,10 @@ #include #include #include -#include #include #include #include -extern struct ctl_table epoll_table[]; /* for sysctl */ /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ #ifdef __clang__ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 47cf70c8eb93..6dd0f277f844 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -219,7 +219,6 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; extern struct ctl_table random_table[]; extern struct ctl_table firmware_config_table[]; -extern struct ctl_table epoll_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2ee0e76888d8..09fa72299f18 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3093,13 +3093,6 @@ static struct ctl_table fs_table[] = { .proc_handler = proc_dointvec, }, #endif -#ifdef CONFIG_EPOLL - { - .procname = "epoll", - .mode = 0555, - .child = epoll_table, - }, -#endif #endif { .procname = "protected_symlinks", -- cgit v1.2.3 From 6aad36d421d8bfe156508fa4edfe67827234cf0f Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:13 -0800 Subject: firmware_loader: move firmware sysctl to its own files Patch series "sysctl: 3rd set of kernel/sysctl cleanups", v2. This is the third set of patches to help address cleaning the kitchen seink in kernel/sysctl.c and to move sysctls away to where they are actually implemented / used. This patch (of 8): kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the firmware configuration sysctl table to the only place where it is used, and make it clear that if sysctls are disabled this is not used. [akpm@linux-foundation.org: export register_firmware_config_sysctl and unregister_firmware_config_sysctl to modules] [akpm@linux-foundation.org: use EXPORT_SYMBOL_NS_GPL instead] [sfr@canb.auug.org.au: fix that so it compiles] Link: https://lkml.kernel.org/r/20211201160626.401d828d@canb.auug.org.au [mcgrof@kernel.org: major commit log update to justify the move] Link: https://lkml.kernel.org/r/20211124231435.1445213-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211124231435.1445213-2-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Signed-off-by: Stephen Rothwell Cc: Kees Cook Cc: Iurii Zaikin Cc: Eric Biederman Cc: Stephen Kitt Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: "Theodore Ts'o" Cc: Al Viro Cc: Petr Mladek Cc: Sergey Senozhatsky Cc: Steven Rostedt (VMware) Cc: John Ogness Cc: Douglas Gilbert Cc: James E.J. Bottomley Cc: Martin K. Petersen Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Mark Fasheh Cc: Paul Turner Cc: Peter Zijlstra Cc: Phillip Potter Cc: Qing Wang Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Suren Baghdasaryan Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/firmware_loader/fallback.c | 7 ++++++- drivers/base/firmware_loader/fallback.h | 11 +++++++++++ drivers/base/firmware_loader/fallback_table.c | 25 +++++++++++++++++++++++-- include/linux/sysctl.h | 1 - kernel/sysctl.c | 7 ------- 5 files changed, 40 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/drivers/base/firmware_loader/fallback.c b/drivers/base/firmware_loader/fallback.c index d7d63c1aa993..4afb0e9312c0 100644 --- a/drivers/base/firmware_loader/fallback.c +++ b/drivers/base/firmware_loader/fallback.c @@ -199,11 +199,16 @@ static struct class firmware_class = { int register_sysfs_loader(void) { - return class_register(&firmware_class); + int ret = class_register(&firmware_class); + + if (ret != 0) + return ret; + return register_firmware_config_sysctl(); } void unregister_sysfs_loader(void) { + unregister_firmware_config_sysctl(); class_unregister(&firmware_class); } diff --git a/drivers/base/firmware_loader/fallback.h b/drivers/base/firmware_loader/fallback.h index 3af7205b302f..9f3055d3b4ca 100644 --- a/drivers/base/firmware_loader/fallback.h +++ b/drivers/base/firmware_loader/fallback.h @@ -42,6 +42,17 @@ void fw_fallback_set_default_timeout(void); int register_sysfs_loader(void); void unregister_sysfs_loader(void); +#ifdef CONFIG_SYSCTL +extern int register_firmware_config_sysctl(void); +extern void unregister_firmware_config_sysctl(void); +#else +static inline int register_firmware_config_sysctl(void) +{ + return 0; +} +static inline void unregister_firmware_config_sysctl(void) { } +#endif /* CONFIG_SYSCTL */ + #else /* CONFIG_FW_LOADER_USER_HELPER */ static inline int firmware_fallback_sysfs(struct firmware *fw, const char *name, struct device *device, diff --git a/drivers/base/firmware_loader/fallback_table.c b/drivers/base/firmware_loader/fallback_table.c index 46a731dede6f..e5ac098d0742 100644 --- a/drivers/base/firmware_loader/fallback_table.c +++ b/drivers/base/firmware_loader/fallback_table.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +25,7 @@ struct firmware_fallback_config fw_fallback_config = { EXPORT_SYMBOL_NS_GPL(fw_fallback_config, FIRMWARE_LOADER_PRIVATE); #ifdef CONFIG_SYSCTL -struct ctl_table firmware_config_table[] = { +static struct ctl_table firmware_config_table[] = { { .procname = "force_sysfs_fallback", .data = &fw_fallback_config.force_sysfs_fallback, @@ -45,4 +46,24 @@ struct ctl_table firmware_config_table[] = { }, { } }; -#endif + +static struct ctl_table_header *firmware_config_sysct_table_header; +int register_firmware_config_sysctl(void) +{ + firmware_config_sysct_table_header = + register_sysctl("kernel/firmware_config", + firmware_config_table); + if (!firmware_config_sysct_table_header) + return -ENOMEM; + return 0; +} +EXPORT_SYMBOL_NS_GPL(register_firmware_config_sysctl, FIRMWARE_LOADER_PRIVATE); + +void unregister_firmware_config_sysctl(void) +{ + unregister_sysctl_table(firmware_config_sysct_table_header); + firmware_config_sysct_table_header = NULL; +} +EXPORT_SYMBOL_NS_GPL(unregister_firmware_config_sysctl, FIRMWARE_LOADER_PRIVATE); + +#endif /* CONFIG_SYSCTL */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 6dd0f277f844..3985e9c80155 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -218,7 +218,6 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; extern struct ctl_table random_table[]; -extern struct ctl_table firmware_config_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09fa72299f18..5fc3ae16e995 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2154,13 +2154,6 @@ static struct ctl_table kern_table[] = { .mode = 0555, .child = usermodehelper_table, }, -#ifdef CONFIG_FW_LOADER_USER_HELPER - { - .procname = "firmware_config", - .mode = 0555, - .child = firmware_config_table, - }, -#endif { .procname = "overflowuid", .data = &overflowuid, -- cgit v1.2.3 From 5475e8f03c80bbce7b43a57d861f5acc44a60b22 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:18 -0800 Subject: random: move the random sysctl declarations to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the random sysctls to their own file and use register_sysctl_init(). [mcgrof@kernel.org: commit log update to justify the move] Link: https://lkml.kernel.org/r/20211124231435.1445213-3-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/random.c | 14 ++++++++++++-- include/linux/sysctl.h | 1 - kernel/sysctl.c | 5 ----- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/drivers/char/random.c b/drivers/char/random.c index b411182df6f6..68613f0b6887 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1992,8 +1992,7 @@ static int proc_do_entropy(struct ctl_table *table, int write, void *buffer, } static int sysctl_poolsize = POOL_BITS; -extern struct ctl_table random_table[]; -struct ctl_table random_table[] = { +static struct ctl_table random_table[] = { { .procname = "poolsize", .data = &sysctl_poolsize, @@ -2055,6 +2054,17 @@ struct ctl_table random_table[] = { #endif { } }; + +/* + * rand_initialize() is called before sysctl_init(), + * so we cannot call register_sysctl_init() in rand_initialize() + */ +static int __init random_sysctls_init(void) +{ + register_sysctl_init("kernel/random", random_table); + return 0; +} +device_initcall(random_sysctls_init); #endif /* CONFIG_SYSCTL */ struct batched_entropy { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 3985e9c80155..fce05a060bc5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -217,7 +217,6 @@ extern int unaligned_dump_stack; extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; -extern struct ctl_table random_table[]; #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5fc3ae16e995..67a5b04f2f84 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2144,11 +2144,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_max_threads, }, - { - .procname = "random", - .mode = 0555, - .child = random_table, - }, { .procname = "usermodehelper", .mode = 0555, -- cgit v1.2.3 From 3ba442d5331ff4d4339faf4986d0841386196f92 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:28 -0800 Subject: fs: move binfmt_misc sysctl to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. This moves the binfmt_misc sysctl to its own file to help remove clutter from kernel/sysctl.c. Link: https://lkml.kernel.org/r/20211124231435.1445213-5-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 6 +++++- kernel/sysctl.c | 7 ------- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index e1eae7ea823a..ddea6acbddde 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -822,7 +822,11 @@ static int __init init_misc_binfmt(void) int err = register_filesystem(&bm_fs_type); if (!err) insert_binfmt(&misc_format); - return err; + if (!register_sysctl_mount_point("fs/binfmt_misc")) { + pr_warn("Failed to create fs/binfmt_misc sysctl mount point"); + return -ENOMEM; + } + return 0; } static void __exit exit_misc_binfmt(void) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 67a5b04f2f84..fb3cc7a1d4ad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3127,13 +3127,6 @@ static struct ctl_table fs_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, -#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) - { - .procname = "binfmt_misc", - .mode = 0555, - .child = sysctl_mount_point, - }, -#endif { .procname = "pipe-max-size", .data = &pipe_max_size, -- cgit v1.2.3 From faaa357a55e03490fb280ac211be2298e635b220 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:33 -0800 Subject: printk: move printk sysctl to printk/sysctl.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move printk sysctl from kernel/sysctl.c to kernel/printk/sysctl.c. Use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: fixed compile issues when PRINTK is not set, commit log update] Link: https://lkml.kernel.org/r/20211124231435.1445213-6-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/Makefile | 5 ++- kernel/printk/internal.h | 6 ++++ kernel/printk/printk.c | 1 + kernel/printk/sysctl.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 68 -------------------------------------- 5 files changed, 96 insertions(+), 69 deletions(-) create mode 100644 kernel/printk/sysctl.c (limited to 'kernel') diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index d118739874c0..f5b388e810b9 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -2,5 +2,8 @@ obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o -obj-$(CONFIG_PRINTK) += printk_ringbuffer.o obj-$(CONFIG_PRINTK_INDEX) += index.o + +obj-$(CONFIG_PRINTK) += printk_support.o +printk_support-y := printk_ringbuffer.o +printk_support-$(CONFIG_SYSCTL) += sysctl.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 9f3ed2fdb721..6b1c4b399845 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -4,6 +4,12 @@ */ #include +#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) +void __init printk_sysctl_init(void); +#else +#define printk_sysctl_init() do { } while (0) +#endif + #ifdef CONFIG_PRINTK /* Flags for a single printk record. */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 155229f0cf0f..363a493bded8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3211,6 +3211,7 @@ static int __init printk_late_init(void) ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online", console_cpu_notify, NULL); WARN_ON(ret < 0); + printk_sysctl_init(); return 0; } late_initcall(printk_late_init); diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c new file mode 100644 index 000000000000..653ae04aab7f --- /dev/null +++ b/kernel/printk/sysctl.c @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * sysctl.c: General linux system control interface + */ + +#include +#include +#include +#include +#include "internal.h" + +static const int ten_thousand = 10000; + +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} + +static struct ctl_table printk_sysctls[] = { + { + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_ratelimit", + .data = &printk_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&ten_thousand, + }, + { + .procname = "printk_devkmsg", + .data = devkmsg_log_str, + .maxlen = DEVKMSG_STR_MAX_SIZE, + .mode = 0644, + .proc_handler = devkmsg_sysctl_set_loglvl, + }, + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + {} +}; + +void __init printk_sysctl_init(void) +{ + register_sysctl_init("kernel", printk_sysctls); +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index fb3cc7a1d4ad..509f782483db 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -908,17 +908,6 @@ static int proc_taint(struct ctl_table *table, int write, return err; } -#ifdef CONFIG_PRINTK -static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} -#endif - /** * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure * @min: pointer to minimum allowable value @@ -2210,63 +2199,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, -#if defined CONFIG_PRINTK - { - .procname = "printk", - .data = &console_loglevel, - .maxlen = 4*sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_ratelimit", - .data = &printk_ratelimit_state.interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "printk_ratelimit_burst", - .data = &printk_ratelimit_state.burst, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "printk_delay", - .data = &printk_delay_msec, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&ten_thousand, - }, - { - .procname = "printk_devkmsg", - .data = devkmsg_log_str, - .maxlen = DEVKMSG_STR_MAX_SIZE, - .mode = 0644, - .proc_handler = devkmsg_sysctl_set_loglvl, - }, - { - .procname = "dmesg_restrict", - .data = &dmesg_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "kptr_restrict", - .data = &kptr_restrict, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_sysadmin, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, -#endif { .procname = "ngroups_max", .data = (void *)&ngroups_max, -- cgit v1.2.3 From 26d1c80fd61e59d5e2eb6fda00e0148a6704ddeb Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:38 -0800 Subject: scsi/sg: move sg-big-buff sysctl to scsi/sg.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the sg-big-buff sysctl from kernel/sysctl.c to drivers/scsi/sg.c and use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: commit log update] Link: https://lkml.kernel.org/r/20211124231435.1445213-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/scsi/sg.c | 35 ++++++++++++++++++++++++++++++++++- include/scsi/sg.h | 4 ---- kernel/sysctl.c | 12 ------------ 3 files changed, 34 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ad12b3261845..6b43e97bd417 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -77,7 +77,7 @@ static int sg_proc_init(void); #define SG_DEFAULT_TIMEOUT mult_frac(SG_DEFAULT_TIMEOUT_USER, HZ, USER_HZ) -int sg_big_buff = SG_DEF_RESERVED_SIZE; +static int sg_big_buff = SG_DEF_RESERVED_SIZE; /* N.B. This variable is readable and writeable via /proc/scsi/sg/def_reserved_size . Each time sg_open() is called a buffer of this size (or less if there is not enough memory) will be reserved @@ -1634,6 +1634,37 @@ MODULE_PARM_DESC(scatter_elem_sz, "scatter gather element " MODULE_PARM_DESC(def_reserved_size, "size of buffer reserved for each fd"); MODULE_PARM_DESC(allow_dio, "allow direct I/O (default: 0 (disallow))"); +#ifdef CONFIG_SYSCTL +#include + +static struct ctl_table sg_sysctls[] = { + { + .procname = "sg-big-buff", + .data = &sg_big_buff, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + {} +}; + +static struct ctl_table_header *hdr; +static void register_sg_sysctls(void) +{ + if (!hdr) + hdr = register_sysctl("kernel", sg_sysctls); +} + +static void unregister_sg_sysctls(void) +{ + if (hdr) + unregister_sysctl_table(hdr); +} +#else +#define register_sg_sysctls() do { } while (0) +#define unregister_sg_sysctls() do { } while (0) +#endif /* CONFIG_SYSCTL */ + static int __init init_sg(void) { @@ -1666,6 +1697,7 @@ init_sg(void) return 0; } class_destroy(sg_sysfs_class); + register_sg_sysctls(); err_out: unregister_chrdev_region(MKDEV(SCSI_GENERIC_MAJOR, 0), SG_MAX_DEVS); return rc; @@ -1674,6 +1706,7 @@ err_out: static void __exit exit_sg(void) { + unregister_sg_sysctls(); #ifdef CONFIG_SCSI_PROC_FS remove_proc_subtree("scsi/sg", NULL); #endif /* CONFIG_SCSI_PROC_FS */ diff --git a/include/scsi/sg.h b/include/scsi/sg.h index 843cefb8efce..068e35d36557 100644 --- a/include/scsi/sg.h +++ b/include/scsi/sg.h @@ -29,10 +29,6 @@ * For utility and test programs see: http://sg.danny.cz/sg/sg3_utils.html */ -#ifdef __KERNEL__ -extern int sg_big_buff; /* for sysctl */ -#endif - typedef struct sg_iovec /* same structure as used by readv() Linux system */ { /* call. It defines one scatter-gather element. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 509f782483db..b0cced3808d4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,9 +95,6 @@ #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) #include #endif -#ifdef CONFIG_CHR_DEV_SG -#include -#endif #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #endif @@ -2090,15 +2087,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dostring, }, #endif -#ifdef CONFIG_CHR_DEV_SG - { - .procname = "sg-big-buff", - .data = &sg_big_buff, - .maxlen = sizeof (int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_BSD_PROCESS_ACCT { .procname = "acct", -- cgit v1.2.3 From 0df8bdd5e3b3e557ce2c2575fce0c64c5dd1045a Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:12:43 -0800 Subject: stackleak: move stack_erasing sysctl to stackleak.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the stack_erasing sysctl from kernel/sysctl.c to kernel/stackleak.c and use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: commit log update] Link: https://lkml.kernel.org/r/20211124231435.1445213-8-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stackleak.h | 5 ----- kernel/stackleak.c | 26 ++++++++++++++++++++++++-- kernel/sysctl.c | 14 -------------- 3 files changed, 24 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/stackleak.h b/include/linux/stackleak.h index a59db2f08e76..ccaab2043fcd 100644 --- a/include/linux/stackleak.h +++ b/include/linux/stackleak.h @@ -23,11 +23,6 @@ static inline void stackleak_task_init(struct task_struct *t) # endif } -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE -int stack_erasing_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -#endif - #else /* !CONFIG_GCC_PLUGIN_STACKLEAK */ static inline void stackleak_task_init(struct task_struct *t) { } #endif diff --git a/kernel/stackleak.c b/kernel/stackleak.c index ce161a8e8d97..66b8af394e58 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -16,11 +16,13 @@ #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE #include #include +#include static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass); -int stack_erasing_sysctl(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +#ifdef CONFIG_SYSCTL +static int stack_erasing_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; int state = !static_branch_unlikely(&stack_erasing_bypass); @@ -42,6 +44,26 @@ int stack_erasing_sysctl(struct ctl_table *table, int write, state ? "enabled" : "disabled"); return ret; } +static struct ctl_table stackleak_sysctls[] = { + { + .procname = "stack_erasing", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = stack_erasing_sysctl, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static int __init stackleak_sysctls_init(void) +{ + register_sysctl_init("kernel", stackleak_sysctls); + return 0; +} +late_initcall(stackleak_sysctls_init); +#endif /* CONFIG_SYSCTL */ #define skip_erasing() static_branch_unlikely(&stack_erasing_bypass) #else diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b0cced3808d4..3e06dafdd2c3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,9 +95,6 @@ #if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) #include #endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE -#include -#endif #if defined(CONFIG_SYSCTL) @@ -2442,17 +2439,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, -#endif -#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE - { - .procname = "stack_erasing", - .data = NULL, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = stack_erasing_sysctl, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif { } }; -- cgit v1.2.3 From b1f2aff888af54a057c2c3c0d88a13ef5d37b52a Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:48 -0800 Subject: sysctl: share unsigned long const values Provide a way to share unsigned long values. This will allow others to not have to re-invent these values. Link: https://lkml.kernel.org/r/20211124231435.1445213-9-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Amir Goldstein Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Benjamin LaHaise Cc: Clemens Ladisch Cc: David Airlie Cc: Douglas Gilbert Cc: Eric Biederman Cc: Greg Kroah-Hartman Cc: Iurii Zaikin Cc: James E.J. Bottomley Cc: Jani Nikula Cc: Jani Nikula Cc: Jan Kara Cc: Joel Becker Cc: John Ogness Cc: Joonas Lahtinen Cc: Joseph Qi Cc: Julia Lawall Cc: Kees Cook Cc: Lukas Middendorf Cc: Mark Fasheh Cc: Martin K. Petersen Cc: Paul Turner Cc: Peter Zijlstra Cc: Petr Mladek Cc: Phillip Potter Cc: Qing Wang Cc: "Rafael J. Wysocki" Cc: Rodrigo Vivi Cc: Sebastian Reichel Cc: Sergey Senozhatsky Cc: Stephen Kitt Cc: Steven Rostedt (VMware) Cc: Suren Baghdasaryan Cc: Tetsuo Handa Cc: "Theodore Ts'o" Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 3 +++ include/linux/sysctl.h | 6 ++++++ kernel/sysctl.c | 9 +++------ 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 6bb2d9a3513d..2eb5987091ea 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -29,6 +29,9 @@ static const struct inode_operations proc_sys_dir_operations; const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; EXPORT_SYMBOL(sysctl_vals); +const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; +EXPORT_SYMBOL_GPL(sysctl_long_vals); + /* Support for permanently empty directories */ struct ctl_table sysctl_mount_point[] = { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 746c098a6ff5..2de6d20d191b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -51,6 +51,12 @@ struct ctl_dir; extern const int sysctl_vals[]; +#define SYSCTL_LONG_ZERO ((void *)&sysctl_long_vals[0]) +#define SYSCTL_LONG_ONE ((void *)&sysctl_long_vals[1]) +#define SYSCTL_LONG_MAX ((void *)&sysctl_long_vals[2]) + +extern const unsigned long sysctl_long_vals[]; + typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e06dafdd2c3..1b8a1cb8aac9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -100,9 +100,6 @@ /* Constants used for minimum and maximum */ -static const unsigned long zero_ul; -static const unsigned long one_ul = 1; -static const unsigned long long_max = LONG_MAX; #ifdef CONFIG_PRINTK static const int ten_thousand = 10000; #endif @@ -2513,7 +2510,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(dirty_background_bytes), .mode = 0644, .proc_handler = dirty_background_bytes_handler, - .extra1 = (void *)&one_ul, + .extra1 = SYSCTL_LONG_ONE, }, { .procname = "dirty_ratio", @@ -2931,8 +2928,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = (void *)&zero_ul, - .extra2 = (void *)&long_max, + .extra1 = SYSCTL_LONG_ZERO, + .extra2 = SYSCTL_LONG_MAX, }, { .procname = "nr_open", -- cgit v1.2.3 From 1d67fe585049d3e2448b997af78c68cbf90ada09 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:52 -0800 Subject: fs: move inode sysctls to its own file Patch series "sysctl: 4th set of kernel/sysctl cleanups". This is slimming down the fs uses of kernel/sysctl.c to the point that the next step is to just get rid of the fs base directory for it and move that elsehwere, so that next patch series starts dealing with that to demo how we can end up cleaning up a full base directory from kernel/sysctl.c, one at a time. This patch (of 9): kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the inode sysctls to its own file. Since we are no longer using this outside of fs/ remove the extern declaration of its respective proc helper. We use early_initcall() as it is the earliest we can use. [arnd@arndb.de: avoid unused-variable warning] Link: https://lkml.kernel.org/r/20211203190123.874239-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-2-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Kees Cook Cc: Iurii Zaikin Cc: Xiaoming Ni Cc: Eric Biederman Cc: Stephen Kitt Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Andy Shevchenko Cc: Jeff Layton Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 39 ++++++++++++++++++++++++++++++++------- include/linux/fs.h | 3 --- kernel/sysctl.c | 14 -------------- 3 files changed, 32 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/fs/inode.c b/fs/inode.c index 980e7b7a5460..63324df6fa27 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -67,11 +67,6 @@ const struct address_space_operations empty_aops = { }; EXPORT_SYMBOL(empty_aops); -/* - * Statistics gathering.. - */ -struct inodes_stat_t inodes_stat; - static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); @@ -106,13 +101,43 @@ long get_nr_dirty_inodes(void) * Handle nr_inode sysctl */ #ifdef CONFIG_SYSCTL -int proc_nr_inodes(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +/* + * Statistics gathering.. + */ +static struct inodes_stat_t inodes_stat; + +static int proc_nr_inodes(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } + +static struct ctl_table inodes_sysctls[] = { + { + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { } +}; + +static int __init init_fs_inode_sysctls(void) +{ + register_sysctl_init("fs", inodes_sysctls); + return 0; +} +early_initcall(init_fs_inode_sysctls); #endif static int no_open(struct inode *inode, struct file *file) diff --git a/include/linux/fs.h b/include/linux/fs.h index c8510da6cc6d..044de67c8167 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -82,7 +82,6 @@ extern void __init files_maxfiles_init(void); extern struct files_stat_struct files_stat; extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; -extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; extern int sysctl_protected_symlinks; extern int sysctl_protected_hardlinks; @@ -3537,8 +3536,6 @@ int proc_nr_files(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int proc_nr_inodes(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1b8a1cb8aac9..402c9d3246bf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2901,20 +2901,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "inode-nr", - .data = &inodes_stat, - .maxlen = 2*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, - { - .procname = "inode-state", - .data = &inodes_stat, - .maxlen = 7*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_inodes, - }, { .procname = "file-nr", .data = &files_stat, -- cgit v1.2.3 From 204d5a24e15562b2816825c0f9b49d26814b77be Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:56 -0800 Subject: fs: move fs stat sysctls to file_table.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. We can create the sysctl dynamically on early init for fs stat to help with this clutter. This dusts off the fs stat syctls knobs and puts them into where they are declared. Link: https://lkml.kernel.org/r/20211129205548.605569-3-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file_table.c | 47 +++++++++++++++++++++++++++++++++++++++-------- include/linux/fs.h | 3 --- kernel/sysctl.c | 25 ------------------------- 3 files changed, 39 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/fs/file_table.c b/fs/file_table.c index 45437f8e1003..57edef16dce4 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -33,7 +33,7 @@ #include "internal.h" /* sysctl tunables... */ -struct files_stat_struct files_stat = { +static struct files_stat_struct files_stat = { .max_files = NR_FILE }; @@ -75,22 +75,53 @@ unsigned long get_max_files(void) } EXPORT_SYMBOL_GPL(get_max_files); +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) + /* * Handle nr_files sysctl */ -#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_nr_files(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } -#else -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) + +static struct ctl_table fs_stat_sysctls[] = { + { + .procname = "file-nr", + .data = &files_stat, + .maxlen = sizeof(files_stat), + .mode = 0444, + .proc_handler = proc_nr_files, + }, + { + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(files_stat.max_files), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = SYSCTL_LONG_ZERO, + .extra2 = SYSCTL_LONG_MAX, + }, + { + .procname = "nr_open", + .data = &sysctl_nr_open, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, + }, + { } +}; + +static int __init init_fs_stat_sysctls(void) { - return -ENOSYS; + register_sysctl_init("fs", fs_stat_sysctls); + return 0; } +fs_initcall(init_fs_stat_sysctls); #endif static struct file *__alloc_file(int flags, const struct cred *cred) diff --git a/include/linux/fs.h b/include/linux/fs.h index 044de67c8167..1e6761966120 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -79,7 +79,6 @@ extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); -extern struct files_stat_struct files_stat; extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; extern int leases_enable, lease_break_time; @@ -3532,8 +3531,6 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; -int proc_nr_files(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 402c9d3246bf..d2c411b15854 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2901,31 +2901,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "file-nr", - .data = &files_stat, - .maxlen = sizeof(files_stat), - .mode = 0444, - .proc_handler = proc_nr_files, - }, - { - .procname = "file-max", - .data = &files_stat.max_files, - .maxlen = sizeof(files_stat.max_files), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = SYSCTL_LONG_ZERO, - .extra2 = SYSCTL_LONG_MAX, - }, - { - .procname = "nr_open", - .data = &sysctl_nr_open, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &sysctl_nr_open_min, - .extra2 = &sysctl_nr_open_max, - }, { .procname = "dentry-state", .data = &dentry_stat, -- cgit v1.2.3 From c8c0c239d5ab1e3e8d2bb0453ce642fe2c6357ec Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:12:59 -0800 Subject: fs: move dcache sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the dcache sysctl clutter out of kernel/sysctl.c. This is a small one-off entry, perhaps later we can simplify this representation, but for now we use the helpers we have. We won't know how we can simplify this further untl we're fully done with the cleanup. [arnd@arndb.de: avoid unused-function warning] Link: https://lkml.kernel.org/r/20211203190123.874239-2-arnd@kernel.org Link: https://lkml.kernel.org/r/20211129205548.605569-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 37 +++++++++++++++++++++++++++++++------ include/linux/dcache.h | 10 ---------- include/linux/fs.h | 2 -- kernel/sysctl.c | 7 ------- 4 files changed, 31 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/fs/dcache.c b/fs/dcache.c index cf871a81f4fd..c84269c6e8bf 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -115,10 +115,13 @@ static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } - -/* Statistics gathering. */ -struct dentry_stat_t dentry_stat = { - .age_limit = 45, +struct dentry_stat_t { + long nr_dentry; + long nr_unused; + long age_limit; /* age in seconds */ + long want_pages; /* pages requested by system */ + long nr_negative; /* # of unused negative dentries */ + long dummy; /* Reserved for future use */ }; static DEFINE_PER_CPU(long, nr_dentry); @@ -126,6 +129,10 @@ static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +/* Statistics gathering. */ +static struct dentry_stat_t dentry_stat = { + .age_limit = 45, +}; /* * Here we resort to our own counters instead of using generic per-cpu counters @@ -167,14 +174,32 @@ static long get_nr_dentry_negative(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, - size_t *lenp, loff_t *ppos) +static int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } + +static struct ctl_table fs_dcache_sysctls[] = { + { + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(long), + .mode = 0444, + .proc_handler = proc_nr_dentry, + }, + { } +}; + +static int __init init_fs_dcache_sysctls(void) +{ + register_sysctl_init("fs", fs_dcache_sysctls); + return 0; +} +fs_initcall(init_fs_dcache_sysctls); #endif /* diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 9e23d33bb6f1..f5bba51480b2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -61,16 +61,6 @@ extern const struct qstr empty_name; extern const struct qstr slash_name; extern const struct qstr dotdot_name; -struct dentry_stat_t { - long nr_dentry; - long nr_unused; - long age_limit; /* age in seconds */ - long want_pages; /* pages requested by system */ - long nr_negative; /* # of unused negative dentries */ - long dummy; /* Reserved for future use */ -}; -extern struct dentry_stat_t dentry_stat; - /* * Try to keep struct dentry aligned on 64 byte cachelines (this will * give reasonable cacheline footprint with larger lines without the diff --git a/include/linux/fs.h b/include/linux/fs.h index 1e6761966120..9b856d5da9e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3531,8 +3531,6 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; -int proc_nr_dentry(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d2c411b15854..4d182fb5c22e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2901,13 +2901,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "dentry-state", - .data = &dentry_stat, - .maxlen = 6*sizeof(long), - .mode = 0444, - .proc_handler = proc_nr_dentry, - }, { .procname = "overflowuid", .data = &fs_overflowuid, -- cgit v1.2.3 From 54771613e8a7dbbba2a205ddf1b33e25a290b3fd Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:03 -0800 Subject: sysctl: move maxolduid as a sysctl specific const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The maxolduid value is only shared for sysctl purposes for use on a max range. Just stuff this into our shared const array. [akpm@linux-foundation.org: fix sysctl_vals[], per Mickaël] Link: https://lkml.kernel.org/r/20211129205548.605569-5-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Signed-off-by: Mickaël Salaün Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 2 +- include/linux/sysctl.h | 3 +++ kernel/sysctl.c | 12 ++++-------- 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 2eb5987091ea..b9d53b53d769 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -26,7 +26,7 @@ static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; /* shared constants to be used in various sysctls */ -const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX }; +const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX, 65535 }; EXPORT_SYMBOL(sysctl_vals); const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 2de6d20d191b..bb921eb8a02d 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -49,6 +49,9 @@ struct ctl_dir; #define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8]) #define SYSCTL_INT_MAX ((void *)&sysctl_vals[9]) +/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +#define SYSCTL_MAXOLDUID ((void *)&sysctl_vals[10]) + extern const int sysctl_vals[]; #define SYSCTL_LONG_ZERO ((void *)&sysctl_long_vals[0]) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4d182fb5c22e..5bbb4c59dd1a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -110,10 +110,6 @@ static const int six_hundred_forty_kb = 640 * 1024; /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ -static const int maxolduid = 65535; -/* minolduid is SYSCTL_ZERO */ - static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -2127,7 +2123,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, { .procname = "overflowgid", @@ -2136,7 +2132,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, #ifdef CONFIG_S390 { @@ -2908,7 +2904,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, { .procname = "overflowgid", @@ -2917,7 +2913,7 @@ static struct ctl_table fs_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&maxolduid, + .extra2 = SYSCTL_MAXOLDUID, }, #ifdef CONFIG_FILE_LOCKING { -- cgit v1.2.3 From d1d8ac9edf10e83f16d13f009439585a1f93ccb2 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:06 -0800 Subject: fs: move shared sysctls to fs/sysctls.c To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move sysctls which are shared between filesystems into a common file outside of kernel/sysctl.c. Link: https://lkml.kernel.org/r/20211129205548.605569-6-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Makefile | 1 + fs/sysctls.c | 38 ++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 18 ------------------ 3 files changed, 39 insertions(+), 18 deletions(-) create mode 100644 fs/sysctls.c (limited to 'kernel') diff --git a/fs/Makefile b/fs/Makefile index 84c5e4cdfee5..ea8770d124da 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -28,6 +28,7 @@ obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o +obj-$(CONFIG_SYSCTL) += sysctls.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o diff --git a/fs/sysctls.c b/fs/sysctls.c new file mode 100644 index 000000000000..54216cd1ecd7 --- /dev/null +++ b/fs/sysctls.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * /proc/sys/fs shared sysctls + * + * These sysctls are shared between different filesystems. + */ +#include +#include + +static struct ctl_table fs_shared_sysctls[] = { + { + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { } +}; + +static int __init init_fs_shared_sysctls(void) +{ + register_sysctl_init("fs", fs_shared_sysctls); + return 0; +} + +early_initcall(init_fs_shared_sysctls); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5bbb4c59dd1a..6d9d2001b790 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2897,24 +2897,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "overflowuid", - .data = &fs_overflowuid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, - { - .procname = "overflowgid", - .data = &fs_overflowgid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_MAXOLDUID, - }, #ifdef CONFIG_FILE_LOCKING { .procname = "leases-enable", -- cgit v1.2.3 From dd81faa88340a1fe8cd81c8ecbadd8e95c58549c Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:10 -0800 Subject: fs: move locking sysctls where they are used kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. The locking fs sysctls are only used on fs/locks.c, so move them there. Link: https://lkml.kernel.org/r/20211129205548.605569-7-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 34 ++++++++++++++++++++++++++++++++-- include/linux/fs.h | 4 ---- kernel/sysctl.c | 20 -------------------- 3 files changed, 32 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/fs/locks.c b/fs/locks.c index 0fca9d680978..8c6df10cd9ed 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -62,6 +62,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -88,8 +89,37 @@ static int target_leasetype(struct file_lock *fl) return fl->fl_type; } -int leases_enable = 1; -int lease_break_time = 45; +static int leases_enable = 1; +static int lease_break_time = 45; + +#ifdef CONFIG_SYSCTL +static struct ctl_table locks_sysctls[] = { + { + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_MMU + { + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif /* CONFIG_MMU */ + {} +}; + +static int __init init_fs_locks_sysctls(void) +{ + register_sysctl_init("fs", locks_sysctls); + return 0; +} +early_initcall(init_fs_locks_sysctls); +#endif /* CONFIG_SYSCTL */ /* * The global file_lock_list is only used for displaying /proc/locks, so we diff --git a/include/linux/fs.h b/include/linux/fs.h index 9b856d5da9e2..0e08c3dd8f75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -82,10 +82,6 @@ extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; extern int leases_enable, lease_break_time; -extern int sysctl_protected_symlinks; -extern int sysctl_protected_hardlinks; -extern int sysctl_protected_fifos; -extern int sysctl_protected_regular; typedef __kernel_rwf_t rwf_t; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6d9d2001b790..073948e9d165 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2897,26 +2897,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { -#ifdef CONFIG_FILE_LOCKING - { - .procname = "leases-enable", - .data = &leases_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#ifdef CONFIG_MMU -#ifdef CONFIG_FILE_LOCKING - { - .procname = "lease-break-time", - .data = &lease_break_time, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif -#endif { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, -- cgit v1.2.3 From 9c011be132972ff94bde2ae99064e29f94e85c68 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:13 -0800 Subject: fs: move namei sysctls to its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move namei's own sysctl knobs to its own file. Other than the move we also avoid initializing two static variables to 0 as this is not needed: * sysctl_protected_symlinks * sysctl_protected_hardlinks Link: https://lkml.kernel.org/r/20211129205548.605569-8-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++---- include/linux/fs.h | 1 - kernel/sysctl.c | 36 --------------------------------- 3 files changed, 54 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/fs/namei.c b/fs/namei.c index d81f04f8d818..b867a92c078e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1020,10 +1020,60 @@ static inline void put_link(struct nameidata *nd) path_put(&last->link); } -int sysctl_protected_symlinks __read_mostly = 0; -int sysctl_protected_hardlinks __read_mostly = 0; -int sysctl_protected_fifos __read_mostly; -int sysctl_protected_regular __read_mostly; +static int sysctl_protected_symlinks __read_mostly; +static int sysctl_protected_hardlinks __read_mostly; +static int sysctl_protected_fifos __read_mostly; +static int sysctl_protected_regular __read_mostly; + +#ifdef CONFIG_SYSCTL +static struct ctl_table namei_sysctls[] = { + { + .procname = "protected_symlinks", + .data = &sysctl_protected_symlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_hardlinks", + .data = &sysctl_protected_hardlinks, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "protected_fifos", + .data = &sysctl_protected_fifos, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { + .procname = "protected_regular", + .data = &sysctl_protected_regular, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; + +static int __init init_fs_namei_sysctls(void) +{ + register_sysctl_init("fs", namei_sysctls); + return 0; +} +fs_initcall(init_fs_namei_sysctls); + +#endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations diff --git a/include/linux/fs.h b/include/linux/fs.h index 0e08c3dd8f75..9617dea24978 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -81,7 +81,6 @@ extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; -extern int leases_enable, lease_break_time; typedef __kernel_rwf_t rwf_t; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 073948e9d165..53fb4692facc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2897,42 +2897,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "protected_symlinks", - .data = &sysctl_protected_symlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_hardlinks", - .data = &sysctl_protected_hardlinks, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "protected_fifos", - .data = &sysctl_protected_fifos, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, - { - .procname = "protected_regular", - .data = &sysctl_protected_regular, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, { .procname = "suid_dumpable", .data = &suid_dumpable, -- cgit v1.2.3 From 66ad398634c21e0a42ce10002ae06c39352da0d1 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:17 -0800 Subject: fs: move fs/exec.c sysctls into its own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the fs/exec.c respective sysctls to its own file. Since checkpatch complains about style issues with the old code, this move also fixes a few of those minor style issues: * Use pr_warn() instead of prink(WARNING * New empty lines are wanted at the beginning of routines Link: https://lkml.kernel.org/r/20211129205548.605569-9-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 66 ------------------------------------------ 2 files changed, 90 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 3c3c366a9bcf..310750340e4a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -2099,3 +2100,92 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd, argv, envp, flags); } #endif + +#ifdef CONFIG_SYSCTL + +static void validate_coredump_safety(void) +{ +#ifdef CONFIG_COREDUMP + if (suid_dumpable == SUID_DUMP_ROOT && + core_pattern[0] != '/' && core_pattern[0] != '|') { + pr_warn( +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); + } +#endif +} + +static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!error) + validate_coredump_safety(); + return error; +} + +static struct ctl_table fs_exec_sysctls[] = { + { + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_coredump, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; + +#ifdef CONFIG_COREDUMP + +static int proc_dostring_coredump(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + + if (!error) + validate_coredump_safety(); + return error; +} + +static struct ctl_table kernel_exec_sysctls[] = { + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring_coredump, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; +#endif + +static int __init init_fs_exec_sysctls(void) +{ + register_sysctl_init("fs", fs_exec_sysctls); +#ifdef CONFIG_COREDUMP + register_sysctl_init("kernel", kernel_exec_sysctls); +#endif + return 0; +} + +fs_initcall(init_fs_exec_sysctls); +#endif /* CONFIG_SYSCTL */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 53fb4692facc..8bab82aa0192 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1117,40 +1117,6 @@ static int proc_dopipe_max_size(struct ctl_table *table, int write, do_proc_dopipe_max_size_conv, NULL); } -static void validate_coredump_safety(void) -{ -#ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMP_ROOT && - core_pattern[0] != '/' && core_pattern[0] != '|') { - printk(KERN_WARNING -"Unsafe core_pattern used with fs.suid_dumpable=2.\n" -"Pipe handler or fully qualified core dump path required.\n" -"Set kernel.core_pattern before fs.suid_dumpable.\n" - ); - } -#endif -} - -static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} - -#ifdef CONFIG_COREDUMP -static int proc_dostring_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dostring(table, write, buffer, lenp, ppos); - if (!error) - validate_coredump_safety(); - return error; -} -#endif - #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -1874,29 +1840,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_COREDUMP - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring_coredump, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -2897,15 +2840,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "suid_dumpable", - .data = &suid_dumpable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax_coredump, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, - }, { .procname = "pipe-max-size", .data = &pipe_max_size, -- cgit v1.2.3 From 1998f19324d24df7de4e74d81503b4299eb99e7d Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:20 -0800 Subject: fs: move pipe sysctls to is own file kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. So move the pipe sysctls to its own file. Link: https://lkml.kernel.org/r/20211129205548.605569-10-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Andy Shevchenko Cc: Antti Palosaari Cc: Eric Biederman Cc: Iurii Zaikin Cc: "J. Bruce Fields" Cc: Jeff Layton Cc: Kees Cook Cc: Lukas Middendorf Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 64 ++++++++++++++++++++++++++++++++++++++++++++--- include/linux/pipe_fs_i.h | 4 --- include/linux/sysctl.h | 6 +++++ kernel/sysctl.c | 61 +++++--------------------------------------- 4 files changed, 73 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/fs/pipe.c b/fs/pipe.c index 6d4342bad9f1..cc28623a67b6 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -50,13 +51,13 @@ * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size */ -unsigned int pipe_max_size = 1048576; +static unsigned int pipe_max_size = 1048576; /* Maximum allocatable pages per user. Hard limit is unset by default, soft * matches default values. */ -unsigned long pipe_user_pages_hard; -unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; +static unsigned long pipe_user_pages_hard; +static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; /* * We use head and tail indices that aren't masked off, except at the point of @@ -1428,6 +1429,60 @@ static struct file_system_type pipe_fs_type = { .kill_sb = kill_anon_super, }; +#ifdef CONFIG_SYSCTL +static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, + unsigned int *valp, + int write, void *data) +{ + if (write) { + unsigned int val; + + val = round_pipe_size(*lvalp); + if (val == 0) + return -EINVAL; + + *valp = val; + } else { + unsigned int val = *valp; + *lvalp = (unsigned long) val; + } + + return 0; +} + +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_douintvec(table, write, buffer, lenp, ppos, + do_proc_dopipe_max_size_conv, NULL); +} + +static struct ctl_table fs_pipe_sysctls[] = { + { + .procname = "pipe-max-size", + .data = &pipe_max_size, + .maxlen = sizeof(pipe_max_size), + .mode = 0644, + .proc_handler = proc_dopipe_max_size, + }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { } +}; +#endif + static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); @@ -1439,6 +1494,9 @@ static int __init init_pipe_fs(void) unregister_filesystem(&pipe_fs_type); } } +#ifdef CONFIG_SYSCTL + register_sysctl_init("fs", fs_pipe_sysctls); +#endif return err; } diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index fc5642431b92..c00c618ef290 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -238,10 +238,6 @@ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); -extern unsigned int pipe_max_size; -extern unsigned long pipe_user_pages_hard; -extern unsigned long pipe_user_pages_soft; - /* Wait for a pipe to be readable/writable while dropping the pipe lock */ void pipe_wait_readable(struct pipe_inode_info *); void pipe_wait_writable(struct pipe_inode_info *); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index bb921eb8a02d..4294e9668bd5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -221,6 +221,12 @@ extern void __register_sysctl_init(const char *path, struct ctl_table *table, extern struct ctl_table_header *register_sysctl_mount_point(const char *path); void do_sysctl_args(void); +int do_proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data); extern int pwrsw_enabled; extern int unaligned_enabled; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8bab82aa0192..f79ec2cf8721 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include #include @@ -761,12 +760,12 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); } -static int do_proc_douintvec(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data) +int do_proc_douintvec(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, + unsigned int *valp, + int write, void *data), + void *data) { return __do_proc_douintvec(table->data, table, write, buffer, lenp, ppos, conv, data); @@ -1090,33 +1089,6 @@ int proc_dou8vec_minmax(struct ctl_table *table, int write, } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); -static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - if (write) { - unsigned int val; - - val = round_pipe_size(*lvalp); - if (val == 0) - return -EINVAL; - - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; - } - - return 0; -} - -static int proc_dopipe_max_size(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_dopipe_max_size_conv, NULL); -} - #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -2840,27 +2812,6 @@ static struct ctl_table vm_table[] = { }; static struct ctl_table fs_table[] = { - { - .procname = "pipe-max-size", - .data = &pipe_max_size, - .maxlen = sizeof(pipe_max_size), - .mode = 0644, - .proc_handler = proc_dopipe_max_size, - }, - { - .procname = "pipe-user-pages-hard", - .data = &pipe_user_pages_hard, - .maxlen = sizeof(pipe_user_pages_hard), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, - { - .procname = "pipe-user-pages-soft", - .data = &pipe_user_pages_soft, - .maxlen = sizeof(pipe_user_pages_soft), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - }, { .procname = "mount-max", .data = &sysctl_mount_max, -- cgit v1.2.3 From 51cb8dfc5a5c39e6c70376b9dc9a14d624a9d271 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:24 -0800 Subject: sysctl: add and use base directory declarer and registration helper Patch series "sysctl: add and use base directory declarer and registration helper". In this patch series we start addressing base directories, and so we start with the "fs" sysctls. The end goal is we end up completely moving all "fs" sysctl knobs out from kernel/sysctl. This patch (of 6): Add a set of helpers which can be used to declare and register base directory sysctls on their own. We do this so we can later move each of the base sysctl directories like "fs", "kernel", etc, to their own respective files instead of shoving the declarations and registrations all on kernel/sysctl.c. The lazy approach has caught up and with this, we just end up extending the list of base directories / sysctls on one file and this makes maintenance difficult due to merge conflicts from many developers. The declarations are used first by kernel/sysctl.c for registration its own base which over time we'll try to clean up. It will be used in the next patch to demonstrate how to cleanly deal with base sysctl directories. [mcgrof@kernel.org: null-terminate the ctl_table arrays] Link: https://lkml.kernel.org/r/YafJY3rXDYnjK/gs@bombadil.infradead.org Link: https://lkml.kernel.org/r/20211129211943.640266-1-mcgrof@kernel.org Link: https://lkml.kernel.org/r/20211129211943.640266-2-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Kees Cook Cc: Iurii Zaikin Cc: Xiaoming Ni Cc: Eric Biederman Cc: Stephen Kitt Cc: Lukas Middendorf Cc: Antti Palosaari Cc: Christian Brauner Cc: Eric Biggers Cc: "Naveen N. Rao" Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Anil S Keshavamurthy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 9 +++++++++ include/linux/sysctl.h | 24 ++++++++++++++++++++++++ kernel/sysctl.c | 41 ++++++++++------------------------------- 3 files changed, 43 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b9d53b53d769..95dfd195e04e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1646,6 +1646,15 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table *table) } EXPORT_SYMBOL(register_sysctl_table); +int __register_sysctl_base(struct ctl_table *base_table) +{ + struct ctl_table_header *hdr; + + hdr = register_sysctl_table(base_table); + kmemleak_not_leak(hdr); + return 0; +} + static void put_links(struct ctl_table_header *header) { struct ctl_table_set *root_set = &sysctl_table_root.default_set; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 4294e9668bd5..02134a8abad7 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -194,6 +194,20 @@ struct ctl_path { #ifdef CONFIG_SYSCTL +#define DECLARE_SYSCTL_BASE(_name, _table) \ +static struct ctl_table _name##_base_table[] = { \ + { \ + .procname = #_name, \ + .mode = 0555, \ + .child = _table, \ + }, \ + { }, \ +} + +extern int __register_sysctl_base(struct ctl_table *base_table); + +#define register_sysctl_base(_name) __register_sysctl_base(_name##_base_table) + void proc_sys_poll_notify(struct ctl_table_poll *poll); extern void setup_sysctl_set(struct ctl_table_set *p, @@ -236,6 +250,16 @@ extern int no_unaligned_warning; extern struct ctl_table sysctl_mount_point[]; #else /* CONFIG_SYSCTL */ + +#define DECLARE_SYSCTL_BASE(_name, _table) + +static inline int __register_sysctl_base(struct ctl_table *base_table) +{ + return 0; +} + +#define register_sysctl_base(table) __register_sysctl_base(table) + static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) { return NULL; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f79ec2cf8721..12456a535059 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2851,41 +2851,20 @@ static struct ctl_table dev_table[] = { { } }; -static struct ctl_table sysctl_base_table[] = { - { - .procname = "kernel", - .mode = 0555, - .child = kern_table, - }, - { - .procname = "vm", - .mode = 0555, - .child = vm_table, - }, - { - .procname = "fs", - .mode = 0555, - .child = fs_table, - }, - { - .procname = "debug", - .mode = 0555, - .child = debug_table, - }, - { - .procname = "dev", - .mode = 0555, - .child = dev_table, - }, - { } -}; +DECLARE_SYSCTL_BASE(kernel, kern_table); +DECLARE_SYSCTL_BASE(vm, vm_table); +DECLARE_SYSCTL_BASE(fs, fs_table); +DECLARE_SYSCTL_BASE(debug, debug_table); +DECLARE_SYSCTL_BASE(dev, dev_table); int __init sysctl_init(void) { - struct ctl_table_header *hdr; + register_sysctl_base(kernel); + register_sysctl_base(vm); + register_sysctl_base(fs); + register_sysctl_base(debug); + register_sysctl_base(dev); - hdr = register_sysctl_table(sysctl_base_table); - kmemleak_not_leak(hdr); return 0; } #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3 From ab171b952c6e065779687b44041038efdadb3915 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:27 -0800 Subject: fs: move namespace sysctls and declare fs base directory This moves the namespace sysctls to its own file as part of the kernel/sysctl.c spring cleaning Since we have now removed all sysctls for "fs", we now have to declare it on the filesystem code, we do that using the new helper, which reduces boiler plate code. We rename init_fs_shared_sysctls() to init_fs_sysctls() to reflect that now fs/sysctls.c is taking on the burden of being the first to register the base directory as well. Lastly, since init code will load in the order in which we link it we have to move the sysctl code to be linked in early, so that its early init routine runs prior to other fs code. This way, other filesystem code can register their own sysctls using the helpers after this: * register_sysctl_init() * register_sysctl() Link: https://lkml.kernel.org/r/20211129211943.640266-3-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Makefile | 3 ++- fs/namespace.c | 24 +++++++++++++++++++++++- fs/sysctls.c | 9 +++++---- include/linux/mount.h | 3 --- kernel/sysctl.c | 14 -------------- 5 files changed, 30 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/fs/Makefile b/fs/Makefile index ea8770d124da..dab324aea08f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -6,6 +6,8 @@ # Rewritten to use lists instead of if-statements. # +obj-$(CONFIG_SYSCTL) += sysctls.o + obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ ioctl.o readdir.o select.o dcache.o inode.o \ @@ -28,7 +30,6 @@ obj-y += notify/ obj-$(CONFIG_EPOLL) += eventpoll.o obj-y += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o -obj-$(CONFIG_SYSCTL) += sysctls.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o diff --git a/fs/namespace.c b/fs/namespace.c index dc31ad6b370f..40b994a29e90 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -37,7 +37,7 @@ #include "internal.h" /* Maximum number of mounts in a mount namespace */ -unsigned int sysctl_mount_max __read_mostly = 100000; +static unsigned int sysctl_mount_max __read_mostly = 100000; static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; @@ -4620,3 +4620,25 @@ const struct proc_ns_operations mntns_operations = { .install = mntns_install, .owner = mntns_owner, }; + +#ifdef CONFIG_SYSCTL +static struct ctl_table fs_namespace_sysctls[] = { + { + .procname = "mount-max", + .data = &sysctl_mount_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, + { } +}; + +static int __init init_fs_namespace_sysctls(void) +{ + register_sysctl_init("fs", fs_namespace_sysctls); + return 0; +} +fs_initcall(init_fs_namespace_sysctls); + +#endif /* CONFIG_SYSCTL */ diff --git a/fs/sysctls.c b/fs/sysctls.c index 54216cd1ecd7..c701273c9432 100644 --- a/fs/sysctls.c +++ b/fs/sysctls.c @@ -29,10 +29,11 @@ static struct ctl_table fs_shared_sysctls[] = { { } }; -static int __init init_fs_shared_sysctls(void) +DECLARE_SYSCTL_BASE(fs, fs_shared_sysctls); + +static int __init init_fs_sysctls(void) { - register_sysctl_init("fs", fs_shared_sysctls); - return 0; + return register_sysctl_base(fs); } -early_initcall(init_fs_shared_sysctls); +early_initcall(init_fs_sysctls); diff --git a/include/linux/mount.h b/include/linux/mount.h index 5d92a7e1a742..7f18a7555dff 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -113,9 +113,6 @@ extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list); extern void mark_mounts_for_expiry(struct list_head *mounts); extern dev_t name_to_dev_t(const char *name); - -extern unsigned int sysctl_mount_max; - extern bool path_is_mountpoint(const struct path *path); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 12456a535059..cdc3dc5f0005 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2811,18 +2811,6 @@ static struct ctl_table vm_table[] = { { } }; -static struct ctl_table fs_table[] = { - { - .procname = "mount-max", - .data = &sysctl_mount_max, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - }, - { } -}; - static struct ctl_table debug_table[] = { #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE { @@ -2853,7 +2841,6 @@ static struct ctl_table dev_table[] = { DECLARE_SYSCTL_BASE(kernel, kern_table); DECLARE_SYSCTL_BASE(vm, vm_table); -DECLARE_SYSCTL_BASE(fs, fs_table); DECLARE_SYSCTL_BASE(debug, debug_table); DECLARE_SYSCTL_BASE(dev, dev_table); @@ -2861,7 +2848,6 @@ int __init sysctl_init(void) { register_sysctl_base(kernel); register_sysctl_base(vm); - register_sysctl_base(fs); register_sysctl_base(debug); register_sysctl_base(dev); -- cgit v1.2.3 From d8c0418aac78e661b5283c9d6a1dfc61d44f26fd Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 21 Jan 2022 22:13:31 -0800 Subject: kernel/sysctl.c: rename sysctl_init() to sysctl_init_bases() Rename sysctl_init() to sysctl_init_bases() so to reflect exactly what this is doing. Link: https://lkml.kernel.org/r/20211129211943.640266-4-mcgrof@kernel.org Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Cc: Xiaoming Ni Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/alignment.c | 2 +- arch/sh/mm/alignment.c | 2 +- fs/proc/proc_sysctl.c | 4 ++-- include/linux/sysctl.h | 2 +- kernel/sysctl.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index adbb3817d0be..6f499559d193 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -1005,7 +1005,7 @@ static int __init noalign_setup(char *__unused) __setup("noalign", noalign_setup); /* - * This needs to be done after sysctl_init, otherwise sys/ will be + * This needs to be done after sysctl_init_bases(), otherwise sys/ will be * overwritten. Actually, this shouldn't be in sys/ at all since * it isn't a sysctl, and it doesn't contain sysctl information. * We now locate it in /proc/cpu/alignment instead. diff --git a/arch/sh/mm/alignment.c b/arch/sh/mm/alignment.c index fb517b82a87b..d802801ceb93 100644 --- a/arch/sh/mm/alignment.c +++ b/arch/sh/mm/alignment.c @@ -161,7 +161,7 @@ static const struct proc_ops alignment_proc_ops = { }; /* - * This needs to be done after sysctl_init, otherwise sys/ will be + * This needs to be done after sysctl_init_bases(), otherwise sys/ will be * overwritten. Actually, this shouldn't be in sys/ at all since * it isn't a sysctl, and it doesn't contain sysctl information. * We now locate it in /proc/cpu/alignment instead. diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 95dfd195e04e..7d9cfc730bd4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1419,7 +1419,7 @@ EXPORT_SYMBOL(register_sysctl); * Context: Can only be called after your respective sysctl base path has been * registered. So for instance, most base directories are registered early on * init before init levels are processed through proc_sys_init() and - * sysctl_init(). + * sysctl_init_bases(). */ void __init __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name) @@ -1768,7 +1768,7 @@ int __init proc_sys_init(void) proc_sys_root->proc_dir_ops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; - return sysctl_init(); + return sysctl_init_bases(); } struct sysctl_alias { diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 02134a8abad7..180adf7da785 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -228,7 +228,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); -extern int sysctl_init(void); +extern int sysctl_init_bases(void); extern void __register_sysctl_init(const char *path, struct ctl_table *table, const char *table_name); #define register_sysctl_init(path, table) __register_sysctl_init(path, table, #table) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cdc3dc5f0005..bb07183cd305 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2844,7 +2844,7 @@ DECLARE_SYSCTL_BASE(vm, vm_table); DECLARE_SYSCTL_BASE(debug, debug_table); DECLARE_SYSCTL_BASE(dev, dev_table); -int __init sysctl_init(void) +int __init sysctl_init_bases(void) { register_sysctl_base(kernel); register_sysctl_base(vm); -- cgit v1.2.3 From fdcd4073fccc6f989308be3f1d61d8a68cd990ce Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:34 -0800 Subject: printk: fix build warning when CONFIG_PRINTK=n build warning when CONFIG_PRINTK=n kernel/printk/printk.c:175:5: warning: no previous prototype for 'devkmsg_sysctl_set_loglvl' [-Wmissing-prototypes] devkmsg_sysctl_set_loglvl() is only used in sysctl.c when CONFIG_PRINTK=y, but it participates in the build when CONFIG_PRINTK=n. So add compile dependency CONFIG_PRINTK=y && CONFIG_SYSCTL=y to fix the build warning. Link: https://lkml.kernel.org/r/20211129211943.640266-5-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/printk.h | 4 ---- kernel/printk/internal.h | 2 ++ kernel/printk/printk.c | 3 ++- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index 9497f6b98339..1522df223c0f 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -183,10 +183,6 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, extern int printk_delay_msec; extern int dmesg_restrict; -extern int -devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf, - size_t *lenp, loff_t *ppos); - extern void wake_up_klogd(void); char *log_buf_addr_get(void); diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 6b1c4b399845..d947ca6c84f9 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -6,6 +6,8 @@ #if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) void __init printk_sysctl_init(void); +int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #else #define printk_sysctl_init() do { } while (0) #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 363a493bded8..82abfaf3c2aa 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -171,7 +171,7 @@ static int __init control_devkmsg(char *str) __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; - +#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL) int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -210,6 +210,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, return 0; } +#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ /* Number of registered extended console drivers. */ static int nr_ext_console_drivers; -- cgit v1.2.3 From f0bc21b268c1464603192a00851cdbbf7c2cdc36 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:38 -0800 Subject: fs/coredump: move coredump sysctls into its own file This moves the fs/coredump.c respective sysctls to its own file. Link: https://lkml.kernel.org/r/20211129211943.640266-6-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/coredump.c | 66 ++++++++++++++++++++++++++++++++++++++++++++---- fs/exec.c | 55 ---------------------------------------- include/linux/coredump.h | 10 +++++--- kernel/sysctl.c | 2 -- 4 files changed, 67 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/fs/coredump.c b/fs/coredump.c index 7dece20b162b..1c060c0a2d72 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -52,9 +53,9 @@ #include -int core_uses_pid; -unsigned int core_pipe_limit; -char core_pattern[CORENAME_MAX_SIZE] = "core"; +static int core_uses_pid; +static unsigned int core_pipe_limit; +static char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; struct core_name { @@ -62,8 +63,6 @@ struct core_name { int used, size; }; -/* The maximal length of core_pattern is also specified in sysctl.c */ - static int expand_corename(struct core_name *cn, int size) { char *corename = krealloc(cn->corename, size, GFP_KERNEL); @@ -893,6 +892,63 @@ int dump_align(struct coredump_params *cprm, int align) } EXPORT_SYMBOL(dump_align); +#ifdef CONFIG_SYSCTL + +void validate_coredump_safety(void) +{ + if (suid_dumpable == SUID_DUMP_ROOT && + core_pattern[0] != '/' && core_pattern[0] != '|') { + pr_warn( +"Unsafe core_pattern used with fs.suid_dumpable=2.\n" +"Pipe handler or fully qualified core dump path required.\n" +"Set kernel.core_pattern before fs.suid_dumpable.\n" + ); + } +} + +static int proc_dostring_coredump(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int error = proc_dostring(table, write, buffer, lenp, ppos); + + if (!error) + validate_coredump_safety(); + return error; +} + +static struct ctl_table coredump_sysctls[] = { + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring_coredump, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static int __init init_fs_coredump_sysctls(void) +{ + register_sysctl_init("kernel", coredump_sysctls); + return 0; +} +fs_initcall(init_fs_coredump_sysctls); +#endif /* CONFIG_SYSCTL */ + /* * The purpose of always_dump_vma() is to make sure that special kernel mappings * that are useful for post-mortem analysis are included in every core dump. diff --git a/fs/exec.c b/fs/exec.c index 310750340e4a..79f2c9483302 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2103,20 +2103,6 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd, #ifdef CONFIG_SYSCTL -static void validate_coredump_safety(void) -{ -#ifdef CONFIG_COREDUMP - if (suid_dumpable == SUID_DUMP_ROOT && - core_pattern[0] != '/' && core_pattern[0] != '|') { - pr_warn( -"Unsafe core_pattern used with fs.suid_dumpable=2.\n" -"Pipe handler or fully qualified core dump path required.\n" -"Set kernel.core_pattern before fs.suid_dumpable.\n" - ); - } -#endif -} - static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -2140,50 +2126,9 @@ static struct ctl_table fs_exec_sysctls[] = { { } }; -#ifdef CONFIG_COREDUMP - -static int proc_dostring_coredump(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) -{ - int error = proc_dostring(table, write, buffer, lenp, ppos); - - if (!error) - validate_coredump_safety(); - return error; -} - -static struct ctl_table kernel_exec_sysctls[] = { - { - .procname = "core_uses_pid", - .data = &core_uses_pid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "core_pattern", - .data = core_pattern, - .maxlen = CORENAME_MAX_SIZE, - .mode = 0644, - .proc_handler = proc_dostring_coredump, - }, - { - .procname = "core_pipe_limit", - .data = &core_pipe_limit, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { } -}; -#endif - static int __init init_fs_exec_sysctls(void) { register_sysctl_init("fs", fs_exec_sysctls); -#ifdef CONFIG_COREDUMP - register_sysctl_init("kernel", kernel_exec_sysctls); -#endif return 0; } diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 78fcd776b185..248a68c668b4 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -14,10 +14,6 @@ struct core_vma_metadata { unsigned long dump_size; }; -extern int core_uses_pid; -extern char core_pattern[]; -extern unsigned int core_pipe_limit; - /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. @@ -37,4 +33,10 @@ extern void do_coredump(const kernel_siginfo_t *siginfo); static inline void do_coredump(const kernel_siginfo_t *siginfo) {} #endif +#if defined(CONFIG_COREDUMP) && defined(CONFIG_SYSCTL) +extern void validate_coredump_safety(void); +#else +static inline void validate_coredump_safety(void) {} +#endif + #endif /* _LINUX_COREDUMP_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bb07183cd305..c78585292005 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -62,12 +62,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include -- cgit v1.2.3 From a737a3c6744bc822d1e6a837fef550e665ddf877 Mon Sep 17 00:00:00 2001 From: Xiaoming Ni Date: Fri, 21 Jan 2022 22:13:41 -0800 Subject: kprobe: move sysctl_kprobes_optimization to kprobes.c kernel/sysctl.c is a kitchen sink where everyone leaves their dirty dishes, this makes it very difficult to maintain. To help with this maintenance let's start by moving sysctls to places where they actually belong. The proc sysctl maintainers do not want to know what sysctl knobs you wish to add for your own piece of code, we just care about the core logic. Move sysctl_kprobes_optimization from kernel/sysctl.c to kernel/kprobes.c. Use register_sysctl() to register the sysctl interface. [mcgrof@kernel.org: fix compile issue when CONFIG_OPTPROBES is disabled] Link: https://lkml.kernel.org/r/20211129211943.640266-7-mcgrof@kernel.org Signed-off-by: Xiaoming Ni Signed-off-by: Luis Chamberlain Cc: Al Viro Cc: Anil S Keshavamurthy Cc: Antti Palosaari Cc: Christian Brauner Cc: "David S. Miller" Cc: Eric Biederman Cc: Eric Biggers Cc: Iurii Zaikin Cc: Kees Cook Cc: Lukas Middendorf Cc: Masami Hiramatsu Cc: "Naveen N. Rao" Cc: Stephen Kitt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 6 ------ kernel/kprobes.c | 30 ++++++++++++++++++++++++++---- kernel/sysctl.c | 12 ------------ 3 files changed, 26 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 8c8f7a4d93af..19b884353b15 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -348,12 +348,6 @@ extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); DEFINE_INSN_CACHE_OPS(optinsn); -#ifdef CONFIG_SYSCTL -extern int sysctl_kprobes_optimization; -extern int proc_kprobes_optimization_handler(struct ctl_table *table, - int write, void *buffer, - size_t *length, loff_t *ppos); -#endif /* CONFIG_SYSCTL */ extern void wait_for_kprobe_optimizer(void); #else /* !CONFIG_OPTPROBES */ static inline void wait_for_kprobe_optimizer(void) { } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 21eccc961bba..94cab8c9ce56 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -48,6 +48,9 @@ #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) +#if !defined(CONFIG_OPTPROBES) || !defined(CONFIG_SYSCTL) +#define kprobe_sysctls_init() do { } while (0) +#endif static int kprobes_initialized; /* kprobe_table can be accessed by @@ -938,10 +941,10 @@ static void unoptimize_all_kprobes(void) } static DEFINE_MUTEX(kprobe_sysctl_mutex); -int sysctl_kprobes_optimization; -int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos) +static int sysctl_kprobes_optimization; +static int proc_kprobes_optimization_handler(struct ctl_table *table, + int write, void *buffer, + size_t *length, loff_t *ppos) { int ret; @@ -957,6 +960,24 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, return ret; } + +static struct ctl_table kprobe_sysctls[] = { + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + +static void __init kprobe_sysctls_init(void) +{ + register_sysctl_init("debug", kprobe_sysctls); +} #endif /* CONFIG_SYSCTL */ /* Put a breakpoint for a probe. */ @@ -2584,6 +2605,7 @@ static int __init init_kprobes(void) err = register_module_notifier(&kprobe_module_nb); kprobes_initialized = (err == 0); + kprobe_sysctls_init(); return err; } early_initcall(init_kprobes); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c78585292005..c322bb629d10 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -2818,17 +2817,6 @@ static struct ctl_table debug_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, -#endif -#if defined(CONFIG_OPTPROBES) - { - .procname = "kprobes-optimization", - .data = &sysctl_kprobes_optimization, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_kprobes_optimization_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif { } }; -- cgit v1.2.3 From e565a8ed1ee4b481539b66cd6f54df9ecf1e9861 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 21 Jan 2022 22:13:45 -0800 Subject: kernel/sysctl.c: remove unused variable ten_thousand The const variable ten_thousand is not used, it is redundant and can be removed. Cleans up clang warning: kernel/sysctl.c:99:18: warning: unused variable 'ten_thousand' [-Wunused-const-variable] static const int ten_thousand = 10000; Link: https://lkml.kernel.org/r/20211221184501.574670-1-colin.i.king@gmail.com Fixes: c26da54dc8ca ("printk: move printk sysctl to printk/sysctl.c") Signed-off-by: Colin Ian King Acked-by: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c322bb629d10..e92e8d21d6bf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -96,9 +96,6 @@ /* Constants used for minimum and maximum */ -#ifdef CONFIG_PRINTK -static const int ten_thousand = 10000; -#endif #ifdef CONFIG_PERF_EVENTS static const int six_hundred_forty_kb = 640 * 1024; #endif -- cgit v1.2.3 From 1622ed7d0743201293094162c26019d2573ecacb Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Fri, 21 Jan 2022 22:13:48 -0800 Subject: sysctl: returns -EINVAL when a negative value is passed to proc_doulongvec_minmax When we pass a negative value to the proc_doulongvec_minmax() function, the function returns 0, but the corresponding interface value does not change. we can easily reproduce this problem with the following commands: cd /proc/sys/fs/epoll echo -1 > max_user_watches; echo $?; cat max_user_watches This function requires a non-negative number to be passed in, so when a negative number is passed in, -EINVAL is returned. Link: https://lkml.kernel.org/r/20211220092627.3744624-1-libaokun1@huawei.com Signed-off-by: Baokun Li Reported-by: Hulk Robot Acked-by: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e92e8d21d6bf..5ae443b2882e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1145,10 +1145,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); - if (err) + if (err || neg) { + err = -EINVAL; break; - if (neg) - continue; + } + val = convmul * val / convdiv; if ((min && val < *min) || (max && val > *max)) { err = -EINVAL; -- cgit v1.2.3 From 4a57d6bbaecd28c8175dc5da013009e4158018c2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 21 Jan 2022 22:14:10 -0800 Subject: locking/rwlocks: introduce write_lock_nested In preparation for converting bit_spin_lock to rwlock in zsmalloc so that multiple writers of zspages can run at the same time but those zspages are supposed to be different zspage instance. Thus, it's not deadlock. This patch adds write_lock_nested to support the case for LOCKDEP. [minchan@kernel.org: fix write_lock_nested for RT] Link: https://lkml.kernel.org/r/YZfrMTAXV56HFWJY@google.com [bigeasy@linutronix.de: fixup write_lock_nested() implementation] Link: https://lkml.kernel.org/r/20211123170134.y6xb7pmpgdn4m3bn@linutronix.de Link: https://lkml.kernel.org/r/20211115185909.3949505-8-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Acked-by: Sebastian Andrzej Siewior Tested-by: Sebastian Andrzej Siewior Cc: Mike Galbraith Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: Naresh Kamboju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rwlock.h | 6 ++++++ include/linux/rwlock_api_smp.h | 8 ++++++++ include/linux/rwlock_rt.h | 10 ++++++++++ include/linux/spinlock_api_up.h | 1 + kernel/locking/spinlock.c | 10 ++++++++++ kernel/locking/spinlock_rt.c | 12 ++++++++++++ 6 files changed, 47 insertions(+) (limited to 'kernel') diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h index 2c0ad417ce3c..8f416c5e929e 100644 --- a/include/linux/rwlock.h +++ b/include/linux/rwlock.h @@ -55,6 +55,12 @@ do { \ #define write_lock(lock) _raw_write_lock(lock) #define read_lock(lock) _raw_read_lock(lock) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define write_lock_nested(lock, subclass) _raw_write_lock_nested(lock, subclass) +#else +#define write_lock_nested(lock, subclass) _raw_write_lock(lock) +#endif + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define read_lock_irqsave(lock, flags) \ diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h index f1db6f17c4fb..dceb0a59b692 100644 --- a/include/linux/rwlock_api_smp.h +++ b/include/linux/rwlock_api_smp.h @@ -17,6 +17,7 @@ void __lockfunc _raw_read_lock(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock(rwlock_t *lock) __acquires(lock); +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) __acquires(lock); void __lockfunc _raw_read_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_write_lock_bh(rwlock_t *lock) __acquires(lock); void __lockfunc _raw_read_lock_irq(rwlock_t *lock) __acquires(lock); @@ -209,6 +210,13 @@ static inline void __raw_write_lock(rwlock_t *lock) LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); } +static inline void __raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + preempt_disable(); + rwlock_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock); +} + #endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */ static inline void __raw_write_unlock(rwlock_t *lock) diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h index 49c1f3842ed5..8544ff05e594 100644 --- a/include/linux/rwlock_rt.h +++ b/include/linux/rwlock_rt.h @@ -28,6 +28,7 @@ extern void rt_read_lock(rwlock_t *rwlock); extern int rt_read_trylock(rwlock_t *rwlock); extern void rt_read_unlock(rwlock_t *rwlock); extern void rt_write_lock(rwlock_t *rwlock); +extern void rt_write_lock_nested(rwlock_t *rwlock, int subclass); extern int rt_write_trylock(rwlock_t *rwlock); extern void rt_write_unlock(rwlock_t *rwlock); @@ -83,6 +84,15 @@ static __always_inline void write_lock(rwlock_t *rwlock) rt_write_lock(rwlock); } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static __always_inline void write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rt_write_lock_nested(rwlock, subclass); +} +#else +#define write_lock_nested(lock, subclass) rt_write_lock(((void)(subclass), (lock))) +#endif + static __always_inline void write_lock_bh(rwlock_t *rwlock) { local_bh_disable(); diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h index d0d188861ad6..b8ba00ccccde 100644 --- a/include/linux/spinlock_api_up.h +++ b/include/linux/spinlock_api_up.h @@ -59,6 +59,7 @@ #define _raw_spin_lock_nested(lock, subclass) __LOCK(lock) #define _raw_read_lock(lock) __LOCK(lock) #define _raw_write_lock(lock) __LOCK(lock) +#define _raw_write_lock_nested(lock, subclass) __LOCK(lock) #define _raw_spin_lock_bh(lock) __LOCK_BH(lock) #define _raw_read_lock_bh(lock) __LOCK_BH(lock) #define _raw_write_lock_bh(lock) __LOCK_BH(lock) diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index b562f9289372..7f49baaa4979 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -300,6 +300,16 @@ void __lockfunc _raw_write_lock(rwlock_t *lock) __raw_write_lock(lock); } EXPORT_SYMBOL(_raw_write_lock); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#define __raw_write_lock_nested(lock, subclass) __raw_write_lock(((void)(subclass), (lock))) +#endif + +void __lockfunc _raw_write_lock_nested(rwlock_t *lock, int subclass) +{ + __raw_write_lock_nested(lock, subclass); +} +EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index 9e396a09fe0f..48a19ed8486d 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -239,6 +239,18 @@ void __sched rt_write_lock(rwlock_t *rwlock) } EXPORT_SYMBOL(rt_write_lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched rt_write_lock_nested(rwlock_t *rwlock, int subclass) +{ + rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, subclass, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock_nested); +#endif + void __sched rt_read_unlock(rwlock_t *rwlock) { rwlock_release(&rwlock->dep_map, _RET_IP_); -- cgit v1.2.3 From 359745d78351c6f5442435f81549f0207ece28aa Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 21 Jan 2022 22:14:23 -0800 Subject: proc: remove PDE_DATA() completely Remove PDE_DATA() completely and replace it with pde_data(). [akpm@linux-foundation.org: fix naming clash in drivers/nubus/proc.c] [akpm@linux-foundation.org: now fix it properly] Link: https://lkml.kernel.org/r/20211124081956.87711-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Christian Brauner Cc: Alexey Dobriyan Cc: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/srm_env.c | 4 +-- arch/arm/kernel/atags_proc.c | 2 +- arch/ia64/kernel/salinfo.c | 10 +++--- arch/powerpc/kernel/proc_powerpc.c | 4 +-- arch/sh/mm/alignment.c | 2 +- arch/xtensa/platforms/iss/simdisk.c | 4 +-- drivers/acpi/proc.c | 2 +- drivers/hwmon/dell-smm-hwmon.c | 4 +-- drivers/net/bonding/bond_procfs.c | 8 ++--- drivers/net/wireless/cisco/airo.c | 22 ++++++------- drivers/net/wireless/intersil/hostap/hostap_ap.c | 16 +++++----- .../net/wireless/intersil/hostap/hostap_download.c | 2 +- drivers/net/wireless/intersil/hostap/hostap_proc.c | 24 +++++++-------- drivers/net/wireless/ray_cs.c | 2 +- drivers/nubus/proc.c | 36 +++++++++++----------- drivers/parisc/led.c | 4 +-- drivers/pci/proc.c | 10 +++--- drivers/platform/x86/thinkpad_acpi.c | 4 +-- drivers/platform/x86/toshiba_acpi.c | 16 +++++----- drivers/pnp/isapnp/proc.c | 2 +- drivers/pnp/pnpbios/proc.c | 4 +-- drivers/scsi/scsi_proc.c | 4 +-- drivers/usb/gadget/function/rndis.c | 4 +-- drivers/zorro/proc.c | 2 +- fs/afs/proc.c | 6 ++-- fs/ext4/mballoc.c | 14 ++++----- fs/jbd2/journal.c | 2 +- fs/proc/proc_net.c | 8 ++--- include/linux/proc_fs.h | 4 +-- include/linux/seq_file.h | 2 +- ipc/util.c | 2 +- kernel/irq/proc.c | 8 ++--- kernel/resource.c | 4 +-- net/atm/proc.c | 4 +-- net/bluetooth/af_bluetooth.c | 8 ++--- net/can/bcm.c | 2 +- net/can/proc.c | 2 +- net/core/neighbour.c | 6 ++-- net/core/pktgen.c | 6 ++-- net/ipv4/netfilter/ipt_CLUSTERIP.c | 6 ++-- net/ipv4/raw.c | 8 ++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/udp.c | 6 ++-- net/netfilter/x_tables.c | 10 +++--- net/netfilter/xt_hashlimit.c | 18 +++++------ net/netfilter/xt_recent.c | 4 +-- net/sunrpc/auth_gss/svcauth_gss.c | 4 +-- net/sunrpc/cache.c | 24 +++++++-------- net/sunrpc/stats.c | 2 +- sound/core/info.c | 4 +-- 50 files changed, 178 insertions(+), 180 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/srm_env.c b/arch/alpha/kernel/srm_env.c index 528d2be58182..217b4dca51dd 100644 --- a/arch/alpha/kernel/srm_env.c +++ b/arch/alpha/kernel/srm_env.c @@ -83,14 +83,14 @@ static int srm_env_proc_show(struct seq_file *m, void *v) static int srm_env_proc_open(struct inode *inode, struct file *file) { - return single_open(file, srm_env_proc_show, PDE_DATA(inode)); + return single_open(file, srm_env_proc_show, pde_data(inode)); } static ssize_t srm_env_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { int res; - unsigned long id = (unsigned long)PDE_DATA(file_inode(file)); + unsigned long id = (unsigned long)pde_data(file_inode(file)); char *buf = (char *) __get_free_page(GFP_USER); unsigned long ret1, ret2; diff --git a/arch/arm/kernel/atags_proc.c b/arch/arm/kernel/atags_proc.c index 3c2faf2bd124..3ec2afe78423 100644 --- a/arch/arm/kernel/atags_proc.c +++ b/arch/arm/kernel/atags_proc.c @@ -13,7 +13,7 @@ struct buffer { static ssize_t atags_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct buffer *b = PDE_DATA(file_inode(file)); + struct buffer *b = pde_data(file_inode(file)); return simple_read_from_buffer(buf, count, ppos, b->data, b->size); } diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index a25ab9b37953..bd3ba276e69c 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -282,7 +282,7 @@ salinfo_event_open(struct inode *inode, struct file *file) static ssize_t salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { - struct salinfo_data *data = PDE_DATA(file_inode(file)); + struct salinfo_data *data = pde_data(file_inode(file)); char cmd[32]; size_t size; int i, n, cpu = -1; @@ -340,7 +340,7 @@ static const struct proc_ops salinfo_event_proc_ops = { static int salinfo_log_open(struct inode *inode, struct file *file) { - struct salinfo_data *data = PDE_DATA(inode); + struct salinfo_data *data = pde_data(inode); if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -365,7 +365,7 @@ salinfo_log_open(struct inode *inode, struct file *file) static int salinfo_log_release(struct inode *inode, struct file *file) { - struct salinfo_data *data = PDE_DATA(inode); + struct salinfo_data *data = pde_data(inode); if (data->state == STATE_NO_DATA) { vfree(data->log_buffer); @@ -433,7 +433,7 @@ retry: static ssize_t salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { - struct salinfo_data *data = PDE_DATA(file_inode(file)); + struct salinfo_data *data = pde_data(file_inode(file)); u8 *buf; u64 bufsize; @@ -494,7 +494,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu) static ssize_t salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - struct salinfo_data *data = PDE_DATA(file_inode(file)); + struct salinfo_data *data = pde_data(file_inode(file)); char cmd[32]; size_t size; u32 offset; diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c index 877817471e3c..6a029f2378e1 100644 --- a/arch/powerpc/kernel/proc_powerpc.c +++ b/arch/powerpc/kernel/proc_powerpc.c @@ -25,7 +25,7 @@ static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes loff_t *ppos) { return simple_read_from_buffer(buf, nbytes, ppos, - PDE_DATA(file_inode(file)), PAGE_SIZE); + pde_data(file_inode(file)), PAGE_SIZE); } static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) @@ -34,7 +34,7 @@ static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) return -EINVAL; remap_pfn_range(vma, vma->vm_start, - __pa(PDE_DATA(file_inode(file))) >> PAGE_SHIFT, + __pa(pde_data(file_inode(file))) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); return 0; } diff --git a/arch/sh/mm/alignment.c b/arch/sh/mm/alignment.c index d802801ceb93..3a76a766f423 100644 --- a/arch/sh/mm/alignment.c +++ b/arch/sh/mm/alignment.c @@ -140,7 +140,7 @@ static int alignment_proc_open(struct inode *inode, struct file *file) static ssize_t alignment_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - int *data = PDE_DATA(file_inode(file)); + int *data = pde_data(file_inode(file)); char mode; if (count > 0) { diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 07b642c1916a..8eb6ad1a3a1d 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -208,7 +208,7 @@ static int simdisk_detach(struct simdisk *dev) static ssize_t proc_read_simdisk(struct file *file, char __user *buf, size_t size, loff_t *ppos) { - struct simdisk *dev = PDE_DATA(file_inode(file)); + struct simdisk *dev = pde_data(file_inode(file)); const char *s = dev->filename; if (s) { ssize_t n = simple_read_from_buffer(buf, size, ppos, @@ -225,7 +225,7 @@ static ssize_t proc_write_simdisk(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char *tmp = memdup_user_nul(buf, count); - struct simdisk *dev = PDE_DATA(file_inode(file)); + struct simdisk *dev = pde_data(file_inode(file)); int err; if (IS_ERR(tmp)) diff --git a/drivers/acpi/proc.c b/drivers/acpi/proc.c index 0cca7991f186..4322f2da6d10 100644 --- a/drivers/acpi/proc.c +++ b/drivers/acpi/proc.c @@ -127,7 +127,7 @@ static int acpi_system_wakeup_device_open_fs(struct inode *inode, struct file *file) { return single_open(file, acpi_system_wakeup_device_seq_show, - PDE_DATA(inode)); + pde_data(inode)); } static const struct proc_ops acpi_system_wakeup_device_proc_ops = { diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index d401f9acf450..9949eeb79378 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -451,7 +451,7 @@ static int i8k_get_power_status(void) static long i8k_ioctl(struct file *fp, unsigned int cmd, unsigned long arg) { - struct dell_smm_data *data = PDE_DATA(file_inode(fp)); + struct dell_smm_data *data = pde_data(file_inode(fp)); int __user *argp = (int __user *)arg; int speed, err; int val = 0; @@ -585,7 +585,7 @@ static int i8k_proc_show(struct seq_file *seq, void *offset) static int i8k_open_fs(struct inode *inode, struct file *file) { - return single_open(file, i8k_proc_show, PDE_DATA(inode)); + return single_open(file, i8k_proc_show, pde_data(inode)); } static const struct proc_ops i8k_proc_ops = { diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index 2ec11af5f0cc..46b150e6289e 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -11,7 +11,7 @@ static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; loff_t off = 0; @@ -30,7 +30,7 @@ static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) static void *bond_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); struct list_head *iter; struct slave *slave; bool found = false; @@ -57,7 +57,7 @@ static void bond_info_seq_stop(struct seq_file *seq, void *v) static void bond_info_show_master(struct seq_file *seq) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); const struct bond_opt_value *optval; struct slave *curr, *primary; int i; @@ -175,7 +175,7 @@ static void bond_info_show_master(struct seq_file *seq) static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { - struct bonding *bond = PDE_DATA(file_inode(seq->file)); + struct bonding *bond = pde_data(file_inode(seq->file)); seq_printf(seq, "\nSlave Interface: %s\n", slave->dev->name); seq_printf(seq, "MII Status: %s\n", bond_slave_link_status(slave->link)); diff --git a/drivers/net/wireless/cisco/airo.c b/drivers/net/wireless/cisco/airo.c index 45594f003ef7..452d08545d31 100644 --- a/drivers/net/wireless/cisco/airo.c +++ b/drivers/net/wireless/cisco/airo.c @@ -4672,7 +4672,7 @@ static ssize_t proc_write(struct file *file, static int proc_status_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *apriv = dev->ml_priv; CapabilityRid cap_rid; StatusRid status_rid; @@ -4756,7 +4756,7 @@ static int proc_stats_rid_open(struct inode *inode, u16 rid) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *apriv = dev->ml_priv; StatsRid stats; int i, j; @@ -4819,7 +4819,7 @@ static inline int sniffing_mode(struct airo_info *ai) static void proc_config_on_close(struct inode *inode, struct file *file) { struct proc_data *data = file->private_data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; char *line; @@ -5030,7 +5030,7 @@ static const char *get_rmode(__le16 mode) static int proc_config_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i; __le16 mode; @@ -5120,7 +5120,7 @@ static int proc_config_open(struct inode *inode, struct file *file) static void proc_SSID_on_close(struct inode *inode, struct file *file) { struct proc_data *data = file->private_data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; SsidRid SSID_rid; int i; @@ -5156,7 +5156,7 @@ static void proc_SSID_on_close(struct inode *inode, struct file *file) static void proc_APList_on_close(struct inode *inode, struct file *file) { struct proc_data *data = file->private_data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; APListRid *APList_rid = &ai->APList; int i; @@ -5280,7 +5280,7 @@ static int set_wep_tx_idx(struct airo_info *ai, u16 index, int perm, int lock) static void proc_wepkey_on_close(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i, rc; char key[16]; @@ -5331,7 +5331,7 @@ static void proc_wepkey_on_close(struct inode *inode, struct file *file) static int proc_wepkey_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; char *ptr; WepKeyRid wkr; @@ -5379,7 +5379,7 @@ static int proc_wepkey_open(struct inode *inode, struct file *file) static int proc_SSID_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i; char *ptr; @@ -5423,7 +5423,7 @@ static int proc_SSID_open(struct inode *inode, struct file *file) static int proc_APList_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; int i; char *ptr; @@ -5462,7 +5462,7 @@ static int proc_APList_open(struct inode *inode, struct file *file) static int proc_BSSList_open(struct inode *inode, struct file *file) { struct proc_data *data; - struct net_device *dev = PDE_DATA(inode); + struct net_device *dev = pde_data(inode); struct airo_info *ai = dev->ml_priv; char *ptr; BSSListRid BSSList_rid; diff --git a/drivers/net/wireless/intersil/hostap/hostap_ap.c b/drivers/net/wireless/intersil/hostap/hostap_ap.c index 8bcc1cdcb75b..462ccc7d7d1a 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_ap.c +++ b/drivers/net/wireless/intersil/hostap/hostap_ap.c @@ -69,7 +69,7 @@ static void prism2_send_mgmt(struct net_device *dev, #if !defined(PRISM2_NO_PROCFS_DEBUG) && defined(CONFIG_PROC_FS) static int ap_debug_proc_show(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); seq_printf(m, "BridgedUnicastFrames=%u\n", ap->bridged_unicast); seq_printf(m, "BridgedMulticastFrames=%u\n", ap->bridged_multicast); @@ -320,7 +320,7 @@ void hostap_deauth_all_stas(struct net_device *dev, struct ap_data *ap, static int ap_control_proc_show(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); char *policy_txt; struct mac_entry *entry; @@ -352,20 +352,20 @@ static int ap_control_proc_show(struct seq_file *m, void *v) static void *ap_control_proc_start(struct seq_file *m, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_lock_bh(&ap->mac_restrictions.lock); return seq_list_start_head(&ap->mac_restrictions.mac_list, *_pos); } static void *ap_control_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); return seq_list_next(v, &ap->mac_restrictions.mac_list, _pos); } static void ap_control_proc_stop(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_unlock_bh(&ap->mac_restrictions.lock); } @@ -554,20 +554,20 @@ static int prism2_ap_proc_show(struct seq_file *m, void *v) static void *prism2_ap_proc_start(struct seq_file *m, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_lock_bh(&ap->sta_table_lock); return seq_list_start_head(&ap->sta_list, *_pos); } static void *prism2_ap_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); return seq_list_next(v, &ap->sta_list, _pos); } static void prism2_ap_proc_stop(struct seq_file *m, void *v) { - struct ap_data *ap = PDE_DATA(file_inode(m->file)); + struct ap_data *ap = pde_data(file_inode(m->file)); spin_unlock_bh(&ap->sta_table_lock); } diff --git a/drivers/net/wireless/intersil/hostap/hostap_download.c b/drivers/net/wireless/intersil/hostap/hostap_download.c index 7c6a5a6d1d45..3672291ced5c 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_download.c +++ b/drivers/net/wireless/intersil/hostap/hostap_download.c @@ -227,7 +227,7 @@ static int prism2_download_aux_dump_proc_open(struct inode *inode, struct file * sizeof(struct prism2_download_aux_dump)); if (ret == 0) { struct seq_file *m = file->private_data; - m->private = PDE_DATA(inode); + m->private = pde_data(inode); } return ret; } diff --git a/drivers/net/wireless/intersil/hostap/hostap_proc.c b/drivers/net/wireless/intersil/hostap/hostap_proc.c index 51c847d98755..61f68786056f 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_proc.c +++ b/drivers/net/wireless/intersil/hostap/hostap_proc.c @@ -97,20 +97,20 @@ static int prism2_wds_proc_show(struct seq_file *m, void *v) static void *prism2_wds_proc_start(struct seq_file *m, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); read_lock_bh(&local->iface_lock); return seq_list_start(&local->hostap_interfaces, *_pos); } static void *prism2_wds_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); return seq_list_next(v, &local->hostap_interfaces, _pos); } static void prism2_wds_proc_stop(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); read_unlock_bh(&local->iface_lock); } @@ -123,7 +123,7 @@ static const struct seq_operations prism2_wds_proc_seqops = { static int prism2_bss_list_proc_show(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); struct list_head *ptr = v; struct hostap_bss_info *bss; @@ -151,21 +151,21 @@ static int prism2_bss_list_proc_show(struct seq_file *m, void *v) static void *prism2_bss_list_proc_start(struct seq_file *m, loff_t *_pos) __acquires(&local->lock) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_lock_bh(&local->lock); return seq_list_start_head(&local->bss_list, *_pos); } static void *prism2_bss_list_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); return seq_list_next(v, &local->bss_list, _pos); } static void prism2_bss_list_proc_stop(struct seq_file *m, void *v) __releases(&local->lock) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_unlock_bh(&local->lock); } @@ -198,7 +198,7 @@ static int prism2_crypt_proc_show(struct seq_file *m, void *v) static ssize_t prism2_pda_proc_read(struct file *file, char __user *buf, size_t count, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(file)); + local_info_t *local = pde_data(file_inode(file)); size_t off; if (local->pda == NULL || *_pos >= PRISM2_PDA_SIZE) @@ -272,7 +272,7 @@ static int prism2_io_debug_proc_read(char *page, char **start, off_t off, #ifndef PRISM2_NO_STATION_MODES static int prism2_scan_results_proc_show(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); unsigned long entry; int i, len; struct hfa384x_hostscan_result *scanres; @@ -322,7 +322,7 @@ static int prism2_scan_results_proc_show(struct seq_file *m, void *v) static void *prism2_scan_results_proc_start(struct seq_file *m, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_lock_bh(&local->lock); /* We have a header (pos 0) + N results to show (pos 1...N) */ @@ -333,7 +333,7 @@ static void *prism2_scan_results_proc_start(struct seq_file *m, loff_t *_pos) static void *prism2_scan_results_proc_next(struct seq_file *m, void *v, loff_t *_pos) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); ++*_pos; if (*_pos > local->last_scan_results_count) @@ -343,7 +343,7 @@ static void *prism2_scan_results_proc_next(struct seq_file *m, void *v, loff_t * static void prism2_scan_results_proc_stop(struct seq_file *m, void *v) { - local_info_t *local = PDE_DATA(file_inode(m->file)); + local_info_t *local = pde_data(file_inode(m->file)); spin_unlock_bh(&local->lock); } diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c index e3a3dc3e45b4..2987ad9271f6 100644 --- a/drivers/net/wireless/ray_cs.c +++ b/drivers/net/wireless/ray_cs.c @@ -2746,7 +2746,7 @@ static ssize_t int_proc_write(struct file *file, const char __user *buffer, nr = nr * 10 + c; p++; } while (--len); - *(int *)PDE_DATA(file_inode(file)) = nr; + *(int *)pde_data(file_inode(file)) = nr; return count; } diff --git a/drivers/nubus/proc.c b/drivers/nubus/proc.c index 88e1f9a0faaf..1fd667852271 100644 --- a/drivers/nubus/proc.c +++ b/drivers/nubus/proc.c @@ -93,30 +93,30 @@ struct nubus_proc_pde_data { static struct nubus_proc_pde_data * nubus_proc_alloc_pde_data(unsigned char *ptr, unsigned int size) { - struct nubus_proc_pde_data *pde_data; + struct nubus_proc_pde_data *pded; - pde_data = kmalloc(sizeof(*pde_data), GFP_KERNEL); - if (!pde_data) + pded = kmalloc(sizeof(*pded), GFP_KERNEL); + if (!pded) return NULL; - pde_data->res_ptr = ptr; - pde_data->res_size = size; - return pde_data; + pded->res_ptr = ptr; + pded->res_size = size; + return pded; } static int nubus_proc_rsrc_show(struct seq_file *m, void *v) { struct inode *inode = m->private; - struct nubus_proc_pde_data *pde_data; + struct nubus_proc_pde_data *pded; - pde_data = PDE_DATA(inode); - if (!pde_data) + pded = pde_data(inode); + if (!pded) return 0; - if (pde_data->res_size > m->size) + if (pded->res_size > m->size) return -EFBIG; - if (pde_data->res_size) { + if (pded->res_size) { int lanes = (int)proc_get_parent_data(inode); struct nubus_dirent ent; @@ -124,11 +124,11 @@ static int nubus_proc_rsrc_show(struct seq_file *m, void *v) return 0; ent.mask = lanes; - ent.base = pde_data->res_ptr; + ent.base = pded->res_ptr; ent.data = 0; - nubus_seq_write_rsrc_mem(m, &ent, pde_data->res_size); + nubus_seq_write_rsrc_mem(m, &ent, pded->res_size); } else { - unsigned int data = (unsigned int)pde_data->res_ptr; + unsigned int data = (unsigned int)pded->res_ptr; seq_putc(m, data >> 16); seq_putc(m, data >> 8); @@ -142,18 +142,18 @@ void nubus_proc_add_rsrc_mem(struct proc_dir_entry *procdir, unsigned int size) { char name[9]; - struct nubus_proc_pde_data *pde_data; + struct nubus_proc_pde_data *pded; if (!procdir) return; snprintf(name, sizeof(name), "%x", ent->type); if (size) - pde_data = nubus_proc_alloc_pde_data(nubus_dirptr(ent), size); + pded = nubus_proc_alloc_pde_data(nubus_dirptr(ent), size); else - pde_data = NULL; + pded = NULL; proc_create_single_data(name, S_IFREG | 0444, procdir, - nubus_proc_rsrc_show, pde_data); + nubus_proc_rsrc_show, pded); } void nubus_proc_add_rsrc(struct proc_dir_entry *procdir, diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c index cf91cb024be3..1e4a5663d011 100644 --- a/drivers/parisc/led.c +++ b/drivers/parisc/led.c @@ -168,14 +168,14 @@ static int led_proc_show(struct seq_file *m, void *v) static int led_proc_open(struct inode *inode, struct file *file) { - return single_open(file, led_proc_show, PDE_DATA(inode)); + return single_open(file, led_proc_show, pde_data(inode)); } static ssize_t led_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - void *data = PDE_DATA(file_inode(file)); + void *data = pde_data(file_inode(file)); char *cur, lbuf[32]; int d; diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c index cb18f8a13ab6..9c7edec64f7e 100644 --- a/drivers/pci/proc.c +++ b/drivers/pci/proc.c @@ -21,14 +21,14 @@ static int proc_initialized; /* = 0 */ static loff_t proc_bus_pci_lseek(struct file *file, loff_t off, int whence) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); return fixed_size_llseek(file, off, whence, dev->cfg_size); } static ssize_t proc_bus_pci_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); unsigned int pos = *ppos; unsigned int cnt, size; @@ -114,7 +114,7 @@ static ssize_t proc_bus_pci_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos) { struct inode *ino = file_inode(file); - struct pci_dev *dev = PDE_DATA(ino); + struct pci_dev *dev = pde_data(ino); int pos = *ppos; int size = dev->cfg_size; int cnt, ret; @@ -196,7 +196,7 @@ struct pci_filp_private { static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); #ifdef HAVE_PCI_MMAP struct pci_filp_private *fpriv = file->private_data; #endif /* HAVE_PCI_MMAP */ @@ -244,7 +244,7 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd, #ifdef HAVE_PCI_MMAP static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma) { - struct pci_dev *dev = PDE_DATA(file_inode(file)); + struct pci_dev *dev = pde_data(file_inode(file)); struct pci_filp_private *fpriv = file->private_data; int i, ret, write_combine = 0, res_bit = IORESOURCE_MEM; diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 82fa6148216c..098180fb1cfc 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -880,14 +880,14 @@ static int dispatch_proc_show(struct seq_file *m, void *v) static int dispatch_proc_open(struct inode *inode, struct file *file) { - return single_open(file, dispatch_proc_show, PDE_DATA(inode)); + return single_open(file, dispatch_proc_show, pde_data(inode)); } static ssize_t dispatch_proc_write(struct file *file, const char __user *userbuf, size_t count, loff_t *pos) { - struct ibm_struct *ibm = PDE_DATA(file_inode(file)); + struct ibm_struct *ibm = pde_data(file_inode(file)); char *kernbuf; int ret; diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 352508d30467..f113dec98e21 100644 --- a/drivers/platform/x86/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c @@ -1368,7 +1368,7 @@ static int lcd_proc_show(struct seq_file *m, void *v) static int lcd_proc_open(struct inode *inode, struct file *file) { - return single_open(file, lcd_proc_show, PDE_DATA(inode)); + return single_open(file, lcd_proc_show, pde_data(inode)); } static int set_lcd_brightness(struct toshiba_acpi_dev *dev, int value) @@ -1404,7 +1404,7 @@ static int set_lcd_status(struct backlight_device *bd) static ssize_t lcd_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char cmd[42]; size_t len; int levels; @@ -1469,13 +1469,13 @@ static int video_proc_show(struct seq_file *m, void *v) static int video_proc_open(struct inode *inode, struct file *file) { - return single_open(file, video_proc_show, PDE_DATA(inode)); + return single_open(file, video_proc_show, pde_data(inode)); } static ssize_t video_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char *buffer; char *cmd; int lcd_out = -1, crt_out = -1, tv_out = -1; @@ -1580,13 +1580,13 @@ static int fan_proc_show(struct seq_file *m, void *v) static int fan_proc_open(struct inode *inode, struct file *file) { - return single_open(file, fan_proc_show, PDE_DATA(inode)); + return single_open(file, fan_proc_show, pde_data(inode)); } static ssize_t fan_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char cmd[42]; size_t len; int value; @@ -1628,13 +1628,13 @@ static int keys_proc_show(struct seq_file *m, void *v) static int keys_proc_open(struct inode *inode, struct file *file) { - return single_open(file, keys_proc_show, PDE_DATA(inode)); + return single_open(file, keys_proc_show, pde_data(inode)); } static ssize_t keys_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file)); + struct toshiba_acpi_dev *dev = pde_data(file_inode(file)); char cmd[42]; size_t len; int value; diff --git a/drivers/pnp/isapnp/proc.c b/drivers/pnp/isapnp/proc.c index 1ae458c02656..55ae72a2818b 100644 --- a/drivers/pnp/isapnp/proc.c +++ b/drivers/pnp/isapnp/proc.c @@ -22,7 +22,7 @@ static loff_t isapnp_proc_bus_lseek(struct file *file, loff_t off, int whence) static ssize_t isapnp_proc_bus_read(struct file *file, char __user * buf, size_t nbytes, loff_t * ppos) { - struct pnp_dev *dev = PDE_DATA(file_inode(file)); + struct pnp_dev *dev = pde_data(file_inode(file)); int pos = *ppos; int cnt, size = 256; diff --git a/drivers/pnp/pnpbios/proc.c b/drivers/pnp/pnpbios/proc.c index a806830e3a40..0f0d819b157f 100644 --- a/drivers/pnp/pnpbios/proc.c +++ b/drivers/pnp/pnpbios/proc.c @@ -173,13 +173,13 @@ static int pnpbios_proc_show(struct seq_file *m, void *v) static int pnpbios_proc_open(struct inode *inode, struct file *file) { - return single_open(file, pnpbios_proc_show, PDE_DATA(inode)); + return single_open(file, pnpbios_proc_show, pde_data(inode)); } static ssize_t pnpbios_proc_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { - void *data = PDE_DATA(file_inode(file)); + void *data = pde_data(file_inode(file)); struct pnp_bios_node *node; int boot = (long)data >> 8; u8 nodenum = (long)data; diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c index d6982d355739..95aee1ad1383 100644 --- a/drivers/scsi/scsi_proc.c +++ b/drivers/scsi/scsi_proc.c @@ -49,7 +49,7 @@ static DEFINE_MUTEX(global_host_template_mutex); static ssize_t proc_scsi_host_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct Scsi_Host *shost = PDE_DATA(file_inode(file)); + struct Scsi_Host *shost = pde_data(file_inode(file)); ssize_t ret = -ENOMEM; char *page; @@ -79,7 +79,7 @@ static int proc_scsi_show(struct seq_file *m, void *v) static int proc_scsi_host_open(struct inode *inode, struct file *file) { - return single_open_size(file, proc_scsi_show, PDE_DATA(inode), + return single_open_size(file, proc_scsi_show, pde_data(inode), 4 * PAGE_SIZE); } diff --git a/drivers/usb/gadget/function/rndis.c b/drivers/usb/gadget/function/rndis.c index 64de9f1b874c..431d5a7d737e 100644 --- a/drivers/usb/gadget/function/rndis.c +++ b/drivers/usb/gadget/function/rndis.c @@ -1117,7 +1117,7 @@ static int rndis_proc_show(struct seq_file *m, void *v) static ssize_t rndis_proc_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { - rndis_params *p = PDE_DATA(file_inode(file)); + rndis_params *p = pde_data(file_inode(file)); u32 speed = 0; int i, fl_speed = 0; @@ -1161,7 +1161,7 @@ static ssize_t rndis_proc_write(struct file *file, const char __user *buffer, static int rndis_proc_open(struct inode *inode, struct file *file) { - return single_open(file, rndis_proc_show, PDE_DATA(inode)); + return single_open(file, rndis_proc_show, pde_data(inode)); } static const struct proc_ops rndis_proc_ops = { diff --git a/drivers/zorro/proc.c b/drivers/zorro/proc.c index 1c9ae08225d8..f916bf60b312 100644 --- a/drivers/zorro/proc.c +++ b/drivers/zorro/proc.c @@ -30,7 +30,7 @@ proc_bus_zorro_lseek(struct file *file, loff_t off, int whence) static ssize_t proc_bus_zorro_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - struct zorro_dev *z = PDE_DATA(file_inode(file)); + struct zorro_dev *z = pde_data(file_inode(file)); struct ConfigDev cd; loff_t pos = *ppos; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 065a28bfa3f1..e1b863449296 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -227,7 +227,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) __acquires(cell->proc_lock) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); rcu_read_lock(); return seq_hlist_start_head_rcu(&cell->proc_volumes, *_pos); @@ -236,7 +236,7 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) static void *afs_proc_cell_volumes_next(struct seq_file *m, void *v, loff_t *_pos) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); return seq_hlist_next_rcu(v, &cell->proc_volumes, _pos); } @@ -322,7 +322,7 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) { struct afs_vl_seq_net_private *priv = m->private; struct afs_vlserver_list *vllist; - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); loff_t pos = *_pos; rcu_read_lock(); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index cf2fd9fc7d98..9f86dd947032 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2834,7 +2834,7 @@ out: static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) @@ -2845,7 +2845,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; ++*pos; @@ -2857,7 +2857,7 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err, buddy_loaded = 0; @@ -2985,7 +2985,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) __acquires(&EXT4_SB(sb)->s_mb_rb_lock) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; read_lock(&EXT4_SB(sb)->s_mb_rb_lock); @@ -2998,7 +2998,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock) static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; ++*pos; @@ -3010,7 +3010,7 @@ static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, lof static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; @@ -3058,7 +3058,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) __releases(&EXT4_SB(sb)->s_mb_rb_lock) { - struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct super_block *sb = pde_data(file_inode(seq->file)); read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0b86a4365b66..f13d548e4a7f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1212,7 +1212,7 @@ static const struct seq_operations jbd2_seq_info_ops = { static int jbd2_seq_info_open(struct inode *inode, struct file *file) { - journal_t *journal = PDE_DATA(inode); + journal_t *journal = pde_data(inode); struct jbd2_stats_proc_session *s; int rc, size; diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 39b823ab2564..e1cfeda397f3 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -138,7 +138,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data); * @parent: The parent directory in which to create. * @ops: The seq_file ops with which to read the file. * @write: The write method with which to 'modify' the file. - * @data: Data for retrieval by PDE_DATA(). + * @data: Data for retrieval by pde_data(). * * Create a network namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a @@ -153,7 +153,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_data); * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling - * PDE_DATA() on the file inode. The network namespace must be accessed by + * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, @@ -230,7 +230,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single); * @parent: The parent directory in which to create. * @show: The seqfile show method with which to read the file. * @write: The write method with which to 'modify' the file. - * @data: Data for retrieval by PDE_DATA(). + * @data: Data for retrieval by pde_data(). * * Create a network-namespaced proc file in the @parent directory with the * specified @name and @mode that allows reading of a file that displays a @@ -245,7 +245,7 @@ EXPORT_SYMBOL_GPL(proc_create_net_single); * modified by the @write function. @write should return 0 on success. * * The @data value is accessible from the @show and @write functions by calling - * PDE_DATA() on the file inode. The network namespace must be accessed by + * pde_data() on the file inode. The network namespace must be accessed by * calling seq_file_single_net() on the seq_file struct. */ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index b6e7005cc1b2..81d6e4ec2294 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -120,8 +120,6 @@ static inline void *pde_data(const struct inode *inode) return inode->i_private; } -#define PDE_DATA(i) pde_data(i) - extern void *proc_get_parent_data(const struct inode *); extern void proc_remove(struct proc_dir_entry *); extern void remove_proc_entry(const char *, struct proc_dir_entry *); @@ -202,7 +200,7 @@ proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {} static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {} -static inline void *PDE_DATA(const struct inode *inode) {BUG(); return NULL;} +static inline void *pde_data(const struct inode *inode) {BUG(); return NULL;} static inline void *proc_get_parent_data(const struct inode *inode) { BUG(); return NULL; } static inline void proc_remove(struct proc_dir_entry *de) {} diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 72dbb44a4573..88cc16444b43 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -209,7 +209,7 @@ static const struct file_operations __name ## _fops = { \ #define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ - return single_open(file, __name ## _show, PDE_DATA(inode)); \ + return single_open(file, __name ## _show, pde_data(inode)); \ } \ \ static const struct proc_ops __name ## _proc_ops = { \ diff --git a/ipc/util.c b/ipc/util.c index fa2d86ef3fb8..a2208d0f26b2 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -894,7 +894,7 @@ static int sysvipc_proc_open(struct inode *inode, struct file *file) if (!iter) return -ENOMEM; - iter->iface = PDE_DATA(inode); + iter->iface = pde_data(inode); iter->ns = get_ipc_ns(current->nsproxy->ipc_ns); iter->pid_ns = get_pid_ns(task_active_pid_ns(current)); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index ee595ec09778..623b8136e9af 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -137,7 +137,7 @@ static inline int irq_select_affinity_usr(unsigned int irq) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE_DATA(file_inode(file)); + unsigned int irq = (int)(long)pde_data(file_inode(file)); cpumask_var_t new_value; int err; @@ -190,12 +190,12 @@ static ssize_t irq_affinity_list_proc_write(struct file *file, static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_proc_show, pde_data(inode)); } static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_list_proc_show, pde_data(inode)); } static const struct proc_ops irq_affinity_proc_ops = { @@ -265,7 +265,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, PDE_DATA(inode)); + return single_open(file, default_affinity_show, pde_data(inode)); } static const struct proc_ops default_affinity_proc_ops = { diff --git a/kernel/resource.c b/kernel/resource.c index 5ad3eba619ba..9c08d6e9eef2 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -99,7 +99,7 @@ enum { MAX_IORES_LEVEL = 5 }; static void *r_start(struct seq_file *m, loff_t *pos) __acquires(resource_lock) { - struct resource *p = PDE_DATA(file_inode(m->file)); + struct resource *p = pde_data(file_inode(m->file)); loff_t l = 0; read_lock(&resource_lock); for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) @@ -115,7 +115,7 @@ static void r_stop(struct seq_file *m, void *v) static int r_show(struct seq_file *m, void *v) { - struct resource *root = PDE_DATA(file_inode(m->file)); + struct resource *root = pde_data(file_inode(m->file)); struct resource *r = v, *p; unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; diff --git a/net/atm/proc.c b/net/atm/proc.c index 4369ffa3302a..9bf736290e48 100644 --- a/net/atm/proc.c +++ b/net/atm/proc.c @@ -108,7 +108,7 @@ out: static inline void *vcc_walk(struct seq_file *seq, loff_t l) { struct vcc_state *state = seq->private; - int family = (uintptr_t)(PDE_DATA(file_inode(seq->file))); + int family = (uintptr_t)(pde_data(file_inode(seq->file))); return __vcc_walk(&state->sk, family, &state->bucket, l) ? state : NULL; @@ -324,7 +324,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf, page = get_zeroed_page(GFP_KERNEL); if (!page) return -ENOMEM; - dev = PDE_DATA(file_inode(file)); + dev = pde_data(file_inode(file)); if (!dev->ops->proc_read) length = -EINVAL; else { diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 1661979b6a6e..ee319779781e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -611,7 +611,7 @@ EXPORT_SYMBOL(bt_sock_wait_ready); static void *bt_seq_start(struct seq_file *seq, loff_t *pos) __acquires(seq->private->l->lock) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); read_lock(&l->lock); return seq_hlist_start_head(&l->head, *pos); @@ -619,7 +619,7 @@ static void *bt_seq_start(struct seq_file *seq, loff_t *pos) static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); return seq_hlist_next(v, &l->head, pos); } @@ -627,14 +627,14 @@ static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void bt_seq_stop(struct seq_file *seq, void *v) __releases(seq->private->l->lock) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); read_unlock(&l->lock); } static int bt_seq_show(struct seq_file *seq, void *v) { - struct bt_sock_list *l = PDE_DATA(file_inode(seq->file)); + struct bt_sock_list *l = pde_data(file_inode(seq->file)); if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk RefCnt Rmem Wmem User Inode Parent"); diff --git a/net/can/bcm.c b/net/can/bcm.c index bc88d901a1c0..95d209b52e6a 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -193,7 +193,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) { char ifname[IFNAMSIZ]; struct net *net = m->private; - struct sock *sk = (struct sock *)PDE_DATA(m->file->f_inode); + struct sock *sk = (struct sock *)pde_data(m->file->f_inode); struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; diff --git a/net/can/proc.c b/net/can/proc.c index b3099f0a3cb8..bbce97825f13 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -305,7 +305,7 @@ static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx, static int can_rcvlist_proc_show(struct seq_file *m, void *v) { /* double cast to prevent GCC warning */ - int idx = (int)(long)PDE_DATA(m->file->f_inode); + int idx = (int)(long)pde_data(m->file->f_inode); struct net_device *dev; struct can_dev_rcv_lists *dev_rcv_lists; struct net *net = m->private; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 213cb7b26b7a..6c2016f7f3d1 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3364,7 +3364,7 @@ EXPORT_SYMBOL(neigh_seq_stop); static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; if (*pos == 0) @@ -3381,7 +3381,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { @@ -3401,7 +3401,7 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v) static int neigh_stat_seq_show(struct seq_file *seq, void *v) { - struct neigh_table *tbl = PDE_DATA(file_inode(seq->file)); + struct neigh_table *tbl = pde_data(file_inode(seq->file)); struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 560a5e712dc3..84b62cd7bc57 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -546,7 +546,7 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, static int pgctrl_open(struct inode *inode, struct file *file) { - return single_open(file, pgctrl_show, PDE_DATA(inode)); + return single_open(file, pgctrl_show, pde_data(inode)); } static const struct proc_ops pktgen_proc_ops = { @@ -1811,7 +1811,7 @@ static ssize_t pktgen_if_write(struct file *file, static int pktgen_if_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_if_show, PDE_DATA(inode)); + return single_open(file, pktgen_if_show, pde_data(inode)); } static const struct proc_ops pktgen_if_proc_ops = { @@ -1948,7 +1948,7 @@ out: static int pktgen_thread_open(struct inode *inode, struct file *file) { - return single_open(file, pktgen_thread_show, PDE_DATA(inode)); + return single_open(file, pktgen_thread_show, pde_data(inode)); } static const struct proc_ops pktgen_thread_proc_ops = { diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index b518f20c9a24..f8e176c77d1c 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -776,7 +776,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - struct clusterip_config *c = PDE_DATA(inode); + struct clusterip_config *c = pde_data(inode); sf->private = c; @@ -788,7 +788,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) static int clusterip_proc_release(struct inode *inode, struct file *file) { - struct clusterip_config *c = PDE_DATA(inode); + struct clusterip_config *c = pde_data(inode); int ret; ret = seq_release(inode, file); @@ -802,7 +802,7 @@ static int clusterip_proc_release(struct inode *inode, struct file *file) static ssize_t clusterip_proc_write(struct file *file, const char __user *input, size_t size, loff_t *ofs) { - struct clusterip_config *c = PDE_DATA(file_inode(file)); + struct clusterip_config *c = pde_data(file_inode(file)); #define PROC_WRITELEN 10 char buffer[PROC_WRITELEN+1]; unsigned long nodenum; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index a53f256bf9d3..9eb5fc247868 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -971,7 +971,7 @@ struct proto raw_prot = { static struct sock *raw_get_first(struct seq_file *seq) { struct sock *sk; - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE; @@ -987,7 +987,7 @@ found: static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); do { @@ -1016,7 +1016,7 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) void *raw_seq_start(struct seq_file *seq, loff_t *pos) __acquires(&h->lock) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); read_lock(&h->lock); return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; @@ -1039,7 +1039,7 @@ EXPORT_SYMBOL_GPL(raw_seq_next); void raw_seq_stop(struct seq_file *seq, void *v) __releases(&h->lock) { - struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file)); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); read_unlock(&h->lock); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b3f34e366b27..b53476e78c84 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3002,7 +3002,7 @@ static unsigned short seq_file_family(const struct seq_file *seq) #endif /* Iterated from proc fs */ - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); return afinfo->family; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 464590ea922e..090360939401 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2960,7 +2960,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; else - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); for (state->bucket = start; state->bucket <= afinfo->udp_table->mask; ++state->bucket) { @@ -2993,7 +2993,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; else - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); do { sk = sk_next(sk); @@ -3050,7 +3050,7 @@ void udp_seq_stop(struct seq_file *seq, void *v) if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; else - afinfo = PDE_DATA(file_inode(seq->file)); + afinfo = pde_data(file_inode(seq->file)); if (state->bucket <= afinfo->udp_table->mask) spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 25524e393349..54a489f16b17 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1517,7 +1517,7 @@ EXPORT_SYMBOL_GPL(xt_unregister_table); #ifdef CONFIG_PROC_FS static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) { - u8 af = (unsigned long)PDE_DATA(file_inode(seq->file)); + u8 af = (unsigned long)pde_data(file_inode(seq->file)); struct net *net = seq_file_net(seq); struct xt_pernet *xt_net; @@ -1529,7 +1529,7 @@ static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - u8 af = (unsigned long)PDE_DATA(file_inode(seq->file)); + u8 af = (unsigned long)pde_data(file_inode(seq->file)); struct net *net = seq_file_net(seq); struct xt_pernet *xt_net; @@ -1540,7 +1540,7 @@ static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void xt_table_seq_stop(struct seq_file *seq, void *v) { - u_int8_t af = (unsigned long)PDE_DATA(file_inode(seq->file)); + u_int8_t af = (unsigned long)pde_data(file_inode(seq->file)); mutex_unlock(&xt[af].mutex); } @@ -1584,7 +1584,7 @@ static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos, [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC, [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE, }; - uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); + uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; if (ppos != NULL) @@ -1633,7 +1633,7 @@ static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos, static void xt_mttg_seq_stop(struct seq_file *seq, void *v) { - uint8_t nfproto = (unsigned long)PDE_DATA(file_inode(seq->file)); + uint8_t nfproto = (unsigned long)pde_data(file_inode(seq->file)); struct nf_mttg_trav *trav = seq->private; switch (trav->class) { diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 9c5cfd74a0ee..0859b8f76764 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -1052,7 +1052,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { static void *dl_seq_start(struct seq_file *s, loff_t *pos) __acquires(htable->lock) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket; spin_lock_bh(&htable->lock); @@ -1069,7 +1069,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos) static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; *pos = ++(*bucket); @@ -1083,7 +1083,7 @@ static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) static void dl_seq_stop(struct seq_file *s, void *v) __releases(htable->lock) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; if (!IS_ERR(bucket)) @@ -1125,7 +1125,7 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1140,7 +1140,7 @@ static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1155,7 +1155,7 @@ static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { - struct xt_hashlimit_htable *ht = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *ht = pde_data(file_inode(s->file)); spin_lock(&ent->lock); /* recalculate to show accurate numbers */ @@ -1169,7 +1169,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_show_v2(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = (unsigned int *)v; struct dsthash_ent *ent; @@ -1183,7 +1183,7 @@ static int dl_seq_show_v2(struct seq_file *s, void *v) static int dl_seq_show_v1(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; @@ -1197,7 +1197,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v) static int dl_seq_show(struct seq_file *s, void *v) { - struct xt_hashlimit_htable *htable = PDE_DATA(file_inode(s->file)); + struct xt_hashlimit_htable *htable = pde_data(file_inode(s->file)); unsigned int *bucket = v; struct dsthash_ent *ent; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 0446307516cd..7ddb9a78e3fc 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -551,7 +551,7 @@ static int recent_seq_open(struct inode *inode, struct file *file) if (st == NULL) return -ENOMEM; - st->table = PDE_DATA(inode); + st->table = pde_data(inode); return 0; } @@ -559,7 +559,7 @@ static ssize_t recent_mt_proc_write(struct file *file, const char __user *input, size_t size, loff_t *loff) { - struct recent_table *t = PDE_DATA(file_inode(file)); + struct recent_table *t = pde_data(file_inode(file)); struct recent_entry *e; char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; const char *c = buf; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index b87565b64928..c2ba9d4cd2c7 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1433,7 +1433,7 @@ static bool use_gss_proxy(struct net *net) static ssize_t write_gssp(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct net *net = PDE_DATA(file_inode(file)); + struct net *net = pde_data(file_inode(file)); char tbuf[20]; unsigned long i; int res; @@ -1461,7 +1461,7 @@ static ssize_t write_gssp(struct file *file, const char __user *buf, static ssize_t read_gssp(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct net *net = PDE_DATA(file_inode(file)); + struct net *net = pde_data(file_inode(file)); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); unsigned long p = *ppos; char tbuf[10]; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 59641803472c..bb1177395b99 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1536,7 +1536,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf, static ssize_t cache_read_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_read(filp, buf, count, ppos, cd); } @@ -1544,14 +1544,14 @@ static ssize_t cache_read_procfs(struct file *filp, char __user *buf, static ssize_t cache_write_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_write(filp, buf, count, ppos, cd); } static __poll_t cache_poll_procfs(struct file *filp, poll_table *wait) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return cache_poll(filp, wait, cd); } @@ -1560,21 +1560,21 @@ static long cache_ioctl_procfs(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_ioctl(inode, filp, cmd, arg, cd); } static int cache_open_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_open(inode, filp, cd); } static int cache_release_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return cache_release(inode, filp, cd); } @@ -1591,14 +1591,14 @@ static const struct proc_ops cache_channel_proc_ops = { static int content_open_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return content_open(inode, filp, cd); } static int content_release_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return content_release(inode, filp, cd); } @@ -1612,14 +1612,14 @@ static const struct proc_ops content_proc_ops = { static int open_flush_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return open_flush(inode, filp, cd); } static int release_flush_procfs(struct inode *inode, struct file *filp) { - struct cache_detail *cd = PDE_DATA(inode); + struct cache_detail *cd = pde_data(inode); return release_flush(inode, filp, cd); } @@ -1627,7 +1627,7 @@ static int release_flush_procfs(struct inode *inode, struct file *filp) static ssize_t read_flush_procfs(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return read_flush(filp, buf, count, ppos, cd); } @@ -1636,7 +1636,7 @@ static ssize_t write_flush_procfs(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { - struct cache_detail *cd = PDE_DATA(file_inode(filp)); + struct cache_detail *cd = pde_data(file_inode(filp)); return write_flush(filp, buf, count, ppos, cd); } diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index c964b48eaaba..52908f9e6eab 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -66,7 +66,7 @@ static int rpc_proc_show(struct seq_file *seq, void *v) { static int rpc_proc_open(struct inode *inode, struct file *file) { - return single_open(file, rpc_proc_show, PDE_DATA(inode)); + return single_open(file, rpc_proc_show, pde_data(inode)); } static const struct proc_ops rpc_proc_ops = { diff --git a/sound/core/info.c b/sound/core/info.c index a451b24199c3..782fba87cc04 100644 --- a/sound/core/info.c +++ b/sound/core/info.c @@ -234,7 +234,7 @@ static int snd_info_entry_mmap(struct file *file, struct vm_area_struct *vma) static int snd_info_entry_open(struct inode *inode, struct file *file) { - struct snd_info_entry *entry = PDE_DATA(inode); + struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int mode, err; @@ -365,7 +365,7 @@ static int snd_info_seq_show(struct seq_file *seq, void *p) static int snd_info_text_entry_open(struct inode *inode, struct file *file) { - struct snd_info_entry *entry = PDE_DATA(inode); + struct snd_info_entry *entry = pde_data(inode); struct snd_info_private_data *data; int err; -- cgit v1.2.3 From 6b9b6413700e104934734b72a3be622a76923b98 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Sat, 22 Jan 2022 09:17:10 -0500 Subject: ftrace: Fix assuming build time sort works for s390 To speed up the boot process, as mcount_loc needs to be sorted for ftrace to work properly, sorting it at build time is more efficient than boot up and can save milliseconds of time. Unfortunately, this change broke s390 as it will modify the mcount_loc location after the sorting takes place and will put back the unsorted locations. Since the sorting is skipped at boot up if it is believed that it was sorted at run time, ftrace can crash as its algorithms are dependent on the list being sorted. Add a new config BUILDTIME_MCOUNT_SORT that is set when BUILDTIME_TABLE_SORT but not if S390 is set. Use this config to determine if sorting should take place at boot up. Link: https://lore.kernel.org/all/yt9dee51ctfn.fsf@linux.ibm.com/ Fixes: 72b3942a173c ("scripts: ftrace - move the sort-processing in ftrace_init") Reported-by: Sven Schnelle Tested-by: Heiko Carstens Signed-off-by: Steven Rostedt (Google) --- kernel/trace/Kconfig | 9 ++++++++- kernel/trace/ftrace.c | 4 ++-- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f468767bc287..752ed89a293b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -70,6 +70,13 @@ config HAVE_C_RECORDMCOUNT help C version of recordmcount available? +config BUILDTIME_MCOUNT_SORT + bool + default y + depends on BUILDTIME_TABLE_SORT && !S390 + help + Sort the mcount_loc section at build time. + config TRACER_MAX_TRACE bool @@ -918,7 +925,7 @@ config EVENT_TRACE_TEST_SYSCALLS config FTRACE_SORT_STARTUP_TEST bool "Verify compile time sorting of ftrace functions" depends on DYNAMIC_FTRACE - depends on BUILDTIME_TABLE_SORT + depends on BUILDTIME_MCOUNT_SORT help Sorting of the mcount_loc sections that is used to find the where the ftrace knows where to patch functions for tracing diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 403e485bf091..b01e1fa62193 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6429,10 +6429,10 @@ static int ftrace_process_locs(struct module *mod, /* * Sorting mcount in vmlinux at build time depend on - * CONFIG_BUILDTIME_TABLE_SORT, while mcount loc in + * CONFIG_BUILDTIME_MCOUNT_SORT, while mcount loc in * modules can not be sorted at build time. */ - if (!IS_ENABLED(CONFIG_BUILDTIME_TABLE_SORT) || mod) { + if (!IS_ENABLED(CONFIG_BUILDTIME_MCOUNT_SORT) || mod) { sort(start, count, sizeof(*start), ftrace_cmp_ips, NULL); } else { -- cgit v1.2.3