diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/bpf/btf.c | 82 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 34 | ||||
| -rw-r--r-- | kernel/bpf/local_storage.c | 3 | ||||
| -rw-r--r-- | kernel/bpf/queue_stack_maps.c | 16 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 105 | ||||
| -rw-r--r-- | kernel/cpu.c | 15 | ||||
| -rw-r--r-- | kernel/dma/swiotlb.c | 3 | ||||
| -rw-r--r-- | kernel/events/uprobes.c | 14 | ||||
| -rw-r--r-- | kernel/kcov.c | 4 | ||||
| -rw-r--r-- | kernel/ptrace.c | 10 | ||||
| -rw-r--r-- | kernel/sched/core.c | 19 | ||||
| -rw-r--r-- | kernel/sched/psi.c | 30 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 4 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 8 | ||||
| -rw-r--r-- | kernel/stackleak.c | 6 | ||||
| -rw-r--r-- | kernel/trace/bpf_trace.c | 8 | ||||
| -rw-r--r-- | kernel/trace/ftrace.c | 8 | ||||
| -rw-r--r-- | kernel/trace/trace.h | 57 | ||||
| -rw-r--r-- | kernel/trace/trace_events_filter.c | 5 | ||||
| -rw-r--r-- | kernel/trace/trace_events_trigger.c | 6 | ||||
| -rw-r--r-- | kernel/trace/trace_functions_graph.c | 53 | ||||
| -rw-r--r-- | kernel/trace/trace_irqsoff.c | 2 | ||||
| -rw-r--r-- | kernel/trace/trace_sched_wakeup.c | 2 | 
23 files changed, 401 insertions, 93 deletions
| diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ee4c82667d65..4da543d6bea2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5,6 +5,7 @@  #include <uapi/linux/types.h>  #include <linux/seq_file.h>  #include <linux/compiler.h> +#include <linux/ctype.h>  #include <linux/errno.h>  #include <linux/slab.h>  #include <linux/anon_inodes.h> @@ -426,6 +427,30 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)  		offset < btf->hdr.str_len;  } +/* Only C-style identifier is permitted. This can be relaxed if + * necessary. + */ +static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +{ +	/* offset must be valid */ +	const char *src = &btf->strings[offset]; +	const char *src_limit; + +	if (!isalpha(*src) && *src != '_') +		return false; + +	/* set a limit on identifier length */ +	src_limit = src + KSYM_NAME_LEN; +	src++; +	while (*src && src < src_limit) { +		if (!isalnum(*src) && *src != '_') +			return false; +		src++; +	} + +	return !*src; +} +  static const char *btf_name_by_offset(const struct btf *btf, u32 offset)  {  	if (!offset) @@ -1143,6 +1168,22 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,  		return -EINVAL;  	} +	/* typedef type must have a valid name, and other ref types, +	 * volatile, const, restrict, should have a null name. +	 */ +	if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) { +		if (!t->name_off || +		    !btf_name_valid_identifier(env->btf, t->name_off)) { +			btf_verifier_log_type(env, t, "Invalid name"); +			return -EINVAL; +		} +	} else { +		if (t->name_off) { +			btf_verifier_log_type(env, t, "Invalid name"); +			return -EINVAL; +		} +	} +  	btf_verifier_log_type(env, t, NULL);  	return 0; @@ -1300,6 +1341,13 @@ static s32 btf_fwd_check_meta(struct btf_verifier_env *env,  		return -EINVAL;  	} +	/* fwd type must have a valid name */ +	if (!t->name_off || +	    !btf_name_valid_identifier(env->btf, t->name_off)) { +		btf_verifier_log_type(env, t, "Invalid name"); +		return -EINVAL; +	} +  	btf_verifier_log_type(env, t, NULL);  	return 0; @@ -1356,6 +1404,12 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,  		return -EINVAL;  	} +	/* array type should not have a name */ +	if (t->name_off) { +		btf_verifier_log_type(env, t, "Invalid name"); +		return -EINVAL; +	} +  	if (btf_type_vlen(t)) {  		btf_verifier_log_type(env, t, "vlen != 0");  		return -EINVAL; @@ -1532,6 +1586,13 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,  		return -EINVAL;  	} +	/* struct type either no name or a valid one */ +	if (t->name_off && +	    !btf_name_valid_identifier(env->btf, t->name_off)) { +		btf_verifier_log_type(env, t, "Invalid name"); +		return -EINVAL; +	} +  	btf_verifier_log_type(env, t, NULL);  	last_offset = 0; @@ -1543,6 +1604,12 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,  			return -EINVAL;  		} +		/* struct member either no name or a valid one */ +		if (member->name_off && +		    !btf_name_valid_identifier(btf, member->name_off)) { +			btf_verifier_log_member(env, t, member, "Invalid name"); +			return -EINVAL; +		}  		/* A member cannot be in type void */  		if (!member->type || !BTF_TYPE_ID_VALID(member->type)) {  			btf_verifier_log_member(env, t, member, @@ -1730,6 +1797,13 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,  		return -EINVAL;  	} +	/* enum type either no name or a valid one */ +	if (t->name_off && +	    !btf_name_valid_identifier(env->btf, t->name_off)) { +		btf_verifier_log_type(env, t, "Invalid name"); +		return -EINVAL; +	} +  	btf_verifier_log_type(env, t, NULL);  	for (i = 0; i < nr_enums; i++) { @@ -1739,6 +1813,14 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,  			return -EINVAL;  		} +		/* enum member must have a valid name */ +		if (!enums[i].name_off || +		    !btf_name_valid_identifier(btf, enums[i].name_off)) { +			btf_verifier_log_type(env, t, "Invalid name"); +			return -EINVAL; +		} + +  		btf_verifier_log(env, "\t%s val=%d\n",  				 btf_name_by_offset(btf, enums[i].name_off),  				 enums[i].val); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 1a796e0799ec..b1a3545d0ec8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -672,6 +672,40 @@ void __weak bpf_jit_free(struct bpf_prog *fp)  	bpf_prog_unlock_free(fp);  } +int bpf_jit_get_func_addr(const struct bpf_prog *prog, +			  const struct bpf_insn *insn, bool extra_pass, +			  u64 *func_addr, bool *func_addr_fixed) +{ +	s16 off = insn->off; +	s32 imm = insn->imm; +	u8 *addr; + +	*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; +	if (!*func_addr_fixed) { +		/* Place-holder address till the last pass has collected +		 * all addresses for JITed subprograms in which case we +		 * can pick them up from prog->aux. +		 */ +		if (!extra_pass) +			addr = NULL; +		else if (prog->aux->func && +			 off >= 0 && off < prog->aux->func_cnt) +			addr = (u8 *)prog->aux->func[off]->bpf_func; +		else +			return -EINVAL; +	} else { +		/* Address of a BPF helper call. Since part of the core +		 * kernel, it's always at a fixed location. __bpf_call_base +		 * and the helper with imm relative to it are both in core +		 * kernel. +		 */ +		addr = (u8 *)__bpf_call_base + imm; +	} + +	*func_addr = (unsigned long)addr; +	return 0; +} +  static int bpf_jit_blind_insn(const struct bpf_insn *from,  			      const struct bpf_insn *aux,  			      struct bpf_insn *to_buff) diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index c97a8f968638..bed9d48a7ae9 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -139,7 +139,8 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,  		return -ENOENT;  	new = kmalloc_node(sizeof(struct bpf_storage_buffer) + -			   map->value_size, __GFP_ZERO | GFP_USER, +			   map->value_size, +			   __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,  			   map->numa_node);  	if (!new)  		return -ENOMEM; diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 8bbd72d3a121..b384ea9f3254 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -7,6 +7,7 @@  #include <linux/bpf.h>  #include <linux/list.h>  #include <linux/slab.h> +#include <linux/capability.h>  #include "percpu_freelist.h"  #define QUEUE_STACK_CREATE_FLAG_MASK \ @@ -45,8 +46,12 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)  /* Called from syscall */  static int queue_stack_map_alloc_check(union bpf_attr *attr)  { +	if (!capable(CAP_SYS_ADMIN)) +		return -EPERM; +  	/* check sanity of attributes */  	if (attr->max_entries == 0 || attr->key_size != 0 || +	    attr->value_size == 0 ||  	    attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK)  		return -EINVAL; @@ -63,15 +68,10 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)  {  	int ret, numa_node = bpf_map_attr_numa_node(attr);  	struct bpf_queue_stack *qs; -	u32 size, value_size; -	u64 queue_size, cost; - -	size = attr->max_entries + 1; -	value_size = attr->value_size; - -	queue_size = sizeof(*qs) + (u64) value_size * size; +	u64 size, queue_size, cost; -	cost = queue_size; +	size = (u64) attr->max_entries + 1; +	cost = queue_size = sizeof(*qs) + size * attr->value_size;  	if (cost >= U32_MAX - PAGE_SIZE)  		return ERR_PTR(-E2BIG); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1971ca325fb4..fc760d00a38c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -175,6 +175,7 @@ struct bpf_verifier_stack_elem {  #define BPF_COMPLEXITY_LIMIT_INSNS	131072  #define BPF_COMPLEXITY_LIMIT_STACK	1024 +#define BPF_COMPLEXITY_LIMIT_STATES	64  #define BPF_MAP_PTR_UNPRIV	1UL  #define BPF_MAP_PTR_POISON	((void *)((0xeB9FUL << 1) +	\ @@ -3751,6 +3752,79 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,  	}  } +/* compute branch direction of the expression "if (reg opcode val) goto target;" + * and return: + *  1 - branch will be taken and "goto target" will be executed + *  0 - branch will not be taken and fall-through to next insn + * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] + */ +static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) +{ +	if (__is_pointer_value(false, reg)) +		return -1; + +	switch (opcode) { +	case BPF_JEQ: +		if (tnum_is_const(reg->var_off)) +			return !!tnum_equals_const(reg->var_off, val); +		break; +	case BPF_JNE: +		if (tnum_is_const(reg->var_off)) +			return !tnum_equals_const(reg->var_off, val); +		break; +	case BPF_JGT: +		if (reg->umin_value > val) +			return 1; +		else if (reg->umax_value <= val) +			return 0; +		break; +	case BPF_JSGT: +		if (reg->smin_value > (s64)val) +			return 1; +		else if (reg->smax_value < (s64)val) +			return 0; +		break; +	case BPF_JLT: +		if (reg->umax_value < val) +			return 1; +		else if (reg->umin_value >= val) +			return 0; +		break; +	case BPF_JSLT: +		if (reg->smax_value < (s64)val) +			return 1; +		else if (reg->smin_value >= (s64)val) +			return 0; +		break; +	case BPF_JGE: +		if (reg->umin_value >= val) +			return 1; +		else if (reg->umax_value < val) +			return 0; +		break; +	case BPF_JSGE: +		if (reg->smin_value >= (s64)val) +			return 1; +		else if (reg->smax_value < (s64)val) +			return 0; +		break; +	case BPF_JLE: +		if (reg->umax_value <= val) +			return 1; +		else if (reg->umin_value > val) +			return 0; +		break; +	case BPF_JSLE: +		if (reg->smax_value <= (s64)val) +			return 1; +		else if (reg->smin_value > (s64)val) +			return 0; +		break; +	} + +	return -1; +} +  /* Adjusts the register min/max values in the case that the dst_reg is the   * variable register that we are working on, and src_reg is a constant or we're   * simply doing a BPF_K check. @@ -4152,21 +4226,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,  	dst_reg = ®s[insn->dst_reg]; -	/* detect if R == 0 where R was initialized to zero earlier */ -	if (BPF_SRC(insn->code) == BPF_K && -	    (opcode == BPF_JEQ || opcode == BPF_JNE) && -	    dst_reg->type == SCALAR_VALUE && -	    tnum_is_const(dst_reg->var_off)) { -		if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || -		    (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { -			/* if (imm == imm) goto pc+off; -			 * only follow the goto, ignore fall-through -			 */ +	if (BPF_SRC(insn->code) == BPF_K) { +		int pred = is_branch_taken(dst_reg, insn->imm, opcode); + +		if (pred == 1) { +			 /* only follow the goto, ignore fall-through */  			*insn_idx += insn->off;  			return 0; -		} else { -			/* if (imm != imm) goto pc+off; -			 * only follow fall-through branch, since +		} else if (pred == 0) { +			/* only follow fall-through branch, since  			 * that's where the program will go  			 */  			return 0; @@ -4980,7 +5048,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  	struct bpf_verifier_state_list *new_sl;  	struct bpf_verifier_state_list *sl;  	struct bpf_verifier_state *cur = env->cur_state, *new; -	int i, j, err; +	int i, j, err, states_cnt = 0;  	sl = env->explored_states[insn_idx];  	if (!sl) @@ -5007,8 +5075,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)  			return 1;  		}  		sl = sl->next; +		states_cnt++;  	} +	if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) +		return 0; +  	/* there were no equivalent states, remember current one.  	 * technically the current state is not proven to be safe yet,  	 * but it will either reach outer most bpf_exit (which means it's safe) @@ -5148,6 +5220,9 @@ static int do_check(struct bpf_verifier_env *env)  			goto process_bpf_exit;  		} +		if (signal_pending(current)) +			return -EAGAIN; +  		if (need_resched())  			cond_resched(); @@ -5650,7 +5725,7 @@ static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len  		return;  	/* NOTE: fake 'exit' subprog should be updated as well. */  	for (i = 0; i <= env->subprog_cnt; i++) { -		if (env->subprog_info[i].start < off) +		if (env->subprog_info[i].start <= off)  			continue;  		env->subprog_info[i].start += len - 1;  	} diff --git a/kernel/cpu.c b/kernel/cpu.c index 3c7f3b4c453c..91d5c38eb7e5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -10,6 +10,7 @@  #include <linux/sched/signal.h>  #include <linux/sched/hotplug.h>  #include <linux/sched/task.h> +#include <linux/sched/smt.h>  #include <linux/unistd.h>  #include <linux/cpu.h>  #include <linux/oom.h> @@ -367,6 +368,12 @@ static void lockdep_release_cpus_lock(void)  #endif	/* CONFIG_HOTPLUG_CPU */ +/* + * Architectures that need SMT-specific errata handling during SMT hotplug + * should override this. + */ +void __weak arch_smt_update(void) { } +  #ifdef CONFIG_HOTPLUG_SMT  enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;  EXPORT_SYMBOL_GPL(cpu_smt_control); @@ -1011,6 +1018,7 @@ out:  	 * concurrent CPU hotplug via cpu_add_remove_lock.  	 */  	lockup_detector_cleanup(); +	arch_smt_update();  	return ret;  } @@ -1139,6 +1147,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)  	ret = cpuhp_up_callbacks(cpu, st, target);  out:  	cpus_write_unlock(); +	arch_smt_update();  	return ret;  } @@ -2055,12 +2064,6 @@ static void cpuhp_online_cpu_device(unsigned int cpu)  	kobject_uevent(&dev->kobj, KOBJ_ONLINE);  } -/* - * Architectures that need SMT-specific errata handling during SMT hotplug - * should override this. - */ -void __weak arch_smt_update(void) { }; -  static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)  {  	int cpu, ret = 0; diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 5731daa09a32..045930e32c0e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -679,7 +679,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,  	}  	if (!dev_is_dma_coherent(dev) && -	    (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0) +	    (attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0 && +	    dev_addr != DIRECT_MAPPING_ERROR)  		arch_sync_dma_for_device(dev, phys, size, dir);  	return dev_addr; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 96d4bee83489..abbd8da9ac21 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -572,7 +572,9 @@ static void put_uprobe(struct uprobe *uprobe)  		 * gets called, we don't get a chance to remove uprobe from  		 * delayed_uprobe_list from remove_breakpoint(). Do it here.  		 */ +		mutex_lock(&delayed_uprobe_lock);  		delayed_uprobe_remove(uprobe, NULL); +		mutex_unlock(&delayed_uprobe_lock);  		kfree(uprobe);  	}  } @@ -829,7 +831,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,  	BUG_ON((uprobe->offset & ~PAGE_MASK) +  			UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); -	smp_wmb(); /* pairs with rmb() in find_active_uprobe() */ +	smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */  	set_bit(UPROBE_COPY_INSN, &uprobe->flags);   out: @@ -2178,10 +2180,18 @@ static void handle_swbp(struct pt_regs *regs)  	 * After we hit the bp, _unregister + _register can install the  	 * new and not-yet-analyzed uprobe at the same address, restart.  	 */ -	smp_rmb(); /* pairs with wmb() in install_breakpoint() */  	if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))  		goto out; +	/* +	 * Pairs with the smp_wmb() in prepare_uprobe(). +	 * +	 * Guarantees that if we see the UPROBE_COPY_INSN bit set, then +	 * we must also see the stores to &uprobe->arch performed by the +	 * prepare_uprobe() call. +	 */ +	smp_rmb(); +  	/* Tracing handlers use ->utask to communicate with fetch methods */  	if (!get_utask())  		goto out; diff --git a/kernel/kcov.c b/kernel/kcov.c index 3ebd09efe72a..97959d7b77e2 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -56,7 +56,7 @@ struct kcov {  	struct task_struct	*t;  }; -static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) +static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)  {  	unsigned int mode; @@ -78,7 +78,7 @@ static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)  	return mode == needed_mode;  } -static unsigned long canonicalize_ip(unsigned long ip) +static notrace unsigned long canonicalize_ip(unsigned long ip)  {  #ifdef CONFIG_RANDOMIZE_BASE  	ip -= kaslr_offset(); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 80b34dffdfb9..c2cee9db5204 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -261,9 +261,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)  static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)  { -	if (mode & PTRACE_MODE_SCHED) -		return false; -  	if (mode & PTRACE_MODE_NOAUDIT)  		return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);  	else @@ -331,16 +328,9 @@ ok:  	     !ptrace_has_cap(mm->user_ns, mode)))  	    return -EPERM; -	if (mode & PTRACE_MODE_SCHED) -		return 0;  	return security_ptrace_access_check(task, mode);  } -bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode) -{ -	return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED); -} -  bool ptrace_may_access(struct task_struct *task, unsigned int mode)  {  	int err; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 091e089063be..6fedf3a98581 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu)  #ifdef CONFIG_SCHED_SMT  	/* -	 * The sched_smt_present static key needs to be evaluated on every -	 * hotplug event because at boot time SMT might be disabled when -	 * the number of booted CPUs is limited. -	 * -	 * If then later a sibling gets hotplugged, then the key would stay -	 * off and SMT scheduling would never be functional. +	 * When going up, increment the number of cores with SMT present.  	 */ -	if (cpumask_weight(cpu_smt_mask(cpu)) > 1) -		static_branch_enable_cpuslocked(&sched_smt_present); +	if (cpumask_weight(cpu_smt_mask(cpu)) == 2) +		static_branch_inc_cpuslocked(&sched_smt_present);  #endif  	set_cpu_active(cpu, true); @@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu)  	 */  	synchronize_rcu_mult(call_rcu, call_rcu_sched); +#ifdef CONFIG_SCHED_SMT +	/* +	 * When going down, decrement the number of cores with SMT present. +	 */ +	if (cpumask_weight(cpu_smt_mask(cpu)) == 2) +		static_branch_dec_cpuslocked(&sched_smt_present); +#endif +  	if (!sched_smp_initialized)  		return 0; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 3d7355d7c3e3..fe24de3fbc93 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,8 +136,18 @@  static int psi_bug __read_mostly; -bool psi_disabled __read_mostly; -core_param(psi_disabled, psi_disabled, bool, 0644); +DEFINE_STATIC_KEY_FALSE(psi_disabled); + +#ifdef CONFIG_PSI_DEFAULT_DISABLED +bool psi_enable; +#else +bool psi_enable = true; +#endif +static int __init setup_psi(char *str) +{ +	return kstrtobool(str, &psi_enable) == 0; +} +__setup("psi=", setup_psi);  /* Running averages - we need to be higher-res than loadavg */  #define PSI_FREQ	(2*HZ+1)	/* 2 sec intervals */ @@ -169,8 +179,10 @@ static void group_init(struct psi_group *group)  void __init psi_init(void)  { -	if (psi_disabled) +	if (!psi_enable) { +		static_branch_enable(&psi_disabled);  		return; +	}  	psi_period = jiffies_to_nsecs(PSI_FREQ);  	group_init(&psi_system); @@ -549,7 +561,7 @@ void psi_memstall_enter(unsigned long *flags)  	struct rq_flags rf;  	struct rq *rq; -	if (psi_disabled) +	if (static_branch_likely(&psi_disabled))  		return;  	*flags = current->flags & PF_MEMSTALL; @@ -579,7 +591,7 @@ void psi_memstall_leave(unsigned long *flags)  	struct rq_flags rf;  	struct rq *rq; -	if (psi_disabled) +	if (static_branch_likely(&psi_disabled))  		return;  	if (*flags) @@ -600,7 +612,7 @@ void psi_memstall_leave(unsigned long *flags)  #ifdef CONFIG_CGROUPS  int psi_cgroup_alloc(struct cgroup *cgroup)  { -	if (psi_disabled) +	if (static_branch_likely(&psi_disabled))  		return 0;  	cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu); @@ -612,7 +624,7 @@ int psi_cgroup_alloc(struct cgroup *cgroup)  void psi_cgroup_free(struct cgroup *cgroup)  { -	if (psi_disabled) +	if (static_branch_likely(&psi_disabled))  		return;  	cancel_delayed_work_sync(&cgroup->psi.clock_work); @@ -637,7 +649,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)  	struct rq_flags rf;  	struct rq *rq; -	if (psi_disabled) { +	if (static_branch_likely(&psi_disabled)) {  		/*  		 * Lame to do this here, but the scheduler cannot be locked  		 * from the outside, so we move cgroups from inside sched/. @@ -673,7 +685,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)  {  	int full; -	if (psi_disabled) +	if (static_branch_likely(&psi_disabled))  		return -EOPNOTSUPP;  	update_stats(group); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 618577fc9aa8..4e524ab589c9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -23,6 +23,7 @@  #include <linux/sched/prio.h>  #include <linux/sched/rt.h>  #include <linux/sched/signal.h> +#include <linux/sched/smt.h>  #include <linux/sched/stat.h>  #include <linux/sched/sysctl.h>  #include <linux/sched/task.h> @@ -936,9 +937,6 @@ static inline int cpu_of(struct rq *rq)  #ifdef CONFIG_SCHED_SMT - -extern struct static_key_false sched_smt_present; -  extern void __update_idle_core(struct rq *rq);  static inline void update_idle_core(struct rq *rq) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 4904c4677000..aa0de240fb41 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -66,7 +66,7 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)  {  	int clear = 0, set = TSK_RUNNING; -	if (psi_disabled) +	if (static_branch_likely(&psi_disabled))  		return;  	if (!wakeup || p->sched_psi_wake_requeue) { @@ -86,7 +86,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)  {  	int clear = TSK_RUNNING, set = 0; -	if (psi_disabled) +	if (static_branch_likely(&psi_disabled))  		return;  	if (!sleep) { @@ -102,7 +102,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)  static inline void psi_ttwu_dequeue(struct task_struct *p)  { -	if (psi_disabled) +	if (static_branch_likely(&psi_disabled))  		return;  	/*  	 * Is the task being migrated during a wakeup? Make sure to @@ -128,7 +128,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p)  static inline void psi_task_tick(struct rq *rq)  { -	if (psi_disabled) +	if (static_branch_likely(&psi_disabled))  		return;  	if (unlikely(rq->curr->flags & PF_MEMSTALL)) diff --git a/kernel/stackleak.c b/kernel/stackleak.c index e42892926244..b193a59fc05b 100644 --- a/kernel/stackleak.c +++ b/kernel/stackleak.c @@ -11,6 +11,7 @@   */  #include <linux/stackleak.h> +#include <linux/kprobes.h>  #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE  #include <linux/jump_label.h> @@ -47,7 +48,7 @@ int stack_erasing_sysctl(struct ctl_table *table, int write,  #define skip_erasing()	false  #endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */ -asmlinkage void stackleak_erase(void) +asmlinkage void notrace stackleak_erase(void)  {  	/* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */  	unsigned long kstack_ptr = current->lowest_stack; @@ -101,8 +102,9 @@ asmlinkage void stackleak_erase(void)  	/* Reset the 'lowest_stack' value for the next syscall */  	current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;  } +NOKPROBE_SYMBOL(stackleak_erase); -void __used stackleak_track_stack(void) +void __used notrace stackleak_track_stack(void)  {  	/*  	 * N.B. stackleak_erase() fills the kernel stack with the poison value, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 08fcfe440c63..9864a35c8bb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -196,11 +196,13 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,  			i++;  		} else if (fmt[i] == 'p' || fmt[i] == 's') {  			mod[fmt_cnt]++; -			i++; -			if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) +			/* disallow any further format extensions */ +			if (fmt[i + 1] != 0 && +			    !isspace(fmt[i + 1]) && +			    !ispunct(fmt[i + 1]))  				return -EINVAL;  			fmt_cnt++; -			if (fmt[i - 1] == 's') { +			if (fmt[i] == 's') {  				if (str_seen)  					/* allow only one '%s' per fmt string */  					return -EINVAL; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f536f601bd46..e23eb9fc77aa 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -817,7 +817,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  static int profile_graph_entry(struct ftrace_graph_ent *trace)  { -	int index = trace->depth; +	int index = current->curr_ret_stack;  	function_profile_call(trace->func, 0, NULL, NULL); @@ -852,7 +852,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)  	if (!fgraph_graph_time) {  		int index; -		index = trace->depth; +		index = current->curr_ret_stack;  		/* Append this call time to the parent time to subtract */  		if (index) @@ -5460,6 +5460,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops)  	if (ops->flags & FTRACE_OPS_FL_ENABLED)  		ftrace_shutdown(ops, 0);  	ops->flags |= FTRACE_OPS_FL_DELETED; +	ftrace_free_filter(ops);  	mutex_unlock(&ftrace_lock);  } @@ -6814,6 +6815,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)  			atomic_set(&t->tracing_graph_pause, 0);  			atomic_set(&t->trace_overrun, 0);  			t->curr_ret_stack = -1; +			t->curr_ret_depth = -1;  			/* Make sure the tasks see the -1 first: */  			smp_wmb();  			t->ret_stack = ret_stack_list[start++]; @@ -7038,6 +7040,7 @@ graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)  void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)  {  	t->curr_ret_stack = -1; +	t->curr_ret_depth = -1;  	/*  	 * The idle task has no parent, it either has its own  	 * stack or no stack at all. @@ -7068,6 +7071,7 @@ void ftrace_graph_init_task(struct task_struct *t)  	/* Make sure we do not use the parent ret_stack */  	t->ret_stack = NULL;  	t->curr_ret_stack = -1; +	t->curr_ret_depth = -1;  	if (ftrace_graph_active) {  		struct ftrace_ret_stack *ret_stack; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3b8c0e24ab30..447bd96ee658 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -512,12 +512,44 @@ enum {   * can only be modified by current, we can reuse trace_recursion.   */  	TRACE_IRQ_BIT, + +	/* Set if the function is in the set_graph_function file */ +	TRACE_GRAPH_BIT, + +	/* +	 * In the very unlikely case that an interrupt came in +	 * at a start of graph tracing, and we want to trace +	 * the function in that interrupt, the depth can be greater +	 * than zero, because of the preempted start of a previous +	 * trace. In an even more unlikely case, depth could be 2 +	 * if a softirq interrupted the start of graph tracing, +	 * followed by an interrupt preempting a start of graph +	 * tracing in the softirq, and depth can even be 3 +	 * if an NMI came in at the start of an interrupt function +	 * that preempted a softirq start of a function that +	 * preempted normal context!!!! Luckily, it can't be +	 * greater than 3, so the next two bits are a mask +	 * of what the depth is when we set TRACE_GRAPH_BIT +	 */ + +	TRACE_GRAPH_DEPTH_START_BIT, +	TRACE_GRAPH_DEPTH_END_BIT,  };  #define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0)  #define trace_recursion_clear(bit)	do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)  #define trace_recursion_test(bit)	((current)->trace_recursion & (1<<(bit))) +#define trace_recursion_depth() \ +	(((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) +#define trace_recursion_set_depth(depth) \ +	do {								\ +		current->trace_recursion &=				\ +			~(3 << TRACE_GRAPH_DEPTH_START_BIT);		\ +		current->trace_recursion |=				\ +			((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT;	\ +	} while (0) +  #define TRACE_CONTEXT_BITS	4  #define TRACE_FTRACE_START	TRACE_FTRACE_BIT @@ -843,8 +875,9 @@ extern void __trace_graph_return(struct trace_array *tr,  extern struct ftrace_hash *ftrace_graph_hash;  extern struct ftrace_hash *ftrace_graph_notrace_hash; -static inline int ftrace_graph_addr(unsigned long addr) +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)  { +	unsigned long addr = trace->func;  	int ret = 0;  	preempt_disable_notrace(); @@ -855,6 +888,14 @@ static inline int ftrace_graph_addr(unsigned long addr)  	}  	if (ftrace_lookup_ip(ftrace_graph_hash, addr)) { + +		/* +		 * This needs to be cleared on the return functions +		 * when the depth is zero. +		 */ +		trace_recursion_set(TRACE_GRAPH_BIT); +		trace_recursion_set_depth(trace->depth); +  		/*  		 * If no irqs are to be traced, but a set_graph_function  		 * is set, and called by an interrupt handler, we still @@ -872,6 +913,13 @@ out:  	return ret;  } +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ +	if (trace_recursion_test(TRACE_GRAPH_BIT) && +	    trace->depth == trace_recursion_depth()) +		trace_recursion_clear(TRACE_GRAPH_BIT); +} +  static inline int ftrace_graph_notrace_addr(unsigned long addr)  {  	int ret = 0; @@ -885,7 +933,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)  	return ret;  }  #else -static inline int ftrace_graph_addr(unsigned long addr) +static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)  {  	return 1;  } @@ -894,6 +942,8 @@ static inline int ftrace_graph_notrace_addr(unsigned long addr)  {  	return 0;  } +static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) +{ }  #endif /* CONFIG_DYNAMIC_FTRACE */  extern unsigned int fgraph_max_depth; @@ -901,7 +951,8 @@ extern unsigned int fgraph_max_depth;  static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)  {  	/* trace it when it is-nested-in or is a function enabled. */ -	return !(trace->depth || ftrace_graph_addr(trace->func)) || +	return !(trace_recursion_test(TRACE_GRAPH_BIT) || +		 ftrace_graph_addr(trace)) ||  		(trace->depth < 0) ||  		(fgraph_max_depth && trace->depth >= fgraph_max_depth);  } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 84a65173b1e9..5574e862de8d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -570,11 +570,13 @@ predicate_parse(const char *str, int nr_parens, int nr_preds,  		}  	} +	kfree(op_stack); +	kfree(inverts);  	return prog;  out_free:  	kfree(op_stack); -	kfree(prog_stack);  	kfree(inverts); +	kfree(prog_stack);  	return ERR_PTR(ret);  } @@ -1718,6 +1720,7 @@ static int create_filter(struct trace_event_call *call,  	err = process_preds(call, filter_string, *filterp, pe);  	if (err && set_str)  		append_filter_err(pe, *filterp); +	create_filter_finish(pe);  	return err;  } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 2152d1e530cb..cd12ecb66eb9 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -732,8 +732,10 @@ int set_trigger_filter(char *filter_str,  	/* The filter is for the 'trigger' event, not the triggered event */  	ret = create_event_filter(file->event_call, filter_str, false, &filter); -	if (ret) -		goto out; +	/* +	 * If create_event_filter() fails, filter still needs to be freed. +	 * Which the calling code will do with data->filter. +	 */   assign:  	tmp = rcu_access_pointer(data->filter); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 169b3c44ee97..086af4f5c3e8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -118,8 +118,8 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration,  		     struct trace_seq *s, u32 flags);  /* Add a function return address to the trace stack on thread info.*/ -int -ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, +static int +ftrace_push_return_trace(unsigned long ret, unsigned long func,  			 unsigned long frame_pointer, unsigned long *retp)  {  	unsigned long long calltime; @@ -177,9 +177,31 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,  #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR  	current->ret_stack[index].retp = retp;  #endif -	*depth = current->curr_ret_stack; +	return 0; +} + +int function_graph_enter(unsigned long ret, unsigned long func, +			 unsigned long frame_pointer, unsigned long *retp) +{ +	struct ftrace_graph_ent trace; + +	trace.func = func; +	trace.depth = ++current->curr_ret_depth; + +	if (ftrace_push_return_trace(ret, func, +				     frame_pointer, retp)) +		goto out; + +	/* Only trace if the calling function expects to */ +	if (!ftrace_graph_entry(&trace)) +		goto out_ret;  	return 0; + out_ret: +	current->curr_ret_stack--; + out: +	current->curr_ret_depth--; +	return -EBUSY;  }  /* Retrieve a function return address to the trace stack on thread info.*/ @@ -241,7 +263,13 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,  	trace->func = current->ret_stack[index].func;  	trace->calltime = current->ret_stack[index].calltime;  	trace->overrun = atomic_read(¤t->trace_overrun); -	trace->depth = index; +	trace->depth = current->curr_ret_depth--; +	/* +	 * We still want to trace interrupts coming in if +	 * max_depth is set to 1. Make sure the decrement is +	 * seen before ftrace_graph_return. +	 */ +	barrier();  }  /* @@ -255,6 +283,12 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)  	ftrace_pop_return_trace(&trace, &ret, frame_pointer);  	trace.rettime = trace_clock_local(); +	ftrace_graph_return(&trace); +	/* +	 * The ftrace_graph_return() may still access the current +	 * ret_stack structure, we need to make sure the update of +	 * curr_ret_stack is after that. +	 */  	barrier();  	current->curr_ret_stack--;  	/* @@ -267,13 +301,6 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)  		return ret;  	} -	/* -	 * The trace should run after decrementing the ret counter -	 * in case an interrupt were to come in. We don't want to -	 * lose the interrupt if max_depth is set. -	 */ -	ftrace_graph_return(&trace); -  	if (unlikely(!ret)) {  		ftrace_graph_stop();  		WARN_ON(1); @@ -482,6 +509,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)  	int cpu;  	int pc; +	ftrace_graph_addr_finish(trace); +  	local_irq_save(flags);  	cpu = raw_smp_processor_id();  	data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -505,6 +534,8 @@ void set_graph_array(struct trace_array *tr)  static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)  { +	ftrace_graph_addr_finish(trace); +  	if (tracing_thresh &&  	    (trace->rettime - trace->calltime < tracing_thresh))  		return; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b7357f9f82a3..98ea6d28df15 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -208,6 +208,8 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)  	unsigned long flags;  	int pc; +	ftrace_graph_addr_finish(trace); +  	if (!func_prolog_dec(tr, &data, &flags))  		return; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index a86b303e6c67..7d04b9890755 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -270,6 +270,8 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)  	unsigned long flags;  	int pc; +	ftrace_graph_addr_finish(trace); +  	if (!func_prolog_preempt_disable(tr, &data, &pc))  		return; | 
