diff options
Diffstat (limited to 'kernel')
102 files changed, 3323 insertions, 2651 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index fbba478ae522..bf770d7556f7 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -229,7 +229,7 @@ config MUTEX_SPIN_ON_OWNER  config RWSEM_SPIN_ON_OWNER         def_bool y -       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW +       depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW  config LOCK_SPIN_ON_OWNER         def_bool y @@ -251,3 +251,10 @@ config ARCH_USE_QUEUED_RWLOCKS  config QUEUED_RWLOCKS  	def_bool y if ARCH_USE_QUEUED_RWLOCKS  	depends on SMP + +config ARCH_HAS_MMIOWB +	bool + +config MMIOWB +	def_bool y if ARCH_HAS_MMIOWB +	depends on SMP diff --git a/kernel/Makefile b/kernel/Makefile index 6c57e78817da..62471e75a2b0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -30,6 +30,7 @@ KCOV_INSTRUMENT_extable.o := n  # Don't self-instrument.  KCOV_INSTRUMENT_kcov.o := n  KASAN_SANITIZE_kcov.o := n +CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)  # cond_syscall is currently not LTO compatible  CFLAGS_sys_ni.o = $(DISABLE_LTO) diff --git a/kernel/acct.c b/kernel/acct.c index addf7732fb56..81f9831a7859 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname)  		filp_close(file, NULL);  		return PTR_ERR(internal);  	} -	err = mnt_want_write(internal); +	err = __mnt_want_write(internal);  	if (err) {  		mntput(internal);  		kfree(acct); @@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname)  	old = xchg(&ns->bacct, &acct->pin);  	mutex_unlock(&acct->lock);  	pin_kill(old); -	mnt_drop_write(mnt); +	__mnt_drop_write(mnt);  	mntput(mnt);  	return 0;  } diff --git a/kernel/async.c b/kernel/async.c index f6bd0d9885e1..12c332e4e13e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -119,7 +119,7 @@ static void async_run_entry_fn(struct work_struct *work)  	/* 1) run (and print duration) */  	if (initcall_debug && system_state < SYSTEM_RUNNING) { -		pr_debug("calling  %lli_%pF @ %i\n", +		pr_debug("calling  %lli_%pS @ %i\n",  			(long long)entry->cookie,  			entry->func, task_pid_nr(current));  		calltime = ktime_get(); @@ -128,7 +128,7 @@ static void async_run_entry_fn(struct work_struct *work)  	if (initcall_debug && system_state < SYSTEM_RUNNING) {  		rettime = ktime_get();  		delta = ktime_sub(rettime, calltime); -		pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", +		pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",  			(long long)entry->cookie,  			entry->func,  			(long long)ktime_to_ns(delta) >> 10); diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index 1323360d90e3..a563c8fdad0d 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -48,19 +48,14 @@ static void backtrace_test_irq(void)  #ifdef CONFIG_STACKTRACE  static void backtrace_test_saved(void)  { -	struct stack_trace trace;  	unsigned long entries[8]; +	unsigned int nr_entries;  	pr_info("Testing a saved backtrace.\n");  	pr_info("The following trace is a kernel self test and not a bug!\n"); -	trace.nr_entries = 0; -	trace.max_entries = ARRAY_SIZE(entries); -	trace.entries = entries; -	trace.skip = 0; - -	save_stack_trace(&trace); -	print_stack_trace(&trace, 0); +	nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); +	stack_trace_print(entries, nr_entries, 0);  }  #else  static void backtrace_test_saved(void) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ff09d32a8a1b..c605397c79f0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)  	if (fp->jited) {  		struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); -		bpf_jit_binary_unlock_ro(hdr);  		bpf_jit_binary_free(hdr);  		WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 4a8f390a2b82..bc53e5b20ddc 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -566,9 +566,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)  	return 0;  } -static void bpf_destroy_inode_deferred(struct rcu_head *head) +static void bpf_free_inode(struct inode *inode)  { -	struct inode *inode = container_of(head, struct inode, i_rcu);  	enum bpf_type type;  	if (S_ISLNK(inode->i_mode)) @@ -578,16 +577,11 @@ static void bpf_destroy_inode_deferred(struct rcu_head *head)  	free_inode_nonrcu(inode);  } -static void bpf_destroy_inode(struct inode *inode) -{ -	call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred); -} -  static const struct super_operations bpf_super_ops = {  	.statfs		= simple_statfs,  	.drop_inode	= generic_delete_inode,  	.show_options	= bpf_show_options, -	.destroy_inode	= bpf_destroy_inode, +	.free_inode	= bpf_free_inode,  };  enum { diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4834c4214e9c..6a1942ed781c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -740,11 +740,10 @@ static inline int nr_cpusets(void)   * Must be called with cpuset_mutex held.   *   * The three key local variables below are: - *    q  - a linked-list queue of cpuset pointers, used to implement a - *	   top-down scan of all cpusets.  This scan loads a pointer - *	   to each cpuset marked is_sched_load_balance into the - *	   array 'csa'.  For our purposes, rebuilding the schedulers - *	   sched domains, we can ignore !is_sched_load_balance cpusets. + *    cp - cpuset pointer, used (together with pos_css) to perform a + *	   top-down scan of all cpusets. For our purposes, rebuilding + *	   the schedulers sched domains, we can ignore !is_sched_load_ + *	   balance cpusets.   *  csa  - (for CpuSet Array) Array of pointers to all the cpusets   *	   that need to be load balanced, for convenient iterative   *	   access by the subsequent code that finds the best partition, @@ -775,7 +774,7 @@ static inline int nr_cpusets(void)  static int generate_sched_domains(cpumask_var_t **domains,  			struct sched_domain_attr **attributes)  { -	struct cpuset *cp;	/* scans q */ +	struct cpuset *cp;	/* top-down scan of cpusets */  	struct cpuset **csa;	/* array of all cpuset ptrs */  	int csn;		/* how many cpuset ptrs in csa so far */  	int i, j, k;		/* indices for partition finding loops */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 6754f3ecfd94..f2ef10460698 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -9,6 +9,7 @@  #include <linux/notifier.h>  #include <linux/sched/signal.h>  #include <linux/sched/hotplug.h> +#include <linux/sched/isolation.h>  #include <linux/sched/task.h>  #include <linux/sched/smt.h>  #include <linux/unistd.h> @@ -860,6 +861,8 @@ static int take_cpu_down(void *_param)  	/* Give up timekeeping duties */  	tick_handover_do_timer(); +	/* Remove CPU from timer broadcasting */ +	tick_offline_cpu(cpu);  	/* Park the stopper thread */  	stop_machine_park(cpu);  	return 0; @@ -1199,8 +1202,15 @@ int freeze_secondary_cpus(int primary)  	int cpu, error = 0;  	cpu_maps_update_begin(); -	if (!cpu_online(primary)) +	if (primary == -1) {  		primary = cpumask_first(cpu_online_mask); +		if (!housekeeping_cpu(primary, HK_FLAG_TIMER)) +			primary = housekeeping_any_cpu(HK_FLAG_TIMER); +	} else { +		if (!cpu_online(primary)) +			primary = cpumask_first(cpu_online_mask); +	} +  	/*  	 * We take down all of the non-boot CPUs in one shot to avoid races  	 * with the userspace trying to use the CPU hotplug at the same time @@ -2033,19 +2043,6 @@ static const struct attribute_group cpuhp_cpu_root_attr_group = {  #ifdef CONFIG_HOTPLUG_SMT -static const char *smt_states[] = { -	[CPU_SMT_ENABLED]		= "on", -	[CPU_SMT_DISABLED]		= "off", -	[CPU_SMT_FORCE_DISABLED]	= "forceoff", -	[CPU_SMT_NOT_SUPPORTED]		= "notsupported", -}; - -static ssize_t -show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) -{ -	return snprintf(buf, PAGE_SIZE - 2, "%s\n", smt_states[cpu_smt_control]); -} -  static void cpuhp_offline_cpu_device(unsigned int cpu)  {  	struct device *dev = get_cpu_device(cpu); @@ -2116,9 +2113,10 @@ static int cpuhp_smt_enable(void)  	return ret;  } +  static ssize_t -store_smt_control(struct device *dev, struct device_attribute *attr, -		  const char *buf, size_t count) +__store_smt_control(struct device *dev, struct device_attribute *attr, +		    const char *buf, size_t count)  {  	int ctrlval, ret; @@ -2156,14 +2154,44 @@ store_smt_control(struct device *dev, struct device_attribute *attr,  	unlock_device_hotplug();  	return ret ? ret : count;  } + +#else /* !CONFIG_HOTPLUG_SMT */ +static ssize_t +__store_smt_control(struct device *dev, struct device_attribute *attr, +		    const char *buf, size_t count) +{ +	return -ENODEV; +} +#endif /* CONFIG_HOTPLUG_SMT */ + +static const char *smt_states[] = { +	[CPU_SMT_ENABLED]		= "on", +	[CPU_SMT_DISABLED]		= "off", +	[CPU_SMT_FORCE_DISABLED]	= "forceoff", +	[CPU_SMT_NOT_SUPPORTED]		= "notsupported", +	[CPU_SMT_NOT_IMPLEMENTED]	= "notimplemented", +}; + +static ssize_t +show_smt_control(struct device *dev, struct device_attribute *attr, char *buf) +{ +	const char *state = smt_states[cpu_smt_control]; + +	return snprintf(buf, PAGE_SIZE - 2, "%s\n", state); +} + +static ssize_t +store_smt_control(struct device *dev, struct device_attribute *attr, +		  const char *buf, size_t count) +{ +	return __store_smt_control(dev, attr, buf, count); +}  static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);  static ssize_t  show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)  { -	bool active = topology_max_smt_threads() > 1; - -	return snprintf(buf, PAGE_SIZE - 2, "%d\n", active); +	return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());  }  static DEVICE_ATTR(active, 0444, show_smt_active, NULL); @@ -2179,21 +2207,17 @@ static const struct attribute_group cpuhp_smt_attr_group = {  	NULL  }; -static int __init cpu_smt_state_init(void) +static int __init cpu_smt_sysfs_init(void)  {  	return sysfs_create_group(&cpu_subsys.dev_root->kobj,  				  &cpuhp_smt_attr_group);  } -#else -static inline int cpu_smt_state_init(void) { return 0; } -#endif -  static int __init cpuhp_sysfs_init(void)  {  	int cpu, ret; -	ret = cpu_smt_state_init(); +	ret = cpu_smt_sysfs_init();  	if (ret)  		return ret; @@ -2214,7 +2238,7 @@ static int __init cpuhp_sysfs_init(void)  	return 0;  }  device_initcall(cpuhp_sysfs_init); -#endif +#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */  /*   * cpu_bit_bitmap[] is a special, "compressed" data structure that @@ -2304,3 +2328,18 @@ void __init boot_cpu_hotplug_init(void)  #endif  	this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);  } + +enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; + +static int __init mitigations_parse_cmdline(char *arg) +{ +	if (!strcmp(arg, "off")) +		cpu_mitigations = CPU_MITIGATIONS_OFF; +	else if (!strcmp(arg, "auto")) +		cpu_mitigations = CPU_MITIGATIONS_AUTO; +	else if (!strcmp(arg, "auto,nosmt")) +		cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; + +	return 0; +} +early_param("mitigations", mitigations_parse_cmdline); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index a218e43cc382..badd77670d00 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -89,8 +89,8 @@ struct dma_debug_entry {  	int		 sg_mapped_ents;  	enum map_err_types  map_err_type;  #ifdef CONFIG_STACKTRACE -	struct		 stack_trace stacktrace; -	unsigned long	 st_entries[DMA_DEBUG_STACKTRACE_ENTRIES]; +	unsigned int	stack_len; +	unsigned long	stack_entries[DMA_DEBUG_STACKTRACE_ENTRIES];  #endif  }; @@ -174,7 +174,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)  #ifdef CONFIG_STACKTRACE  	if (entry) {  		pr_warning("Mapped at:\n"); -		print_stack_trace(&entry->stacktrace, 0); +		stack_trace_print(entry->stack_entries, entry->stack_len, 0);  	}  #endif  } @@ -704,12 +704,10 @@ static struct dma_debug_entry *dma_entry_alloc(void)  	spin_unlock_irqrestore(&free_entries_lock, flags);  #ifdef CONFIG_STACKTRACE -	entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; -	entry->stacktrace.entries = entry->st_entries; -	entry->stacktrace.skip = 1; -	save_stack_trace(&entry->stacktrace); +	entry->stack_len = stack_trace_save(entry->stack_entries, +					    ARRAY_SIZE(entry->stack_entries), +					    1);  #endif -  	return entry;  } diff --git a/kernel/events/core.c b/kernel/events/core.c index dc7dead2d2cc..abbd4b3b96c2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2478,6 +2478,16 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,  	perf_pmu_enable(cpuctx->ctx.pmu);  } +void perf_pmu_resched(struct pmu *pmu) +{ +	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); +	struct perf_event_context *task_ctx = cpuctx->task_ctx; + +	perf_ctx_lock(cpuctx, task_ctx); +	ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU); +	perf_ctx_unlock(cpuctx, task_ctx); +} +  /*   * Cross CPU call to install and enable a performance event   * @@ -11917,7 +11927,7 @@ static void __init perf_event_init_all_cpus(void)  	}  } -void perf_swevent_init_cpu(unsigned int cpu) +static void perf_swevent_init_cpu(unsigned int cpu)  {  	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c5cde87329c7..4ca7364c956d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -2028,7 +2028,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)  		if (uc->handler) {  			rc = uc->handler(uc, regs);  			WARN(rc & ~UPROBE_HANDLER_MASK, -				"bad rc=0x%x from %pf()\n", rc, uc->handler); +				"bad rc=0x%x from %ps()\n", rc, uc->handler);  		}  		if (uc->ret_handler) @@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {  	.priority		= INT_MAX-1,	/* notified after kprobes, kgdb */  }; -static int __init init_uprobes(void) +void __init uprobes_init(void)  {  	int i;  	for (i = 0; i < UPROBES_HASH_SZ; i++)  		mutex_init(&uprobes_mmap_mutex[i]); -	if (percpu_init_rwsem(&dup_mmap_sem)) -		return -ENOMEM; +	BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); -	return register_die_notifier(&uprobe_exception_nb); +	BUG_ON(register_die_notifier(&uprobe_exception_nb));  } -__initcall(init_uprobes); diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 17f75b545f66..feb80712b913 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v)  {  	struct fei_attr *attr = list_entry(v, struct fei_attr, list); -	seq_printf(m, "%pf\n", attr->kp.addr); +	seq_printf(m, "%ps\n", attr->kp.addr);  	return 0;  } diff --git a/kernel/fork.c b/kernel/fork.c index 9dcd18aa210b..fbe9dfcd8680 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -815,6 +815,7 @@ void __init fork_init(void)  #endif  	lockdep_init_task(&init_task); +	uprobes_init();  }  int __weak arch_dup_task_struct(struct task_struct *dst, @@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)  		complete_vfork_done(tsk);  } -/* - * Allocate a new mm structure and copy contents from the - * mm structure of the passed in task structure. +/** + * dup_mm() - duplicates an existing mm structure + * @tsk: the task_struct with which the new mm will be associated. + * @oldmm: the mm to duplicate. + * + * Allocates a new mm structure and duplicates the provided @oldmm structure + * content into it. + * + * Return: the duplicated mm or NULL on failure.   */ -static struct mm_struct *dup_mm(struct task_struct *tsk) +static struct mm_struct *dup_mm(struct task_struct *tsk, +				struct mm_struct *oldmm)  { -	struct mm_struct *mm, *oldmm = current->mm; +	struct mm_struct *mm;  	int err;  	mm = allocate_mm(); @@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)  	}  	retval = -ENOMEM; -	mm = dup_mm(tsk); +	mm = dup_mm(tsk, current->mm);  	if (!mm)  		goto fail_nomem; @@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu)  	return task;  } +struct mm_struct *copy_init_mm(void) +{ +	return dup_mm(NULL, &init_mm); +} +  /*   *  Ok, this is the main fork-routine.   * diff --git a/kernel/futex.c b/kernel/futex.c index 9e40cf7be606..6262f1534ac9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1311,13 +1311,15 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,  static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)  { +	int err;  	u32 uninitialized_var(curval);  	if (unlikely(should_fail_futex(true)))  		return -EFAULT; -	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) -		return -EFAULT; +	err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); +	if (unlikely(err)) +		return err;  	/* If user space value changed, let the caller retry */  	return curval != uval ? -EAGAIN : 0; @@ -1502,10 +1504,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_  	if (unlikely(should_fail_futex(true)))  		ret = -EFAULT; -	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { -		ret = -EFAULT; - -	} else if (curval != uval) { +	ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); +	if (!ret && (curval != uval)) {  		/*  		 * If a unconditional UNLOCK_PI operation (user space did not  		 * try the TID->0 transition) raced with a waiter setting the @@ -1700,32 +1700,32 @@ retry_private:  	double_lock_hb(hb1, hb2);  	op_ret = futex_atomic_op_inuser(op, uaddr2);  	if (unlikely(op_ret < 0)) { -  		double_unlock_hb(hb1, hb2); -#ifndef CONFIG_MMU -		/* -		 * we don't get EFAULT from MMU faults if we don't have an MMU, -		 * but we might get them from range checking -		 */ -		ret = op_ret; -		goto out_put_keys; -#endif - -		if (unlikely(op_ret != -EFAULT)) { +		if (!IS_ENABLED(CONFIG_MMU) || +		    unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { +			/* +			 * we don't get EFAULT from MMU faults if we don't have +			 * an MMU, but we might get them from range checking +			 */  			ret = op_ret;  			goto out_put_keys;  		} -		ret = fault_in_user_writeable(uaddr2); -		if (ret) -			goto out_put_keys; +		if (op_ret == -EFAULT) { +			ret = fault_in_user_writeable(uaddr2); +			if (ret) +				goto out_put_keys; +		} -		if (!(flags & FLAGS_SHARED)) +		if (!(flags & FLAGS_SHARED)) { +			cond_resched();  			goto retry_private; +		}  		put_futex_key(&key2);  		put_futex_key(&key1); +		cond_resched();  		goto retry;  	} @@ -2350,7 +2350,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,  	u32 uval, uninitialized_var(curval), newval;  	struct task_struct *oldowner, *newowner;  	u32 newtid; -	int ret; +	int ret, err = 0;  	lockdep_assert_held(q->lock_ptr); @@ -2421,14 +2421,17 @@ retry:  	if (!pi_state->owner)  		newtid |= FUTEX_OWNER_DIED; -	if (get_futex_value_locked(&uval, uaddr)) -		goto handle_fault; +	err = get_futex_value_locked(&uval, uaddr); +	if (err) +		goto handle_err;  	for (;;) {  		newval = (uval & FUTEX_OWNER_DIED) | newtid; -		if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) -			goto handle_fault; +		err = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); +		if (err) +			goto handle_err; +  		if (curval == uval)  			break;  		uval = curval; @@ -2456,23 +2459,37 @@ retry:  	return 0;  	/* -	 * To handle the page fault we need to drop the locks here. That gives -	 * the other task (either the highest priority waiter itself or the -	 * task which stole the rtmutex) the chance to try the fixup of the -	 * pi_state. So once we are back from handling the fault we need to -	 * check the pi_state after reacquiring the locks and before trying to -	 * do another fixup. When the fixup has been done already we simply -	 * return. +	 * In order to reschedule or handle a page fault, we need to drop the +	 * locks here. In the case of a fault, this gives the other task +	 * (either the highest priority waiter itself or the task which stole +	 * the rtmutex) the chance to try the fixup of the pi_state. So once we +	 * are back from handling the fault we need to check the pi_state after +	 * reacquiring the locks and before trying to do another fixup. When +	 * the fixup has been done already we simply return.  	 *  	 * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely  	 * drop hb->lock since the caller owns the hb -> futex_q relation.  	 * Dropping the pi_mutex->wait_lock requires the state revalidate.  	 */ -handle_fault: +handle_err:  	raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);  	spin_unlock(q->lock_ptr); -	ret = fault_in_user_writeable(uaddr); +	switch (err) { +	case -EFAULT: +		ret = fault_in_user_writeable(uaddr); +		break; + +	case -EAGAIN: +		cond_resched(); +		ret = 0; +		break; + +	default: +		WARN_ON_ONCE(1); +		ret = err; +		break; +	}  	spin_lock(q->lock_ptr);  	raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); @@ -3041,10 +3058,8 @@ retry:  		 * A unconditional UNLOCK_PI op raced against a waiter  		 * setting the FUTEX_WAITERS bit. Try again.  		 */ -		if (ret == -EAGAIN) { -			put_futex_key(&key); -			goto retry; -		} +		if (ret == -EAGAIN) +			goto pi_retry;  		/*  		 * wake_futex_pi has detected invalid state. Tell user  		 * space. @@ -3059,9 +3074,19 @@ retry:  	 * preserve the WAITERS bit not the OWNER_DIED one. We are the  	 * owner.  	 */ -	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { +	if ((ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))) {  		spin_unlock(&hb->lock); -		goto pi_faulted; +		switch (ret) { +		case -EFAULT: +			goto pi_faulted; + +		case -EAGAIN: +			goto pi_retry; + +		default: +			WARN_ON_ONCE(1); +			goto out_putkey; +		}  	}  	/* @@ -3075,6 +3100,11 @@ out_putkey:  	put_futex_key(&key);  	return ret; +pi_retry: +	put_futex_key(&key); +	cond_resched(); +	goto retry; +  pi_faulted:  	put_futex_key(&key); @@ -3435,6 +3465,7 @@ err_unlock:  static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)  {  	u32 uval, uninitialized_var(nval), mval; +	int err;  	/* Futex address must be 32bit aligned */  	if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) @@ -3444,42 +3475,57 @@ retry:  	if (get_user(uval, uaddr))  		return -1; -	if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { -		/* -		 * Ok, this dying thread is truly holding a futex -		 * of interest. Set the OWNER_DIED bit atomically -		 * via cmpxchg, and if the value had FUTEX_WAITERS -		 * set, wake up a waiter (if any). (We have to do a -		 * futex_wake() even if OWNER_DIED is already set - -		 * to handle the rare but possible case of recursive -		 * thread-death.) The rest of the cleanup is done in -		 * userspace. -		 */ -		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; -		/* -		 * We are not holding a lock here, but we want to have -		 * the pagefault_disable/enable() protection because -		 * we want to handle the fault gracefully. If the -		 * access fails we try to fault in the futex with R/W -		 * verification via get_user_pages. get_user() above -		 * does not guarantee R/W access. If that fails we -		 * give up and leave the futex locked. -		 */ -		if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { +	if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) +		return 0; + +	/* +	 * Ok, this dying thread is truly holding a futex +	 * of interest. Set the OWNER_DIED bit atomically +	 * via cmpxchg, and if the value had FUTEX_WAITERS +	 * set, wake up a waiter (if any). (We have to do a +	 * futex_wake() even if OWNER_DIED is already set - +	 * to handle the rare but possible case of recursive +	 * thread-death.) The rest of the cleanup is done in +	 * userspace. +	 */ +	mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + +	/* +	 * We are not holding a lock here, but we want to have +	 * the pagefault_disable/enable() protection because +	 * we want to handle the fault gracefully. If the +	 * access fails we try to fault in the futex with R/W +	 * verification via get_user_pages. get_user() above +	 * does not guarantee R/W access. If that fails we +	 * give up and leave the futex locked. +	 */ +	if ((err = cmpxchg_futex_value_locked(&nval, uaddr, uval, mval))) { +		switch (err) { +		case -EFAULT:  			if (fault_in_user_writeable(uaddr))  				return -1;  			goto retry; -		} -		if (nval != uval) + +		case -EAGAIN: +			cond_resched();  			goto retry; -		/* -		 * Wake robust non-PI futexes here. The wakeup of -		 * PI futexes happens in exit_pi_state(): -		 */ -		if (!pi && (uval & FUTEX_WAITERS)) -			futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); +		default: +			WARN_ON_ONCE(1); +			return err; +		}  	} + +	if (nval != uval) +		goto retry; + +	/* +	 * Wake robust non-PI futexes here. The wakeup of +	 * PI futexes happens in exit_pi_state(): +	 */ +	if (!pi && (uval & FUTEX_WAITERS)) +		futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); +  	return 0;  } diff --git a/kernel/iomem.c b/kernel/iomem.c index f7525e14ebc6..93c264444510 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -55,7 +55,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size,   *   * MEMREMAP_WB - matches the default mapping for System RAM on   * the architecture.  This is usually a read-allocate write-back cache. - * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * Moreover, if MEMREMAP_WB is specified and the requested remap region is RAM   * memremap() will bypass establishing a new mapping and instead return   * a pointer into the direct map.   * @@ -86,7 +86,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)  	/* Try all mapping types requested until one returns non-NULL */  	if (flags & MEMREMAP_WB) {  		/* -		 * MEMREMAP_WB is special in that it can be satisifed +		 * MEMREMAP_WB is special in that it can be satisfied  		 * from the direct map.  Some archs depend on the  		 * capability of memremap() to autodetect cases where  		 * the requested range is potentially in System RAM. diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 516c00a5e867..c1eccd4f6520 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p)  	raw_spin_lock_irq(&desc->lock);  	data = irq_desc_get_irq_data(desc); -	seq_printf(m, "handler:  %pf\n", desc->handle_irq); +	seq_printf(m, "handler:  %ps\n", desc->handle_irq);  	seq_printf(m, "device:   %s\n", desc->dev_name);  	seq_printf(m, "status:   0x%08x\n", desc->status_use_accessors);  	irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states, diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index f808c6a97dcc..f6e5515ee077 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -220,9 +220,8 @@ devm_irq_alloc_generic_chip(struct device *dev, const char *name, int num_ct,  			    irq_flow_handler_t handler)  {  	struct irq_chip_generic *gc; -	unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); -	gc = devm_kzalloc(dev, sz, GFP_KERNEL); +	gc = devm_kzalloc(dev, struct_size(gc, chip_types, num_ct), GFP_KERNEL);  	if (gc)  		irq_init_generic_chip(gc, name, num_ct,  				      irq_base, reg_base, handler); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6df5ddfdb0f8..a4ace611f47f 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags  		res = action->handler(irq, action->dev_id);  		trace_irq_handler_exit(irq, action, res); -		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", +		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",  			      irq, action->handler))  			local_irq_disable(); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1401afa0d58a..78f3ddeb7fe4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -357,8 +357,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)  	desc->affinity_notify = notify;  	raw_spin_unlock_irqrestore(&desc->lock, flags); -	if (old_notify) +	if (old_notify) { +		cancel_work_sync(&old_notify->work);  		kref_put(&old_notify->kref, old_notify->release); +	}  	return 0;  } @@ -779,7 +781,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)  		ret = 0;  		break;  	default: -		pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", +		pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n",  		       flags, irq_desc_get_irq(desc), chip->irq_set_type);  	}  	if (unmask) diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 6d2fa6914b30..2ed97a7c9b2a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)  	 */  	raw_spin_lock_irqsave(&desc->lock, flags);  	for_each_action_of_desc(desc, action) { -		printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); +		printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler);  		if (action->thread_fn) -			printk(KERN_CONT " threaded [<%p>] %pf", +			printk(KERN_CONT " threaded [<%p>] %ps",  					action->thread_fn, action->thread_fn);  		printk(KERN_CONT "\n");  	} diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 1e4cb63a5c82..90c735da15d0 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c @@ -9,6 +9,7 @@  #include <linux/idr.h>  #include <linux/irq.h>  #include <linux/math64.h> +#include <linux/log2.h>  #include <trace/events/irq.h> @@ -18,16 +19,6 @@ DEFINE_STATIC_KEY_FALSE(irq_timing_enabled);  DEFINE_PER_CPU(struct irq_timings, irq_timings); -struct irqt_stat { -	u64	next_evt; -	u64	last_ts; -	u64	variance; -	u32	avg; -	u32	nr_samples; -	int	anomalies; -	int	valid; -}; -  static DEFINE_IDR(irqt_stats);  void irq_timings_enable(void) @@ -40,75 +31,360 @@ void irq_timings_disable(void)  	static_branch_disable(&irq_timing_enabled);  } -/** - * irqs_update - update the irq timing statistics with a new timestamp +/* + * The main goal of this algorithm is to predict the next interrupt + * occurrence on the current CPU. + * + * Currently, the interrupt timings are stored in a circular array + * buffer every time there is an interrupt, as a tuple: the interrupt + * number and the associated timestamp when the event occurred <irq, + * timestamp>. + * + * For every interrupt occurring in a short period of time, we can + * measure the elapsed time between the occurrences for the same + * interrupt and we end up with a suite of intervals. The experience + * showed the interrupts are often coming following a periodic + * pattern. + * + * The objective of the algorithm is to find out this periodic pattern + * in a fastest way and use its period to predict the next irq event. + * + * When the next interrupt event is requested, we are in the situation + * where the interrupts are disabled and the circular buffer + * containing the timings is filled with the events which happened + * after the previous next-interrupt-event request. + * + * At this point, we read the circular buffer and we fill the irq + * related statistics structure. After this step, the circular array + * containing the timings is empty because all the values are + * dispatched in their corresponding buffers. + * + * Now for each interrupt, we can predict the next event by using the + * suffix array, log interval and exponential moving average + * + * 1. Suffix array + * + * Suffix array is an array of all the suffixes of a string. It is + * widely used as a data structure for compression, text search, ... + * For instance for the word 'banana', the suffixes will be: 'banana' + * 'anana' 'nana' 'ana' 'na' 'a' + * + * Usually, the suffix array is sorted but for our purpose it is + * not necessary and won't provide any improvement in the context of + * the solved problem where we clearly define the boundaries of the + * search by a max period and min period. + * + * The suffix array will build a suite of intervals of different + * length and will look for the repetition of each suite. If the suite + * is repeating then we have the period because it is the length of + * the suite whatever its position in the buffer. + * + * 2. Log interval + * + * We saw the irq timings allow to compute the interval of the + * occurrences for a specific interrupt. We can reasonibly assume the + * longer is the interval, the higher is the error for the next event + * and we can consider storing those interval values into an array + * where each slot in the array correspond to an interval at the power + * of 2 of the index. For example, index 12 will contain values + * between 2^11 and 2^12. + * + * At the end we have an array of values where at each index defines a + * [2^index - 1, 2 ^ index] interval values allowing to store a large + * number of values inside a small array. + * + * For example, if we have the value 1123, then we store it at + * ilog2(1123) = 10 index value. + * + * Storing those value at the specific index is done by computing an + * exponential moving average for this specific slot. For instance, + * for values 1800, 1123, 1453, ... fall under the same slot (10) and + * the exponential moving average is computed every time a new value + * is stored at this slot. + * + * 3. Exponential Moving Average + * + * The EMA is largely used to track a signal for stocks or as a low + * pass filter. The magic of the formula, is it is very simple and the + * reactivity of the average can be tuned with the factors called + * alpha. + * + * The higher the alphas are, the faster the average respond to the + * signal change. In our case, if a slot in the array is a big + * interval, we can have numbers with a big difference between + * them. The impact of those differences in the average computation + * can be tuned by changing the alpha value. + * + * + *  -- The algorithm -- + * + * We saw the different processing above, now let's see how they are + * used together. + * + * For each interrupt: + *	For each interval: + *		Compute the index = ilog2(interval) + *		Compute a new_ema(buffer[index], interval) + *		Store the index in a circular buffer + * + *	Compute the suffix array of the indexes + * + *	For each suffix: + *		If the suffix is reverse-found 3 times + *			Return suffix + * + *	Return Not found + * + * However we can not have endless suffix array to be build, it won't + * make sense and it will add an extra overhead, so we can restrict + * this to a maximum suffix length of 5 and a minimum suffix length of + * 2. The experience showed 5 is the majority of the maximum pattern + * period found for different devices. + * + * The result is a pattern finding less than 1us for an interrupt.   * - * @irqs: an irqt_stat struct pointer - * @ts: the new timestamp + * Example based on real values:   * - * The statistics are computed online, in other words, the code is - * designed to compute the statistics on a stream of values rather - * than doing multiple passes on the values to compute the average, - * then the variance. The integer division introduces a loss of - * precision but with an acceptable error margin regarding the results - * we would have with the double floating precision: we are dealing - * with nanosec, so big numbers, consequently the mantisse is - * negligeable, especially when converting the time in usec - * afterwards. + * Example 1 : MMC write/read interrupt interval:   * - * The computation happens at idle time. When the CPU is not idle, the - * interrupts' timestamps are stored in the circular buffer, when the - * CPU goes idle and this routine is called, all the buffer's values - * are injected in the statistical model continuying to extend the - * statistics from the previous busy-idle cycle. + *	223947, 1240, 1384, 1386, 1386, + *	217416, 1236, 1384, 1386, 1387, + *	214719, 1241, 1386, 1387, 1384, + *	213696, 1234, 1384, 1386, 1388, + *	219904, 1240, 1385, 1389, 1385, + *	212240, 1240, 1386, 1386, 1386, + *	214415, 1236, 1384, 1386, 1387, + *	214276, 1234, 1384, 1388, ?   * - * The observations showed a device will trigger a burst of periodic - * interrupts followed by one or two peaks of longer time, for - * instance when a SD card device flushes its cache, then the periodic - * intervals occur again. A one second inactivity period resets the - * stats, that gives us the certitude the statistical values won't - * exceed 1x10^9, thus the computation won't overflow. + * For each element, apply ilog2(value)   * - * Basically, the purpose of the algorithm is to watch the periodic - * interrupts and eliminate the peaks. + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, ?   * - * An interrupt is considered periodically stable if the interval of - * its occurences follow the normal distribution, thus the values - * comply with: + * Max period of 5, we take the last (max_period * 3) 15 elements as + * we can be confident if the pattern repeats itself three times it is + * a repeating pattern.   * - *      avg - 3 x stddev < value < avg + 3 x stddev + *	             8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, 8, + *	15, 8, 8, 8, ?   * - * Which can be simplified to: + * Suffixes are:   * - *      -3 x stddev < value - avg < 3 x stddev + *  1) 8, 15, 8, 8, 8  <- max period + *  2) 8, 15, 8, 8 + *  3) 8, 15, 8 + *  4) 8, 15           <- min period   * - *      abs(value - avg) < 3 x stddev + * From there we search the repeating pattern for each suffix.   * - * In order to save a costly square root computation, we use the - * variance. For the record, stddev = sqrt(variance). The equation - * above becomes: + * buffer: 8, 15, 8, 8, 8, 8, 15, 8, 8, 8, 8, 15, 8, 8, 8 + *         |   |  |  |  |  |   |  |  |  |  |   |  |  |  | + *         8, 15, 8, 8, 8  |   |  |  |  |  |   |  |  |  | + *                         8, 15, 8, 8, 8  |   |  |  |  | + *                                         8, 15, 8, 8, 8   * - *      abs(value - avg) < 3 x sqrt(variance) + * When moving the suffix, we found exactly 3 matches.   * - * And finally we square it: + * The first suffix with period 5 is repeating.   * - *      (value - avg) ^ 2 < (3 x sqrt(variance)) ^ 2 + * The next event is (3 * max_period) % suffix_period   * - *      (value - avg) x (value - avg) < 9 x variance + * In this example, the result 0, so the next event is suffix[0] => 8   * - * Statistically speaking, any values out of this interval is - * considered as an anomaly and is discarded. However, a normal - * distribution appears when the number of samples is 30 (it is the - * rule of thumb in statistics, cf. "30 samples" on Internet). When - * there are three consecutive anomalies, the statistics are resetted. + * However, 8 is the index in the array of exponential moving average + * which was calculated on the fly when storing the values, so the + * interval is ema[8] = 1366   * + * + * Example 2: + * + *	4, 3, 5, 100, + *	3, 3, 5, 117, + *	4, 4, 5, 112, + *	4, 3, 4, 110, + *	3, 5, 3, 117, + *	4, 4, 5, 112, + *	4, 3, 4, 110, + *	3, 4, 5, 112, + *	4, 3, 4, 110 + * + * ilog2 + * + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4 + * + * Max period 5: + *	   0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4, + *	0, 0, 0, 4 + * + * Suffixes: + * + *  1) 0, 0, 4, 0, 0 + *  2) 0, 0, 4, 0 + *  3) 0, 0, 4 + *  4) 0, 0 + * + * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 + *         |  |  |  |  |  |  X + *         0, 0, 4, 0, 0, |  X + *                        0, 0 + * + * buffer: 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4, 0, 0, 0, 4 + *         |  |  |  |  |  |  |  |  |  |  |  |  |  |  | + *         0, 0, 4, 0, |  |  |  |  |  |  |  |  |  |  | + *                     0, 0, 4, 0, |  |  |  |  |  |  | + *                                 0, 0, 4, 0, |  |  | + *                                             0  0  4 + * + * Pattern is found 3 times, the remaining is 1 which results from + * (max_period * 3) % suffix_period. This value is the index in the + * suffix arrays. The suffix array for a period 4 has the value 4 + * at index 1. + */ +#define EMA_ALPHA_VAL		64 +#define EMA_ALPHA_SHIFT		7 + +#define PREDICTION_PERIOD_MIN	2 +#define PREDICTION_PERIOD_MAX	5 +#define PREDICTION_FACTOR	4 +#define PREDICTION_MAX		10 /* 2 ^ PREDICTION_MAX useconds */ +#define PREDICTION_BUFFER_SIZE	16 /* slots for EMAs, hardly more than 16 */ + +struct irqt_stat { +	u64	last_ts; +	u64	ema_time[PREDICTION_BUFFER_SIZE]; +	int	timings[IRQ_TIMINGS_SIZE]; +	int	circ_timings[IRQ_TIMINGS_SIZE]; +	int	count; +}; + +/* + * Exponential moving average computation   */ -static void irqs_update(struct irqt_stat *irqs, u64 ts) +static u64 irq_timings_ema_new(u64 value, u64 ema_old) +{ +	s64 diff; + +	if (unlikely(!ema_old)) +		return value; + +	diff = (value - ema_old) * EMA_ALPHA_VAL; +	/* +	 * We can use a s64 type variable to be added with the u64 +	 * ema_old variable as this one will never have its topmost +	 * bit set, it will be always smaller than 2^63 nanosec +	 * interrupt interval (292 years). +	 */ +	return ema_old + (diff >> EMA_ALPHA_SHIFT); +} + +static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) +{ +	int i; + +	/* +	 * The buffer contains the suite of intervals, in a ilog2 +	 * basis, we are looking for a repetition. We point the +	 * beginning of the search three times the length of the +	 * period beginning at the end of the buffer. We do that for +	 * each suffix. +	 */ +	for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) { + +		int *begin = &buffer[len - (i * 3)]; +		int *ptr = begin; + +		/* +		 * We look if the suite with period 'i' repeat +		 * itself. If it is truncated at the end, as it +		 * repeats we can use the period to find out the next +		 * element. +		 */ +		while (!memcmp(ptr, begin, i * sizeof(*ptr))) { +			ptr += i; +			if (ptr >= &buffer[len]) +				return begin[((i * 3) % i)]; +		} +	} + +	return -1; +} + +static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) +{ +	int index, i, period_max, count, start, min = INT_MAX; + +	if ((now - irqs->last_ts) >= NSEC_PER_SEC) { +		irqs->count = irqs->last_ts = 0; +		return U64_MAX; +	} + +	/* +	 * As we want to find three times the repetition, we need a +	 * number of intervals greater or equal to three times the +	 * maximum period, otherwise we truncate the max period. +	 */ +	period_max = irqs->count > (3 * PREDICTION_PERIOD_MAX) ? +		PREDICTION_PERIOD_MAX : irqs->count / 3; + +	/* +	 * If we don't have enough irq timings for this prediction, +	 * just bail out. +	 */ +	if (period_max <= PREDICTION_PERIOD_MIN) +		return U64_MAX; + +	/* +	 * 'count' will depends if the circular buffer wrapped or not +	 */ +	count = irqs->count < IRQ_TIMINGS_SIZE ? +		irqs->count : IRQ_TIMINGS_SIZE; + +	start = irqs->count < IRQ_TIMINGS_SIZE ? +		0 : (irqs->count & IRQ_TIMINGS_MASK); + +	/* +	 * Copy the content of the circular buffer into another buffer +	 * in order to linearize the buffer instead of dealing with +	 * wrapping indexes and shifted array which will be prone to +	 * error and extremelly difficult to debug. +	 */ +	for (i = 0; i < count; i++) { +		int index = (start + i) & IRQ_TIMINGS_MASK; + +		irqs->timings[i] = irqs->circ_timings[index]; +		min = min_t(int, irqs->timings[i], min); +	} + +	index = irq_timings_next_event_index(irqs->timings, count, period_max); +	if (index < 0) +		return irqs->last_ts + irqs->ema_time[min]; + +	return irqs->last_ts + irqs->ema_time[index]; +} + +static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)  {  	u64 old_ts = irqs->last_ts; -	u64 variance = 0;  	u64 interval; -	s64 diff; +	int index;  	/*  	 * The timestamps are absolute time values, we need to compute @@ -135,87 +411,28 @@ static void irqs_update(struct irqt_stat *irqs, u64 ts)  	 * want as we need another timestamp to compute an interval.  	 */  	if (interval >= NSEC_PER_SEC) { -		memset(irqs, 0, sizeof(*irqs)); -		irqs->last_ts = ts; +		irqs->count = 0;  		return;  	}  	/* -	 * Pre-compute the delta with the average as the result is -	 * used several times in this function. -	 */ -	diff = interval - irqs->avg; - -	/* -	 * Increment the number of samples. -	 */ -	irqs->nr_samples++; - -	/* -	 * Online variance divided by the number of elements if there -	 * is more than one sample.  Normally the formula is division -	 * by nr_samples - 1 but we assume the number of element will be -	 * more than 32 and dividing by 32 instead of 31 is enough -	 * precise. -	 */ -	if (likely(irqs->nr_samples > 1)) -		variance = irqs->variance >> IRQ_TIMINGS_SHIFT; - -	/* -	 * The rule of thumb in statistics for the normal distribution -	 * is having at least 30 samples in order to have the model to -	 * apply. Values outside the interval are considered as an -	 * anomaly. -	 */ -	if ((irqs->nr_samples >= 30) && ((diff * diff) > (9 * variance))) { -		/* -		 * After three consecutive anomalies, we reset the -		 * stats as it is no longer stable enough. -		 */ -		if (irqs->anomalies++ >= 3) { -			memset(irqs, 0, sizeof(*irqs)); -			irqs->last_ts = ts; -			return; -		} -	} else { -		/* -		 * The anomalies must be consecutives, so at this -		 * point, we reset the anomalies counter. -		 */ -		irqs->anomalies = 0; -	} - -	/* -	 * The interrupt is considered stable enough to try to predict -	 * the next event on it. +	 * Get the index in the ema table for this interrupt. The +	 * PREDICTION_FACTOR increase the interval size for the array +	 * of exponential average.  	 */ -	irqs->valid = 1; +	index = likely(interval) ? +		ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;  	/* -	 * Online average algorithm: -	 * -	 *  new_average = average + ((value - average) / count) -	 * -	 * The variance computation depends on the new average -	 * to be computed here first. -	 * +	 * Store the index as an element of the pattern in another +	 * circular array.  	 */ -	irqs->avg = irqs->avg + (diff >> IRQ_TIMINGS_SHIFT); +	irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; -	/* -	 * Online variance algorithm: -	 * -	 *  new_variance = variance + (value - average) x (value - new_average) -	 * -	 * Warning: irqs->avg is updated with the line above, hence -	 * 'interval - irqs->avg' is no longer equal to 'diff' -	 */ -	irqs->variance = irqs->variance + (diff * (interval - irqs->avg)); +	irqs->ema_time[index] = irq_timings_ema_new(interval, +						    irqs->ema_time[index]); -	/* -	 * Update the next event -	 */ -	irqs->next_evt = ts + irqs->avg; +	irqs->count++;  }  /** @@ -259,6 +476,9 @@ u64 irq_timings_next_event(u64 now)  	 */  	lockdep_assert_irqs_disabled(); +	if (!irqts->count) +		return next_evt; +  	/*  	 * Number of elements in the circular buffer: If it happens it  	 * was flushed before, then the number of elements could be @@ -269,21 +489,19 @@ u64 irq_timings_next_event(u64 now)  	 * type but with the cost of extra computation in the  	 * interrupt handler hot path. We choose efficiency.  	 * -	 * Inject measured irq/timestamp to the statistical model -	 * while decrementing the counter because we consume the data -	 * from our circular buffer. +	 * Inject measured irq/timestamp to the pattern prediction +	 * model while decrementing the counter because we consume the +	 * data from our circular buffer.  	 */ -	for (i = irqts->count & IRQ_TIMINGS_MASK, -		     irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); -	     irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { -		irq = irq_timing_decode(irqts->values[i], &ts); +	i = (irqts->count & IRQ_TIMINGS_MASK) - 1; +	irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); +	for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { +		irq = irq_timing_decode(irqts->values[i], &ts);  		s = idr_find(&irqt_stats, irq); -		if (s) { -			irqs = this_cpu_ptr(s); -			irqs_update(irqs, ts); -		} +		if (s) +			irq_timings_store(irq, this_cpu_ptr(s), ts);  	}  	/* @@ -294,26 +512,12 @@ u64 irq_timings_next_event(u64 now)  		irqs = this_cpu_ptr(s); -		if (!irqs->valid) -			continue; +		ts = __irq_timings_next_event(irqs, i, now); +		if (ts <= now) +			return now; -		if (irqs->next_evt <= now) { -			irq = i; -			next_evt = now; - -			/* -			 * This interrupt mustn't use in the future -			 * until new events occur and update the -			 * statistics. -			 */ -			irqs->valid = 0; -			break; -		} - -		if (irqs->next_evt < next_evt) { -			irq = i; -			next_evt = irqs->next_evt; -		} +		if (ts < next_evt) +			next_evt = ts;  	}  	return next_evt; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 6b7cdf17ccf8..73288914ed5e 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -56,61 +56,70 @@ void __weak arch_irq_work_raise(void)  	 */  } -/* - * Enqueue the irq_work @work on @cpu unless it's already pending - * somewhere. - * - * Can be re-enqueued while the callback is still in progress. - */ -bool irq_work_queue_on(struct irq_work *work, int cpu) +/* Enqueue on current CPU, work must already be claimed and preempt disabled */ +static void __irq_work_queue_local(struct irq_work *work)  { -	/* All work should have been flushed before going offline */ -	WARN_ON_ONCE(cpu_is_offline(cpu)); - -#ifdef CONFIG_SMP - -	/* Arch remote IPI send/receive backend aren't NMI safe */ -	WARN_ON_ONCE(in_nmi()); +	/* If the work is "lazy", handle it from next tick if any */ +	if (work->flags & IRQ_WORK_LAZY) { +		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && +		    tick_nohz_tick_stopped()) +			arch_irq_work_raise(); +	} else { +		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) +			arch_irq_work_raise(); +	} +} +/* Enqueue the irq work @work on the current CPU */ +bool irq_work_queue(struct irq_work *work) +{  	/* Only queue if not already pending */  	if (!irq_work_claim(work))  		return false; -	if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) -		arch_send_call_function_single_ipi(cpu); - -#else /* #ifdef CONFIG_SMP */ -	irq_work_queue(work); -#endif /* #else #ifdef CONFIG_SMP */ +	/* Queue the entry and raise the IPI if needed. */ +	preempt_disable(); +	__irq_work_queue_local(work); +	preempt_enable();  	return true;  } +EXPORT_SYMBOL_GPL(irq_work_queue); -/* Enqueue the irq work @work on the current CPU */ -bool irq_work_queue(struct irq_work *work) +/* + * Enqueue the irq_work @work on @cpu unless it's already pending + * somewhere. + * + * Can be re-enqueued while the callback is still in progress. + */ +bool irq_work_queue_on(struct irq_work *work, int cpu)  { +#ifndef CONFIG_SMP +	return irq_work_queue(work); + +#else /* CONFIG_SMP: */ +	/* All work should have been flushed before going offline */ +	WARN_ON_ONCE(cpu_is_offline(cpu)); +  	/* Only queue if not already pending */  	if (!irq_work_claim(work))  		return false; -	/* Queue the entry and raise the IPI if needed. */  	preempt_disable(); - -	/* If the work is "lazy", handle it from next tick if any */ -	if (work->flags & IRQ_WORK_LAZY) { -		if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && -		    tick_nohz_tick_stopped()) -			arch_irq_work_raise(); +	if (cpu != smp_processor_id()) { +		/* Arch remote IPI send/receive backend aren't NMI safe */ +		WARN_ON_ONCE(in_nmi()); +		if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) +			arch_send_call_function_single_ipi(cpu);  	} else { -		if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) -			arch_irq_work_raise(); +		__irq_work_queue_local(work);  	} -  	preempt_enable();  	return true; +#endif /* CONFIG_SMP */  } -EXPORT_SYMBOL_GPL(irq_work_queue); +  bool irq_work_needs_cpu(void)  { diff --git a/kernel/jump_label.c b/kernel/jump_label.c index bad96b476eb6..de6efdecc70d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -202,11 +202,13 @@ void static_key_disable(struct static_key *key)  }  EXPORT_SYMBOL_GPL(static_key_disable); -static void __static_key_slow_dec_cpuslocked(struct static_key *key, -					   unsigned long rate_limit, -					   struct delayed_work *work) +static bool static_key_slow_try_dec(struct static_key *key)  { -	lockdep_assert_cpus_held(); +	int val; + +	val = atomic_fetch_add_unless(&key->enabled, -1, 1); +	if (val == 1) +		return false;  	/*  	 * The negative count check is valid even when a negative @@ -215,63 +217,70 @@ static void __static_key_slow_dec_cpuslocked(struct static_key *key,  	 * returns is unbalanced, because all other static_key_slow_inc()  	 * instances block while the update is in progress.  	 */ -	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { -		WARN(atomic_read(&key->enabled) < 0, -		     "jump label: negative count!\n"); +	WARN(val < 0, "jump label: negative count!\n"); +	return true; +} + +static void __static_key_slow_dec_cpuslocked(struct static_key *key) +{ +	lockdep_assert_cpus_held(); + +	if (static_key_slow_try_dec(key))  		return; -	} -	if (rate_limit) { -		atomic_inc(&key->enabled); -		schedule_delayed_work(work, rate_limit); -	} else { +	jump_label_lock(); +	if (atomic_dec_and_test(&key->enabled))  		jump_label_update(key); -	}  	jump_label_unlock();  } -static void __static_key_slow_dec(struct static_key *key, -				  unsigned long rate_limit, -				  struct delayed_work *work) +static void __static_key_slow_dec(struct static_key *key)  {  	cpus_read_lock(); -	__static_key_slow_dec_cpuslocked(key, rate_limit, work); +	__static_key_slow_dec_cpuslocked(key);  	cpus_read_unlock();  } -static void jump_label_update_timeout(struct work_struct *work) +void jump_label_update_timeout(struct work_struct *work)  {  	struct static_key_deferred *key =  		container_of(work, struct static_key_deferred, work.work); -	__static_key_slow_dec(&key->key, 0, NULL); +	__static_key_slow_dec(&key->key);  } +EXPORT_SYMBOL_GPL(jump_label_update_timeout);  void static_key_slow_dec(struct static_key *key)  {  	STATIC_KEY_CHECK_USE(key); -	__static_key_slow_dec(key, 0, NULL); +	__static_key_slow_dec(key);  }  EXPORT_SYMBOL_GPL(static_key_slow_dec);  void static_key_slow_dec_cpuslocked(struct static_key *key)  {  	STATIC_KEY_CHECK_USE(key); -	__static_key_slow_dec_cpuslocked(key, 0, NULL); +	__static_key_slow_dec_cpuslocked(key);  } -void static_key_slow_dec_deferred(struct static_key_deferred *key) +void __static_key_slow_dec_deferred(struct static_key *key, +				    struct delayed_work *work, +				    unsigned long timeout)  {  	STATIC_KEY_CHECK_USE(key); -	__static_key_slow_dec(&key->key, key->timeout, &key->work); + +	if (static_key_slow_try_dec(key)) +		return; + +	schedule_delayed_work(work, timeout);  } -EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); +EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); -void static_key_deferred_flush(struct static_key_deferred *key) +void __static_key_deferred_flush(void *key, struct delayed_work *work)  {  	STATIC_KEY_CHECK_USE(key); -	flush_delayed_work(&key->work); +	flush_delayed_work(work);  } -EXPORT_SYMBOL_GPL(static_key_deferred_flush); +EXPORT_SYMBOL_GPL(__static_key_deferred_flush);  void jump_label_rate_limit(struct static_key_deferred *key,  		unsigned long rl) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index d7140447be75..fd5c95ff9251 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1150,7 +1150,7 @@ int kernel_kexec(void)  		error = dpm_suspend_end(PMSG_FREEZE);  		if (error)  			goto Resume_devices; -		error = disable_nonboot_cpus(); +		error = suspend_disable_secondary_cpus();  		if (error)  			goto Enable_cpus;  		local_irq_disable(); @@ -1183,7 +1183,7 @@ int kernel_kexec(void)   Enable_irqs:  		local_irq_enable();   Enable_cpus: -		enable_nonboot_cpus(); +		suspend_enable_secondary_cpus();  		dpm_resume_start(PMSG_RESTORE);   Resume_devices:  		dpm_resume_end(PMSG_RESTORE); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f1d0e00a3971..f7fb8f6a688f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -688,7 +688,6 @@ static int kexec_calculate_store_digests(struct kimage *image)  		goto out_free_desc;  	desc->tfm   = tfm; -	desc->flags = 0;  	ret = crypto_shash_init(desc);  	if (ret < 0) diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 96b4179cee6a..99a5b5f46dc5 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -120,8 +120,8 @@ account_global_scheduler_latency(struct task_struct *tsk,  				break;  			} -			/* 0 and ULONG_MAX entries mean end of backtrace: */ -			if (record == 0 || record == ULONG_MAX) +			/* 0 entry marks end of backtrace: */ +			if (!record)  				break;  		}  		if (same) { @@ -141,20 +141,6 @@ account_global_scheduler_latency(struct task_struct *tsk,  	memcpy(&latency_record[i], lat, sizeof(struct latency_record));  } -/* - * Iterator to store a backtrace into a latency record entry - */ -static inline void store_stacktrace(struct task_struct *tsk, -					struct latency_record *lat) -{ -	struct stack_trace trace; - -	memset(&trace, 0, sizeof(trace)); -	trace.max_entries = LT_BACKTRACEDEPTH; -	trace.entries = &lat->backtrace[0]; -	save_stack_trace_tsk(tsk, &trace); -} -  /**   * __account_scheduler_latency - record an occurred latency   * @tsk - the task struct of the task hitting the latency @@ -191,7 +177,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)  	lat.count = 1;  	lat.time = usecs;  	lat.max = usecs; -	store_stacktrace(tsk, &lat); + +	stack_trace_save_tsk(tsk, lat.backtrace, LT_BACKTRACEDEPTH, 0);  	raw_spin_lock_irqsave(&latency_lock, flags); @@ -210,8 +197,8 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)  				break;  			} -			/* 0 and ULONG_MAX entries mean end of backtrace: */ -			if (record == 0 || record == ULONG_MAX) +			/* 0 entry is end of backtrace */ +			if (!record)  				break;  		}  		if (same) { @@ -252,10 +239,10 @@ static int lstats_show(struct seq_file *m, void *v)  				   lr->count, lr->time, lr->max);  			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {  				unsigned long bt = lr->backtrace[q]; +  				if (!bt)  					break; -				if (bt == ULONG_MAX) -					break; +  				seq_printf(m, " %ps", (void *)bt);  			}  			seq_puts(m, "\n"); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index eb0ee10a1981..f12c0eabd843 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -426,7 +426,13 @@ static void klp_free_object_dynamic(struct klp_object *obj)  	kfree(obj);  } -static struct klp_object *klp_alloc_object_dynamic(const char *name) +static void klp_init_func_early(struct klp_object *obj, +				struct klp_func *func); +static void klp_init_object_early(struct klp_patch *patch, +				  struct klp_object *obj); + +static struct klp_object *klp_alloc_object_dynamic(const char *name, +						   struct klp_patch *patch)  {  	struct klp_object *obj; @@ -442,7 +448,7 @@ static struct klp_object *klp_alloc_object_dynamic(const char *name)  		}  	} -	INIT_LIST_HEAD(&obj->func_list); +	klp_init_object_early(patch, obj);  	obj->dynamic = true;  	return obj; @@ -471,6 +477,7 @@ static struct klp_func *klp_alloc_func_nop(struct klp_func *old_func,  		}  	} +	klp_init_func_early(obj, func);  	/*  	 * func->new_func is same as func->old_func. These addresses are  	 * set when the object is loaded, see klp_init_object_loaded(). @@ -490,11 +497,9 @@ static int klp_add_object_nops(struct klp_patch *patch,  	obj = klp_find_object(patch, old_obj);  	if (!obj) { -		obj = klp_alloc_object_dynamic(old_obj->name); +		obj = klp_alloc_object_dynamic(old_obj->name, patch);  		if (!obj)  			return -ENOMEM; - -		list_add_tail(&obj->node, &patch->obj_list);  	}  	klp_for_each_func(old_obj, old_func) { @@ -505,8 +510,6 @@ static int klp_add_object_nops(struct klp_patch *patch,  		func = klp_alloc_func_nop(old_func, obj);  		if (!func)  			return -ENOMEM; - -		list_add_tail(&func->node, &obj->func_list);  	}  	return 0; @@ -588,13 +591,7 @@ static void __klp_free_funcs(struct klp_object *obj, bool nops_only)  			continue;  		list_del(&func->node); - -		/* Might be called from klp_init_patch() error path. */ -		if (func->kobj_added) { -			kobject_put(&func->kobj); -		} else if (func->nop) { -			klp_free_func_nop(func); -		} +		kobject_put(&func->kobj);  	}  } @@ -624,13 +621,7 @@ static void __klp_free_objects(struct klp_patch *patch, bool nops_only)  			continue;  		list_del(&obj->node); - -		/* Might be called from klp_init_patch() error path. */ -		if (obj->kobj_added) { -			kobject_put(&obj->kobj); -		} else if (obj->dynamic) { -			klp_free_object_dynamic(obj); -		} +		kobject_put(&obj->kobj);  	}  } @@ -675,10 +666,8 @@ static void klp_free_patch_finish(struct klp_patch *patch)  	 * this is called when the patch gets disabled and it  	 * cannot get enabled again.  	 */ -	if (patch->kobj_added) { -		kobject_put(&patch->kobj); -		wait_for_completion(&patch->finish); -	} +	kobject_put(&patch->kobj); +	wait_for_completion(&patch->finish);  	/* Put the module after the last access to struct klp_patch. */  	if (!patch->forced) @@ -700,8 +689,6 @@ static void klp_free_patch_work_fn(struct work_struct *work)  static int klp_init_func(struct klp_object *obj, struct klp_func *func)  { -	int ret; -  	if (!func->old_name)  		return -EINVAL; @@ -724,13 +711,9 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func)  	 * object. If the user selects 0 for old_sympos, then 1 will be used  	 * since a unique symbol will be the first occurrence.  	 */ -	ret = kobject_init_and_add(&func->kobj, &klp_ktype_func, -				   &obj->kobj, "%s,%lu", func->old_name, -				   func->old_sympos ? func->old_sympos : 1); -	if (!ret) -		func->kobj_added = true; - -	return ret; +	return kobject_add(&func->kobj, &obj->kobj, "%s,%lu", +			   func->old_name, +			   func->old_sympos ? func->old_sympos : 1);  }  /* Arches may override this to finish any remaining arch-specific tasks */ @@ -801,11 +784,9 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)  	klp_find_object_module(obj);  	name = klp_is_module(obj) ? obj->name : "vmlinux"; -	ret = kobject_init_and_add(&obj->kobj, &klp_ktype_object, -				   &patch->kobj, "%s", name); +	ret = kobject_add(&obj->kobj, &patch->kobj, "%s", name);  	if (ret)  		return ret; -	obj->kobj_added = true;  	klp_for_each_func(obj, func) {  		ret = klp_init_func(obj, func); @@ -819,6 +800,21 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)  	return ret;  } +static void klp_init_func_early(struct klp_object *obj, +				struct klp_func *func) +{ +	kobject_init(&func->kobj, &klp_ktype_func); +	list_add_tail(&func->node, &obj->func_list); +} + +static void klp_init_object_early(struct klp_patch *patch, +				  struct klp_object *obj) +{ +	INIT_LIST_HEAD(&obj->func_list); +	kobject_init(&obj->kobj, &klp_ktype_object); +	list_add_tail(&obj->node, &patch->obj_list); +} +  static int klp_init_patch_early(struct klp_patch *patch)  {  	struct klp_object *obj; @@ -829,7 +825,7 @@ static int klp_init_patch_early(struct klp_patch *patch)  	INIT_LIST_HEAD(&patch->list);  	INIT_LIST_HEAD(&patch->obj_list); -	patch->kobj_added = false; +	kobject_init(&patch->kobj, &klp_ktype_patch);  	patch->enabled = false;  	patch->forced = false;  	INIT_WORK(&patch->free_work, klp_free_patch_work_fn); @@ -839,13 +835,10 @@ static int klp_init_patch_early(struct klp_patch *patch)  		if (!obj->funcs)  			return -EINVAL; -		INIT_LIST_HEAD(&obj->func_list); -		obj->kobj_added = false; -		list_add_tail(&obj->node, &patch->obj_list); +		klp_init_object_early(patch, obj);  		klp_for_each_func_static(obj, func) { -			func->kobj_added = false; -			list_add_tail(&func->node, &obj->func_list); +			klp_init_func_early(obj, func);  		}  	} @@ -860,11 +853,9 @@ static int klp_init_patch(struct klp_patch *patch)  	struct klp_object *obj;  	int ret; -	ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, -				   klp_root_kobj, "%s", patch->mod->name); +	ret = kobject_add(&patch->kobj, klp_root_kobj, "%s", patch->mod->name);  	if (ret)  		return ret; -	patch->kobj_added = true;  	if (patch->replace) {  		ret = klp_add_nops(patch); @@ -926,9 +917,6 @@ static int __klp_enable_patch(struct klp_patch *patch)  	if (WARN_ON(patch->enabled))  		return -EINVAL; -	if (!patch->kobj_added) -		return -EINVAL; -  	pr_notice("enabling patch '%s'\n", patch->mod->name);  	klp_init_transition(patch, KLP_PATCHED); @@ -1003,11 +991,10 @@ int klp_enable_patch(struct klp_patch *patch)  		return -ENODEV;  	if (!klp_have_reliable_stack()) { -		pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); -		return -EOPNOTSUPP; +		pr_warn("This architecture doesn't have support for the livepatch consistency model.\n"); +		pr_warn("The livepatch transition may never complete.\n");  	} -  	mutex_lock(&klp_mutex);  	ret = klp_init_patch_early(patch); diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 9c89ae8b337a..c53370d596be 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -202,15 +202,15 @@ void klp_update_patch_state(struct task_struct *task)   * Determine whether the given stack trace includes any references to a   * to-be-patched or to-be-unpatched function.   */ -static int klp_check_stack_func(struct klp_func *func, -				struct stack_trace *trace) +static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, +				unsigned int nr_entries)  {  	unsigned long func_addr, func_size, address;  	struct klp_ops *ops;  	int i; -	for (i = 0; i < trace->nr_entries; i++) { -		address = trace->entries[i]; +	for (i = 0; i < nr_entries; i++) { +		address = entries[i];  		if (klp_target_state == KLP_UNPATCHED) {  			 /* @@ -254,29 +254,25 @@ static int klp_check_stack_func(struct klp_func *func,  static int klp_check_stack(struct task_struct *task, char *err_buf)  {  	static unsigned long entries[MAX_STACK_ENTRIES]; -	struct stack_trace trace;  	struct klp_object *obj;  	struct klp_func *func; -	int ret; +	int ret, nr_entries; -	trace.skip = 0; -	trace.nr_entries = 0; -	trace.max_entries = MAX_STACK_ENTRIES; -	trace.entries = entries; -	ret = save_stack_trace_tsk_reliable(task, &trace); +	ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));  	WARN_ON_ONCE(ret == -ENOSYS); -	if (ret) { +	if (ret < 0) {  		snprintf(err_buf, STACK_ERR_BUF_SIZE,  			 "%s: %s:%d has an unreliable stack\n",  			 __func__, task->comm, task->pid);  		return ret;  	} +	nr_entries = ret;  	klp_for_each_object(klp_transition_patch, obj) {  		if (!obj->patched)  			continue;  		klp_for_each_func(obj, func) { -			ret = klp_check_stack_func(func, &trace); +			ret = klp_check_stack_func(func, entries, nr_entries);  			if (ret) {  				snprintf(err_buf, STACK_ERR_BUF_SIZE,  					 "%s: %s:%d is sleeping on function %s\n", diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 392c7f23af76..6fe2f333aecb 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,7 +3,7 @@  # and is generally not a function of system call inputs.  KCOV_INSTRUMENT		:= n -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o  ifdef CONFIG_FUNCTION_TRACER  CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o -obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o -obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o  obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o  obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o  obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o +obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c new file mode 100644 index 000000000000..fa2c2f951c6b --- /dev/null +++ b/kernel/locking/lock_events.c @@ -0,0 +1,179 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <waiman.long@hpe.com> + */ + +/* + * Collect locking event counts + */ +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/sched/clock.h> +#include <linux/fs.h> + +#include "lock_events.h" + +#undef  LOCK_EVENT +#define LOCK_EVENT(name)	[LOCKEVENT_ ## name] = #name, + +#define LOCK_EVENTS_DIR		"lock_event_counts" + +/* + * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different + * types of locks will be reported under the <debugfs>/lock_event_counts/ + * directory. See lock_events_list.h for the list of available locking + * events. + * + * Writing to the special ".reset_counts" file will reset all the above + * locking event counts. This is a very slow operation and so should not + * be done frequently. + * + * These event counts are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counts usable even in a production + * environment. + */ +static const char * const lockevent_names[lockevent_num + 1] = { + +#include "lock_events_list.h" + +	[LOCKEVENT_reset_cnts] = ".reset_counts", +}; + +/* + * Per-cpu counts + */ +DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * The lockevent_read() function can be overridden. + */ +ssize_t __weak lockevent_read(struct file *file, char __user *user_buf, +			      size_t count, loff_t *ppos) +{ +	char buf[64]; +	int cpu, id, len; +	u64 sum = 0; + +	/* +	 * Get the counter ID stored in file->f_inode->i_private +	 */ +	id = (long)file_inode(file)->i_private; + +	if (id >= lockevent_num) +		return -EBADF; + +	for_each_possible_cpu(cpu) +		sum += per_cpu(lockevents[id], cpu); +	len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum); + +	return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When idx = reset_cnts, reset all the counts. + */ +static ssize_t lockevent_write(struct file *file, const char __user *user_buf, +			   size_t count, loff_t *ppos) +{ +	int cpu; + +	/* +	 * Get the counter ID stored in file->f_inode->i_private +	 */ +	if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts) +		return count; + +	for_each_possible_cpu(cpu) { +		int i; +		unsigned long *ptr = per_cpu_ptr(lockevents, cpu); + +		for (i = 0 ; i < lockevent_num; i++) +			WRITE_ONCE(ptr[i], 0); +	} +	return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_lockevent = { +	.read = lockevent_read, +	.write = lockevent_write, +	.llseek = default_llseek, +}; + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +#include <asm/paravirt.h> + +static bool __init skip_lockevent(const char *name) +{ +	static int pv_on __initdata = -1; + +	if (pv_on < 0) +		pv_on = !pv_is_native_spin_unlock(); +	/* +	 * Skip PV qspinlock events on bare metal. +	 */ +	if (!pv_on && !memcmp(name, "pv_", 3)) +		return true; +	return false; +} +#else +static inline bool skip_lockevent(const char *name) +{ +	return false; +} +#endif + +/* + * Initialize debugfs for the locking event counts. + */ +static int __init init_lockevent_counts(void) +{ +	struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL); +	int i; + +	if (!d_counts) +		goto out; + +	/* +	 * Create the debugfs files +	 * +	 * As reading from and writing to the stat files can be slow, only +	 * root is allowed to do the read/write to limit impact to system +	 * performance. +	 */ +	for (i = 0; i < lockevent_num; i++) { +		if (skip_lockevent(lockevent_names[i])) +			continue; +		if (!debugfs_create_file(lockevent_names[i], 0400, d_counts, +					 (void *)(long)i, &fops_lockevent)) +			goto fail_undo; +	} + +	if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200, +				 d_counts, (void *)(long)LOCKEVENT_reset_cnts, +				 &fops_lockevent)) +		goto fail_undo; + +	return 0; +fail_undo: +	debugfs_remove_recursive(d_counts); +out: +	pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR); +	return -ENOMEM; +} +fs_initcall(init_lockevent_counts); diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h new file mode 100644 index 000000000000..feb1acc54611 --- /dev/null +++ b/kernel/locking/lock_events.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <longman@redhat.com> + */ + +#ifndef __LOCKING_LOCK_EVENTS_H +#define __LOCKING_LOCK_EVENTS_H + +enum lock_events { + +#include "lock_events_list.h" + +	lockevent_num,	/* Total number of lock event counts */ +	LOCKEVENT_reset_cnts = lockevent_num, +}; + +#ifdef CONFIG_LOCK_EVENT_COUNTS +/* + * Per-cpu counters + */ +DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); + +/* + * Increment the PV qspinlock statistical counters + */ +static inline void __lockevent_inc(enum lock_events event, bool cond) +{ +	if (cond) +		__this_cpu_inc(lockevents[event]); +} + +#define lockevent_inc(ev)	  __lockevent_inc(LOCKEVENT_ ##ev, true) +#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c) + +static inline void __lockevent_add(enum lock_events event, int inc) +{ +	__this_cpu_add(lockevents[event], inc); +} + +#define lockevent_add(ev, c)	__lockevent_add(LOCKEVENT_ ##ev, c) + +#else  /* CONFIG_LOCK_EVENT_COUNTS */ + +#define lockevent_inc(ev) +#define lockevent_add(ev, c) +#define lockevent_cond_inc(ev, c) + +#endif /* CONFIG_LOCK_EVENT_COUNTS */ +#endif /* __LOCKING_LOCK_EVENTS_H */ diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h new file mode 100644 index 000000000000..ad7668cfc9da --- /dev/null +++ b/kernel/locking/lock_events_list.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <longman@redhat.com> + */ + +#ifndef LOCK_EVENT +#define LOCK_EVENT(name)	LOCKEVENT_ ## name, +#endif + +#ifdef CONFIG_QUEUED_SPINLOCKS +#ifdef CONFIG_PARAVIRT_SPINLOCKS +/* + * Locking events for PV qspinlock. + */ +LOCK_EVENT(pv_hash_hops)	/* Average # of hops per hashing operation */ +LOCK_EVENT(pv_kick_unlock)	/* # of vCPU kicks issued at unlock time   */ +LOCK_EVENT(pv_kick_wake)	/* # of vCPU kicks for pv_latency_wake	   */ +LOCK_EVENT(pv_latency_kick)	/* Average latency (ns) of vCPU kick	   */ +LOCK_EVENT(pv_latency_wake)	/* Average latency (ns) of kick-to-wakeup  */ +LOCK_EVENT(pv_lock_stealing)	/* # of lock stealing operations	   */ +LOCK_EVENT(pv_spurious_wakeup)	/* # of spurious wakeups in non-head vCPUs */ +LOCK_EVENT(pv_wait_again)	/* # of wait's after queue head vCPU kick  */ +LOCK_EVENT(pv_wait_early)	/* # of early vCPU wait's		   */ +LOCK_EVENT(pv_wait_head)	/* # of vCPU wait's at the queue head	   */ +LOCK_EVENT(pv_wait_node)	/* # of vCPU wait's at non-head queue node */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +/* + * Locking events for qspinlock + * + * Subtracting lock_use_node[234] from lock_slowpath will give you + * lock_use_node1. + */ +LOCK_EVENT(lock_pending)	/* # of locking ops via pending code	     */ +LOCK_EVENT(lock_slowpath)	/* # of locking ops via MCS lock queue	     */ +LOCK_EVENT(lock_use_node2)	/* # of locking ops that use 2nd percpu node */ +LOCK_EVENT(lock_use_node3)	/* # of locking ops that use 3rd percpu node */ +LOCK_EVENT(lock_use_node4)	/* # of locking ops that use 4th percpu node */ +LOCK_EVENT(lock_no_node)	/* # of locking ops w/o using percpu node    */ +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +/* + * Locking events for rwsem + */ +LOCK_EVENT(rwsem_sleep_reader)	/* # of reader sleeps			*/ +LOCK_EVENT(rwsem_sleep_writer)	/* # of writer sleeps			*/ +LOCK_EVENT(rwsem_wake_reader)	/* # of reader wakeups			*/ +LOCK_EVENT(rwsem_wake_writer)	/* # of writer wakeups			*/ +LOCK_EVENT(rwsem_opt_wlock)	/* # of write locks opt-spin acquired	*/ +LOCK_EVENT(rwsem_opt_fail)	/* # of failed opt-spinnings		*/ +LOCK_EVENT(rwsem_rlock)		/* # of read locks acquired		*/ +LOCK_EVENT(rwsem_rlock_fast)	/* # of fast read locks acquired	*/ +LOCK_EVENT(rwsem_rlock_fail)	/* # of failed read lock acquisitions	*/ +LOCK_EVENT(rwsem_rtrylock)	/* # of read trylock calls		*/ +LOCK_EVENT(rwsem_wlock)		/* # of write locks acquired		*/ +LOCK_EVENT(rwsem_wlock_fail)	/* # of failed write lock acquisitions	*/ +LOCK_EVENT(rwsem_wtrylock)	/* # of write trylock calls		*/ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e221be724fe8..d06190fa5082 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -434,29 +434,14 @@ static void print_lockdep_off(const char *bug_msg)  #endif  } -static int save_trace(struct stack_trace *trace) +static int save_trace(struct lock_trace *trace)  { -	trace->nr_entries = 0; -	trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; -	trace->entries = stack_trace + nr_stack_trace_entries; - -	trace->skip = 3; - -	save_stack_trace(trace); - -	/* -	 * Some daft arches put -1 at the end to indicate its a full trace. -	 * -	 * <rant> this is buggy anyway, since it takes a whole extra entry so a -	 * complete trace that maxes out the entries provided will be reported -	 * as incomplete, friggin useless </rant> -	 */ -	if (trace->nr_entries != 0 && -	    trace->entries[trace->nr_entries-1] == ULONG_MAX) -		trace->nr_entries--; - -	trace->max_entries = trace->nr_entries; +	unsigned long *entries = stack_trace + nr_stack_trace_entries; +	unsigned int max_entries; +	trace->offset = nr_stack_trace_entries; +	max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; +	trace->nr_entries = stack_trace_save(entries, max_entries, 3);  	nr_stack_trace_entries += trace->nr_entries;  	if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { @@ -516,11 +501,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)  {  	char c = '.'; -	if (class->usage_mask & lock_flag(bit + 2)) +	if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))  		c = '+';  	if (class->usage_mask & lock_flag(bit)) {  		c = '-'; -		if (class->usage_mask & lock_flag(bit + 2)) +		if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))  			c = '?';  	} @@ -649,6 +634,9 @@ static int static_obj(const void *obj)  		      end   = (unsigned long) &_end,  		      addr  = (unsigned long) obj; +	if (arch_is_kernel_initmem_freed(addr)) +		return 0; +  	/*  	 * static variable?  	 */ @@ -1207,7 +1195,7 @@ static struct lock_list *alloc_list_entry(void)  static int add_lock_to_list(struct lock_class *this,  			    struct lock_class *links_to, struct list_head *head,  			    unsigned long ip, int distance, -			    struct stack_trace *trace) +			    struct lock_trace *trace)  {  	struct lock_list *entry;  	/* @@ -1426,6 +1414,13 @@ static inline int __bfs_backwards(struct lock_list *src_entry,   * checking.   */ +static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +{ +	unsigned long *entries = stack_trace + trace->offset; + +	stack_trace_print(entries, trace->nr_entries, spaces); +} +  /*   * Print a dependency chain entry (this is only done when a deadlock   * has been detected): @@ -1438,8 +1433,7 @@ print_circular_bug_entry(struct lock_list *target, int depth)  	printk("\n-> #%u", depth);  	print_lock_name(target->class);  	printk(KERN_CONT ":\n"); -	print_stack_trace(&target->trace, 6); - +	print_lock_trace(&target->trace, 6);  	return 0;  } @@ -1533,10 +1527,9 @@ static inline int class_equal(struct lock_list *entry, void *data)  }  static noinline int print_circular_bug(struct lock_list *this, -				struct lock_list *target, -				struct held_lock *check_src, -				struct held_lock *check_tgt, -				struct stack_trace *trace) +				       struct lock_list *target, +				       struct held_lock *check_src, +				       struct held_lock *check_tgt)  {  	struct task_struct *curr = current;  	struct lock_list *parent; @@ -1676,19 +1669,25 @@ check_redundant(struct lock_list *root, struct lock_class *target,  }  #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + +static inline int usage_accumulate(struct lock_list *entry, void *mask) +{ +	*(unsigned long *)mask |= entry->class->usage_mask; + +	return 0; +} +  /*   * Forwards and backwards subgraph searching, for the purposes of   * proving that two subgraphs can be connected by a new dependency   * without creating any illegal irq-safe -> irq-unsafe lock dependency.   */ -static inline int usage_match(struct lock_list *entry, void *bit) +static inline int usage_match(struct lock_list *entry, void *mask)  { -	return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); +	return entry->class->usage_mask & *(unsigned long *)mask;  } - -  /*   * Find a node in the forwards-direction dependency sub-graph starting   * at @root->class that matches @bit. @@ -1700,14 +1699,14 @@ static inline int usage_match(struct lock_list *entry, void *bit)   * Return <0 on error.   */  static int -find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_forwards(struct lock_list *root, unsigned long usage_mask,  			struct lock_list **target_entry)  {  	int result;  	debug_atomic_inc(nr_find_usage_forwards_checks); -	result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); +	result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);  	return result;  } @@ -1723,14 +1722,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,   * Return <0 on error.   */  static int -find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, +find_usage_backwards(struct lock_list *root, unsigned long usage_mask,  			struct lock_list **target_entry)  {  	int result;  	debug_atomic_inc(nr_find_usage_backwards_checks); -	result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); +	result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);  	return result;  } @@ -1752,7 +1751,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)  			len += printk("%*s   %s", depth, "", usage_str[bit]);  			len += printk(KERN_CONT " at:\n"); -			print_stack_trace(class->usage_traces + bit, len); +			print_lock_trace(class->usage_traces + bit, len);  		}  	}  	printk("%*s }\n", depth, ""); @@ -1777,7 +1776,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,  	do {  		print_lock_class_header(entry->class, depth);  		printk("%*s ... acquired at:\n", depth, ""); -		print_stack_trace(&entry->trace, 2); +		print_lock_trace(&entry->trace, 2);  		printk("\n");  		if (depth == 0 && (entry != root)) { @@ -1890,14 +1889,14 @@ print_bad_irq_dependency(struct task_struct *curr,  	print_lock_name(backwards_entry->class);  	pr_warn("\n... which became %s-irq-safe at:\n", irqclass); -	print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); +	print_lock_trace(backwards_entry->class->usage_traces + bit1, 1);  	pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);  	print_lock_name(forwards_entry->class);  	pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);  	pr_warn("..."); -	print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); +	print_lock_trace(forwards_entry->class->usage_traces + bit2, 1);  	pr_warn("\nother info that might help us debug this:\n\n");  	print_irq_lock_scenario(backwards_entry, forwards_entry, @@ -1922,39 +1921,6 @@ print_bad_irq_dependency(struct task_struct *curr,  	return 0;  } -static int -check_usage(struct task_struct *curr, struct held_lock *prev, -	    struct held_lock *next, enum lock_usage_bit bit_backwards, -	    enum lock_usage_bit bit_forwards, const char *irqclass) -{ -	int ret; -	struct lock_list this, that; -	struct lock_list *uninitialized_var(target_entry); -	struct lock_list *uninitialized_var(target_entry1); - -	this.parent = NULL; - -	this.class = hlock_class(prev); -	ret = find_usage_backwards(&this, bit_backwards, &target_entry); -	if (ret < 0) -		return print_bfs_bug(ret); -	if (ret == 1) -		return ret; - -	that.parent = NULL; -	that.class = hlock_class(next); -	ret = find_usage_forwards(&that, bit_forwards, &target_entry1); -	if (ret < 0) -		return print_bfs_bug(ret); -	if (ret == 1) -		return ret; - -	return print_bad_irq_dependency(curr, &this, &that, -			target_entry, target_entry1, -			prev, next, -			bit_backwards, bit_forwards, irqclass); -} -  static const char *state_names[] = {  #define LOCKDEP_STATE(__STATE) \  	__stringify(__STATE), @@ -1971,9 +1937,19 @@ static const char *state_rnames[] = {  static inline const char *state_name(enum lock_usage_bit bit)  { -	return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2]; +	if (bit & LOCK_USAGE_READ_MASK) +		return state_rnames[bit >> LOCK_USAGE_DIR_MASK]; +	else +		return state_names[bit >> LOCK_USAGE_DIR_MASK];  } +/* + * The bit number is encoded like: + * + *  bit0: 0 exclusive, 1 read lock + *  bit1: 0 used in irq, 1 irq enabled + *  bit2-n: state + */  static int exclusive_bit(int new_bit)  {  	int state = new_bit & LOCK_USAGE_STATE_MASK; @@ -1985,45 +1961,160 @@ static int exclusive_bit(int new_bit)  	return state | (dir ^ LOCK_USAGE_DIR_MASK);  } +/* + * Observe that when given a bitmask where each bitnr is encoded as above, a + * right shift of the mask transforms the individual bitnrs as -1 and + * conversely, a left shift transforms into +1 for the individual bitnrs. + * + * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can + * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0) + * instead by subtracting the bit number by 2, or shifting the mask right by 2. + * + * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2. + * + * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is + * all bits set) and recompose with bitnr1 flipped. + */ +static unsigned long invert_dir_mask(unsigned long mask) +{ +	unsigned long excl = 0; + +	/* Invert dir */ +	excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK; +	excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK; + +	return excl; +} + +/* + * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all + * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). + * And then mask out all bitnr0. + */ +static unsigned long exclusive_mask(unsigned long mask) +{ +	unsigned long excl = invert_dir_mask(mask); + +	/* Strip read */ +	excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK; +	excl &= ~LOCKF_IRQ_READ; + +	return excl; +} + +/* + * Retrieve the _possible_ original mask to which @mask is + * exclusive. Ie: this is the opposite of exclusive_mask(). + * Note that 2 possible original bits can match an exclusive + * bit: one has LOCK_USAGE_READ_MASK set, the other has it + * cleared. So both are returned for each exclusive bit. + */ +static unsigned long original_mask(unsigned long mask) +{ +	unsigned long excl = invert_dir_mask(mask); + +	/* Include read in existing usages */ +	excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK; + +	return excl; +} + +/* + * Find the first pair of bit match between an original + * usage mask and an exclusive usage mask. + */ +static int find_exclusive_match(unsigned long mask, +				unsigned long excl_mask, +				enum lock_usage_bit *bitp, +				enum lock_usage_bit *excl_bitp) +{ +	int bit, excl; + +	for_each_set_bit(bit, &mask, LOCK_USED) { +		excl = exclusive_bit(bit); +		if (excl_mask & lock_flag(excl)) { +			*bitp = bit; +			*excl_bitp = excl; +			return 0; +		} +	} +	return -1; +} + +/* + * Prove that the new dependency does not connect a hardirq-safe(-read) + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at <prev>, and the + * forwards-subgraph starting at <next>: + */  static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, -			   struct held_lock *next, enum lock_usage_bit bit) +			   struct held_lock *next)  { +	unsigned long usage_mask = 0, forward_mask, backward_mask; +	enum lock_usage_bit forward_bit = 0, backward_bit = 0; +	struct lock_list *uninitialized_var(target_entry1); +	struct lock_list *uninitialized_var(target_entry); +	struct lock_list this, that; +	int ret; +  	/* -	 * Prove that the new dependency does not connect a hardirq-safe -	 * lock with a hardirq-unsafe lock - to achieve this we search -	 * the backwards-subgraph starting at <prev>, and the -	 * forwards-subgraph starting at <next>: +	 * Step 1: gather all hard/soft IRQs usages backward in an +	 * accumulated usage mask.  	 */ -	if (!check_usage(curr, prev, next, bit, -			   exclusive_bit(bit), state_name(bit))) -		return 0; +	this.parent = NULL; +	this.class = hlock_class(prev); -	bit++; /* _READ */ +	ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); +	if (ret < 0) +		return print_bfs_bug(ret); + +	usage_mask &= LOCKF_USED_IN_IRQ_ALL; +	if (!usage_mask) +		return 1;  	/* -	 * Prove that the new dependency does not connect a hardirq-safe-read -	 * lock with a hardirq-unsafe lock - to achieve this we search -	 * the backwards-subgraph starting at <prev>, and the -	 * forwards-subgraph starting at <next>: +	 * Step 2: find exclusive uses forward that match the previous +	 * backward accumulated mask.  	 */ -	if (!check_usage(curr, prev, next, bit, -			   exclusive_bit(bit), state_name(bit))) -		return 0; +	forward_mask = exclusive_mask(usage_mask); -	return 1; -} +	that.parent = NULL; +	that.class = hlock_class(next); -static int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, -		struct held_lock *next) -{ -#define LOCKDEP_STATE(__STATE)						\ -	if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE))	\ -		return 0; -#include "lockdep_states.h" -#undef LOCKDEP_STATE +	ret = find_usage_forwards(&that, forward_mask, &target_entry1); +	if (ret < 0) +		return print_bfs_bug(ret); +	if (ret == 1) +		return ret; -	return 1; +	/* +	 * Step 3: we found a bad match! Now retrieve a lock from the backward +	 * list whose usage mask matches the exclusive usage mask from the +	 * lock found on the forward list. +	 */ +	backward_mask = original_mask(target_entry1->class->usage_mask); + +	ret = find_usage_backwards(&this, backward_mask, &target_entry); +	if (ret < 0) +		return print_bfs_bug(ret); +	if (DEBUG_LOCKS_WARN_ON(ret == 1)) +		return 1; + +	/* +	 * Step 4: narrow down to a pair of incompatible usage bits +	 * and report it. +	 */ +	ret = find_exclusive_match(target_entry->class->usage_mask, +				   target_entry1->class->usage_mask, +				   &backward_bit, &forward_bit); +	if (DEBUG_LOCKS_WARN_ON(ret == -1)) +		return 1; + +	return print_bad_irq_dependency(curr, &this, &that, +			target_entry, target_entry1, +			prev, next, +			backward_bit, forward_bit, +			state_name(backward_bit));  }  static void inc_chains(void) @@ -2040,9 +2131,8 @@ static void inc_chains(void)  #else -static inline int -check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, -		struct held_lock *next) +static inline int check_irq_usage(struct task_struct *curr, +				  struct held_lock *prev, struct held_lock *next)  {  	return 1;  } @@ -2170,8 +2260,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,   */  static int  check_prev_add(struct task_struct *curr, struct held_lock *prev, -	       struct held_lock *next, int distance, struct stack_trace *trace, -	       int (*save)(struct stack_trace *trace)) +	       struct held_lock *next, int distance, struct lock_trace *trace)  {  	struct lock_list *uninitialized_var(target_entry);  	struct lock_list *entry; @@ -2209,20 +2298,20 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  	this.parent = NULL;  	ret = check_noncircular(&this, hlock_class(prev), &target_entry);  	if (unlikely(!ret)) { -		if (!trace->entries) { +		if (!trace->nr_entries) {  			/* -			 * If @save fails here, the printing might trigger -			 * a WARN but because of the !nr_entries it should -			 * not do bad things. +			 * If save_trace fails here, the printing might +			 * trigger a WARN but because of the !nr_entries it +			 * should not do bad things.  			 */ -			save(trace); +			save_trace(trace);  		} -		return print_circular_bug(&this, target_entry, next, prev, trace); +		return print_circular_bug(&this, target_entry, next, prev);  	}  	else if (unlikely(ret < 0))  		return print_bfs_bug(ret); -	if (!check_prev_add_irq(curr, prev, next)) +	if (!check_irq_usage(curr, prev, next))  		return 0;  	/* @@ -2265,7 +2354,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  		return print_bfs_bug(ret); -	if (!trace->entries && !save(trace)) +	if (!trace->nr_entries && !save_trace(trace))  		return 0;  	/* @@ -2297,14 +2386,9 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,  static int  check_prevs_add(struct task_struct *curr, struct held_lock *next)  { +	struct lock_trace trace = { .nr_entries = 0 };  	int depth = curr->lockdep_depth;  	struct held_lock *hlock; -	struct stack_trace trace = { -		.nr_entries = 0, -		.max_entries = 0, -		.entries = NULL, -		.skip = 0, -	};  	/*  	 * Debugging checks. @@ -2330,7 +2414,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)  		 * added:  		 */  		if (hlock->read != 2 && hlock->check) { -			int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); +			int ret = check_prev_add(curr, hlock, next, distance, +						 &trace);  			if (!ret)  				return 0; @@ -2731,6 +2816,10 @@ static inline int validate_chain(struct task_struct *curr,  {  	return 1;  } + +static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) +{ +}  #endif  /* @@ -2784,6 +2873,12 @@ static void check_chain_key(struct task_struct *curr)  #endif  } +static int mark_lock(struct task_struct *curr, struct held_lock *this, +		     enum lock_usage_bit new_bit); + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + +  static void  print_usage_bug_scenario(struct held_lock *lock)  { @@ -2827,7 +2922,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,  	print_lock(this);  	pr_warn("{%s} state was registered at:\n", usage_str[prev_bit]); -	print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); +	print_lock_trace(hlock_class(this)->usage_traces + prev_bit, 1);  	print_irqtrace_events(curr);  	pr_warn("\nother info that might help us debug this:\n"); @@ -2853,10 +2948,6 @@ valid_state(struct task_struct *curr, struct held_lock *this,  	return 1;  } -static int mark_lock(struct task_struct *curr, struct held_lock *this, -		     enum lock_usage_bit new_bit); - -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)  /*   * print irq inversion bug: @@ -2936,7 +3027,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,  	root.parent = NULL;  	root.class = hlock_class(this); -	ret = find_usage_forwards(&root, bit, &target_entry); +	ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);  	if (ret < 0)  		return print_bfs_bug(ret);  	if (ret == 1) @@ -2960,7 +3051,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,  	root.parent = NULL;  	root.class = hlock_class(this); -	ret = find_usage_backwards(&root, bit, &target_entry); +	ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);  	if (ret < 0)  		return print_bfs_bug(ret);  	if (ret == 1) @@ -3015,7 +3106,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = {  static inline int state_verbose(enum lock_usage_bit bit,  				struct lock_class *class)  { -	return state_verbose_f[bit >> 2](class); +	return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class);  }  typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, @@ -3157,7 +3248,7 @@ void lockdep_hardirqs_on(unsigned long ip)  	/*  	 * See the fine text that goes along with this variable definition.  	 */ -	if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) +	if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled))  		return;  	/* diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index d4c197425f68..150ec3f0c5b5 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -42,13 +42,35 @@ enum {  	__LOCKF(USED)  }; -#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) -#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) +#define LOCKDEP_STATE(__STATE)	LOCKF_ENABLED_##__STATE | +static const unsigned long LOCKF_ENABLED_IRQ = +#include "lockdep_states.h" +	0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE)	LOCKF_USED_IN_##__STATE | +static const unsigned long LOCKF_USED_IN_IRQ = +#include "lockdep_states.h" +	0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE)	LOCKF_ENABLED_##__STATE##_READ | +static const unsigned long LOCKF_ENABLED_IRQ_READ = +#include "lockdep_states.h" +	0; +#undef LOCKDEP_STATE + +#define LOCKDEP_STATE(__STATE)	LOCKF_USED_IN_##__STATE##_READ | +static const unsigned long LOCKF_USED_IN_IRQ_READ = +#include "lockdep_states.h" +	0; +#undef LOCKDEP_STATE + +#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ) +#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ) -#define LOCKF_ENABLED_IRQ_READ \ -		(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) -#define LOCKF_USED_IN_IRQ_READ \ -		(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) +#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ) +#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ)  /*   * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index ad40a2617063..80a463d31a8d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -829,7 +829,9 @@ static void lock_torture_cleanup(void)  						"End of test: SUCCESS");  	kfree(cxt.lwsa); +	cxt.lwsa = NULL;  	kfree(cxt.lrsa); +	cxt.lrsa = NULL;  end:  	torture_cleanup_end(); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 883cf1b92d90..f17dad99eec8 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -7,6 +7,8 @@  #include <linux/sched.h>  #include <linux/errno.h> +#include "rwsem.h" +  int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,  			const char *name, struct lock_class_key *rwsem_key)  { diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 5e9247dc2515..e14b32c69639 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -395,7 +395,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)  	 * 0,1,0 -> 0,0,1  	 */  	clear_pending_set_locked(lock); -	qstat_inc(qstat_lock_pending, true); +	lockevent_inc(lock_pending);  	return;  	/* @@ -403,7 +403,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)  	 * queuing.  	 */  queue: -	qstat_inc(qstat_lock_slowpath, true); +	lockevent_inc(lock_slowpath);  pv_queue:  	node = this_cpu_ptr(&qnodes[0].mcs);  	idx = node->count++; @@ -419,7 +419,7 @@ pv_queue:  	 * simple enough.  	 */  	if (unlikely(idx >= MAX_NODES)) { -		qstat_inc(qstat_lock_no_node, true); +		lockevent_inc(lock_no_node);  		while (!queued_spin_trylock(lock))  			cpu_relax();  		goto release; @@ -430,7 +430,7 @@ pv_queue:  	/*  	 * Keep counts of non-zero index values:  	 */ -	qstat_inc(qstat_lock_use_node2 + idx - 1, idx); +	lockevent_cond_inc(lock_use_node2 + idx - 1, idx);  	/*  	 * Ensure that we increment the head node->count before initialising diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 8f36c27c1794..89bab079e7a4 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)  		if (!(val & _Q_LOCKED_PENDING_MASK) &&  		   (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { -			qstat_inc(qstat_pv_lock_stealing, true); +			lockevent_inc(pv_lock_stealing);  			return true;  		}  		if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) @@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)  		hopcnt++;  		if (!cmpxchg(&he->lock, NULL, lock)) {  			WRITE_ONCE(he->node, node); -			qstat_hop(hopcnt); +			lockevent_pv_hop(hopcnt);  			return &he->lock;  		}  	} @@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)  		smp_store_mb(pn->state, vcpu_halted);  		if (!READ_ONCE(node->locked)) { -			qstat_inc(qstat_pv_wait_node, true); -			qstat_inc(qstat_pv_wait_early, wait_early); +			lockevent_inc(pv_wait_node); +			lockevent_cond_inc(pv_wait_early, wait_early);  			pv_wait(&pn->state, vcpu_halted);  		} @@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)  		 * So it is better to spin for a while in the hope that the  		 * MCS lock will be released soon.  		 */ -		qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); +		lockevent_cond_inc(pv_spurious_wakeup, +				  !READ_ONCE(node->locked));  	}  	/* @@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)  	/*  	 * Tracking # of slowpath locking operations  	 */ -	qstat_inc(qstat_lock_slowpath, true); +	lockevent_inc(lock_slowpath);  	for (;; waitcnt++) {  		/* @@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)  			}  		}  		WRITE_ONCE(pn->state, vcpu_hashed); -		qstat_inc(qstat_pv_wait_head, true); -		qstat_inc(qstat_pv_wait_again, waitcnt); +		lockevent_inc(pv_wait_head); +		lockevent_cond_inc(pv_wait_again, waitcnt);  		pv_wait(&lock->locked, _Q_SLOW_VAL);  		/* @@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)  	 * vCPU is harmless other than the additional latency in completing  	 * the unlock.  	 */ -	qstat_inc(qstat_pv_kick_unlock, true); +	lockevent_inc(pv_kick_unlock);  	pv_kick(node->cpu);  } diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index d73f85388d5c..54152670ff24 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h @@ -9,262 +9,105 @@   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the   * GNU General Public License for more details.   * - * Authors: Waiman Long <waiman.long@hpe.com> + * Authors: Waiman Long <longman@redhat.com>   */ -/* - * When queued spinlock statistical counters are enabled, the following - * debugfs files will be created for reporting the counter values: - * - * <debugfs>/qlockstat/ - *   pv_hash_hops	- average # of hops per hashing operation - *   pv_kick_unlock	- # of vCPU kicks issued at unlock time - *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake - *   pv_latency_kick	- average latency (ns) of vCPU kick operation - *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup - *   pv_lock_stealing	- # of lock stealing operations - *   pv_spurious_wakeup	- # of spurious wakeups in non-head vCPUs - *   pv_wait_again	- # of wait's after a queue head vCPU kick - *   pv_wait_early	- # of early vCPU wait's - *   pv_wait_head	- # of vCPU wait's at the queue head - *   pv_wait_node	- # of vCPU wait's at a non-head queue node - *   lock_pending	- # of locking operations via pending code - *   lock_slowpath	- # of locking operations via MCS lock queue - *   lock_use_node2	- # of locking operations that use 2nd per-CPU node - *   lock_use_node3	- # of locking operations that use 3rd per-CPU node - *   lock_use_node4	- # of locking operations that use 4th per-CPU node - *   lock_no_node	- # of locking operations without using per-CPU node - * - * Subtracting lock_use_node[234] from lock_slowpath will give you - * lock_use_node1. - * - * Writing to the "reset_counters" file will reset all the above counter - * values. - * - * These statistical counters are implemented as per-cpu variables which are - * summed and computed whenever the corresponding debugfs files are read. This - * minimizes added overhead making the counters usable even in a production - * environment. - * - * There may be slight difference between pv_kick_wake and pv_kick_unlock. - */ -enum qlock_stats { -	qstat_pv_hash_hops, -	qstat_pv_kick_unlock, -	qstat_pv_kick_wake, -	qstat_pv_latency_kick, -	qstat_pv_latency_wake, -	qstat_pv_lock_stealing, -	qstat_pv_spurious_wakeup, -	qstat_pv_wait_again, -	qstat_pv_wait_early, -	qstat_pv_wait_head, -	qstat_pv_wait_node, -	qstat_lock_pending, -	qstat_lock_slowpath, -	qstat_lock_use_node2, -	qstat_lock_use_node3, -	qstat_lock_use_node4, -	qstat_lock_no_node, -	qstat_num,	/* Total number of statistical counters */ -	qstat_reset_cnts = qstat_num, -}; +#include "lock_events.h" -#ifdef CONFIG_QUEUED_LOCK_STAT +#ifdef CONFIG_LOCK_EVENT_COUNTS +#ifdef CONFIG_PARAVIRT_SPINLOCKS  /* - * Collect pvqspinlock statistics + * Collect pvqspinlock locking event counts   */ -#include <linux/debugfs.h>  #include <linux/sched.h>  #include <linux/sched/clock.h>  #include <linux/fs.h> -static const char * const qstat_names[qstat_num + 1] = { -	[qstat_pv_hash_hops]	   = "pv_hash_hops", -	[qstat_pv_kick_unlock]     = "pv_kick_unlock", -	[qstat_pv_kick_wake]       = "pv_kick_wake", -	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", -	[qstat_pv_latency_kick]	   = "pv_latency_kick", -	[qstat_pv_latency_wake]    = "pv_latency_wake", -	[qstat_pv_lock_stealing]   = "pv_lock_stealing", -	[qstat_pv_wait_again]      = "pv_wait_again", -	[qstat_pv_wait_early]      = "pv_wait_early", -	[qstat_pv_wait_head]       = "pv_wait_head", -	[qstat_pv_wait_node]       = "pv_wait_node", -	[qstat_lock_pending]       = "lock_pending", -	[qstat_lock_slowpath]      = "lock_slowpath", -	[qstat_lock_use_node2]	   = "lock_use_node2", -	[qstat_lock_use_node3]	   = "lock_use_node3", -	[qstat_lock_use_node4]	   = "lock_use_node4", -	[qstat_lock_no_node]	   = "lock_no_node", -	[qstat_reset_cnts]         = "reset_counters", -}; +#define EVENT_COUNT(ev)	lockevents[LOCKEVENT_ ## ev]  /* - * Per-cpu counters + * PV specific per-cpu counter   */ -static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);  static DEFINE_PER_CPU(u64, pv_kick_time);  /* - * Function to read and return the qlock statistical counter values + * Function to read and return the PV qspinlock counts.   *   * The following counters are handled specially: - * 1. qstat_pv_latency_kick + * 1. pv_latency_kick   *    Average kick latency (ns) = pv_latency_kick/pv_kick_unlock - * 2. qstat_pv_latency_wake + * 2. pv_latency_wake   *    Average wake latency (ns) = pv_latency_wake/pv_kick_wake - * 3. qstat_pv_hash_hops + * 3. pv_hash_hops   *    Average hops/hash = pv_hash_hops/pv_kick_unlock   */ -static ssize_t qstat_read(struct file *file, char __user *user_buf, -			  size_t count, loff_t *ppos) +ssize_t lockevent_read(struct file *file, char __user *user_buf, +		       size_t count, loff_t *ppos)  {  	char buf[64]; -	int cpu, counter, len; -	u64 stat = 0, kicks = 0; +	int cpu, id, len; +	u64 sum = 0, kicks = 0;  	/*  	 * Get the counter ID stored in file->f_inode->i_private  	 */ -	counter = (long)file_inode(file)->i_private; +	id = (long)file_inode(file)->i_private; -	if (counter >= qstat_num) +	if (id >= lockevent_num)  		return -EBADF;  	for_each_possible_cpu(cpu) { -		stat += per_cpu(qstats[counter], cpu); +		sum += per_cpu(lockevents[id], cpu);  		/* -		 * Need to sum additional counter for some of them +		 * Need to sum additional counters for some of them  		 */ -		switch (counter) { +		switch (id) { -		case qstat_pv_latency_kick: -		case qstat_pv_hash_hops: -			kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); +		case LOCKEVENT_pv_latency_kick: +		case LOCKEVENT_pv_hash_hops: +			kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu);  			break; -		case qstat_pv_latency_wake: -			kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); +		case LOCKEVENT_pv_latency_wake: +			kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu);  			break;  		}  	} -	if (counter == qstat_pv_hash_hops) { +	if (id == LOCKEVENT_pv_hash_hops) {  		u64 frac = 0;  		if (kicks) { -			frac = 100ULL * do_div(stat, kicks); +			frac = 100ULL * do_div(sum, kicks);  			frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);  		}  		/*  		 * Return a X.XX decimal number  		 */ -		len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); +		len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", +			       sum, frac);  	} else {  		/*  		 * Round to the nearest ns  		 */ -		if ((counter == qstat_pv_latency_kick) || -		    (counter == qstat_pv_latency_wake)) { +		if ((id == LOCKEVENT_pv_latency_kick) || +		    (id == LOCKEVENT_pv_latency_wake)) {  			if (kicks) -				stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); +				sum = DIV_ROUND_CLOSEST_ULL(sum, kicks);  		} -		len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); +		len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);  	}  	return simple_read_from_buffer(user_buf, count, ppos, buf, len);  }  /* - * Function to handle write request - * - * When counter = reset_cnts, reset all the counter values. - * Since the counter updates aren't atomic, the resetting is done twice - * to make sure that the counters are very likely to be all cleared. - */ -static ssize_t qstat_write(struct file *file, const char __user *user_buf, -			   size_t count, loff_t *ppos) -{ -	int cpu; - -	/* -	 * Get the counter ID stored in file->f_inode->i_private -	 */ -	if ((long)file_inode(file)->i_private != qstat_reset_cnts) -		return count; - -	for_each_possible_cpu(cpu) { -		int i; -		unsigned long *ptr = per_cpu_ptr(qstats, cpu); - -		for (i = 0 ; i < qstat_num; i++) -			WRITE_ONCE(ptr[i], 0); -	} -	return count; -} - -/* - * Debugfs data structures - */ -static const struct file_operations fops_qstat = { -	.read = qstat_read, -	.write = qstat_write, -	.llseek = default_llseek, -}; - -/* - * Initialize debugfs for the qspinlock statistical counters - */ -static int __init init_qspinlock_stat(void) -{ -	struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); -	int i; - -	if (!d_qstat) -		goto out; - -	/* -	 * Create the debugfs files -	 * -	 * As reading from and writing to the stat files can be slow, only -	 * root is allowed to do the read/write to limit impact to system -	 * performance. -	 */ -	for (i = 0; i < qstat_num; i++) -		if (!debugfs_create_file(qstat_names[i], 0400, d_qstat, -					 (void *)(long)i, &fops_qstat)) -			goto fail_undo; - -	if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, -				 (void *)(long)qstat_reset_cnts, &fops_qstat)) -		goto fail_undo; - -	return 0; -fail_undo: -	debugfs_remove_recursive(d_qstat); -out: -	pr_warn("Could not create 'qlockstat' debugfs entries\n"); -	return -ENOMEM; -} -fs_initcall(init_qspinlock_stat); - -/* - * Increment the PV qspinlock statistical counters - */ -static inline void qstat_inc(enum qlock_stats stat, bool cond) -{ -	if (cond) -		this_cpu_inc(qstats[stat]); -} - -/*   * PV hash hop count   */ -static inline void qstat_hop(int hopcnt) +static inline void lockevent_pv_hop(int hopcnt)  { -	this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); +	this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt);  }  /* @@ -276,7 +119,7 @@ static inline void __pv_kick(int cpu)  	per_cpu(pv_kick_time, cpu) = start;  	pv_kick(cpu); -	this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); +	this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start);  }  /* @@ -289,18 +132,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)  	*pkick_time = 0;  	pv_wait(ptr, val);  	if (*pkick_time) { -		this_cpu_add(qstats[qstat_pv_latency_wake], +		this_cpu_add(EVENT_COUNT(pv_latency_wake),  			     sched_clock() - *pkick_time); -		qstat_inc(qstat_pv_kick_wake, true); +		lockevent_inc(pv_kick_wake);  	}  }  #define pv_kick(c)	__pv_kick(c)  #define pv_wait(p, v)	__pv_wait(p, v) -#else /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#else /* CONFIG_LOCK_EVENT_COUNTS */ -static inline void qstat_inc(enum qlock_stats stat, bool cond)	{ } -static inline void qstat_hop(int hopcnt)			{ } +static inline void lockevent_pv_hop(int hopcnt)	{ } -#endif /* CONFIG_QUEUED_LOCK_STAT */ +#endif /* CONFIG_LOCK_EVENT_COUNTS */ diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c deleted file mode 100644 index a7ffb2a96ede..000000000000 --- a/kernel/locking/rwsem-spinlock.c +++ /dev/null @@ -1,339 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* rwsem-spinlock.c: R/W semaphores: contention handling functions for - * generic spinlock implementation - * - * Copyright (c) 2001   David Howells (dhowells@redhat.com). - * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> - * - Derived also from comments by Linus - */ -#include <linux/rwsem.h> -#include <linux/sched/signal.h> -#include <linux/sched/debug.h> -#include <linux/export.h> - -enum rwsem_waiter_type { -	RWSEM_WAITING_FOR_WRITE, -	RWSEM_WAITING_FOR_READ -}; - -struct rwsem_waiter { -	struct list_head list; -	struct task_struct *task; -	enum rwsem_waiter_type type; -}; - -int rwsem_is_locked(struct rw_semaphore *sem) -{ -	int ret = 1; -	unsigned long flags; - -	if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { -		ret = (sem->count != 0); -		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -	} -	return ret; -} -EXPORT_SYMBOL(rwsem_is_locked); - -/* - * initialise the semaphore - */ -void __init_rwsem(struct rw_semaphore *sem, const char *name, -		  struct lock_class_key *key) -{ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -	/* -	 * Make sure we are not reinitializing a held semaphore: -	 */ -	debug_check_no_locks_freed((void *)sem, sizeof(*sem)); -	lockdep_init_map(&sem->dep_map, name, key, 0); -#endif -	sem->count = 0; -	raw_spin_lock_init(&sem->wait_lock); -	INIT_LIST_HEAD(&sem->wait_list); -} -EXPORT_SYMBOL(__init_rwsem); - -/* - * handle the lock release when processes blocked on it that can now run - * - if we come here, then: - *   - the 'active count' _reached_ zero - *   - the 'waiting count' is non-zero - * - the spinlock must be held by the caller - * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if wakewrite is non-zero - */ -static inline struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) -{ -	struct rwsem_waiter *waiter; -	struct task_struct *tsk; -	int woken; - -	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); - -	if (waiter->type == RWSEM_WAITING_FOR_WRITE) { -		if (wakewrite) -			/* Wake up a writer. Note that we do not grant it the -			 * lock - it will have to acquire it when it runs. */ -			wake_up_process(waiter->task); -		goto out; -	} - -	/* grant an infinite number of read locks to the front of the queue */ -	woken = 0; -	do { -		struct list_head *next = waiter->list.next; - -		list_del(&waiter->list); -		tsk = waiter->task; -		/* -		 * Make sure we do not wakeup the next reader before -		 * setting the nil condition to grant the next reader; -		 * otherwise we could miss the wakeup on the other -		 * side and end up sleeping again. See the pairing -		 * in rwsem_down_read_failed(). -		 */ -		smp_mb(); -		waiter->task = NULL; -		wake_up_process(tsk); -		put_task_struct(tsk); -		woken++; -		if (next == &sem->wait_list) -			break; -		waiter = list_entry(next, struct rwsem_waiter, list); -	} while (waiter->type != RWSEM_WAITING_FOR_WRITE); - -	sem->count += woken; - - out: -	return sem; -} - -/* - * wake a single writer - */ -static inline struct rw_semaphore * -__rwsem_wake_one_writer(struct rw_semaphore *sem) -{ -	struct rwsem_waiter *waiter; - -	waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); -	wake_up_process(waiter->task); - -	return sem; -} - -/* - * get a read lock on the semaphore - */ -int __sched __down_read_common(struct rw_semaphore *sem, int state) -{ -	struct rwsem_waiter waiter; -	unsigned long flags; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	if (sem->count >= 0 && list_empty(&sem->wait_list)) { -		/* granted */ -		sem->count++; -		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -		goto out; -	} - -	/* set up my own style of waitqueue */ -	waiter.task = current; -	waiter.type = RWSEM_WAITING_FOR_READ; -	get_task_struct(current); - -	list_add_tail(&waiter.list, &sem->wait_list); - -	/* wait to be given the lock */ -	for (;;) { -		if (!waiter.task) -			break; -		if (signal_pending_state(state, current)) -			goto out_nolock; -		set_current_state(state); -		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -		schedule(); -		raw_spin_lock_irqsave(&sem->wait_lock, flags); -	} - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - out: -	return 0; - -out_nolock: -	/* -	 * We didn't take the lock, so that there is a writer, which -	 * is owner or the first waiter of the sem. If it's a waiter, -	 * it will be woken by current owner. Not need to wake anybody. -	 */ -	list_del(&waiter.list); -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -	return -EINTR; -} - -void __sched __down_read(struct rw_semaphore *sem) -{ -	__down_read_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_read_killable(struct rw_semaphore *sem) -{ -	return __down_read_common(sem, TASK_KILLABLE); -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -int __down_read_trylock(struct rw_semaphore *sem) -{ -	unsigned long flags; -	int ret = 0; - - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	if (sem->count >= 0 && list_empty(&sem->wait_list)) { -		/* granted */ -		sem->count++; -		ret = 1; -	} - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - -	return ret; -} - -/* - * get a write lock on the semaphore - */ -int __sched __down_write_common(struct rw_semaphore *sem, int state) -{ -	struct rwsem_waiter waiter; -	unsigned long flags; -	int ret = 0; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	/* set up my own style of waitqueue */ -	waiter.task = current; -	waiter.type = RWSEM_WAITING_FOR_WRITE; -	list_add_tail(&waiter.list, &sem->wait_list); - -	/* wait for someone to release the lock */ -	for (;;) { -		/* -		 * That is the key to support write lock stealing: allows the -		 * task already on CPU to get the lock soon rather than put -		 * itself into sleep and waiting for system woke it or someone -		 * else in the head of the wait list up. -		 */ -		if (sem->count == 0) -			break; -		if (signal_pending_state(state, current)) -			goto out_nolock; - -		set_current_state(state); -		raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -		schedule(); -		raw_spin_lock_irqsave(&sem->wait_lock, flags); -	} -	/* got the lock */ -	sem->count = -1; -	list_del(&waiter.list); - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - -	return ret; - -out_nolock: -	list_del(&waiter.list); -	if (!list_empty(&sem->wait_list) && sem->count >= 0) -		__rwsem_do_wake(sem, 0); -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - -	return -EINTR; -} - -void __sched __down_write(struct rw_semaphore *sem) -{ -	__down_write_common(sem, TASK_UNINTERRUPTIBLE); -} - -int __sched __down_write_killable(struct rw_semaphore *sem) -{ -	return __down_write_common(sem, TASK_KILLABLE); -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -int __down_write_trylock(struct rw_semaphore *sem) -{ -	unsigned long flags; -	int ret = 0; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	if (sem->count == 0) { -		/* got the lock */ -		sem->count = -1; -		ret = 1; -	} - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); - -	return ret; -} - -/* - * release a read lock on the semaphore - */ -void __up_read(struct rw_semaphore *sem) -{ -	unsigned long flags; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	if (--sem->count == 0 && !list_empty(&sem->wait_list)) -		sem = __rwsem_wake_one_writer(sem); - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * release a write lock on the semaphore - */ -void __up_write(struct rw_semaphore *sem) -{ -	unsigned long flags; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	sem->count = 0; -	if (!list_empty(&sem->wait_list)) -		sem = __rwsem_do_wake(sem, 1); - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - -/* - * downgrade a write lock into a read lock - * - just wake up any readers at the front of the queue - */ -void __downgrade_write(struct rw_semaphore *sem) -{ -	unsigned long flags; - -	raw_spin_lock_irqsave(&sem->wait_lock, flags); - -	sem->count = 1; -	if (!list_empty(&sem->wait_list)) -		sem = __rwsem_do_wake(sem, 0); - -	raw_spin_unlock_irqrestore(&sem->wait_lock, flags); -} - diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index fbe96341beee..6b3ee9948bf1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -147,6 +147,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,  			 * will notice the queued writer.  			 */  			wake_q_add(wake_q, waiter->task); +			lockevent_inc(rwsem_wake_writer);  		}  		return; @@ -176,9 +177,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,  			goto try_reader_grant;  		}  		/* -		 * It is not really necessary to set it to reader-owned here, -		 * but it gives the spinners an early indication that the -		 * readers now have the lock. +		 * Set it to reader-owned to give spinners an early +		 * indication that readers now have the lock.  		 */  		__rwsem_set_reader_owned(sem, waiter->task);  	} @@ -215,6 +215,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,  	}  	adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; +	lockevent_cond_inc(rwsem_wake_reader, woken);  	if (list_empty(&sem->wait_list)) {  		/* hit end of list above */  		adjustment -= RWSEM_WAITING_BIAS; @@ -225,92 +226,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,  }  /* - * Wait for the read lock to be granted - */ -static inline struct rw_semaphore __sched * -__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) -{ -	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; -	struct rwsem_waiter waiter; -	DEFINE_WAKE_Q(wake_q); - -	waiter.task = current; -	waiter.type = RWSEM_WAITING_FOR_READ; - -	raw_spin_lock_irq(&sem->wait_lock); -	if (list_empty(&sem->wait_list)) { -		/* -		 * In case the wait queue is empty and the lock isn't owned -		 * by a writer, this reader can exit the slowpath and return -		 * immediately as its RWSEM_ACTIVE_READ_BIAS has already -		 * been set in the count. -		 */ -		if (atomic_long_read(&sem->count) >= 0) { -			raw_spin_unlock_irq(&sem->wait_lock); -			return sem; -		} -		adjustment += RWSEM_WAITING_BIAS; -	} -	list_add_tail(&waiter.list, &sem->wait_list); - -	/* we're now waiting on the lock, but no longer actively locking */ -	count = atomic_long_add_return(adjustment, &sem->count); - -	/* -	 * If there are no active locks, wake the front queued process(es). -	 * -	 * If there are no writers and we are first in the queue, -	 * wake our own waiter to join the existing active readers ! -	 */ -	if (count == RWSEM_WAITING_BIAS || -	    (count > RWSEM_WAITING_BIAS && -	     adjustment != -RWSEM_ACTIVE_READ_BIAS)) -		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); - -	raw_spin_unlock_irq(&sem->wait_lock); -	wake_up_q(&wake_q); - -	/* wait to be given the lock */ -	while (true) { -		set_current_state(state); -		if (!waiter.task) -			break; -		if (signal_pending_state(state, current)) { -			raw_spin_lock_irq(&sem->wait_lock); -			if (waiter.task) -				goto out_nolock; -			raw_spin_unlock_irq(&sem->wait_lock); -			break; -		} -		schedule(); -	} - -	__set_current_state(TASK_RUNNING); -	return sem; -out_nolock: -	list_del(&waiter.list); -	if (list_empty(&sem->wait_list)) -		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); -	raw_spin_unlock_irq(&sem->wait_lock); -	__set_current_state(TASK_RUNNING); -	return ERR_PTR(-EINTR); -} - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed(struct rw_semaphore *sem) -{ -	return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed); - -__visible struct rw_semaphore * __sched -rwsem_down_read_failed_killable(struct rw_semaphore *sem) -{ -	return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); -} -EXPORT_SYMBOL(rwsem_down_read_failed_killable); - -/*   * This function must be called with the sem->wait_lock held to prevent   * race conditions between checking the rwsem wait list and setting the   * sem->count accordingly. @@ -346,21 +261,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)   */  static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)  { -	long old, count = atomic_long_read(&sem->count); - -	while (true) { -		if (!(count == 0 || count == RWSEM_WAITING_BIAS)) -			return false; +	long count = atomic_long_read(&sem->count); -		old = atomic_long_cmpxchg_acquire(&sem->count, count, -				      count + RWSEM_ACTIVE_WRITE_BIAS); -		if (old == count) { +	while (!count || count == RWSEM_WAITING_BIAS) { +		if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, +					count + RWSEM_ACTIVE_WRITE_BIAS)) {  			rwsem_set_owner(sem); +			lockevent_inc(rwsem_opt_wlock);  			return true;  		} - -		count = old;  	} +	return false;  }  static inline bool owner_on_cpu(struct task_struct *owner) @@ -481,6 +392,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)  	osq_unlock(&sem->osq);  done:  	preempt_enable(); +	lockevent_cond_inc(rwsem_opt_fail, !taken);  	return taken;  } @@ -505,6 +417,97 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)  #endif  /* + * Wait for the read lock to be granted + */ +static inline struct rw_semaphore __sched * +__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) +{ +	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; +	struct rwsem_waiter waiter; +	DEFINE_WAKE_Q(wake_q); + +	waiter.task = current; +	waiter.type = RWSEM_WAITING_FOR_READ; + +	raw_spin_lock_irq(&sem->wait_lock); +	if (list_empty(&sem->wait_list)) { +		/* +		 * In case the wait queue is empty and the lock isn't owned +		 * by a writer, this reader can exit the slowpath and return +		 * immediately as its RWSEM_ACTIVE_READ_BIAS has already +		 * been set in the count. +		 */ +		if (atomic_long_read(&sem->count) >= 0) { +			raw_spin_unlock_irq(&sem->wait_lock); +			rwsem_set_reader_owned(sem); +			lockevent_inc(rwsem_rlock_fast); +			return sem; +		} +		adjustment += RWSEM_WAITING_BIAS; +	} +	list_add_tail(&waiter.list, &sem->wait_list); + +	/* we're now waiting on the lock, but no longer actively locking */ +	count = atomic_long_add_return(adjustment, &sem->count); + +	/* +	 * If there are no active locks, wake the front queued process(es). +	 * +	 * If there are no writers and we are first in the queue, +	 * wake our own waiter to join the existing active readers ! +	 */ +	if (count == RWSEM_WAITING_BIAS || +	    (count > RWSEM_WAITING_BIAS && +	     adjustment != -RWSEM_ACTIVE_READ_BIAS)) +		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); + +	raw_spin_unlock_irq(&sem->wait_lock); +	wake_up_q(&wake_q); + +	/* wait to be given the lock */ +	while (true) { +		set_current_state(state); +		if (!waiter.task) +			break; +		if (signal_pending_state(state, current)) { +			raw_spin_lock_irq(&sem->wait_lock); +			if (waiter.task) +				goto out_nolock; +			raw_spin_unlock_irq(&sem->wait_lock); +			break; +		} +		schedule(); +		lockevent_inc(rwsem_sleep_reader); +	} + +	__set_current_state(TASK_RUNNING); +	lockevent_inc(rwsem_rlock); +	return sem; +out_nolock: +	list_del(&waiter.list); +	if (list_empty(&sem->wait_list)) +		atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); +	raw_spin_unlock_irq(&sem->wait_lock); +	__set_current_state(TASK_RUNNING); +	lockevent_inc(rwsem_rlock_fail); +	return ERR_PTR(-EINTR); +} + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed(struct rw_semaphore *sem) +{ +	return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed); + +__visible struct rw_semaphore * __sched +rwsem_down_read_failed_killable(struct rw_semaphore *sem) +{ +	return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); +} +EXPORT_SYMBOL(rwsem_down_read_failed_killable); + +/*   * Wait until we successfully acquire the write lock   */  static inline struct rw_semaphore * @@ -580,6 +583,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)  				goto out_nolock;  			schedule(); +			lockevent_inc(rwsem_sleep_writer);  			set_current_state(state);  		} while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); @@ -588,6 +592,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)  	__set_current_state(TASK_RUNNING);  	list_del(&waiter.list);  	raw_spin_unlock_irq(&sem->wait_lock); +	lockevent_inc(rwsem_wlock);  	return ret; @@ -601,6 +606,7 @@ out_nolock:  		__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);  	raw_spin_unlock_irq(&sem->wait_lock);  	wake_up_q(&wake_q); +	lockevent_inc(rwsem_wlock_fail);  	return ERR_PTR(-EINTR);  } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e586f0d03ad3..ccbf18f560ff 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem)  	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);  	LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -	rwsem_set_reader_owned(sem);  }  EXPORT_SYMBOL(down_read); @@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem)  		return -EINTR;  	} -	rwsem_set_reader_owned(sem);  	return 0;  } @@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem)  {  	int ret = __down_read_trylock(sem); -	if (ret == 1) { +	if (ret == 1)  		rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); -		rwsem_set_reader_owned(sem); -	}  	return ret;  } @@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem)  	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);  	LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -	rwsem_set_owner(sem);  }  EXPORT_SYMBOL(down_write); @@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem)  		return -EINTR;  	} -	rwsem_set_owner(sem);  	return 0;  } @@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem)  {  	int ret = __down_write_trylock(sem); -	if (ret == 1) { +	if (ret == 1)  		rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); -		rwsem_set_owner(sem); -	}  	return ret;  } @@ -117,9 +109,7 @@ EXPORT_SYMBOL(down_write_trylock);  void up_read(struct rw_semaphore *sem)  {  	rwsem_release(&sem->dep_map, 1, _RET_IP_); -	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); -	rwsem_clear_reader_owned(sem);  	__up_read(sem);  } @@ -131,9 +121,7 @@ EXPORT_SYMBOL(up_read);  void up_write(struct rw_semaphore *sem)  {  	rwsem_release(&sem->dep_map, 1, _RET_IP_); -	DEBUG_RWSEMS_WARN_ON(sem->owner != current); -	rwsem_clear_owner(sem);  	__up_write(sem);  } @@ -145,9 +133,7 @@ EXPORT_SYMBOL(up_write);  void downgrade_write(struct rw_semaphore *sem)  {  	lock_downgrade(&sem->dep_map, _RET_IP_); -	DEBUG_RWSEMS_WARN_ON(sem->owner != current); -	rwsem_set_reader_owned(sem);  	__downgrade_write(sem);  } @@ -161,7 +147,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)  	rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);  	LOCK_CONTENDED(sem, __down_read_trylock, __down_read); -	rwsem_set_reader_owned(sem);  }  EXPORT_SYMBOL(down_read_nested); @@ -172,7 +157,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)  	rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);  	LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -	rwsem_set_owner(sem);  }  EXPORT_SYMBOL(_down_write_nest_lock); @@ -193,7 +177,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)  	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);  	LOCK_CONTENDED(sem, __down_write_trylock, __down_write); -	rwsem_set_owner(sem);  }  EXPORT_SYMBOL(down_write_nested); @@ -208,7 +191,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)  		return -EINTR;  	} -	rwsem_set_owner(sem);  	return 0;  } @@ -216,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested);  void up_read_non_owner(struct rw_semaphore *sem)  { -	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); +	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), +				sem);  	__up_read(sem);  } diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index bad2bca0268b..64877f5294e3 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -23,15 +23,44 @@   * is involved. Ideally we would like to track all the readers that own   * a rwsem, but the overhead is simply too big.   */ +#include "lock_events.h" +  #define RWSEM_READER_OWNED	(1UL << 0)  #define RWSEM_ANONYMOUSLY_OWNED	(1UL << 1)  #ifdef CONFIG_DEBUG_RWSEMS -# define DEBUG_RWSEMS_WARN_ON(c)	DEBUG_LOCKS_WARN_ON(c) +# define DEBUG_RWSEMS_WARN_ON(c, sem)	do {			\ +	if (!debug_locks_silent &&				\ +	    WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ +		#c, atomic_long_read(&(sem)->count),		\ +		(long)((sem)->owner), (long)current,		\ +		list_empty(&(sem)->wait_list) ? "" : "not "))	\ +			debug_locks_off();			\ +	} while (0) +#else +# define DEBUG_RWSEMS_WARN_ON(c, sem) +#endif + +/* + * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. + * Adapted largely from include/asm-i386/rwsem.h + * by Paul Mackerras <paulus@samba.org>. + */ + +/* + * the semaphore definition + */ +#ifdef CONFIG_64BIT +# define RWSEM_ACTIVE_MASK		0xffffffffL  #else -# define DEBUG_RWSEMS_WARN_ON(c) +# define RWSEM_ACTIVE_MASK		0x0000ffffL  #endif +#define RWSEM_ACTIVE_BIAS		0x00000001L +#define RWSEM_WAITING_BIAS		(-RWSEM_ACTIVE_MASK-1) +#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS +#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) +  #ifdef CONFIG_RWSEM_SPIN_ON_OWNER  /*   * All writes to owner are protected by WRITE_ONCE() to make sure that @@ -132,3 +161,144 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)  {  }  #endif + +extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); +extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); + +/* + * lock for reading + */ +static inline void __down_read(struct rw_semaphore *sem) +{ +	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { +		rwsem_down_read_failed(sem); +		DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & +					RWSEM_READER_OWNED), sem); +	} else { +		rwsem_set_reader_owned(sem); +	} +} + +static inline int __down_read_killable(struct rw_semaphore *sem) +{ +	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { +		if (IS_ERR(rwsem_down_read_failed_killable(sem))) +			return -EINTR; +		DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & +					RWSEM_READER_OWNED), sem); +	} else { +		rwsem_set_reader_owned(sem); +	} +	return 0; +} + +static inline int __down_read_trylock(struct rw_semaphore *sem) +{ +	/* +	 * Optimize for the case when the rwsem is not locked at all. +	 */ +	long tmp = RWSEM_UNLOCKED_VALUE; + +	lockevent_inc(rwsem_rtrylock); +	do { +		if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, +					tmp + RWSEM_ACTIVE_READ_BIAS)) { +			rwsem_set_reader_owned(sem); +			return 1; +		} +	} while (tmp >= 0); +	return 0; +} + +/* + * lock for writing + */ +static inline void __down_write(struct rw_semaphore *sem) +{ +	long tmp; + +	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, +					     &sem->count); +	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) +		rwsem_down_write_failed(sem); +	rwsem_set_owner(sem); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ +	long tmp; + +	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, +					     &sem->count); +	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) +		if (IS_ERR(rwsem_down_write_failed_killable(sem))) +			return -EINTR; +	rwsem_set_owner(sem); +	return 0; +} + +static inline int __down_write_trylock(struct rw_semaphore *sem) +{ +	long tmp; + +	lockevent_inc(rwsem_wtrylock); +	tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, +		      RWSEM_ACTIVE_WRITE_BIAS); +	if (tmp == RWSEM_UNLOCKED_VALUE) { +		rwsem_set_owner(sem); +		return true; +	} +	return false; +} + +/* + * unlock after reading + */ +static inline void __up_read(struct rw_semaphore *sem) +{ +	long tmp; + +	DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), +				sem); +	rwsem_clear_reader_owned(sem); +	tmp = atomic_long_dec_return_release(&sem->count); +	if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) +		rwsem_wake(sem); +} + +/* + * unlock after writing + */ +static inline void __up_write(struct rw_semaphore *sem) +{ +	DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); +	rwsem_clear_owner(sem); +	if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, +						    &sem->count) < 0)) +		rwsem_wake(sem); +} + +/* + * downgrade write lock to read lock + */ +static inline void __downgrade_write(struct rw_semaphore *sem) +{ +	long tmp; + +	/* +	 * When downgrading from exclusive to shared ownership, +	 * anything inside the write-locked region cannot leak +	 * into the read side. In contrast, anything in the +	 * read-locked region is ok to be re-ordered into the +	 * write side. As such, rely on RELEASE semantics. +	 */ +	DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); +	tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); +	rwsem_set_reader_owned(sem); +	if (tmp < 0) +		rwsem_downgrade_wake(sem); +} diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 936f3d14dd6b..0ff08380f531 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -22,6 +22,13 @@  #include <linux/debug_locks.h>  #include <linux/export.h> +#ifdef CONFIG_MMIOWB +#ifndef arch_mmiowb_state +DEFINE_PER_CPU(struct mmiowb_state, __mmiowb_state); +EXPORT_PER_CPU_SYMBOL(__mmiowb_state); +#endif +#endif +  /*   * If lockdep is enabled then we use the non-preemption spin-ops   * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 9aa0fccd5d43..399669f7eba8 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -111,6 +111,7 @@ void do_raw_spin_lock(raw_spinlock_t *lock)  {  	debug_spin_lock_before(lock);  	arch_spin_lock(&lock->raw_lock); +	mmiowb_spin_lock();  	debug_spin_lock_after(lock);  } @@ -118,8 +119,10 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)  {  	int ret = arch_spin_trylock(&lock->raw_lock); -	if (ret) +	if (ret) { +		mmiowb_spin_lock();  		debug_spin_lock_after(lock); +	}  #ifndef CONFIG_SMP  	/*  	 * Must not happen on UP: @@ -131,6 +134,7 @@ int do_raw_spin_trylock(raw_spinlock_t *lock)  void do_raw_spin_unlock(raw_spinlock_t *lock)  { +	mmiowb_spin_unlock();  	debug_spin_unlock(lock);  	arch_spin_unlock(&lock->raw_lock);  } diff --git a/kernel/module.c b/kernel/module.c index 0b9aa8ab89f0..a9020bdd4cf6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);  EXPORT_SYMBOL_GPL(module_mutex);  static LIST_HEAD(modules); +/* Work queue for freeing init sections in success case */ +static struct work_struct init_free_wq; +static struct llist_head init_free_list; +  #ifdef CONFIG_MODULES_TREE_LOOKUP  /* @@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init)  	if (!rodata_enabled)  		return; +	set_vm_flush_reset_perms(mod->core_layout.base); +	set_vm_flush_reset_perms(mod->init_layout.base);  	frob_text(&mod->core_layout, set_memory_ro); +	frob_text(&mod->core_layout, set_memory_x); +  	frob_rodata(&mod->core_layout, set_memory_ro); +  	frob_text(&mod->init_layout, set_memory_ro); +	frob_text(&mod->init_layout, set_memory_x); +  	frob_rodata(&mod->init_layout, set_memory_ro);  	if (after_init) @@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)  	frob_writable_data(&mod->init_layout, set_memory_nx);  } -static void module_disable_nx(const struct module *mod) -{ -	frob_rodata(&mod->core_layout, set_memory_x); -	frob_ro_after_init(&mod->core_layout, set_memory_x); -	frob_writable_data(&mod->core_layout, set_memory_x); -	frob_rodata(&mod->init_layout, set_memory_x); -	frob_writable_data(&mod->init_layout, set_memory_x); -} -  /* Iterate through all modules and set each module's text as RW */  void set_all_modules_text_rw(void)  { @@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void)  	}  	mutex_unlock(&module_mutex);  } - -static void disable_ro_nx(const struct module_layout *layout) -{ -	if (rodata_enabled) { -		frob_text(layout, set_memory_rw); -		frob_rodata(layout, set_memory_rw); -		frob_ro_after_init(layout, set_memory_rw); -	} -	frob_rodata(layout, set_memory_x); -	frob_ro_after_init(layout, set_memory_x); -	frob_writable_data(layout, set_memory_x); -} -  #else -static void disable_ro_nx(const struct module_layout *layout) { }  static void module_enable_nx(const struct module *mod) { } -static void module_disable_nx(const struct module *mod) { }  #endif  #ifdef CONFIG_LIVEPATCH @@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod)  void __weak module_memfree(void *module_region)  { +	/* +	 * This memory may be RO, and freeing RO memory in an interrupt is not +	 * supported by vmalloc. +	 */ +	WARN_ON(in_interrupt());  	vfree(module_region);  } @@ -2166,7 +2158,6 @@ static void free_module(struct module *mod)  	mutex_unlock(&module_mutex);  	/* This may be empty, but that's OK */ -	disable_ro_nx(&mod->init_layout);  	module_arch_freeing_init(mod);  	module_memfree(mod->init_layout.base);  	kfree(mod->args); @@ -2176,7 +2167,6 @@ static void free_module(struct module *mod)  	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);  	/* Finally, free the core (containing the module structure) */ -	disable_ro_nx(&mod->core_layout);  	module_memfree(mod->core_layout.base);  } @@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)  /* For freeing module_init on success, in case kallsyms traversing */  struct mod_initfree { -	struct rcu_head rcu; +	struct llist_node node;  	void *module_init;  }; -static void do_free_init(struct rcu_head *head) +static void do_free_init(struct work_struct *w)  { -	struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); -	module_memfree(m->module_init); -	kfree(m); +	struct llist_node *pos, *n, *list; +	struct mod_initfree *initfree; + +	list = llist_del_all(&init_free_list); + +	synchronize_rcu(); + +	llist_for_each_safe(pos, n, list) { +		initfree = container_of(pos, struct mod_initfree, node); +		module_memfree(initfree->module_init); +		kfree(initfree); +	}  } +static int __init modules_wq_init(void) +{ +	INIT_WORK(&init_free_wq, do_free_init); +	init_llist_head(&init_free_list); +	return 0; +} +module_init(modules_wq_init); +  /*   * This is where the real work happens.   * @@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)  #endif  	module_enable_ro(mod, true);  	mod_tree_remove_init(mod); -	disable_ro_nx(&mod->init_layout);  	module_arch_freeing_init(mod);  	mod->init_layout.base = NULL;  	mod->init_layout.size = 0; @@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)  	 * We want to free module_init, but be aware that kallsyms may be  	 * walking this with preempt disabled.  In all the failure paths, we  	 * call synchronize_rcu(), but we don't want to slow down the success -	 * path, so use actual RCU here. +	 * path. module_memfree() cannot be called in an interrupt, so do the +	 * work and call synchronize_rcu() in a work queue. +	 *  	 * Note that module_alloc() on most architectures creates W+X page  	 * mappings which won't be cleaned up until do_free_init() runs.  Any  	 * code such as mark_rodata_ro() which depends on those mappings to  	 * be cleaned up needs to sync with the queued work - ie  	 * rcu_barrier()  	 */ -	call_rcu(&freeinit->rcu, do_free_init); +	if (llist_add(&freeinit->node, &init_free_list)) +		schedule_work(&init_free_wq); +  	mutex_unlock(&module_mutex);  	wake_up_all(&module_wq); @@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,  	module_bug_cleanup(mod);  	mutex_unlock(&module_mutex); -	/* we can't deallocate the module until we clear memory protection */ -	module_disable_ro(mod); -	module_disable_nx(mod); -   ddebug_cleanup:  	ftrace_release_mod(mod);  	dynamic_debug_remove(mod, info->debug); diff --git a/kernel/panic.c b/kernel/panic.c index 0ae0d7332f12..c1fcaad337b7 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -318,12 +318,7 @@ void panic(const char *fmt, ...)  	}  #endif  #if defined(CONFIG_S390) -	{ -		unsigned long caller; - -		caller = (unsigned long)__builtin_return_address(0); -		disabled_wait(caller); -	} +	disabled_wait();  #endif  	pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);  	local_irq_enable(); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index f8fe57d1022e..9bbaaab14b36 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -114,6 +114,15 @@ config PM_SLEEP_SMP  	depends on PM_SLEEP  	select HOTPLUG_CPU +config PM_SLEEP_SMP_NONZERO_CPU +	def_bool y +	depends on PM_SLEEP_SMP +	depends on ARCH_SUSPEND_NONZERO_CPU +	---help--- +	If an arch can suspend (for suspend, hibernate, kexec, etc) on a +	non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This +	will allow nohz_full mask to include CPU0. +  config PM_AUTOSLEEP  	bool "Opportunistic sleep"  	depends on PM_SLEEP diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index abef759de7c8..c8c272df7154 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -14,7 +14,6 @@  #include <linux/export.h>  #include <linux/suspend.h> -#include <linux/syscalls.h>  #include <linux/reboot.h>  #include <linux/string.h>  #include <linux/device.h> @@ -281,7 +280,7 @@ static int create_image(int platform_mode)  	if (error || hibernation_test(TEST_PLATFORM))  		goto Platform_finish; -	error = disable_nonboot_cpus(); +	error = suspend_disable_secondary_cpus();  	if (error || hibernation_test(TEST_CPUS))  		goto Enable_cpus; @@ -323,7 +322,7 @@ static int create_image(int platform_mode)  	local_irq_enable();   Enable_cpus: -	enable_nonboot_cpus(); +	suspend_enable_secondary_cpus();   Platform_finish:  	platform_finish(platform_mode); @@ -417,7 +416,7 @@ int hibernation_snapshot(int platform_mode)  int __weak hibernate_resume_nonboot_cpu_disable(void)  { -	return disable_nonboot_cpus(); +	return suspend_disable_secondary_cpus();  }  /** @@ -486,7 +485,7 @@ static int resume_target_kernel(bool platform_mode)  	local_irq_enable();   Enable_cpus: -	enable_nonboot_cpus(); +	suspend_enable_secondary_cpus();   Cleanup:  	platform_restore_cleanup(platform_mode); @@ -564,7 +563,7 @@ int hibernation_platform_enter(void)  	if (error)  		goto Platform_finish; -	error = disable_nonboot_cpus(); +	error = suspend_disable_secondary_cpus();  	if (error)  		goto Enable_cpus; @@ -586,7 +585,7 @@ int hibernation_platform_enter(void)  	local_irq_enable();   Enable_cpus: -	enable_nonboot_cpus(); +	suspend_enable_secondary_cpus();   Platform_finish:  	hibernation_ops->finish(); @@ -709,9 +708,7 @@ int hibernate(void)  		goto Exit;  	} -	pr_info("Syncing filesystems ... \n"); -	ksys_sync(); -	pr_info("done.\n"); +	ksys_sync_helper();  	error = freeze_processes();  	if (error) diff --git a/kernel/power/main.c b/kernel/power/main.c index 98e76cad128b..4f43e724f6eb 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -16,6 +16,7 @@  #include <linux/debugfs.h>  #include <linux/seq_file.h>  #include <linux/suspend.h> +#include <linux/syscalls.h>  #include "power.h" @@ -51,6 +52,19 @@ void unlock_system_sleep(void)  }  EXPORT_SYMBOL_GPL(unlock_system_sleep); +void ksys_sync_helper(void) +{ +	ktime_t start; +	long elapsed_msecs; + +	start = ktime_get(); +	ksys_sync(); +	elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); +	pr_info("Filesystems sync: %ld.%03ld seconds\n", +		elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); +} +EXPORT_SYMBOL_GPL(ksys_sync_helper); +  /* Routines for PM-transition notifications */  static BLOCKING_NOTIFIER_HEAD(pm_chain_head); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index f08a1e4ee1d4..bc9558ab1e5b 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)   * safe_copy_page - Copy a page in a safe way.   *   * Check if the page we are going to copy is marked as present in the kernel - * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set - * and in that case kernel_page_present() always returns 'true'). + * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or + * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present() + * always returns 'true'.   */  static void safe_copy_page(void *dst, struct page *s_page)  { diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 0bd595a0b610..ef908c134b34 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -17,7 +17,6 @@  #include <linux/console.h>  #include <linux/cpu.h>  #include <linux/cpuidle.h> -#include <linux/syscalls.h>  #include <linux/gfp.h>  #include <linux/io.h>  #include <linux/kernel.h> @@ -428,7 +427,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  	if (suspend_test(TEST_PLATFORM))  		goto Platform_wake; -	error = disable_nonboot_cpus(); +	error = suspend_disable_secondary_cpus();  	if (error || suspend_test(TEST_CPUS))  		goto Enable_cpus; @@ -458,7 +457,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  	BUG_ON(irqs_disabled());   Enable_cpus: -	enable_nonboot_cpus(); +	suspend_enable_secondary_cpus();   Platform_wake:  	platform_resume_noirq(state); @@ -568,13 +567,11 @@ static int enter_state(suspend_state_t state)  	if (state == PM_SUSPEND_TO_IDLE)  		s2idle_begin(); -#ifndef CONFIG_SUSPEND_SKIP_SYNC -	trace_suspend_resume(TPS("sync_filesystems"), 0, true); -	pr_info("Syncing filesystems ... "); -	ksys_sync(); -	pr_cont("done.\n"); -	trace_suspend_resume(TPS("sync_filesystems"), 0, false); -#endif +	if (!IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC)) { +		trace_suspend_resume(TPS("sync_filesystems"), 0, true); +		ksys_sync_helper(); +		trace_suspend_resume(TPS("sync_filesystems"), 0, false); +	}  	pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);  	pm_suspend_clear_flags(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 2d8b60a3c86b..cb24e840a3e6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -10,7 +10,6 @@   */  #include <linux/suspend.h> -#include <linux/syscalls.h>  #include <linux/reboot.h>  #include <linux/string.h>  #include <linux/device.h> @@ -228,9 +227,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,  		if (data->frozen)  			break; -		printk("Syncing filesystems ... "); -		ksys_sync(); -		printk("done.\n"); +		ksys_sync_helper();  		error = freeze_processes();  		if (error) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index acee72c0b24b..4b58c907b4b7 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -233,6 +233,7 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)  #ifdef CONFIG_RCU_STALL_COMMON  extern int rcu_cpu_stall_suppress; +extern int rcu_cpu_stall_timeout;  int rcu_jiffies_till_stall_check(void);  #define rcu_ftrace_dump_stall_suppress() \ diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index c29761152874..7a6890b23c5f 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -494,6 +494,10 @@ rcu_perf_cleanup(void)  	if (torture_cleanup_begin())  		return; +	if (!cur_ops) { +		torture_cleanup_end(); +		return; +	}  	if (reader_tasks) {  		for (i = 0; i < nrealreaders; i++) @@ -614,6 +618,7 @@ rcu_perf_init(void)  		pr_cont("\n");  		WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST));  		firsterr = -EINVAL; +		cur_ops = NULL;  		goto unwind;  	}  	if (cur_ops->init) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index f14d1b18a74f..efaa5b3f4d3f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -299,7 +299,6 @@ struct rcu_torture_ops {  	int irq_capable;  	int can_boost;  	int extendables; -	int ext_irq_conflict;  	const char *name;  }; @@ -592,12 +591,7 @@ static void srcu_torture_init(void)  static void srcu_torture_cleanup(void)  { -	static DEFINE_TORTURE_RANDOM(rand); - -	if (torture_random(&rand) & 0x800) -		cleanup_srcu_struct(&srcu_ctld); -	else -		cleanup_srcu_struct_quiesced(&srcu_ctld); +	cleanup_srcu_struct(&srcu_ctld);  	srcu_ctlp = &srcu_ctl; /* In case of a later rcutorture run. */  } @@ -1160,7 +1154,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)  	unsigned long randmask2 = randmask1 >> 3;  	WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); -	/* Most of the time lots of bits, half the time only one bit. */ +	/* Mostly only one bit (need preemption!), sometimes lots of bits. */  	if (!(randmask1 & 0x7))  		mask = mask & randmask2;  	else @@ -1170,10 +1164,6 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)  	    ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) ||  	     (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH))))  		mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; -	if ((mask & RCUTORTURE_RDR_IRQ) && -	    !(mask & cur_ops->ext_irq_conflict) && -	    (oldmask & cur_ops->ext_irq_conflict)) -		mask |= cur_ops->ext_irq_conflict; /* Or if readers object. */  	return mask ?: RCUTORTURE_RDR_RCU;  } @@ -1848,7 +1838,7 @@ static int rcutorture_oom_notify(struct notifier_block *self,  	WARN(1, "%s invoked upon OOM during forward-progress testing.\n",  	     __func__);  	rcu_torture_fwd_cb_hist(); -	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat) / 2)); +	rcu_fwd_progress_check(1 + (jiffies - READ_ONCE(rcu_fwd_startat)) / 2);  	WRITE_ONCE(rcu_fwd_emergency_stop, true);  	smp_mb(); /* Emergency stop before free and wait to avoid hangs. */  	pr_info("%s: Freed %lu RCU callbacks.\n", @@ -2094,6 +2084,10 @@ rcu_torture_cleanup(void)  			cur_ops->cb_barrier();  		return;  	} +	if (!cur_ops) { +		torture_cleanup_end(); +		return; +	}  	rcu_torture_barrier_cleanup();  	torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task); @@ -2267,6 +2261,7 @@ rcu_torture_init(void)  		pr_cont("\n");  		WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST));  		firsterr = -EINVAL; +		cur_ops = NULL;  		goto unwind;  	}  	if (cur_ops->fqs == NULL && fqs_duration != 0) { diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 5d4a39a6505a..44d6606b8325 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -76,19 +76,16 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);   * Must invoke this after you are finished using a given srcu_struct that   * was initialized via init_srcu_struct(), else you leak memory.   */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +void cleanup_srcu_struct(struct srcu_struct *ssp)  {  	WARN_ON(ssp->srcu_lock_nesting[0] || ssp->srcu_lock_nesting[1]); -	if (quiesced) -		WARN_ON(work_pending(&ssp->srcu_work)); -	else -		flush_work(&ssp->srcu_work); +	flush_work(&ssp->srcu_work);  	WARN_ON(ssp->srcu_gp_running);  	WARN_ON(ssp->srcu_gp_waiting);  	WARN_ON(ssp->srcu_cb_head);  	WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);  } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct);  /*   * Removes the count for the old reader from the appropriate element of diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index a60b8ba9e1ac..9b761e546de8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -360,8 +360,14 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)  	return SRCU_INTERVAL;  } -/* Helper for cleanup_srcu_struct() and cleanup_srcu_struct_quiesced(). */ -void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced) +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @ssp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *ssp)  {  	int cpu; @@ -369,24 +375,14 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)  		return; /* Just leak it! */  	if (WARN_ON(srcu_readers_active(ssp)))  		return; /* Just leak it! */ -	if (quiesced) { -		if (WARN_ON(delayed_work_pending(&ssp->work))) -			return; /* Just leak it! */ -	} else { -		flush_delayed_work(&ssp->work); -	} +	flush_delayed_work(&ssp->work);  	for_each_possible_cpu(cpu) {  		struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); -		if (quiesced) { -			if (WARN_ON(timer_pending(&sdp->delay_work))) -				return; /* Just leak it! */ -			if (WARN_ON(work_pending(&sdp->work))) -				return; /* Just leak it! */ -		} else { -			del_timer_sync(&sdp->delay_work); -			flush_work(&sdp->work); -		} +		del_timer_sync(&sdp->delay_work); +		flush_work(&sdp->work); +		if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) +			return; /* Forgot srcu_barrier(), so just leak it! */  	}  	if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||  	    WARN_ON(srcu_readers_active(ssp))) { @@ -397,7 +393,7 @@ void _cleanup_srcu_struct(struct srcu_struct *ssp, bool quiesced)  	free_percpu(ssp->sda);  	ssp->sda = NULL;  } -EXPORT_SYMBOL_GPL(_cleanup_srcu_struct); +EXPORT_SYMBOL_GPL(cleanup_srcu_struct);  /*   * Counts the new reader in the appropriate per-CPU element of the diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 911bd9076d43..477b4eb44af5 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -52,7 +52,7 @@ void rcu_qs(void)  	local_irq_save(flags);  	if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) {  		rcu_ctrlblk.donetail = rcu_ctrlblk.curtail; -		raise_softirq(RCU_SOFTIRQ); +		raise_softirq_irqoff(RCU_SOFTIRQ);  	}  	local_irq_restore(flags);  } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index acd6ccf56faf..b4d88a594785 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -102,11 +102,6 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;  /* Number of rcu_nodes at specified level. */  int num_rcu_lvl[] = NUM_RCU_LVL_INIT;  int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ -/* panic() on RCU Stall sysctl. */ -int sysctl_panic_on_rcu_stall __read_mostly; -/* Commandeer a sysrq key to dump RCU's tree. */ -static bool sysrq_rcu; -module_param(sysrq_rcu, bool, 0444);  /*   * The rcu_scheduler_active variable is initialized to the value @@ -149,7 +144,7 @@ static void sync_sched_exp_online_cleanup(int cpu);  /* rcuc/rcub kthread realtime priority */  static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; -module_param(kthread_prio, int, 0644); +module_param(kthread_prio, int, 0444);  /* Delay in jiffies for grace-period initialization delays, debug only. */ @@ -406,7 +401,7 @@ static bool rcu_kick_kthreads;   */  static ulong jiffies_till_sched_qs = ULONG_MAX;  module_param(jiffies_till_sched_qs, ulong, 0444); -static ulong jiffies_to_sched_qs; /* Adjusted version of above if not default */ +static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */  module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */  /* @@ -424,6 +419,7 @@ static void adjust_jiffies_till_sched_qs(void)  		WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);  		return;  	} +	/* Otherwise, set to third fqs scan, but bound below on large system. */  	j = READ_ONCE(jiffies_till_first_fqs) +  		      2 * READ_ONCE(jiffies_till_next_fqs);  	if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV) @@ -513,74 +509,6 @@ static const char *gp_state_getname(short gs)  }  /* - * Show the state of the grace-period kthreads. - */ -void show_rcu_gp_kthreads(void) -{ -	int cpu; -	unsigned long j; -	unsigned long ja; -	unsigned long jr; -	unsigned long jw; -	struct rcu_data *rdp; -	struct rcu_node *rnp; - -	j = jiffies; -	ja = j - READ_ONCE(rcu_state.gp_activity); -	jr = j - READ_ONCE(rcu_state.gp_req_activity); -	jw = j - READ_ONCE(rcu_state.gp_wake_time); -	pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", -		rcu_state.name, gp_state_getname(rcu_state.gp_state), -		rcu_state.gp_state, -		rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, -		ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), -		(long)READ_ONCE(rcu_state.gp_seq), -		(long)READ_ONCE(rcu_get_root()->gp_seq_needed), -		READ_ONCE(rcu_state.gp_flags)); -	rcu_for_each_node_breadth_first(rnp) { -		if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) -			continue; -		pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", -			rnp->grplo, rnp->grphi, (long)rnp->gp_seq, -			(long)rnp->gp_seq_needed); -		if (!rcu_is_leaf_node(rnp)) -			continue; -		for_each_leaf_node_possible_cpu(rnp, cpu) { -			rdp = per_cpu_ptr(&rcu_data, cpu); -			if (rdp->gpwrap || -			    ULONG_CMP_GE(rcu_state.gp_seq, -					 rdp->gp_seq_needed)) -				continue; -			pr_info("\tcpu %d ->gp_seq_needed %ld\n", -				cpu, (long)rdp->gp_seq_needed); -		} -	} -	/* sched_show_task(rcu_state.gp_kthread); */ -} -EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); - -/* Dump grace-period-request information due to commandeered sysrq. */ -static void sysrq_show_rcu(int key) -{ -	show_rcu_gp_kthreads(); -} - -static struct sysrq_key_op sysrq_rcudump_op = { -	.handler = sysrq_show_rcu, -	.help_msg = "show-rcu(y)", -	.action_msg = "Show RCU tree", -	.enable_mask = SYSRQ_ENABLE_DUMP, -}; - -static int __init rcu_sysrq_init(void) -{ -	if (sysrq_rcu) -		return register_sysrq_key('y', &sysrq_rcudump_op); -	return 0; -} -early_initcall(rcu_sysrq_init); - -/*   * Send along grace-period-related data for rcutorture diagnostics.   */  void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, @@ -1034,27 +962,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)  }  /* - * Handler for the irq_work request posted when a grace period has - * gone on for too long, but not yet long enough for an RCU CPU - * stall warning.  Set state appropriately, but just complain if - * there is unexpected state on entry. - */ -static void rcu_iw_handler(struct irq_work *iwp) -{ -	struct rcu_data *rdp; -	struct rcu_node *rnp; - -	rdp = container_of(iwp, struct rcu_data, rcu_iw); -	rnp = rdp->mynode; -	raw_spin_lock_rcu_node(rnp); -	if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { -		rdp->rcu_iw_gp_seq = rnp->gp_seq; -		rdp->rcu_iw_pending = false; -	} -	raw_spin_unlock_rcu_node(rnp); -} - -/*   * Return true if the specified CPU has passed through a quiescent   * state by virtue of being in or having passed through an dynticks   * idle state since the last call to dyntick_save_progress_counter() @@ -1167,295 +1074,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)  	return 0;  } -static void record_gp_stall_check_time(void) -{ -	unsigned long j = jiffies; -	unsigned long j1; - -	rcu_state.gp_start = j; -	j1 = rcu_jiffies_till_stall_check(); -	/* Record ->gp_start before ->jiffies_stall. */ -	smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ -	rcu_state.jiffies_resched = j + j1 / 2; -	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); -} - -/* - * Complain about starvation of grace-period kthread. - */ -static void rcu_check_gp_kthread_starvation(void) -{ -	struct task_struct *gpk = rcu_state.gp_kthread; -	unsigned long j; - -	j = jiffies - READ_ONCE(rcu_state.gp_activity); -	if (j > 2 * HZ) { -		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", -		       rcu_state.name, j, -		       (long)rcu_seq_current(&rcu_state.gp_seq), -		       READ_ONCE(rcu_state.gp_flags), -		       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, -		       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); -		if (gpk) { -			pr_err("RCU grace-period kthread stack dump:\n"); -			sched_show_task(gpk); -			wake_up_process(gpk); -		} -	} -} - -/* - * Dump stacks of all tasks running on stalled CPUs.  First try using - * NMIs, but fall back to manual remote stack tracing on architectures - * that don't support NMI-based stack dumps.  The NMI-triggered stack - * traces are more accurate because they are printed by the target CPU. - */ -static void rcu_dump_cpu_stacks(void) -{ -	int cpu; -	unsigned long flags; -	struct rcu_node *rnp; - -	rcu_for_each_leaf_node(rnp) { -		raw_spin_lock_irqsave_rcu_node(rnp, flags); -		for_each_leaf_node_possible_cpu(rnp, cpu) -			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) -				if (!trigger_single_cpu_backtrace(cpu)) -					dump_cpu_task(cpu); -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -	} -} - -/* - * If too much time has passed in the current grace period, and if - * so configured, go kick the relevant kthreads. - */ -static void rcu_stall_kick_kthreads(void) -{ -	unsigned long j; - -	if (!rcu_kick_kthreads) -		return; -	j = READ_ONCE(rcu_state.jiffies_kick_kthreads); -	if (time_after(jiffies, j) && rcu_state.gp_kthread && -	    (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { -		WARN_ONCE(1, "Kicking %s grace-period kthread\n", -			  rcu_state.name); -		rcu_ftrace_dump(DUMP_ALL); -		wake_up_process(rcu_state.gp_kthread); -		WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); -	} -} - -static void panic_on_rcu_stall(void) -{ -	if (sysctl_panic_on_rcu_stall) -		panic("RCU Stall\n"); -} - -static void print_other_cpu_stall(unsigned long gp_seq) -{ -	int cpu; -	unsigned long flags; -	unsigned long gpa; -	unsigned long j; -	int ndetected = 0; -	struct rcu_node *rnp = rcu_get_root(); -	long totqlen = 0; - -	/* Kick and suppress, if so configured. */ -	rcu_stall_kick_kthreads(); -	if (rcu_cpu_stall_suppress) -		return; - -	/* -	 * OK, time to rat on our buddy... -	 * See Documentation/RCU/stallwarn.txt for info on how to debug -	 * RCU CPU stall warnings. -	 */ -	pr_err("INFO: %s detected stalls on CPUs/tasks:", rcu_state.name); -	print_cpu_stall_info_begin(); -	rcu_for_each_leaf_node(rnp) { -		raw_spin_lock_irqsave_rcu_node(rnp, flags); -		ndetected += rcu_print_task_stall(rnp); -		if (rnp->qsmask != 0) { -			for_each_leaf_node_possible_cpu(rnp, cpu) -				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { -					print_cpu_stall_info(cpu); -					ndetected++; -				} -		} -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -	} - -	print_cpu_stall_info_end(); -	for_each_possible_cpu(cpu) -		totqlen += rcu_get_n_cbs_cpu(cpu); -	pr_cont("(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", -	       smp_processor_id(), (long)(jiffies - rcu_state.gp_start), -	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); -	if (ndetected) { -		rcu_dump_cpu_stacks(); - -		/* Complain about tasks blocking the grace period. */ -		rcu_print_detail_task_stall(); -	} else { -		if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { -			pr_err("INFO: Stall ended before state dump start\n"); -		} else { -			j = jiffies; -			gpa = READ_ONCE(rcu_state.gp_activity); -			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", -			       rcu_state.name, j - gpa, j, gpa, -			       READ_ONCE(jiffies_till_next_fqs), -			       rcu_get_root()->qsmask); -			/* In this case, the current CPU might be at fault. */ -			sched_show_task(current); -		} -	} -	/* Rewrite if needed in case of slow consoles. */ -	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) -		WRITE_ONCE(rcu_state.jiffies_stall, -			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - -	rcu_check_gp_kthread_starvation(); - -	panic_on_rcu_stall(); - -	rcu_force_quiescent_state();  /* Kick them all. */ -} - -static void print_cpu_stall(void) -{ -	int cpu; -	unsigned long flags; -	struct rcu_data *rdp = this_cpu_ptr(&rcu_data); -	struct rcu_node *rnp = rcu_get_root(); -	long totqlen = 0; - -	/* Kick and suppress, if so configured. */ -	rcu_stall_kick_kthreads(); -	if (rcu_cpu_stall_suppress) -		return; - -	/* -	 * OK, time to rat on ourselves... -	 * See Documentation/RCU/stallwarn.txt for info on how to debug -	 * RCU CPU stall warnings. -	 */ -	pr_err("INFO: %s self-detected stall on CPU", rcu_state.name); -	print_cpu_stall_info_begin(); -	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); -	print_cpu_stall_info(smp_processor_id()); -	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); -	print_cpu_stall_info_end(); -	for_each_possible_cpu(cpu) -		totqlen += rcu_get_n_cbs_cpu(cpu); -	pr_cont(" (t=%lu jiffies g=%ld q=%lu)\n", -		jiffies - rcu_state.gp_start, -		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen); - -	rcu_check_gp_kthread_starvation(); - -	rcu_dump_cpu_stacks(); - -	raw_spin_lock_irqsave_rcu_node(rnp, flags); -	/* Rewrite if needed in case of slow consoles. */ -	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) -		WRITE_ONCE(rcu_state.jiffies_stall, -			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3); -	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - -	panic_on_rcu_stall(); - -	/* -	 * Attempt to revive the RCU machinery by forcing a context switch. -	 * -	 * A context switch would normally allow the RCU state machine to make -	 * progress and it could be we're stuck in kernel space without context -	 * switches for an entirely unreasonable amount of time. -	 */ -	set_tsk_need_resched(current); -	set_preempt_need_resched(); -} - -static void check_cpu_stall(struct rcu_data *rdp) -{ -	unsigned long gs1; -	unsigned long gs2; -	unsigned long gps; -	unsigned long j; -	unsigned long jn; -	unsigned long js; -	struct rcu_node *rnp; - -	if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || -	    !rcu_gp_in_progress()) -		return; -	rcu_stall_kick_kthreads(); -	j = jiffies; - -	/* -	 * Lots of memory barriers to reject false positives. -	 * -	 * The idea is to pick up rcu_state.gp_seq, then -	 * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally -	 * another copy of rcu_state.gp_seq.  These values are updated in -	 * the opposite order with memory barriers (or equivalent) during -	 * grace-period initialization and cleanup.  Now, a false positive -	 * can occur if we get an new value of rcu_state.gp_start and a old -	 * value of rcu_state.jiffies_stall.  But given the memory barriers, -	 * the only way that this can happen is if one grace period ends -	 * and another starts between these two fetches.  This is detected -	 * by comparing the second fetch of rcu_state.gp_seq with the -	 * previous fetch from rcu_state.gp_seq. -	 * -	 * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, -	 * and rcu_state.gp_start suffice to forestall false positives. -	 */ -	gs1 = READ_ONCE(rcu_state.gp_seq); -	smp_rmb(); /* Pick up ->gp_seq first... */ -	js = READ_ONCE(rcu_state.jiffies_stall); -	smp_rmb(); /* ...then ->jiffies_stall before the rest... */ -	gps = READ_ONCE(rcu_state.gp_start); -	smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ -	gs2 = READ_ONCE(rcu_state.gp_seq); -	if (gs1 != gs2 || -	    ULONG_CMP_LT(j, js) || -	    ULONG_CMP_GE(gps, js)) -		return; /* No stall or GP completed since entering function. */ -	rnp = rdp->mynode; -	jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; -	if (rcu_gp_in_progress() && -	    (READ_ONCE(rnp->qsmask) & rdp->grpmask) && -	    cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - -		/* We haven't checked in, so go dump stack. */ -		print_cpu_stall(); - -	} else if (rcu_gp_in_progress() && -		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && -		   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - -		/* They had a few time units to dump stack, so complain. */ -		print_other_cpu_stall(gs2); -	} -} - -/** - * rcu_cpu_stall_reset - prevent further stall warnings in current grace period - * - * Set the stall-warning timeout way off into the future, thus preventing - * any RCU CPU stall-warning messages from appearing in the current set of - * RCU grace periods. - * - * The caller must disable hard irqs. - */ -void rcu_cpu_stall_reset(void) -{ -	WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); -} -  /* Trace-event wrapper function for trace_rcu_future_grace_period.  */  static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,  			      unsigned long gp_seq_req, const char *s) @@ -1585,7 +1203,7 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)  static void rcu_gp_kthread_wake(void)  {  	if ((current == rcu_state.gp_kthread && -	     !in_interrupt() && !in_serving_softirq()) || +	     !in_irq() && !in_serving_softirq()) ||  	    !READ_ONCE(rcu_state.gp_flags) ||  	    !rcu_state.gp_kthread)  		return; @@ -2295,11 +1913,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)  		return;  	}  	mask = rdp->grpmask; +	rdp->core_needs_qs = false;  	if ((rnp->qsmask & mask) == 0) {  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  	} else { -		rdp->core_needs_qs = false; -  		/*  		 * This GP can't end until cpu checks in, so all of our  		 * callbacks can be processed during the next GP. @@ -2548,11 +2165,11 @@ void rcu_sched_clock_irq(int user)  }  /* - * Scan the leaf rcu_node structures, processing dyntick state for any that - * have not yet encountered a quiescent state, using the function specified. - * Also initiate boosting for any threads blocked on the root rcu_node. - * - * The caller must have suppressed start of new grace periods. + * Scan the leaf rcu_node structures.  For each structure on which all + * CPUs have reported a quiescent state and on which there are tasks + * blocking the current grace period, initiate RCU priority boosting. + * Otherwise, invoke the specified function to check dyntick state for + * each CPU that has not yet reported a quiescent state.   */  static void force_qs_rnp(int (*f)(struct rcu_data *rdp))  { @@ -2635,101 +2252,6 @@ void rcu_force_quiescent_state(void)  }  EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); -/* - * This function checks for grace-period requests that fail to motivate - * RCU to come out of its idle mode. - */ -void -rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, -			 const unsigned long gpssdelay) -{ -	unsigned long flags; -	unsigned long j; -	struct rcu_node *rnp_root = rcu_get_root(); -	static atomic_t warned = ATOMIC_INIT(0); - -	if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || -	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) -		return; -	j = jiffies; /* Expensive access, and in common case don't get here. */ -	if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || -	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || -	    atomic_read(&warned)) -		return; - -	raw_spin_lock_irqsave_rcu_node(rnp, flags); -	j = jiffies; -	if (rcu_gp_in_progress() || -	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || -	    time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || -	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || -	    atomic_read(&warned)) { -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -		return; -	} -	/* Hold onto the leaf lock to make others see warned==1. */ - -	if (rnp_root != rnp) -		raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ -	j = jiffies; -	if (rcu_gp_in_progress() || -	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || -	    time_before(j, rcu_state.gp_req_activity + gpssdelay) || -	    time_before(j, rcu_state.gp_activity + gpssdelay) || -	    atomic_xchg(&warned, 1)) { -		raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -		return; -	} -	WARN_ON(1); -	if (rnp_root != rnp) -		raw_spin_unlock_rcu_node(rnp_root); -	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -	show_rcu_gp_kthreads(); -} - -/* - * Do a forward-progress check for rcutorture.  This is normally invoked - * due to an OOM event.  The argument "j" gives the time period during - * which rcutorture would like progress to have been made. - */ -void rcu_fwd_progress_check(unsigned long j) -{ -	unsigned long cbs; -	int cpu; -	unsigned long max_cbs = 0; -	int max_cpu = -1; -	struct rcu_data *rdp; - -	if (rcu_gp_in_progress()) { -		pr_info("%s: GP age %lu jiffies\n", -			__func__, jiffies - rcu_state.gp_start); -		show_rcu_gp_kthreads(); -	} else { -		pr_info("%s: Last GP end %lu jiffies ago\n", -			__func__, jiffies - rcu_state.gp_end); -		preempt_disable(); -		rdp = this_cpu_ptr(&rcu_data); -		rcu_check_gp_start_stall(rdp->mynode, rdp, j); -		preempt_enable(); -	} -	for_each_possible_cpu(cpu) { -		cbs = rcu_get_n_cbs_cpu(cpu); -		if (!cbs) -			continue; -		if (max_cpu < 0) -			pr_info("%s: callbacks", __func__); -		pr_cont(" %d: %lu", cpu, cbs); -		if (cbs <= max_cbs) -			continue; -		max_cbs = cbs; -		max_cpu = cpu; -	} -	if (max_cpu >= 0) -		pr_cont("\n"); -} -EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); -  /* Perform RCU core processing work for the current CPU.  */  static __latent_entropy void rcu_core(struct softirq_action *unused)  { @@ -2870,7 +2392,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy)  		 * Use rcu:rcu_callback trace event to find the previous  		 * time callback was passed to __call_rcu().  		 */ -		WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n", +		WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n",  			  head, head->func);  		WRITE_ONCE(head->func, rcu_leak_callback);  		return; @@ -3559,13 +3081,11 @@ static int rcu_pm_notify(struct notifier_block *self,  	switch (action) {  	case PM_HIBERNATION_PREPARE:  	case PM_SUSPEND_PREPARE: -		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ -			rcu_expedite_gp(); +		rcu_expedite_gp();  		break;  	case PM_POST_HIBERNATION:  	case PM_POST_SUSPEND: -		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ -			rcu_unexpedite_gp(); +		rcu_unexpedite_gp();  		break;  	default:  		break; @@ -3742,8 +3262,7 @@ static void __init rcu_init_geometry(void)  		jiffies_till_first_fqs = d;  	if (jiffies_till_next_fqs == ULONG_MAX)  		jiffies_till_next_fqs = d; -	if (jiffies_till_sched_qs == ULONG_MAX) -		adjust_jiffies_till_sched_qs(); +	adjust_jiffies_till_sched_qs();  	/* If the compile-time values are accurate, just leave. */  	if (rcu_fanout_leaf == RCU_FANOUT_LEAF && @@ -3858,5 +3377,6 @@ void __init rcu_init(void)  	srcu_init();  } +#include "tree_stall.h"  #include "tree_exp.h"  #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bb4f995f2d3f..e253d11af3c4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -393,15 +393,13 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name;  int rcu_dynticks_snap(struct rcu_data *rdp); -/* Forward declarations for rcutree_plugin.h */ +/* Forward declarations for tree_plugin.h */  static void rcu_bootup_announce(void);  static void rcu_qs(void);  static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);  #ifdef CONFIG_HOTPLUG_CPU  static bool rcu_preempt_has_tasks(struct rcu_node *rnp);  #endif /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_print_detail_task_stall(void); -static int rcu_print_task_stall(struct rcu_node *rnp);  static int rcu_print_task_exp_stall(struct rcu_node *rnp);  static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);  static void rcu_flavor_sched_clock_irq(int user); @@ -418,9 +416,6 @@ static void rcu_prepare_for_idle(void);  static bool rcu_preempt_has_tasks(struct rcu_node *rnp);  static bool rcu_preempt_need_deferred_qs(struct task_struct *t);  static void rcu_preempt_deferred_qs(struct task_struct *t); -static void print_cpu_stall_info_begin(void); -static void print_cpu_stall_info(int cpu); -static void print_cpu_stall_info_end(void);  static void zero_cpu_stall_ticks(struct rcu_data *rdp);  static bool rcu_nocb_cpu_needs_barrier(int cpu);  static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); @@ -445,3 +440,10 @@ static void rcu_bind_gp_kthread(void);  static bool rcu_nohz_full_cpu(void);  static void rcu_dynticks_task_enter(void);  static void rcu_dynticks_task_exit(void); + +/* Forward declarations for tree_stall.h */ +static void record_gp_stall_check_time(void); +static void rcu_iw_handler(struct irq_work *iwp); +static void check_cpu_stall(struct rcu_data *rdp); +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, +				     const unsigned long gpssdelay); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4c2a0189e748..9c990df880d1 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -10,6 +10,7 @@  #include <linux/lockdep.h>  static void rcu_exp_handler(void *unused); +static int rcu_print_task_exp_stall(struct rcu_node *rnp);  /*   * Record the start of an expedited grace period. @@ -633,7 +634,7 @@ static void rcu_exp_handler(void *unused)  		raw_spin_lock_irqsave_rcu_node(rnp, flags);  		if (rnp->expmask & rdp->grpmask) {  			rdp->deferred_qs = true; -			WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); +			t->rcu_read_unlock_special.b.exp_hint = true;  		}  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  		return; @@ -648,7 +649,7 @@ static void rcu_exp_handler(void *unused)  	 *  	 * If the CPU is fully enabled (or if some buggy RCU-preempt  	 * read-side critical section is being used from idle), just -	 * invoke rcu_preempt_defer_qs() to immediately report the +	 * invoke rcu_preempt_deferred_qs() to immediately report the  	 * quiescent state.  We cannot use rcu_read_unlock_special()  	 * because we are in an interrupt handler, which will cause that  	 * function to take an early exit without doing anything. @@ -670,6 +671,27 @@ static void sync_sched_exp_online_cleanup(int cpu)  {  } +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each that is blocking the current + * expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ +	struct task_struct *t; +	int ndetected = 0; + +	if (!rnp->exp_tasks) +		return 0; +	t = list_entry(rnp->exp_tasks->prev, +		       struct task_struct, rcu_node_entry); +	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { +		pr_cont(" P%d", t->pid); +		ndetected++; +	} +	return ndetected; +} +  #else /* #ifdef CONFIG_PREEMPT_RCU */  /* Invoked on each online non-idle CPU for expedited quiescent state. */ @@ -709,6 +731,16 @@ static void sync_sched_exp_online_cleanup(int cpu)  	WARN_ON_ONCE(ret);  } +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections that are + * blocking the current expedited grace period. + */ +static int rcu_print_task_exp_stall(struct rcu_node *rnp) +{ +	return 0; +} +  #endif /* #else #ifdef CONFIG_PREEMPT_RCU */  /** diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 97dba50f6fb2..1102765f91fd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -285,7 +285,7 @@ static void rcu_qs(void)  				       TPS("cpuqs"));  		__this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);  		barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */ -		current->rcu_read_unlock_special.b.need_qs = false; +		WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);  	}  } @@ -643,100 +643,6 @@ static void rcu_read_unlock_special(struct task_struct *t)  }  /* - * Dump detailed information for all tasks blocking the current RCU - * grace period on the specified rcu_node structure. - */ -static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) -{ -	unsigned long flags; -	struct task_struct *t; - -	raw_spin_lock_irqsave_rcu_node(rnp, flags); -	if (!rcu_preempt_blocked_readers_cgp(rnp)) { -		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -		return; -	} -	t = list_entry(rnp->gp_tasks->prev, -		       struct task_struct, rcu_node_entry); -	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { -		/* -		 * We could be printing a lot while holding a spinlock. -		 * Avoid triggering hard lockup. -		 */ -		touch_nmi_watchdog(); -		sched_show_task(t); -	} -	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -} - -/* - * Dump detailed information for all tasks blocking the current RCU - * grace period. - */ -static void rcu_print_detail_task_stall(void) -{ -	struct rcu_node *rnp = rcu_get_root(); - -	rcu_print_detail_task_stall_rnp(rnp); -	rcu_for_each_leaf_node(rnp) -		rcu_print_detail_task_stall_rnp(rnp); -} - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ -	pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", -	       rnp->level, rnp->grplo, rnp->grphi); -} - -static void rcu_print_task_stall_end(void) -{ -	pr_cont("\n"); -} - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ -	struct task_struct *t; -	int ndetected = 0; - -	if (!rcu_preempt_blocked_readers_cgp(rnp)) -		return 0; -	rcu_print_task_stall_begin(rnp); -	t = list_entry(rnp->gp_tasks->prev, -		       struct task_struct, rcu_node_entry); -	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { -		pr_cont(" P%d", t->pid); -		ndetected++; -	} -	rcu_print_task_stall_end(); -	return ndetected; -} - -/* - * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each that is blocking the current - * expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ -	struct task_struct *t; -	int ndetected = 0; - -	if (!rnp->exp_tasks) -		return 0; -	t = list_entry(rnp->exp_tasks->prev, -		       struct task_struct, rcu_node_entry); -	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { -		pr_cont(" P%d", t->pid); -		ndetected++; -	} -	return ndetected; -} - -/*   * Check that the list of blocked tasks for the newly completed grace   * period is in fact empty.  It is a serious bug to complete a grace   * period that still has RCU readers blocked!  This function must be @@ -804,19 +710,25 @@ static void rcu_flavor_sched_clock_irq(int user)  /*   * Check for a task exiting while in a preemptible-RCU read-side - * critical section, clean up if so.  No need to issue warnings, - * as debug_check_no_locks_held() already does this if lockdep - * is enabled. + * critical section, clean up if so.  No need to issue warnings, as + * debug_check_no_locks_held() already does this if lockdep is enabled. + * Besides, if this function does anything other than just immediately + * return, there was a bug of some sort.  Spewing warnings from this + * function is like as not to simply obscure important prior warnings.   */  void exit_rcu(void)  {  	struct task_struct *t = current; -	if (likely(list_empty(¤t->rcu_node_entry))) +	if (unlikely(!list_empty(¤t->rcu_node_entry))) { +		t->rcu_read_lock_nesting = 1; +		barrier(); +		WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); +	} else if (unlikely(t->rcu_read_lock_nesting)) { +		t->rcu_read_lock_nesting = 1; +	} else {  		return; -	t->rcu_read_lock_nesting = 1; -	barrier(); -	t->rcu_read_unlock_special.b.blocked = true; +	}  	__rcu_read_unlock();  	rcu_preempt_deferred_qs(current);  } @@ -980,33 +892,6 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t)  static void rcu_preempt_deferred_qs(struct task_struct *t) { }  /* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static void rcu_print_detail_task_stall(void) -{ -} - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections. - */ -static int rcu_print_task_stall(struct rcu_node *rnp) -{ -	return 0; -} - -/* - * Because preemptible RCU does not exist, we never have to check for - * tasks blocked within RCU read-side critical sections that are - * blocking the current expedited grace period. - */ -static int rcu_print_task_exp_stall(struct rcu_node *rnp) -{ -	return 0; -} - -/*   * Because there is no preemptible RCU, there can be no readers blocked,   * so there is no need to check for blocked tasks.  So check only for   * bogus qsmask values. @@ -1185,8 +1070,6 @@ static int rcu_boost_kthread(void *arg)  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)  	__releases(rnp->lock)  { -	struct task_struct *t; -  	raw_lockdep_assert_held_rcu_node(rnp);  	if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -1200,9 +1083,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)  		if (rnp->exp_tasks == NULL)  			rnp->boost_tasks = rnp->gp_tasks;  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); -		t = rnp->boost_kthread_task; -		if (t) -			rcu_wake_cond(t, rnp->boost_kthread_status); +		rcu_wake_cond(rnp->boost_kthread_task, +			      rnp->boost_kthread_status);  	} else {  		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);  	} @@ -1649,98 +1531,6 @@ static void rcu_cleanup_after_idle(void)  #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_FAST_NO_HZ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ -	struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - -	sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", -		rdp->last_accelerate & 0xffff, jiffies & 0xffff, -		".l"[rdp->all_lazy], -		".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], -		".D"[!rdp->tick_nohz_enabled_snap]); -} - -#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ -	*cp = '\0'; -} - -#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ - -/* Initiate the stall-info list. */ -static void print_cpu_stall_info_begin(void) -{ -	pr_cont("\n"); -} - -/* - * Print out diagnostic information for the specified stalled CPU. - * - * If the specified CPU is aware of the current RCU grace period, then - * print the number of scheduling clock interrupts the CPU has taken - * during the time that it has been aware.  Otherwise, print the number - * of RCU grace periods that this CPU is ignorant of, for example, "1" - * if the CPU was aware of the previous grace period. - * - * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. - */ -static void print_cpu_stall_info(int cpu) -{ -	unsigned long delta; -	char fast_no_hz[72]; -	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); -	char *ticks_title; -	unsigned long ticks_value; - -	/* -	 * We could be printing a lot while holding a spinlock.  Avoid -	 * triggering hard lockup. -	 */ -	touch_nmi_watchdog(); - -	ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); -	if (ticks_value) { -		ticks_title = "GPs behind"; -	} else { -		ticks_title = "ticks this GP"; -		ticks_value = rdp->ticks_this_gp; -	} -	print_cpu_stall_fast_no_hz(fast_no_hz, cpu); -	delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); -	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", -	       cpu, -	       "O."[!!cpu_online(cpu)], -	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], -	       "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], -	       !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : -			rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : -				"!."[!delta], -	       ticks_value, ticks_title, -	       rcu_dynticks_snap(rdp) & 0xfff, -	       rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, -	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), -	       READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, -	       fast_no_hz); -} - -/* Terminate the stall-info list. */ -static void print_cpu_stall_info_end(void) -{ -	pr_err("\t"); -} - -/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ -	rdp->ticks_this_gp = 0; -	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); -	WRITE_ONCE(rdp->last_fqs_resched, jiffies); -} -  #ifdef CONFIG_RCU_NOCB_CPU  /* @@ -1766,11 +1556,22 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)   */ -/* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ +/* + * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. + * The string after the "rcu_nocbs=" is either "all" for all CPUs, or a + * comma-separated list of CPUs and/or CPU ranges.  If an invalid list is + * given, a warning is emitted and all CPUs are offloaded. + */  static int __init rcu_nocb_setup(char *str)  {  	alloc_bootmem_cpumask_var(&rcu_nocb_mask); -	cpulist_parse(str, rcu_nocb_mask); +	if (!strcasecmp(str, "all")) +		cpumask_setall(rcu_nocb_mask); +	else +		if (cpulist_parse(str, rcu_nocb_mask)) { +			pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); +			cpumask_setall(rcu_nocb_mask); +		}  	return 1;  }  __setup("rcu_nocbs=", rcu_nocb_setup); diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h new file mode 100644 index 000000000000..f65a73a97323 --- /dev/null +++ b/kernel/rcu/tree_stall.h @@ -0,0 +1,709 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * RCU CPU stall warnings for normal RCU grace periods + * + * Copyright IBM Corporation, 2019 + * + * Author: Paul E. McKenney <paulmck@linux.ibm.com> + */ + +////////////////////////////////////////////////////////////////////////////// +// +// Controlling CPU stall warnings, including delay calculation. + +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA	       (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA	       0 +#endif + +/* Limit-check stall timeouts specified at boottime and runtime. */ +int rcu_jiffies_till_stall_check(void) +{ +	int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); + +	/* +	 * Limit check must be consistent with the Kconfig limits +	 * for CONFIG_RCU_CPU_STALL_TIMEOUT. +	 */ +	if (till_stall_check < 3) { +		WRITE_ONCE(rcu_cpu_stall_timeout, 3); +		till_stall_check = 3; +	} else if (till_stall_check > 300) { +		WRITE_ONCE(rcu_cpu_stall_timeout, 300); +		till_stall_check = 300; +	} +	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} +EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); + +/* Don't do RCU CPU stall warnings during long sysrq printouts. */ +void rcu_sysrq_start(void) +{ +	if (!rcu_cpu_stall_suppress) +		rcu_cpu_stall_suppress = 2; +} + +void rcu_sysrq_end(void) +{ +	if (rcu_cpu_stall_suppress == 2) +		rcu_cpu_stall_suppress = 0; +} + +/* Don't print RCU CPU stall warnings during a kernel panic. */ +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ +	rcu_cpu_stall_suppress = 1; +	return NOTIFY_DONE; +} + +static struct notifier_block rcu_panic_block = { +	.notifier_call = rcu_panic, +}; + +static int __init check_cpu_stall_init(void) +{ +	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); +	return 0; +} +early_initcall(check_cpu_stall_init); + +/* If so specified via sysctl, panic, yielding cleaner stall-warning output. */ +static void panic_on_rcu_stall(void) +{ +	if (sysctl_panic_on_rcu_stall) +		panic("RCU Stall\n"); +} + +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ +	WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Interaction with RCU grace periods + +/* Start of new grace period, so record stall time (and forcing times). */ +static void record_gp_stall_check_time(void) +{ +	unsigned long j = jiffies; +	unsigned long j1; + +	rcu_state.gp_start = j; +	j1 = rcu_jiffies_till_stall_check(); +	/* Record ->gp_start before ->jiffies_stall. */ +	smp_store_release(&rcu_state.jiffies_stall, j + j1); /* ^^^ */ +	rcu_state.jiffies_resched = j + j1 / 2; +	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); +} + +/* Zero ->ticks_this_gp and snapshot the number of RCU softirq handlers. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ +	rdp->ticks_this_gp = 0; +	rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); +	WRITE_ONCE(rdp->last_fqs_resched, jiffies); +} + +/* + * If too much time has passed in the current grace period, and if + * so configured, go kick the relevant kthreads. + */ +static void rcu_stall_kick_kthreads(void) +{ +	unsigned long j; + +	if (!rcu_kick_kthreads) +		return; +	j = READ_ONCE(rcu_state.jiffies_kick_kthreads); +	if (time_after(jiffies, j) && rcu_state.gp_kthread && +	    (rcu_gp_in_progress() || READ_ONCE(rcu_state.gp_flags))) { +		WARN_ONCE(1, "Kicking %s grace-period kthread\n", +			  rcu_state.name); +		rcu_ftrace_dump(DUMP_ALL); +		wake_up_process(rcu_state.gp_kthread); +		WRITE_ONCE(rcu_state.jiffies_kick_kthreads, j + HZ); +	} +} + +/* + * Handler for the irq_work request posted about halfway into the RCU CPU + * stall timeout, and used to detect excessive irq disabling.  Set state + * appropriately, but just complain if there is unexpected state on entry. + */ +static void rcu_iw_handler(struct irq_work *iwp) +{ +	struct rcu_data *rdp; +	struct rcu_node *rnp; + +	rdp = container_of(iwp, struct rcu_data, rcu_iw); +	rnp = rdp->mynode; +	raw_spin_lock_rcu_node(rnp); +	if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) { +		rdp->rcu_iw_gp_seq = rnp->gp_seq; +		rdp->rcu_iw_pending = false; +	} +	raw_spin_unlock_rcu_node(rnp); +} + +////////////////////////////////////////////////////////////////////////////// +// +// Printing RCU CPU stall warnings + +#ifdef CONFIG_PREEMPT + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period on the specified rcu_node structure. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +	unsigned long flags; +	struct task_struct *t; + +	raw_spin_lock_irqsave_rcu_node(rnp, flags); +	if (!rcu_preempt_blocked_readers_cgp(rnp)) { +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +		return; +	} +	t = list_entry(rnp->gp_tasks->prev, +		       struct task_struct, rcu_node_entry); +	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { +		/* +		 * We could be printing a lot while holding a spinlock. +		 * Avoid triggering hard lockup. +		 */ +		touch_nmi_watchdog(); +		sched_show_task(t); +	} +	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ +	struct task_struct *t; +	int ndetected = 0; + +	if (!rcu_preempt_blocked_readers_cgp(rnp)) +		return 0; +	pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", +	       rnp->level, rnp->grplo, rnp->grphi); +	t = list_entry(rnp->gp_tasks->prev, +		       struct task_struct, rcu_node_entry); +	list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { +		pr_cont(" P%d", t->pid); +		ndetected++; +	} +	pr_cont("\n"); +	return ndetected; +} + +#else /* #ifdef CONFIG_PREEMPT */ + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ +} + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static int rcu_print_task_stall(struct rcu_node *rnp) +{ +	return 0; +} +#endif /* #else #ifdef CONFIG_PREEMPT */ + +/* + * Dump stacks of all tasks running on stalled CPUs.  First try using + * NMIs, but fall back to manual remote stack tracing on architectures + * that don't support NMI-based stack dumps.  The NMI-triggered stack + * traces are more accurate because they are printed by the target CPU. + */ +static void rcu_dump_cpu_stacks(void) +{ +	int cpu; +	unsigned long flags; +	struct rcu_node *rnp; + +	rcu_for_each_leaf_node(rnp) { +		raw_spin_lock_irqsave_rcu_node(rnp, flags); +		for_each_leaf_node_possible_cpu(rnp, cpu) +			if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) +				if (!trigger_single_cpu_backtrace(cpu)) +					dump_cpu_task(cpu); +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +	} +} + +#ifdef CONFIG_RCU_FAST_NO_HZ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ +	struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + +	sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", +		rdp->last_accelerate & 0xffff, jiffies & 0xffff, +		".l"[rdp->all_lazy], +		".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], +		".D"[!!rdp->tick_nohz_enabled_snap]); +} + +#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ +	*cp = '\0'; +} + +#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ + +/* + * Print out diagnostic information for the specified stalled CPU. + * + * If the specified CPU is aware of the current RCU grace period, then + * print the number of scheduling clock interrupts the CPU has taken + * during the time that it has been aware.  Otherwise, print the number + * of RCU grace periods that this CPU is ignorant of, for example, "1" + * if the CPU was aware of the previous grace period. + * + * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + */ +static void print_cpu_stall_info(int cpu) +{ +	unsigned long delta; +	char fast_no_hz[72]; +	struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); +	char *ticks_title; +	unsigned long ticks_value; + +	/* +	 * We could be printing a lot while holding a spinlock.  Avoid +	 * triggering hard lockup. +	 */ +	touch_nmi_watchdog(); + +	ticks_value = rcu_seq_ctr(rcu_state.gp_seq - rdp->gp_seq); +	if (ticks_value) { +		ticks_title = "GPs behind"; +	} else { +		ticks_title = "ticks this GP"; +		ticks_value = rdp->ticks_this_gp; +	} +	print_cpu_stall_fast_no_hz(fast_no_hz, cpu); +	delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); +	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", +	       cpu, +	       "O."[!!cpu_online(cpu)], +	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], +	       "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], +	       !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' : +			rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' : +				"!."[!delta], +	       ticks_value, ticks_title, +	       rcu_dynticks_snap(rdp) & 0xfff, +	       rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, +	       rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), +	       READ_ONCE(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, +	       fast_no_hz); +} + +/* Complain about starvation of grace-period kthread.  */ +static void rcu_check_gp_kthread_starvation(void) +{ +	struct task_struct *gpk = rcu_state.gp_kthread; +	unsigned long j; + +	j = jiffies - READ_ONCE(rcu_state.gp_activity); +	if (j > 2 * HZ) { +		pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", +		       rcu_state.name, j, +		       (long)rcu_seq_current(&rcu_state.gp_seq), +		       READ_ONCE(rcu_state.gp_flags), +		       gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, +		       gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); +		if (gpk) { +			pr_err("RCU grace-period kthread stack dump:\n"); +			sched_show_task(gpk); +			wake_up_process(gpk); +		} +	} +} + +static void print_other_cpu_stall(unsigned long gp_seq) +{ +	int cpu; +	unsigned long flags; +	unsigned long gpa; +	unsigned long j; +	int ndetected = 0; +	struct rcu_node *rnp; +	long totqlen = 0; + +	/* Kick and suppress, if so configured. */ +	rcu_stall_kick_kthreads(); +	if (rcu_cpu_stall_suppress) +		return; + +	/* +	 * OK, time to rat on our buddy... +	 * See Documentation/RCU/stallwarn.txt for info on how to debug +	 * RCU CPU stall warnings. +	 */ +	pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); +	rcu_for_each_leaf_node(rnp) { +		raw_spin_lock_irqsave_rcu_node(rnp, flags); +		ndetected += rcu_print_task_stall(rnp); +		if (rnp->qsmask != 0) { +			for_each_leaf_node_possible_cpu(rnp, cpu) +				if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { +					print_cpu_stall_info(cpu); +					ndetected++; +				} +		} +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +	} + +	for_each_possible_cpu(cpu) +		totqlen += rcu_get_n_cbs_cpu(cpu); +	pr_cont("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu)\n", +	       smp_processor_id(), (long)(jiffies - rcu_state.gp_start), +	       (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); +	if (ndetected) { +		rcu_dump_cpu_stacks(); + +		/* Complain about tasks blocking the grace period. */ +		rcu_for_each_leaf_node(rnp) +			rcu_print_detail_task_stall_rnp(rnp); +	} else { +		if (rcu_seq_current(&rcu_state.gp_seq) != gp_seq) { +			pr_err("INFO: Stall ended before state dump start\n"); +		} else { +			j = jiffies; +			gpa = READ_ONCE(rcu_state.gp_activity); +			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", +			       rcu_state.name, j - gpa, j, gpa, +			       READ_ONCE(jiffies_till_next_fqs), +			       rcu_get_root()->qsmask); +			/* In this case, the current CPU might be at fault. */ +			sched_show_task(current); +		} +	} +	/* Rewrite if needed in case of slow consoles. */ +	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) +		WRITE_ONCE(rcu_state.jiffies_stall, +			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + +	rcu_check_gp_kthread_starvation(); + +	panic_on_rcu_stall(); + +	rcu_force_quiescent_state();  /* Kick them all. */ +} + +static void print_cpu_stall(void) +{ +	int cpu; +	unsigned long flags; +	struct rcu_data *rdp = this_cpu_ptr(&rcu_data); +	struct rcu_node *rnp = rcu_get_root(); +	long totqlen = 0; + +	/* Kick and suppress, if so configured. */ +	rcu_stall_kick_kthreads(); +	if (rcu_cpu_stall_suppress) +		return; + +	/* +	 * OK, time to rat on ourselves... +	 * See Documentation/RCU/stallwarn.txt for info on how to debug +	 * RCU CPU stall warnings. +	 */ +	pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name); +	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags); +	print_cpu_stall_info(smp_processor_id()); +	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags); +	for_each_possible_cpu(cpu) +		totqlen += rcu_get_n_cbs_cpu(cpu); +	pr_cont("\t(t=%lu jiffies g=%ld q=%lu)\n", +		jiffies - rcu_state.gp_start, +		(long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + +	rcu_check_gp_kthread_starvation(); + +	rcu_dump_cpu_stacks(); + +	raw_spin_lock_irqsave_rcu_node(rnp, flags); +	/* Rewrite if needed in case of slow consoles. */ +	if (ULONG_CMP_GE(jiffies, READ_ONCE(rcu_state.jiffies_stall))) +		WRITE_ONCE(rcu_state.jiffies_stall, +			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3); +	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + +	panic_on_rcu_stall(); + +	/* +	 * Attempt to revive the RCU machinery by forcing a context switch. +	 * +	 * A context switch would normally allow the RCU state machine to make +	 * progress and it could be we're stuck in kernel space without context +	 * switches for an entirely unreasonable amount of time. +	 */ +	set_tsk_need_resched(current); +	set_preempt_need_resched(); +} + +static void check_cpu_stall(struct rcu_data *rdp) +{ +	unsigned long gs1; +	unsigned long gs2; +	unsigned long gps; +	unsigned long j; +	unsigned long jn; +	unsigned long js; +	struct rcu_node *rnp; + +	if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || +	    !rcu_gp_in_progress()) +		return; +	rcu_stall_kick_kthreads(); +	j = jiffies; + +	/* +	 * Lots of memory barriers to reject false positives. +	 * +	 * The idea is to pick up rcu_state.gp_seq, then +	 * rcu_state.jiffies_stall, then rcu_state.gp_start, and finally +	 * another copy of rcu_state.gp_seq.  These values are updated in +	 * the opposite order with memory barriers (or equivalent) during +	 * grace-period initialization and cleanup.  Now, a false positive +	 * can occur if we get an new value of rcu_state.gp_start and a old +	 * value of rcu_state.jiffies_stall.  But given the memory barriers, +	 * the only way that this can happen is if one grace period ends +	 * and another starts between these two fetches.  This is detected +	 * by comparing the second fetch of rcu_state.gp_seq with the +	 * previous fetch from rcu_state.gp_seq. +	 * +	 * Given this check, comparisons of jiffies, rcu_state.jiffies_stall, +	 * and rcu_state.gp_start suffice to forestall false positives. +	 */ +	gs1 = READ_ONCE(rcu_state.gp_seq); +	smp_rmb(); /* Pick up ->gp_seq first... */ +	js = READ_ONCE(rcu_state.jiffies_stall); +	smp_rmb(); /* ...then ->jiffies_stall before the rest... */ +	gps = READ_ONCE(rcu_state.gp_start); +	smp_rmb(); /* ...and finally ->gp_start before ->gp_seq again. */ +	gs2 = READ_ONCE(rcu_state.gp_seq); +	if (gs1 != gs2 || +	    ULONG_CMP_LT(j, js) || +	    ULONG_CMP_GE(gps, js)) +		return; /* No stall or GP completed since entering function. */ +	rnp = rdp->mynode; +	jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; +	if (rcu_gp_in_progress() && +	    (READ_ONCE(rnp->qsmask) & rdp->grpmask) && +	    cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + +		/* We haven't checked in, so go dump stack. */ +		print_cpu_stall(); + +	} else if (rcu_gp_in_progress() && +		   ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && +		   cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { + +		/* They had a few time units to dump stack, so complain. */ +		print_other_cpu_stall(gs2); +	} +} + +////////////////////////////////////////////////////////////////////////////// +// +// RCU forward-progress mechanisms, including of callback invocation. + + +/* + * Show the state of the grace-period kthreads. + */ +void show_rcu_gp_kthreads(void) +{ +	int cpu; +	unsigned long j; +	unsigned long ja; +	unsigned long jr; +	unsigned long jw; +	struct rcu_data *rdp; +	struct rcu_node *rnp; + +	j = jiffies; +	ja = j - READ_ONCE(rcu_state.gp_activity); +	jr = j - READ_ONCE(rcu_state.gp_req_activity); +	jw = j - READ_ONCE(rcu_state.gp_wake_time); +	pr_info("%s: wait state: %s(%d) ->state: %#lx delta ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_flags %#x\n", +		rcu_state.name, gp_state_getname(rcu_state.gp_state), +		rcu_state.gp_state, +		rcu_state.gp_kthread ? rcu_state.gp_kthread->state : 0x1ffffL, +		ja, jr, jw, (long)READ_ONCE(rcu_state.gp_wake_seq), +		(long)READ_ONCE(rcu_state.gp_seq), +		(long)READ_ONCE(rcu_get_root()->gp_seq_needed), +		READ_ONCE(rcu_state.gp_flags)); +	rcu_for_each_node_breadth_first(rnp) { +		if (ULONG_CMP_GE(rcu_state.gp_seq, rnp->gp_seq_needed)) +			continue; +		pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld\n", +			rnp->grplo, rnp->grphi, (long)rnp->gp_seq, +			(long)rnp->gp_seq_needed); +		if (!rcu_is_leaf_node(rnp)) +			continue; +		for_each_leaf_node_possible_cpu(rnp, cpu) { +			rdp = per_cpu_ptr(&rcu_data, cpu); +			if (rdp->gpwrap || +			    ULONG_CMP_GE(rcu_state.gp_seq, +					 rdp->gp_seq_needed)) +				continue; +			pr_info("\tcpu %d ->gp_seq_needed %ld\n", +				cpu, (long)rdp->gp_seq_needed); +		} +	} +	/* sched_show_task(rcu_state.gp_kthread); */ +} +EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); + +/* + * This function checks for grace-period requests that fail to motivate + * RCU to come out of its idle mode. + */ +static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, +				     const unsigned long gpssdelay) +{ +	unsigned long flags; +	unsigned long j; +	struct rcu_node *rnp_root = rcu_get_root(); +	static atomic_t warned = ATOMIC_INIT(0); + +	if (!IS_ENABLED(CONFIG_PROVE_RCU) || rcu_gp_in_progress() || +	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed)) +		return; +	j = jiffies; /* Expensive access, and in common case don't get here. */ +	if (time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || +	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || +	    atomic_read(&warned)) +		return; + +	raw_spin_lock_irqsave_rcu_node(rnp, flags); +	j = jiffies; +	if (rcu_gp_in_progress() || +	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || +	    time_before(j, READ_ONCE(rcu_state.gp_req_activity) + gpssdelay) || +	    time_before(j, READ_ONCE(rcu_state.gp_activity) + gpssdelay) || +	    atomic_read(&warned)) { +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +		return; +	} +	/* Hold onto the leaf lock to make others see warned==1. */ + +	if (rnp_root != rnp) +		raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */ +	j = jiffies; +	if (rcu_gp_in_progress() || +	    ULONG_CMP_GE(rnp_root->gp_seq, rnp_root->gp_seq_needed) || +	    time_before(j, rcu_state.gp_req_activity + gpssdelay) || +	    time_before(j, rcu_state.gp_activity + gpssdelay) || +	    atomic_xchg(&warned, 1)) { +		raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ +		raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +		return; +	} +	WARN_ON(1); +	if (rnp_root != rnp) +		raw_spin_unlock_rcu_node(rnp_root); +	raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +	show_rcu_gp_kthreads(); +} + +/* + * Do a forward-progress check for rcutorture.  This is normally invoked + * due to an OOM event.  The argument "j" gives the time period during + * which rcutorture would like progress to have been made. + */ +void rcu_fwd_progress_check(unsigned long j) +{ +	unsigned long cbs; +	int cpu; +	unsigned long max_cbs = 0; +	int max_cpu = -1; +	struct rcu_data *rdp; + +	if (rcu_gp_in_progress()) { +		pr_info("%s: GP age %lu jiffies\n", +			__func__, jiffies - rcu_state.gp_start); +		show_rcu_gp_kthreads(); +	} else { +		pr_info("%s: Last GP end %lu jiffies ago\n", +			__func__, jiffies - rcu_state.gp_end); +		preempt_disable(); +		rdp = this_cpu_ptr(&rcu_data); +		rcu_check_gp_start_stall(rdp->mynode, rdp, j); +		preempt_enable(); +	} +	for_each_possible_cpu(cpu) { +		cbs = rcu_get_n_cbs_cpu(cpu); +		if (!cbs) +			continue; +		if (max_cpu < 0) +			pr_info("%s: callbacks", __func__); +		pr_cont(" %d: %lu", cpu, cbs); +		if (cbs <= max_cbs) +			continue; +		max_cbs = cbs; +		max_cpu = cpu; +	} +	if (max_cpu >= 0) +		pr_cont("\n"); +} +EXPORT_SYMBOL_GPL(rcu_fwd_progress_check); + +/* Commandeer a sysrq key to dump RCU's tree. */ +static bool sysrq_rcu; +module_param(sysrq_rcu, bool, 0444); + +/* Dump grace-period-request information due to commandeered sysrq. */ +static void sysrq_show_rcu(int key) +{ +	show_rcu_gp_kthreads(); +} + +static struct sysrq_key_op sysrq_rcudump_op = { +	.handler = sysrq_show_rcu, +	.help_msg = "show-rcu(y)", +	.action_msg = "Show RCU tree", +	.enable_mask = SYSRQ_ENABLE_DUMP, +}; + +static int __init rcu_sysrq_init(void) +{ +	if (sysrq_rcu) +		return register_sysrq_key('y', &sysrq_rcudump_op); +	return 0; +} +early_initcall(rcu_sysrq_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index cbaa976c5945..c3bf44ba42e5 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -424,68 +424,11 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);  #endif  #ifdef CONFIG_RCU_STALL_COMMON - -#ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA	       (5 * HZ) -#else -#define RCU_STALL_DELAY_DELTA	       0 -#endif -  int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */  EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); -static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; -  module_param(rcu_cpu_stall_suppress, int, 0644); +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;  module_param(rcu_cpu_stall_timeout, int, 0644); - -int rcu_jiffies_till_stall_check(void) -{ -	int till_stall_check = READ_ONCE(rcu_cpu_stall_timeout); - -	/* -	 * Limit check must be consistent with the Kconfig limits -	 * for CONFIG_RCU_CPU_STALL_TIMEOUT. -	 */ -	if (till_stall_check < 3) { -		WRITE_ONCE(rcu_cpu_stall_timeout, 3); -		till_stall_check = 3; -	} else if (till_stall_check > 300) { -		WRITE_ONCE(rcu_cpu_stall_timeout, 300); -		till_stall_check = 300; -	} -	return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; -} -EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); - -void rcu_sysrq_start(void) -{ -	if (!rcu_cpu_stall_suppress) -		rcu_cpu_stall_suppress = 2; -} - -void rcu_sysrq_end(void) -{ -	if (rcu_cpu_stall_suppress == 2) -		rcu_cpu_stall_suppress = 0; -} - -static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) -{ -	rcu_cpu_stall_suppress = 1; -	return NOTIFY_DONE; -} - -static struct notifier_block rcu_panic_block = { -	.notifier_call = rcu_panic, -}; - -static int __init check_cpu_stall_init(void) -{ -	atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); -	return 0; -} -early_initcall(check_cpu_stall_init); -  #endif /* #ifdef CONFIG_RCU_STALL_COMMON */  #ifdef CONFIG_TASKS_RCU diff --git a/kernel/resource.c b/kernel/resource.c index 92190f62ebc5..8c15f846e8ef 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -520,21 +520,20 @@ EXPORT_SYMBOL_GPL(page_is_ram);  int region_intersects(resource_size_t start, size_t size, unsigned long flags,  		      unsigned long desc)  { -	resource_size_t end = start + size - 1; +	struct resource res;  	int type = 0; int other = 0;  	struct resource *p; +	res.start = start; +	res.end = start + size - 1; +  	read_lock(&resource_lock);  	for (p = iomem_resource.child; p ; p = p->sibling) {  		bool is_type = (((p->flags & flags) == flags) &&  				((desc == IORES_DESC_NONE) ||  				 (desc == p->desc))); -		if (start >= p->start && start <= p->end) -			is_type ? type++ : other++; -		if (end >= p->start && end <= p->end) -			is_type ? type++ : other++; -		if (p->start >= start && p->end <= end) +		if (resource_overlaps(p, &res))  			is_type ? type++ : other++;  	}  	read_unlock(&resource_lock); diff --git a/kernel/rseq.c b/kernel/rseq.c index 25e9a7b60eba..9424ee90589e 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -254,8 +254,7 @@ static int rseq_ip_fixup(struct pt_regs *regs)   * - signal delivery,   * and return to user-space.   * - * This is how we can ensure that the entire rseq critical section, - * consisting of both the C part and the assembly instruction sequence, + * This is how we can ensure that the entire rseq critical section   * will issue the commit instruction only if executed atomically with   * respect to other threads scheduled on the same CPU, and with respect   * to signal handlers. @@ -314,7 +313,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,  		/* Unregister rseq for current thread. */  		if (current->rseq != rseq || !current->rseq)  			return -EINVAL; -		if (current->rseq_len != rseq_len) +		if (rseq_len != sizeof(*rseq))  			return -EINVAL;  		if (current->rseq_sig != sig)  			return -EPERM; @@ -322,7 +321,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,  		if (ret)  			return ret;  		current->rseq = NULL; -		current->rseq_len = 0;  		current->rseq_sig = 0;  		return 0;  	} @@ -336,7 +334,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,  		 * the provided address differs from the prior  		 * one.  		 */ -		if (current->rseq != rseq || current->rseq_len != rseq_len) +		if (current->rseq != rseq || rseq_len != sizeof(*rseq))  			return -EINVAL;  		if (current->rseq_sig != sig)  			return -EPERM; @@ -354,7 +352,6 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,  	if (!access_ok(rseq, rseq_len))  		return -EFAULT;  	current->rseq = rseq; -	current->rseq_len = rseq_len;  	current->rseq_sig = sig;  	/*  	 * If rseq was previously inactive, and has just been diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4778c48a7fda..102dfcf0a29a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -792,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)  		rq->nr_uninterruptible--;  	enqueue_task(rq, p, flags); + +	p->on_rq = TASK_ON_RQ_QUEUED;  }  void deactivate_task(struct rq *rq, struct task_struct *p, int flags)  { +	p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; +  	if (task_contributes_to_load(p))  		rq->nr_uninterruptible++; @@ -920,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)  }  /* - * Per-CPU kthreads are allowed to run on !actie && online CPUs, see + * Per-CPU kthreads are allowed to run on !active && online CPUs, see   * __set_cpus_allowed_ptr() and select_fallback_rq().   */  static inline bool is_cpu_allowed(struct task_struct *p, int cpu) @@ -1151,7 +1155,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,  		/* Need help from migration thread: drop lock and wait. */  		task_rq_unlock(rq, p, &rf);  		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -		tlb_migrate_finish(p->mm);  		return 0;  	} else if (task_on_rq_queued(p)) {  		/* @@ -1237,11 +1240,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)  		rq_pin_lock(src_rq, &srf);  		rq_pin_lock(dst_rq, &drf); -		p->on_rq = TASK_ON_RQ_MIGRATING;  		deactivate_task(src_rq, p, 0);  		set_task_cpu(p, cpu);  		activate_task(dst_rq, p, 0); -		p->on_rq = TASK_ON_RQ_QUEUED;  		check_preempt_curr(dst_rq, p, 0);  		rq_unpin_lock(dst_rq, &drf); @@ -1681,16 +1682,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)  		__schedstat_inc(p->se.statistics.nr_wakeups_sync);  } -static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) -{ -	activate_task(rq, p, en_flags); -	p->on_rq = TASK_ON_RQ_QUEUED; - -	/* If a worker is waking up, notify the workqueue: */ -	if (p->flags & PF_WQ_WORKER) -		wq_worker_waking_up(p, cpu_of(rq)); -} -  /*   * Mark the task runnable and perform wakeup-preemption.   */ @@ -1742,7 +1733,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,  		en_flags |= ENQUEUE_MIGRATED;  #endif -	ttwu_activate(rq, p, en_flags); +	activate_task(rq, p, en_flags);  	ttwu_do_wakeup(rq, p, wake_flags, rf);  } @@ -2107,56 +2098,6 @@ out:  }  /** - * try_to_wake_up_local - try to wake up a local task with rq lock held - * @p: the thread to be awakened - * @rf: request-queue flags for pinning - * - * Put @p on the run-queue if it's not already there. The caller must - * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. - */ -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) -{ -	struct rq *rq = task_rq(p); - -	if (WARN_ON_ONCE(rq != this_rq()) || -	    WARN_ON_ONCE(p == current)) -		return; - -	lockdep_assert_held(&rq->lock); - -	if (!raw_spin_trylock(&p->pi_lock)) { -		/* -		 * This is OK, because current is on_cpu, which avoids it being -		 * picked for load-balance and preemption/IRQs are still -		 * disabled avoiding further scheduler activity on it and we've -		 * not yet picked a replacement task. -		 */ -		rq_unlock(rq, rf); -		raw_spin_lock(&p->pi_lock); -		rq_relock(rq, rf); -	} - -	if (!(p->state & TASK_NORMAL)) -		goto out; - -	trace_sched_waking(p); - -	if (!task_on_rq_queued(p)) { -		if (p->in_iowait) { -			delayacct_blkio_end(p); -			atomic_dec(&rq->nr_iowait); -		} -		ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); -	} - -	ttwu_do_wakeup(rq, p, 0, rf); -	ttwu_stat(p, smp_processor_id(), 0); -out: -	raw_spin_unlock(&p->pi_lock); -} - -/**   * wake_up_process - Wake up a specific process   * @p: The process to be woken up.   * @@ -2467,7 +2408,6 @@ void wake_up_new_task(struct task_struct *p)  	post_init_entity_util_avg(p);  	activate_task(rq, p, ENQUEUE_NOCLOCK); -	p->on_rq = TASK_ON_RQ_QUEUED;  	trace_sched_wakeup_new(p);  	check_preempt_curr(rq, p, WF_FORK);  #ifdef CONFIG_SMP @@ -3466,25 +3406,11 @@ static void __sched notrace __schedule(bool preempt)  			prev->state = TASK_RUNNING;  		} else {  			deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); -			prev->on_rq = 0;  			if (prev->in_iowait) {  				atomic_inc(&rq->nr_iowait);  				delayacct_blkio_start();  			} - -			/* -			 * If a worker went to sleep, notify and ask workqueue -			 * whether it wants to wake up a task to maintain -			 * concurrency. -			 */ -			if (prev->flags & PF_WQ_WORKER) { -				struct task_struct *to_wakeup; - -				to_wakeup = wq_worker_sleeping(prev); -				if (to_wakeup) -					try_to_wake_up_local(to_wakeup, &rf); -			}  		}  		switch_count = &prev->nvcsw;  	} @@ -3544,6 +3470,20 @@ static inline void sched_submit_work(struct task_struct *tsk)  {  	if (!tsk->state || tsk_is_pi_blocked(tsk))  		return; + +	/* +	 * If a worker went to sleep, notify and ask workqueue whether +	 * it wants to wake up a task to maintain concurrency. +	 * As this function is called inside the schedule() context, +	 * we disable preemption to avoid it calling schedule() again +	 * in the possible wakeup of a kworker. +	 */ +	if (tsk->flags & PF_WQ_WORKER) { +		preempt_disable(); +		wq_worker_sleeping(tsk); +		preempt_enable_no_resched(); +	} +  	/*  	 * If we are going to sleep and we have plugged IO queued,  	 * make sure to submit it to avoid deadlocks. @@ -3552,6 +3492,12 @@ static inline void sched_submit_work(struct task_struct *tsk)  		blk_schedule_flush_plug(tsk);  } +static void sched_update_worker(struct task_struct *tsk) +{ +	if (tsk->flags & PF_WQ_WORKER) +		wq_worker_running(tsk); +} +  asmlinkage __visible void __sched schedule(void)  {  	struct task_struct *tsk = current; @@ -3562,6 +3508,7 @@ asmlinkage __visible void __sched schedule(void)  		__schedule(false);  		sched_preempt_enable_no_resched();  	} while (need_resched()); +	sched_update_worker(tsk);  }  EXPORT_SYMBOL(schedule); @@ -5918,7 +5865,7 @@ void __init sched_init_smp(void)  static int __init migration_init(void)  { -	sched_rq_cpu_starting(smp_processor_id()); +	sched_cpu_starting(smp_processor_id());  	return 0;  }  early_initcall(migration_init); @@ -6559,6 +6506,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,  				struct cftype *cftype, u64 shareval)  { +	if (shareval > scale_load_down(ULONG_MAX)) +		shareval = MAX_SHARES;  	return sched_group_set_shares(css_tg(css), scale_load(shareval));  } @@ -6574,7 +6523,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,  static DEFINE_MUTEX(cfs_constraints_mutex);  const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ +static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */  static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); @@ -6654,20 +6603,22 @@ out_unlock:  	return ret;  } -int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)  {  	u64 quota, period;  	period = ktime_to_ns(tg->cfs_bandwidth.period);  	if (cfs_quota_us < 0)  		quota = RUNTIME_INF; -	else +	else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)  		quota = (u64)cfs_quota_us * NSEC_PER_USEC; +	else +		return -EINVAL;  	return tg_set_cfs_bandwidth(tg, period, quota);  } -long tg_get_cfs_quota(struct task_group *tg) +static long tg_get_cfs_quota(struct task_group *tg)  {  	u64 quota_us; @@ -6680,17 +6631,20 @@ long tg_get_cfs_quota(struct task_group *tg)  	return quota_us;  } -int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)  {  	u64 quota, period; +	if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) +		return -EINVAL; +  	period = (u64)cfs_period_us * NSEC_PER_USEC;  	quota = tg->cfs_bandwidth.quota;  	return tg_set_cfs_bandwidth(tg, period, quota);  } -long tg_get_cfs_period(struct task_group *tg) +static long tg_get_cfs_period(struct task_group *tg)  {  	u64 cfs_period_us; diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 835671f0f917..b5dcd1d83c7f 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -7,7 +7,7 @@   */  #include "sched.h" -DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);  /**   * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3638d2377e3c..5403479073b0 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -13,6 +13,8 @@  #include <linux/sched/cpufreq.h>  #include <trace/events/power.h> +#define IOWAIT_BOOST_MIN	(SCHED_CAPACITY_SCALE / 8) +  struct sugov_tunables {  	struct gov_attr_set	attr_set;  	unsigned int		rate_limit_us; @@ -51,7 +53,6 @@ struct sugov_cpu {  	u64			last_update;  	unsigned long		bw_dl; -	unsigned long		min;  	unsigned long		max;  	/* The field below is for single-CPU policies only: */ @@ -291,8 +292,8 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)   *   * The IO wait boost of a task is disabled after a tick since the last update   * of a CPU. If a new IO wait boost is requested after more then a tick, then - * we enable the boost starting from the minimum frequency, which improves - * energy efficiency by ignoring sporadic wakeups from IO. + * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy + * efficiency by ignoring sporadic wakeups from IO.   */  static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,  			       bool set_iowait_boost) @@ -303,7 +304,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,  	if (delta_ns <= TICK_NSEC)  		return false; -	sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0; +	sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;  	sg_cpu->iowait_boost_pending = set_iowait_boost;  	return true; @@ -317,8 +318,9 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,   *   * Each time a task wakes up after an IO operation, the CPU utilization can be   * boosted to a certain utilization which doubles at each "frequent and - * successive" wakeup from IO, ranging from the utilization of the minimum - * OPP to the utilization of the maximum OPP. + * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization + * of the maximum OPP. + *   * To keep doubling, an IO boost has to be requested at least once per tick,   * otherwise we restart from the utilization of the minimum OPP.   */ @@ -349,7 +351,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,  	}  	/* First wakeup after IO: start with minimum boost */ -	sg_cpu->iowait_boost = sg_cpu->min; +	sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;  }  /** @@ -389,7 +391,7 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,  		 * No boost pending; reduce the boost value.  		 */  		sg_cpu->iowait_boost >>= 1; -		if (sg_cpu->iowait_boost < sg_cpu->min) { +		if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {  			sg_cpu->iowait_boost = 0;  			return util;  		} @@ -827,9 +829,6 @@ static int sugov_start(struct cpufreq_policy *policy)  		memset(sg_cpu, 0, sizeof(*sg_cpu));  		sg_cpu->cpu			= cpu;  		sg_cpu->sg_policy		= sg_policy; -		sg_cpu->min			= -			(SCHED_CAPACITY_SCALE * policy->cpuinfo.min_freq) / -			policy->cpuinfo.max_freq;  	}  	for_each_cpu(cpu, policy->cpus) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8039d62ae36e..678bfb9bd87f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -702,7 +702,7 @@ do {									\  static const char *sched_tunable_scaling_names[] = {  	"none", -	"logaritmic", +	"logarithmic",  	"linear"  }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 35f3ea375084..f35930f5e528 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2597,7 +2597,7 @@ out:  /*   * Drive the periodic memory faults..   */ -void task_tick_numa(struct rq *rq, struct task_struct *curr) +static void task_tick_numa(struct rq *rq, struct task_struct *curr)  {  	struct callback_head *work = &curr->numa_work;  	u64 period, now; @@ -3571,7 +3571,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)   * Synchronize entity load avg of dequeued entity without locking   * the previous rq.   */ -void sync_entity_load_avg(struct sched_entity *se) +static void sync_entity_load_avg(struct sched_entity *se)  {  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	u64 last_update_time; @@ -3584,7 +3584,7 @@ void sync_entity_load_avg(struct sched_entity *se)   * Task first catches up with cfs_rq, and then subtract   * itself from the cfs_rq (task must be off the queue now).   */ -void remove_entity_load_avg(struct sched_entity *se) +static void remove_entity_load_avg(struct sched_entity *se)  {  	struct cfs_rq *cfs_rq = cfs_rq_of(se);  	unsigned long flags; @@ -5145,7 +5145,6 @@ static inline void hrtick_update(struct rq *rq)  #ifdef CONFIG_SMP  static inline unsigned long cpu_util(int cpu); -static unsigned long capacity_of(int cpu);  static inline bool cpu_overutilized(int cpu)  { @@ -7521,7 +7520,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env)  {  	lockdep_assert_held(&env->src_rq->lock); -	p->on_rq = TASK_ON_RQ_MIGRATING;  	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);  	set_task_cpu(p, env->dst_cpu);  } @@ -7657,7 +7655,6 @@ static void attach_task(struct rq *rq, struct task_struct *p)  	BUG_ON(task_rq(p) != rq);  	activate_task(rq, p, ENQUEUE_NOCLOCK); -	p->on_rq = TASK_ON_RQ_QUEUED;  	check_preempt_curr(rq, p, 0);  } @@ -9551,22 +9548,26 @@ static inline int on_null_domain(struct rq *rq)   * - When one of the busy CPUs notice that there may be an idle rebalancing   *   needed, they will kick the idle load balancer, which then does idle   *   load balancing for all the idle CPUs. + * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + *   anywhere yet.   */  static inline int find_new_ilb(void)  { -	int ilb = cpumask_first(nohz.idle_cpus_mask); +	int ilb; -	if (ilb < nr_cpu_ids && idle_cpu(ilb)) -		return ilb; +	for_each_cpu_and(ilb, nohz.idle_cpus_mask, +			      housekeeping_cpumask(HK_FLAG_MISC)) { +		if (idle_cpu(ilb)) +			return ilb; +	}  	return nr_cpu_ids;  }  /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick the - * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle - * CPU (if there is one). + * Kick a CPU to do the nohz balancing, if it is time for it. We pick any + * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).   */  static void kick_ilb(unsigned int flags)  { diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b02d148e7672..687302051a27 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -65,6 +65,7 @@ void __init housekeeping_init(void)  static int __init housekeeping_setup(char *str, enum hk_flags flags)  {  	cpumask_var_t non_housekeeping_mask; +	cpumask_var_t tmp;  	int err;  	alloc_bootmem_cpumask_var(&non_housekeeping_mask); @@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)  		return 0;  	} +	alloc_bootmem_cpumask_var(&tmp);  	if (!housekeeping_flags) {  		alloc_bootmem_cpumask_var(&housekeeping_mask);  		cpumask_andnot(housekeeping_mask,  			       cpu_possible_mask, non_housekeeping_mask); -		if (cpumask_empty(housekeeping_mask)) + +		cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); +		if (cpumask_empty(tmp)) { +			pr_warn("Housekeeping: must include one present CPU, " +				"using boot CPU:%d\n", smp_processor_id());  			__cpumask_set_cpu(smp_processor_id(), housekeeping_mask); +			__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask); +		}  	} else { -		cpumask_var_t tmp; - -		alloc_bootmem_cpumask_var(&tmp); +		cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask); +		if (cpumask_empty(tmp)) +			__cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);  		cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);  		if (!cpumask_equal(tmp, housekeeping_mask)) {  			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n"); @@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)  			free_bootmem_cpumask_var(non_housekeeping_mask);  			return 0;  		} -		free_bootmem_cpumask_var(tmp);  	} +	free_bootmem_cpumask_var(tmp);  	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {  		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 90fa23d36565..1e6b909dca36 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)  	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;  	if (rt_runtime_us < 0)  		rt_runtime = RUNTIME_INF; +	else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC) +		return -EINVAL;  	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);  } @@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)  {  	u64 rt_runtime, rt_period; +	if (rt_period_us > U64_MAX / NSEC_PER_USEC) +		return -EINVAL; +  	rt_period = rt_period_us * NSEC_PER_USEC;  	rt_runtime = tg->rt_bandwidth.rt_runtime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index efa686eeff26..b52ed1ada0be 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -780,7 +780,7 @@ struct root_domain {  	 * NULL-terminated list of performance domains intersecting with the  	 * CPUs of the rd. Protected by RCU.  	 */ -	struct perf_domain	*pd; +	struct perf_domain __rcu *pd;  };  extern struct root_domain def_root_domain; @@ -869,8 +869,8 @@ struct rq {  	atomic_t		nr_iowait;  #ifdef CONFIG_SMP -	struct root_domain	*rd; -	struct sched_domain	*sd; +	struct root_domain		*rd; +	struct sched_domain __rcu	*sd;  	unsigned long		cpu_capacity;  	unsigned long		cpu_capacity_orig; @@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)  	return sd;  } -DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);  DECLARE_PER_CPU(int, sd_llc_size);  DECLARE_PER_CPU(int, sd_llc_id); -DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); -DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);  extern struct static_key_false sched_asym_cpucapacity;  struct sched_group_capacity { @@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu)  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */  #ifdef CONFIG_CPU_FREQ -DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); +DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);  /**   * cpufreq_update_util - Take a note about CPU utilization changes. diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ab7f371a3a17..f53f89df837d 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd)   * the cpumask of the domain), this allows us to quickly tell if   * two CPUs are in the same cache domain, see cpus_share_cache().   */ -DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);  DEFINE_PER_CPU(int, sd_llc_size);  DEFINE_PER_CPU(int, sd_llc_id); -DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); -DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);  DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);  static void update_top_cache_domain(int cpu) @@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)  	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);  	struct sched_domain *child = sd->child;  	struct sched_group *sg; +	bool already_visited;  	if (child)  		cpu = cpumask_first(sched_domain_span(child)); @@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)  	sg = *per_cpu_ptr(sdd->sg, cpu);  	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); -	/* For claim_allocations: */ -	atomic_inc(&sg->ref); -	atomic_inc(&sg->sgc->ref); +	/* Increase refcounts for claim_allocations: */ +	already_visited = atomic_inc_return(&sg->ref) > 1; +	/* sgc visits should follow a similar trend as sg */ +	WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1)); + +	/* If we have already visited that group, it's already initialized. */ +	if (already_visited) +		return sg;  	if (child) {  		cpumask_copy(sched_group_span(sg), sched_domain_span(child)); @@ -1087,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)  /*   * build_sched_groups will build a circular linked list of the groups - * covered by the given span, and will set each group's ->cpumask correctly, - * and ->cpu_capacity to 0. + * covered by the given span, will set each group's ->cpumask correctly, + * and will initialize their ->sgc.   *   * Assumes the sched_domain tree is fully constructed   */ @@ -2075,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)  }  /* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. - * For now this just excludes isolated CPUs, but could be used to - * exclude other special cases in the future. + * Set up scheduler domains and groups.  For now this just excludes isolated + * CPUs, but could be used to exclude other special cases in the future.   */  int sched_init_domains(const struct cpumask *cpu_map)  { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 3582eeb59893..a635ecba6fe2 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -331,7 +331,7 @@ static int is_ancestor(struct seccomp_filter *parent,   * Expects sighand and cred_guard_mutex locks to be held.   *   * Returns 0 on success, -ve on error, or the pid of a thread which was - * either not in the correct seccomp mode or it did not have an ancestral + * either not in the correct seccomp mode or did not have an ancestral   * seccomp filter.   */  static inline pid_t seccomp_can_sync_threads(void) diff --git a/kernel/softirq.c b/kernel/softirq.c index 10277429ed84..2c3382378d94 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -573,57 +573,6 @@ void tasklet_kill(struct tasklet_struct *t)  }  EXPORT_SYMBOL(tasklet_kill); -/* - * tasklet_hrtimer - */ - -/* - * The trampoline is called when the hrtimer expires. It schedules a tasklet - * to run __tasklet_hrtimer_trampoline() which in turn will call the intended - * hrtimer callback, but from softirq context. - */ -static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) -{ -	struct tasklet_hrtimer *ttimer = -		container_of(timer, struct tasklet_hrtimer, timer); - -	tasklet_hi_schedule(&ttimer->tasklet); -	return HRTIMER_NORESTART; -} - -/* - * Helper function which calls the hrtimer callback from - * tasklet/softirq context - */ -static void __tasklet_hrtimer_trampoline(unsigned long data) -{ -	struct tasklet_hrtimer *ttimer = (void *)data; -	enum hrtimer_restart restart; - -	restart = ttimer->function(&ttimer->timer); -	if (restart != HRTIMER_NORESTART) -		hrtimer_restart(&ttimer->timer); -} - -/** - * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks - * @ttimer:	 tasklet_hrtimer which is initialized - * @function:	 hrtimer callback function which gets called from softirq context - * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) - * @mode:	 hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) - */ -void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, -			  enum hrtimer_restart (*function)(struct hrtimer *), -			  clockid_t which_clock, enum hrtimer_mode mode) -{ -	hrtimer_init(&ttimer->timer, which_clock, mode); -	ttimer->timer.function = __hrtimer_tasklet_trampoline; -	tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, -		     (unsigned long)ttimer); -	ttimer->function = function; -} -EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); -  void __init softirq_init(void)  {  	int cpu; diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index f8edee9c792d..27bafc1e271e 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -5,41 +5,56 @@   *   *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>   */ +#include <linux/sched/task_stack.h> +#include <linux/sched/debug.h>  #include <linux/sched.h>  #include <linux/kernel.h>  #include <linux/export.h>  #include <linux/kallsyms.h>  #include <linux/stacktrace.h> -void print_stack_trace(struct stack_trace *trace, int spaces) +/** + * stack_trace_print - Print the entries in the stack trace + * @entries:	Pointer to storage array + * @nr_entries:	Number of entries in the storage array + * @spaces:	Number of leading spaces to print + */ +void stack_trace_print(unsigned long *entries, unsigned int nr_entries, +		       int spaces)  { -	int i; +	unsigned int i; -	if (WARN_ON(!trace->entries)) +	if (WARN_ON(!entries))  		return; -	for (i = 0; i < trace->nr_entries; i++) -		printk("%*c%pS\n", 1 + spaces, ' ', (void *)trace->entries[i]); +	for (i = 0; i < nr_entries; i++) +		printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]);  } -EXPORT_SYMBOL_GPL(print_stack_trace); +EXPORT_SYMBOL_GPL(stack_trace_print); -int snprint_stack_trace(char *buf, size_t size, -			struct stack_trace *trace, int spaces) +/** + * stack_trace_snprint - Print the entries in the stack trace into a buffer + * @buf:	Pointer to the print buffer + * @size:	Size of the print buffer + * @entries:	Pointer to storage array + * @nr_entries:	Number of entries in the storage array + * @spaces:	Number of leading spaces to print + * + * Return: Number of bytes printed. + */ +int stack_trace_snprint(char *buf, size_t size, unsigned long *entries, +			unsigned int nr_entries, int spaces)  { -	int i; -	int generated; -	int total = 0; +	unsigned int generated, i, total = 0; -	if (WARN_ON(!trace->entries)) +	if (WARN_ON(!entries))  		return 0; -	for (i = 0; i < trace->nr_entries; i++) { +	for (i = 0; i < nr_entries && size; i++) {  		generated = snprintf(buf, size, "%*c%pS\n", 1 + spaces, ' ', -				     (void *)trace->entries[i]); +				     (void *)entries[i]);  		total += generated; - -		/* Assume that generated isn't a negative number */  		if (generated >= size) {  			buf += size;  			size = 0; @@ -51,7 +66,176 @@ int snprint_stack_trace(char *buf, size_t size,  	return total;  } -EXPORT_SYMBOL_GPL(snprint_stack_trace); +EXPORT_SYMBOL_GPL(stack_trace_snprint); + +#ifdef CONFIG_ARCH_STACKWALK + +struct stacktrace_cookie { +	unsigned long	*store; +	unsigned int	size; +	unsigned int	skip; +	unsigned int	len; +}; + +static bool stack_trace_consume_entry(void *cookie, unsigned long addr, +				      bool reliable) +{ +	struct stacktrace_cookie *c = cookie; + +	if (c->len >= c->size) +		return false; + +	if (c->skip > 0) { +		c->skip--; +		return true; +	} +	c->store[c->len++] = addr; +	return c->len < c->size; +} + +static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr, +					      bool reliable) +{ +	if (in_sched_functions(addr)) +		return true; +	return stack_trace_consume_entry(cookie, addr, reliable); +} + +/** + * stack_trace_save - Save a stack trace into a storage array + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save(unsigned long *store, unsigned int size, +			      unsigned int skipnr) +{ +	stack_trace_consume_fn consume_entry = stack_trace_consume_entry; +	struct stacktrace_cookie c = { +		.store	= store, +		.size	= size, +		.skip	= skipnr + 1, +	}; + +	arch_stack_walk(consume_entry, &c, current, NULL); +	return c.len; +} +EXPORT_SYMBOL_GPL(stack_trace_save); + +/** + * stack_trace_save_tsk - Save a task stack trace into a storage array + * @task:	The task to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, +				  unsigned int size, unsigned int skipnr) +{ +	stack_trace_consume_fn consume_entry = stack_trace_consume_entry_nosched; +	struct stacktrace_cookie c = { +		.store	= store, +		.size	= size, +		.skip	= skipnr + 1, +	}; + +	if (!try_get_task_stack(tsk)) +		return 0; + +	arch_stack_walk(consume_entry, &c, tsk, NULL); +	put_task_stack(tsk); +	return c.len; +} + +/** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array + * @regs:	Pointer to pt_regs to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, +				   unsigned int size, unsigned int skipnr) +{ +	stack_trace_consume_fn consume_entry = stack_trace_consume_entry; +	struct stacktrace_cookie c = { +		.store	= store, +		.size	= size, +		.skip	= skipnr, +	}; + +	arch_stack_walk(consume_entry, &c, current, regs); +	return c.len; +} + +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE +/** + * stack_trace_save_tsk_reliable - Save task stack with verification + * @tsk:	Pointer to the task to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * + * Return:	An error if it detects any unreliable features of the + *		stack. Otherwise it guarantees that the stack trace is + *		reliable and returns the number of entries stored. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, +				  unsigned int size) +{ +	stack_trace_consume_fn consume_entry = stack_trace_consume_entry; +	struct stacktrace_cookie c = { +		.store	= store, +		.size	= size, +	}; +	int ret; + +	/* +	 * If the task doesn't have a stack (e.g., a zombie), the stack is +	 * "reliably" empty. +	 */ +	if (!try_get_task_stack(tsk)) +		return 0; + +	ret = arch_stack_walk_reliable(consume_entry, &c, tsk); +	put_task_stack(tsk); +	return ret; +} +#endif + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +/** + * stack_trace_save_user - Save a user space stack trace into a storage array + * @store:	Pointer to storage array + * @size:	Size of the storage array + * + * Return: Number of trace entries stored. + */ +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) +{ +	stack_trace_consume_fn consume_entry = stack_trace_consume_entry; +	struct stacktrace_cookie c = { +		.store	= store, +		.size	= size, +	}; + +	/* Trace user stack if not a kernel thread */ +	if (!current->mm) +		return 0; + +	arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); +	return c.len; +} +#endif + +#else /* CONFIG_ARCH_STACKWALK */  /*   * Architectures that do not implement save_stack_trace_*() @@ -77,3 +261,118 @@ save_stack_trace_tsk_reliable(struct task_struct *tsk,  	WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n");  	return -ENOSYS;  } + +/** + * stack_trace_save - Save a stack trace into a storage array + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save(unsigned long *store, unsigned int size, +			      unsigned int skipnr) +{ +	struct stack_trace trace = { +		.entries	= store, +		.max_entries	= size, +		.skip		= skipnr + 1, +	}; + +	save_stack_trace(&trace); +	return trace.nr_entries; +} +EXPORT_SYMBOL_GPL(stack_trace_save); + +/** + * stack_trace_save_tsk - Save a task stack trace into a storage array + * @task:	The task to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_tsk(struct task_struct *task, +				  unsigned long *store, unsigned int size, +				  unsigned int skipnr) +{ +	struct stack_trace trace = { +		.entries	= store, +		.max_entries	= size, +		.skip		= skipnr + 1, +	}; + +	save_stack_trace_tsk(task, &trace); +	return trace.nr_entries; +} + +/** + * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array + * @regs:	Pointer to pt_regs to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * @skipnr:	Number of entries to skip at the start of the stack trace + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store, +				   unsigned int size, unsigned int skipnr) +{ +	struct stack_trace trace = { +		.entries	= store, +		.max_entries	= size, +		.skip		= skipnr, +	}; + +	save_stack_trace_regs(regs, &trace); +	return trace.nr_entries; +} + +#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE +/** + * stack_trace_save_tsk_reliable - Save task stack with verification + * @tsk:	Pointer to the task to examine + * @store:	Pointer to storage array + * @size:	Size of the storage array + * + * Return:	An error if it detects any unreliable features of the + *		stack. Otherwise it guarantees that the stack trace is + *		reliable and returns the number of entries stored. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. + */ +int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, +				  unsigned int size) +{ +	struct stack_trace trace = { +		.entries	= store, +		.max_entries	= size, +	}; +	int ret = save_stack_trace_tsk_reliable(tsk, &trace); + +	return ret ? ret : trace.nr_entries; +} +#endif + +#ifdef CONFIG_USER_STACKTRACE_SUPPORT +/** + * stack_trace_save_user - Save a user space stack trace into a storage array + * @store:	Pointer to storage array + * @size:	Size of the storage array + * + * Return: Number of trace entries stored + */ +unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) +{ +	struct stack_trace trace = { +		.entries	= store, +		.max_entries	= size, +	}; + +	save_stack_trace_user(&trace); +	return trace.nr_entries; +} +#endif /* CONFIG_USER_STACKTRACE_SUPPORT */ + +#endif /* !CONFIG_ARCH_STACKWALK */ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 067cb83f37ea..7231fb5953fc 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -513,7 +513,7 @@ repeat:  		}  		preempt_count_dec();  		WARN_ONCE(preempt_count(), -			  "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); +			  "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);  		goto repeat;  	}  } diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5e77662dd2d9..f5490222e134 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -611,6 +611,22 @@ void clockevents_resume(void)  }  #ifdef CONFIG_HOTPLUG_CPU + +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +/** + * tick_offline_cpu - Take CPU out of the broadcast mechanism + * @cpu:	The outgoing CPU + * + * Called on the outgoing CPU after it took itself offline. + */ +void tick_offline_cpu(unsigned int cpu) +{ +	raw_spin_lock(&clockevents_lock); +	tick_broadcast_offline(cpu); +	raw_spin_unlock(&clockevents_lock); +} +# endif +  /**   * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu   */ @@ -621,8 +637,6 @@ void tick_cleanup_dead_cpu(int cpu)  	raw_spin_lock_irqsave(&clockevents_lock, flags); -	tick_shutdown_broadcast_oneshot(cpu); -	tick_shutdown_broadcast(cpu);  	tick_shutdown(cpu);  	/*  	 * Unregister the clock event devices which were diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index ac9c03dd6c7d..d23b434c2ca7 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -63,7 +63,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock);  #if (BITS_PER_LONG < 64)  u64 get_jiffies_64(void)  { -	unsigned long seq; +	unsigned int seq;  	u64 ret;  	do { diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 930113b9799a..142b07619918 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -94,7 +94,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)  unsigned long long notrace sched_clock(void)  {  	u64 cyc, res; -	unsigned long seq; +	unsigned int seq;  	struct clock_read_data *rd;  	do { @@ -231,7 +231,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)  	if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))  		enable_sched_clock_irqtime(); -	pr_debug("Registered %pF as sched_clock source\n", read); +	pr_debug("Registered %pS as sched_clock source\n", read);  }  void __init generic_sched_clock_init(void) @@ -267,7 +267,7 @@ void __init generic_sched_clock_init(void)   */  static u64 notrace suspended_sched_clock_read(void)  { -	unsigned long seq = raw_read_seqcount(&cd.seq); +	unsigned int seq = raw_read_seqcount(&cd.seq);  	return cd.read_data[seq & 1].epoch_cyc;  } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ee834d4fb814..e51778c312f1 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -36,10 +36,16 @@ static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(tick_broadcast_lock);  static void tick_broadcast_setup_oneshot(struct clock_event_device *bc);  static void tick_broadcast_clear_oneshot(int cpu);  static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); +# ifdef CONFIG_HOTPLUG_CPU +static void tick_broadcast_oneshot_offline(unsigned int cpu); +# endif  #else  static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); }  static inline void tick_broadcast_clear_oneshot(int cpu) { }  static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } +# ifdef CONFIG_HOTPLUG_CPU +static inline void tick_broadcast_oneshot_offline(unsigned int cpu) { } +# endif  #endif  /* @@ -433,27 +439,29 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)  }  #ifdef CONFIG_HOTPLUG_CPU -/* - * Remove a CPU from broadcasting - */ -void tick_shutdown_broadcast(unsigned int cpu) +static void tick_shutdown_broadcast(void)  { -	struct clock_event_device *bc; -	unsigned long flags; - -	raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - -	bc = tick_broadcast_device.evtdev; -	cpumask_clear_cpu(cpu, tick_broadcast_mask); -	cpumask_clear_cpu(cpu, tick_broadcast_on); +	struct clock_event_device *bc = tick_broadcast_device.evtdev;  	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {  		if (bc && cpumask_empty(tick_broadcast_mask))  			clockevents_shutdown(bc);  	} +} -	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +/* + * Remove a CPU from broadcasting + */ +void tick_broadcast_offline(unsigned int cpu) +{ +	raw_spin_lock(&tick_broadcast_lock); +	cpumask_clear_cpu(cpu, tick_broadcast_mask); +	cpumask_clear_cpu(cpu, tick_broadcast_on); +	tick_broadcast_oneshot_offline(cpu); +	tick_shutdown_broadcast(); +	raw_spin_unlock(&tick_broadcast_lock);  } +  #endif  void tick_suspend_broadcast(void) @@ -801,13 +809,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state)  			 * either the CPU handling the broadcast  			 * interrupt or we got woken by something else.  			 * -			 * We are not longer in the broadcast mask, so +			 * We are no longer in the broadcast mask, so  			 * if the cpu local expiry time is already  			 * reached, we would reprogram the cpu local  			 * timer with an already expired event.  			 *  			 * This can lead to a ping-pong when we return -			 * to idle and therefor rearm the broadcast +			 * to idle and therefore rearm the broadcast  			 * timer before the cpu local timer was able  			 * to fire. This happens because the forced  			 * reprogramming makes sure that the event @@ -950,14 +958,10 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)  }  /* - * Remove a dead CPU from broadcasting + * Remove a dying CPU from broadcasting   */ -void tick_shutdown_broadcast_oneshot(unsigned int cpu) +static void tick_broadcast_oneshot_offline(unsigned int cpu)  { -	unsigned long flags; - -	raw_spin_lock_irqsave(&tick_broadcast_lock, flags); -  	/*  	 * Clear the broadcast masks for the dead cpu, but do not stop  	 * the broadcast device! @@ -965,8 +969,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int cpu)  	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);  	cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);  	cpumask_clear_cpu(cpu, tick_broadcast_force_mask); - -	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);  }  #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index df401463a191..59225b484e4e 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -46,6 +46,14 @@ ktime_t tick_period;   *    procedure also covers cpu hotplug.   */  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; +#ifdef CONFIG_NO_HZ_FULL +/* + * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns + * tick_do_timer_cpu and it should be taken over by an eligible secondary + * when one comes online. + */ +static int tick_do_timer_boot_cpu __read_mostly = -1; +#endif  /*   * Debugging: see timer_list.c @@ -149,7 +157,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)  	    !tick_broadcast_oneshot_active()) {  		clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);  	} else { -		unsigned long seq; +		unsigned int seq;  		ktime_t next;  		do { @@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)  	}  } +#ifdef CONFIG_NO_HZ_FULL +static void giveup_do_timer(void *info) +{ +	int cpu = *(unsigned int *)info; + +	WARN_ON(tick_do_timer_cpu != smp_processor_id()); + +	tick_do_timer_cpu = cpu; +} + +static void tick_take_do_timer_from_boot(void) +{ +	int cpu = smp_processor_id(); +	int from = tick_do_timer_boot_cpu; + +	if (from >= 0 && from != cpu) +		smp_call_function_single(from, giveup_do_timer, &cpu, 1); +} +#endif +  /*   * Setup the tick device   */ @@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td,  		 * this cpu:  		 */  		if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { -			if (!tick_nohz_full_cpu(cpu)) -				tick_do_timer_cpu = cpu; -			else -				tick_do_timer_cpu = TICK_DO_TIMER_NONE; +			tick_do_timer_cpu = cpu; +  			tick_next_period = ktime_get();  			tick_period = NSEC_PER_SEC / HZ; +#ifdef CONFIG_NO_HZ_FULL +			/* +			 * The boot CPU may be nohz_full, in which case set +			 * tick_do_timer_boot_cpu so the first housekeeping +			 * secondary that comes up will take do_timer from +			 * us. +			 */ +			if (tick_nohz_full_cpu(cpu)) +				tick_do_timer_boot_cpu = cpu; + +		} else if (tick_do_timer_boot_cpu != -1 && +						!tick_nohz_full_cpu(cpu)) { +			tick_take_do_timer_from_boot(); +			tick_do_timer_boot_cpu = -1; +			WARN_ON(tick_do_timer_cpu != cpu); +#endif  		}  		/* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e277284c2831..7b2496136729 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -64,7 +64,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);  extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);  extern void tick_install_broadcast_device(struct clock_event_device *dev);  extern int tick_is_broadcast_device(struct clock_event_device *dev); -extern void tick_shutdown_broadcast(unsigned int cpu);  extern void tick_suspend_broadcast(void);  extern void tick_resume_broadcast(void);  extern bool tick_resume_check_broadcast(void); @@ -78,7 +77,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev)  static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; }  static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; }  static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } -static inline void tick_shutdown_broadcast(unsigned int cpu) { }  static inline void tick_suspend_broadcast(void) { }  static inline void tick_resume_broadcast(void) { }  static inline bool tick_resume_check_broadcast(void) { return false; } @@ -128,19 +126,23 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; }  /* Functions related to oneshot broadcasting */  #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)  extern void tick_broadcast_switch_to_oneshot(void); -extern void tick_shutdown_broadcast_oneshot(unsigned int cpu);  extern int tick_broadcast_oneshot_active(void);  extern void tick_check_oneshot_broadcast_this_cpu(void);  bool tick_broadcast_oneshot_available(void);  extern struct cpumask *tick_get_broadcast_oneshot_mask(void);  #else /* !(BROADCAST && ONESHOT): */  static inline void tick_broadcast_switch_to_oneshot(void) { } -static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { }  static inline int tick_broadcast_oneshot_active(void) { return 0; }  static inline void tick_check_oneshot_broadcast_this_cpu(void) { }  static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); }  #endif /* !(BROADCAST && ONESHOT) */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_HOTPLUG_CPU) +extern void tick_broadcast_offline(unsigned int cpu); +#else +static inline void tick_broadcast_offline(unsigned int cpu) { } +#endif +  /* NO_HZ_FULL internal */  #ifdef CONFIG_NO_HZ_FULL  extern void tick_nohz_init(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6fa52cd6df0b..f4ee1a3428ae 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)  	 * into a long sleep. If two CPUs happen to assign themselves to  	 * this duty, then the jiffies update is still serialized by  	 * jiffies_lock. +	 * +	 * If nohz_full is enabled, this should not happen because the +	 * tick_do_timer_cpu never relinquishes.  	 */ -	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) -	    && !tick_nohz_full_cpu(cpu)) +	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) { +#ifdef CONFIG_NO_HZ_FULL +		WARN_ON(tick_nohz_full_running); +#endif  		tick_do_timer_cpu = cpu; +	}  #endif  	/* Check, if the jiffies need an update */ @@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)  static int tick_nohz_cpu_down(unsigned int cpu)  {  	/* -	 * The boot CPU handles housekeeping duty (unbound timers, -	 * workqueues, timekeeping, ...) on behalf of full dynticks +	 * The tick_do_timer_cpu CPU handles housekeeping duty (unbound +	 * timers, workqueues, timekeeping, ...) on behalf of full dynticks  	 * CPUs. It must remain online when nohz full is enabled.  	 */  	if (tick_nohz_full_running && tick_do_timer_cpu == cpu) @@ -423,12 +429,15 @@ void __init tick_nohz_init(void)  		return;  	} -	cpu = smp_processor_id(); +	if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && +			!IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) { +		cpu = smp_processor_id(); -	if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { -		pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", -			cpu); -		cpumask_clear_cpu(cpu, tick_nohz_full_mask); +		if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { +			pr_warn("NO_HZ: Clearing %d from nohz_full range " +				"for timekeeping\n", cpu); +			cpumask_clear_cpu(cpu, tick_nohz_full_mask); +		}  	}  	for_each_cpu(cpu, tick_nohz_full_mask) @@ -645,7 +654,8 @@ static inline bool local_timer_softirq_pending(void)  static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)  {  	u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; -	unsigned long seq, basejiff; +	unsigned long basejiff; +	unsigned int seq;  	/* Read jiffies and the time when jiffies were updated last */  	do { @@ -904,8 +914,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)  		/*  		 * Boot safety: make sure the timekeeping duty has been  		 * assigned before entering dyntick-idle mode, +		 * tick_do_timer_cpu is TICK_DO_TIMER_BOOT  		 */ -		if (tick_do_timer_cpu == TICK_DO_TIMER_NONE) +		if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT)) +			return false; + +		/* Should not happen for nohz-full */ +		if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))  			return false;  	} @@ -1023,6 +1038,18 @@ bool tick_nohz_idle_got_tick(void)  }  /** + * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer + * or the tick, whatever that expires first. Note that, if the tick has been + * stopped, it returns the next hrtimer. + * + * Called from power state control code with interrupts disabled + */ +ktime_t tick_nohz_get_next_hrtimer(void) +{ +	return __this_cpu_read(tick_cpu_device.evtdev)->next_event; +} + +/**   * tick_nohz_get_sleep_length - return the expected length of the current sleep   * @delta_next: duration until the next event if the tick cannot be stopped   * diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 6de959a854b2..4fb06527cf64 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -24,12 +24,19 @@ enum tick_nohz_mode {   * struct tick_sched - sched tick emulation and no idle tick control/stats   * @sched_timer:	hrtimer to schedule the periodic tick in high   *			resolution mode + * @check_clocks:	Notification mechanism about clocksource changes + * @nohz_mode:		Mode - one state of tick_nohz_mode + * @inidle:		Indicator that the CPU is in the tick idle mode + * @tick_stopped:	Indicator that the idle tick has been stopped + * @idle_active:	Indicator that the CPU is actively in the tick idle mode; + *			it is resetted during irq handling phases. + * @do_timer_lst:	CPU was the last one doing do_timer before going idle + * @got_idle_tick:	Tick timer function has run with @inidle set   * @last_tick:		Store the last tick expiry time when the tick   *			timer is modified for nohz sleeps. This is necessary   *			to resume the tick timer operation in the timeline   *			when the CPU returns from nohz sleep.   * @next_tick:		Next tick to be fired when in dynticks mode. - * @tick_stopped:	Indicator that the idle tick has been stopped   * @idle_jiffies:	jiffies at the entry to idle for idle time accounting   * @idle_calls:		Total number of idle calls   * @idle_sleeps:	Number of idle calls, where the sched tick was stopped @@ -40,8 +47,8 @@ enum tick_nohz_mode {   * @iowait_sleeptime:	Sum of the time slept in idle with sched tick stopped, with IO outstanding   * @timer_expires:	Anticipated timer expiration time (in case sched tick is stopped)   * @timer_expires_base:	Base time clock monotonic for @timer_expires - * @do_timer_lst:	CPU was the last one doing do_timer before going idle - * @got_idle_tick:	Tick timer function has run with @inidle set + * @next_timer:		Expiry time of next expiring timer for debugging purpose only + * @tick_dep_mask:	Tick dependency mask - is set, if someone needs the tick   */  struct tick_sched {  	struct hrtimer			sched_timer; diff --git a/kernel/time/time.c b/kernel/time/time.c index c3f756f8534b..86656bbac232 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -171,7 +171,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz  	static int firsttime = 1;  	int error = 0; -	if (tv && !timespec64_valid(tv)) +	if (tv && !timespec64_valid_settod(tv))  		return -EINVAL;  	error = security_settime64(tv, tz); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f986e1918d12..5716e28bfa3c 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -720,7 +720,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)  void ktime_get_real_ts64(struct timespec64 *ts)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	u64 nsecs;  	WARN_ON(timekeeping_suspended); @@ -829,7 +829,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);  ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)  {  	ktime_t *offset = offsets[offs]; -	unsigned long seq; +	unsigned int seq;  	ktime_t tconv;  	do { @@ -960,7 +960,7 @@ time64_t __ktime_get_real_seconds(void)  void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	ktime_t base_raw;  	ktime_t base_real;  	u64 nsec_raw; @@ -1122,7 +1122,7 @@ int get_device_system_crosststamp(int (*get_time_fn)  	ktime_t base_real, base_raw;  	u64 nsec_real, nsec_raw;  	u8 cs_was_changed_seq; -	unsigned long seq; +	unsigned int seq;  	bool do_interp;  	int ret; @@ -1221,7 +1221,7 @@ int do_settimeofday64(const struct timespec64 *ts)  	unsigned long flags;  	int ret = 0; -	if (!timespec64_valid_strict(ts)) +	if (!timespec64_valid_settod(ts))  		return -EINVAL;  	raw_spin_lock_irqsave(&timekeeper_lock, flags); @@ -1278,7 +1278,7 @@ static int timekeeping_inject_offset(const struct timespec64 *ts)  	/* Make sure the proposed value is valid */  	tmp = timespec64_add(tk_xtime(tk), *ts);  	if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 || -	    !timespec64_valid_strict(&tmp)) { +	    !timespec64_valid_settod(&tmp)) {  		ret = -EINVAL;  		goto error;  	} @@ -1409,7 +1409,7 @@ int timekeeping_notify(struct clocksource *clock)  void ktime_get_raw_ts64(struct timespec64 *ts)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	u64 nsecs;  	do { @@ -1431,7 +1431,7 @@ EXPORT_SYMBOL(ktime_get_raw_ts64);  int timekeeping_valid_for_hres(void)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	int ret;  	do { @@ -1450,7 +1450,7 @@ int timekeeping_valid_for_hres(void)  u64 timekeeping_max_deferment(void)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	u64 ret;  	do { @@ -1527,7 +1527,7 @@ void __init timekeeping_init(void)  	unsigned long flags;  	read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); -	if (timespec64_valid_strict(&wall_time) && +	if (timespec64_valid_settod(&wall_time) &&  	    timespec64_to_ns(&wall_time) > 0) {  		persistent_clock_exists = true;  	} else if (timespec64_to_ns(&wall_time) != 0) { @@ -2150,7 +2150,7 @@ EXPORT_SYMBOL_GPL(getboottime64);  void ktime_get_coarse_real_ts64(struct timespec64 *ts)  {  	struct timekeeper *tk = &tk_core.timekeeper; -	unsigned long seq; +	unsigned int seq;  	do {  		seq = read_seqcount_begin(&tk_core.seq); @@ -2164,7 +2164,7 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)  {  	struct timekeeper *tk = &tk_core.timekeeper;  	struct timespec64 now, mono; -	unsigned long seq; +	unsigned int seq;  	do {  		seq = read_seqcount_begin(&tk_core.seq); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2fce056f8a49..343c7ba33b1c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -536,6 +536,8 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,  	hlist_add_head(&timer->entry, base->vectors + idx);  	__set_bit(idx, base->pending_map);  	timer_set_idx(timer, idx); + +	trace_timer_start(timer, timer->expires, timer->flags);  }  static void @@ -757,13 +759,6 @@ static inline void debug_init(struct timer_list *timer)  	trace_timer_init(timer);  } -static inline void -debug_activate(struct timer_list *timer, unsigned long expires) -{ -	debug_timer_activate(timer); -	trace_timer_start(timer, expires, timer->flags); -} -  static inline void debug_deactivate(struct timer_list *timer)  {  	debug_timer_deactivate(timer); @@ -1037,7 +1032,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option  		}  	} -	debug_activate(timer, expires); +	debug_timer_activate(timer);  	timer->expires = expires;  	/* @@ -1171,7 +1166,7 @@ void add_timer_on(struct timer_list *timer, int cpu)  	}  	forward_timer_base(base); -	debug_activate(timer, timer->expires); +	debug_timer_activate(timer);  	internal_add_timer(base, timer);  	raw_spin_unlock_irqrestore(&base->lock, flags);  } @@ -1298,7 +1293,9 @@ int del_timer_sync(struct timer_list *timer)  EXPORT_SYMBOL(del_timer_sync);  #endif -static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *)) +static void call_timer_fn(struct timer_list *timer, +			  void (*fn)(struct timer_list *), +			  unsigned long baseclk)  {  	int count = preempt_count(); @@ -1321,14 +1318,14 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list  	 */  	lock_map_acquire(&lockdep_map); -	trace_timer_expire_entry(timer); +	trace_timer_expire_entry(timer, baseclk);  	fn(timer);  	trace_timer_expire_exit(timer);  	lock_map_release(&lockdep_map);  	if (count != preempt_count()) { -		WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", +		WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",  			  fn, count, preempt_count());  		/*  		 * Restore the preempt count. That gives us a decent @@ -1342,6 +1339,13 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list  static void expire_timers(struct timer_base *base, struct hlist_head *head)  { +	/* +	 * This value is required only for tracing. base->clk was +	 * incremented directly before expire_timers was called. But expiry +	 * is related to the old base->clk value. +	 */ +	unsigned long baseclk = base->clk - 1; +  	while (!hlist_empty(head)) {  		struct timer_list *timer;  		void (*fn)(struct timer_list *); @@ -1355,11 +1359,11 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)  		if (timer->flags & TIMER_IRQSAFE) {  			raw_spin_unlock(&base->lock); -			call_timer_fn(timer, fn); +			call_timer_fn(timer, fn, baseclk);  			raw_spin_lock(&base->lock);  		} else {  			raw_spin_unlock_irq(&base->lock); -			call_timer_fn(timer, fn); +			call_timer_fn(timer, fn, baseclk);  			raw_spin_lock_irq(&base->lock);  		}  	} diff --git a/kernel/torture.c b/kernel/torture.c index 8faa1a9aaeb9..17b2be9bde12 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -88,6 +88,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,  	if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu))  		return false; +	if (num_online_cpus() <= 1) +		return false;  /* Can't offline the last CPU. */  	if (verbose > 1)  		pr_alert("%s" TORTURE_FLAG diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d64c00afceb5..94b0e37d90ef 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,6 +14,8 @@  #include <linux/syscalls.h>  #include <linux/error-injection.h> +#include <asm/tlb.h> +  #include "trace_probe.h"  #include "trace.h" @@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,  	 * access_ok() should prevent writing to non-user memory, but in  	 * some situations (nommu, temporary switch, etc) access_ok() does  	 * not provide enough validation, hence the check on KERNEL_DS. +	 * +	 * nmi_uaccess_okay() ensures the probe is not run in an interim +	 * state, when the task or mm are switched. This is specifically +	 * required to prevent the use of temporary mm.  	 */  	if (unlikely(in_interrupt() || @@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,  		return -EPERM;  	if (unlikely(uaccess_kernel()))  		return -EPERM; +	if (unlikely(!nmi_uaccess_okay())) +		return -EPERM;  	if (!access_ok(unsafe_ptr, size))  		return -EPERM; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ca1ee656d6d8..ec439999f387 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -159,6 +159,8 @@ static union trace_eval_map_item *trace_eval_maps;  #endif /* CONFIG_TRACE_EVAL_MAP_FILE */  static int tracing_set_tracer(struct trace_array *tr, const char *buf); +static void ftrace_trace_userstack(struct ring_buffer *buffer, +				   unsigned long flags, int pc);  #define MAX_TRACER_SIZE		100  static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; @@ -2752,12 +2754,21 @@ trace_function(struct trace_array *tr,  #ifdef CONFIG_STACKTRACE -#define FTRACE_STACK_MAX_ENTRIES (PAGE_SIZE / sizeof(unsigned long)) +/* Allow 4 levels of nesting: normal, softirq, irq, NMI */ +#define FTRACE_KSTACK_NESTING	4 + +#define FTRACE_KSTACK_ENTRIES	(PAGE_SIZE / FTRACE_KSTACK_NESTING) +  struct ftrace_stack { -	unsigned long		calls[FTRACE_STACK_MAX_ENTRIES]; +	unsigned long		calls[FTRACE_KSTACK_ENTRIES]; +}; + + +struct ftrace_stacks { +	struct ftrace_stack	stacks[FTRACE_KSTACK_NESTING];  }; -static DEFINE_PER_CPU(struct ftrace_stack, ftrace_stack); +static DEFINE_PER_CPU(struct ftrace_stacks, ftrace_stacks);  static DEFINE_PER_CPU(int, ftrace_stack_reserve);  static void __ftrace_trace_stack(struct ring_buffer *buffer, @@ -2766,13 +2777,10 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,  {  	struct trace_event_call *call = &event_kernel_stack;  	struct ring_buffer_event *event; +	unsigned int size, nr_entries; +	struct ftrace_stack *fstack;  	struct stack_entry *entry; -	struct stack_trace trace; -	int use_stack; -	int size = FTRACE_STACK_ENTRIES; - -	trace.nr_entries	= 0; -	trace.skip		= skip; +	int stackidx;  	/*  	 * Add one, for this function and the call to save_stack_trace() @@ -2780,7 +2788,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,  	 */  #ifndef CONFIG_UNWINDER_ORC  	if (!regs) -		trace.skip++; +		skip++;  #endif  	/* @@ -2791,53 +2799,40 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,  	 */  	preempt_disable_notrace(); -	use_stack = __this_cpu_inc_return(ftrace_stack_reserve); +	stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; + +	/* This should never happen. If it does, yell once and skip */ +	if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) +		goto out; +  	/* -	 * We don't need any atomic variables, just a barrier. -	 * If an interrupt comes in, we don't care, because it would -	 * have exited and put the counter back to what we want. -	 * We just need a barrier to keep gcc from moving things -	 * around. +	 * The above __this_cpu_inc_return() is 'atomic' cpu local. An +	 * interrupt will either see the value pre increment or post +	 * increment. If the interrupt happens pre increment it will have +	 * restored the counter when it returns.  We just need a barrier to +	 * keep gcc from moving things around.  	 */  	barrier(); -	if (use_stack == 1) { -		trace.entries		= this_cpu_ptr(ftrace_stack.calls); -		trace.max_entries	= FTRACE_STACK_MAX_ENTRIES; -		if (regs) -			save_stack_trace_regs(regs, &trace); -		else -			save_stack_trace(&trace); - -		if (trace.nr_entries > size) -			size = trace.nr_entries; -	} else -		/* From now on, use_stack is a boolean */ -		use_stack = 0; +	fstack = this_cpu_ptr(ftrace_stacks.stacks) + stackidx; +	size = ARRAY_SIZE(fstack->calls); -	size *= sizeof(unsigned long); +	if (regs) { +		nr_entries = stack_trace_save_regs(regs, fstack->calls, +						   size, skip); +	} else { +		nr_entries = stack_trace_save(fstack->calls, size, skip); +	} +	size = nr_entries * sizeof(unsigned long);  	event = __trace_buffer_lock_reserve(buffer, TRACE_STACK,  					    sizeof(*entry) + size, flags, pc);  	if (!event)  		goto out;  	entry = ring_buffer_event_data(event); -	memset(&entry->caller, 0, size); - -	if (use_stack) -		memcpy(&entry->caller, trace.entries, -		       trace.nr_entries * sizeof(unsigned long)); -	else { -		trace.max_entries	= FTRACE_STACK_ENTRIES; -		trace.entries		= entry->caller; -		if (regs) -			save_stack_trace_regs(regs, &trace); -		else -			save_stack_trace(&trace); -	} - -	entry->size = trace.nr_entries; +	memcpy(&entry->caller, fstack->calls, size); +	entry->size = nr_entries;  	if (!call_filter_check_discard(call, entry, buffer, event))  		__buffer_unlock_commit(buffer, event); @@ -2907,15 +2902,15 @@ void trace_dump_stack(int skip)  }  EXPORT_SYMBOL_GPL(trace_dump_stack); +#ifdef CONFIG_USER_STACKTRACE_SUPPORT  static DEFINE_PER_CPU(int, user_stack_count); -void +static void  ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)  {  	struct trace_event_call *call = &event_user_stack;  	struct ring_buffer_event *event;  	struct userstack_entry *entry; -	struct stack_trace trace;  	if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))  		return; @@ -2946,12 +2941,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)  	entry->tgid		= current->tgid;  	memset(&entry->caller, 0, sizeof(entry->caller)); -	trace.nr_entries	= 0; -	trace.max_entries	= FTRACE_STACK_ENTRIES; -	trace.skip		= 0; -	trace.entries		= entry->caller; - -	save_stack_trace_user(&trace); +	stack_trace_save_user(entry->caller, FTRACE_STACK_ENTRIES);  	if (!call_filter_check_discard(call, entry, buffer, event))  		__buffer_unlock_commit(buffer, event); @@ -2960,13 +2950,12 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)   out:  	preempt_enable();  } - -#ifdef UNUSED -static void __trace_userstack(struct trace_array *tr, unsigned long flags) +#else /* CONFIG_USER_STACKTRACE_SUPPORT */ +static void ftrace_trace_userstack(struct ring_buffer *buffer, +				   unsigned long flags, int pc)  { -	ftrace_trace_userstack(tr, flags, preempt_count());  } -#endif /* UNUSED */ +#endif /* !CONFIG_USER_STACKTRACE_SUPPORT */  #endif /* CONFIG_STACKTRACE */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d80cee49e0eb..639047b259d7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -782,17 +782,9 @@ void update_max_tr_single(struct trace_array *tr,  #endif /* CONFIG_TRACER_MAX_TRACE */  #ifdef CONFIG_STACKTRACE -void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, -			    int pc); -  void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,  		   int pc);  #else -static inline void ftrace_trace_userstack(struct ring_buffer *buffer, -					  unsigned long flags, int pc) -{ -} -  static inline void __trace_stack(struct trace_array *tr, unsigned long flags,  				 int skip, int pc)  { diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4ad967453b6f..3ea65cdff30d 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -205,6 +205,8 @@ void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)  void ftrace_likely_update(struct ftrace_likely_data *f, int val,  			  int expect, int is_constant)  { +	unsigned long flags = user_access_save(); +  	/* A constant is always correct */  	if (is_constant) {  		f->constant++; @@ -223,6 +225,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,  		f->data.correct++;  	else  		f->data.incorrect++; + +	user_access_restore(flags);  }  EXPORT_SYMBOL(ftrace_likely_update); diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 795aa2038377..a1d20421f4b0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -5186,7 +5186,6 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,  	u64 var_ref_vals[TRACING_MAP_VARS_MAX];  	char compound_key[HIST_KEY_SIZE_MAX];  	struct tracing_map_elt *elt = NULL; -	struct stack_trace stacktrace;  	struct hist_field *key_field;  	u64 field_contents;  	void *key = NULL; @@ -5198,14 +5197,9 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec,  		key_field = hist_data->fields[i];  		if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { -			stacktrace.max_entries = HIST_STACKTRACE_DEPTH; -			stacktrace.entries = entries; -			stacktrace.nr_entries = 0; -			stacktrace.skip = HIST_STACKTRACE_SKIP; - -			memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE); -			save_stack_trace(&stacktrace); - +			memset(entries, 0, HIST_STACKTRACE_SIZE); +			stack_trace_save(entries, HIST_STACKTRACE_DEPTH, +					 HIST_STACKTRACE_SKIP);  			key = entries;  		} else {  			field_contents = key_field->fn(key_field, elt, rbe, rec); @@ -5246,7 +5240,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m,  	unsigned int i;  	for (i = 0; i < max_entries; i++) { -		if (stacktrace_entries[i] == ULONG_MAX) +		if (!stacktrace_entries[i])  			return;  		seq_printf(m, "%*c", 1 + spaces, ' '); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index eec648a0d673..5d16f73898db 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -18,44 +18,32 @@  #include "trace.h" -static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = -	 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; -unsigned stack_trace_index[STACK_TRACE_ENTRIES]; +#define STACK_TRACE_ENTRIES 500 -/* - * Reserve one entry for the passed in ip. This will allow - * us to remove most or all of the stack size overhead - * added by the stack tracer itself. - */ -struct stack_trace stack_trace_max = { -	.max_entries		= STACK_TRACE_ENTRIES - 1, -	.entries		= &stack_dump_trace[0], -}; +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES]; +static unsigned stack_trace_index[STACK_TRACE_ENTRIES]; -unsigned long stack_trace_max_size; -arch_spinlock_t stack_trace_max_lock = +static unsigned int stack_trace_nr_entries; +static unsigned long stack_trace_max_size; +static arch_spinlock_t stack_trace_max_lock =  	(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;  DEFINE_PER_CPU(int, disable_stack_tracer);  static DEFINE_MUTEX(stack_sysctl_mutex);  int stack_tracer_enabled; -static int last_stack_tracer_enabled; -void stack_trace_print(void) +static void print_max_stack(void)  {  	long i;  	int size;  	pr_emerg("        Depth    Size   Location    (%d entries)\n"  			   "        -----    ----   --------\n", -			   stack_trace_max.nr_entries); +			   stack_trace_nr_entries); -	for (i = 0; i < stack_trace_max.nr_entries; i++) { -		if (stack_dump_trace[i] == ULONG_MAX) -			break; -		if (i+1 == stack_trace_max.nr_entries || -				stack_dump_trace[i+1] == ULONG_MAX) +	for (i = 0; i < stack_trace_nr_entries; i++) { +		if (i + 1 == stack_trace_nr_entries)  			size = stack_trace_index[i];  		else  			size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -65,16 +53,7 @@ void stack_trace_print(void)  	}  } -/* - * When arch-specific code overrides this function, the following - * data should be filled up, assuming stack_trace_max_lock is held to - * prevent concurrent updates. - *     stack_trace_index[] - *     stack_trace_max - *     stack_trace_max_size - */ -void __weak -check_stack(unsigned long ip, unsigned long *stack) +static void check_stack(unsigned long ip, unsigned long *stack)  {  	unsigned long this_size, flags; unsigned long *p, *top, *start;  	static int tracer_frame; @@ -110,13 +89,12 @@ check_stack(unsigned long ip, unsigned long *stack)  	stack_trace_max_size = this_size; -	stack_trace_max.nr_entries = 0; -	stack_trace_max.skip = 0; - -	save_stack_trace(&stack_trace_max); +	stack_trace_nr_entries = stack_trace_save(stack_dump_trace, +					       ARRAY_SIZE(stack_dump_trace) - 1, +					       0);  	/* Skip over the overhead of the stack tracer itself */ -	for (i = 0; i < stack_trace_max.nr_entries; i++) { +	for (i = 0; i < stack_trace_nr_entries; i++) {  		if (stack_dump_trace[i] == ip)  			break;  	} @@ -125,7 +103,7 @@ check_stack(unsigned long ip, unsigned long *stack)  	 * Some archs may not have the passed in ip in the dump.  	 * If that happens, we need to show everything.  	 */ -	if (i == stack_trace_max.nr_entries) +	if (i == stack_trace_nr_entries)  		i = 0;  	/* @@ -143,15 +121,13 @@ check_stack(unsigned long ip, unsigned long *stack)  	 * loop will only happen once. This code only takes place  	 * on a new max, so it is far from a fast path.  	 */ -	while (i < stack_trace_max.nr_entries) { +	while (i < stack_trace_nr_entries) {  		int found = 0;  		stack_trace_index[x] = this_size;  		p = start; -		for (; p < top && i < stack_trace_max.nr_entries; p++) { -			if (stack_dump_trace[i] == ULONG_MAX) -				break; +		for (; p < top && i < stack_trace_nr_entries; p++) {  			/*  			 * The READ_ONCE_NOCHECK is used to let KASAN know that  			 * this is not a stack-out-of-bounds error. @@ -182,12 +158,10 @@ check_stack(unsigned long ip, unsigned long *stack)  			i++;  	} -	stack_trace_max.nr_entries = x; -	for (; x < i; x++) -		stack_dump_trace[x] = ULONG_MAX; +	stack_trace_nr_entries = x;  	if (task_stack_end_corrupted(current)) { -		stack_trace_print(); +		print_max_stack();  		BUG();  	} @@ -286,7 +260,7 @@ __next(struct seq_file *m, loff_t *pos)  {  	long n = *pos - 1; -	if (n >= stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) +	if (n >= stack_trace_nr_entries)  		return NULL;  	m->private = (void *)n; @@ -350,7 +324,7 @@ static int t_show(struct seq_file *m, void *v)  		seq_printf(m, "        Depth    Size   Location"  			   "    (%d entries)\n"  			   "        -----    ----   --------\n", -			   stack_trace_max.nr_entries); +			   stack_trace_nr_entries);  		if (!stack_tracer_enabled && !stack_trace_max_size)  			print_disabled(m); @@ -360,12 +334,10 @@ static int t_show(struct seq_file *m, void *v)  	i = *(long *)v; -	if (i >= stack_trace_max.nr_entries || -	    stack_dump_trace[i] == ULONG_MAX) +	if (i >= stack_trace_nr_entries)  		return 0; -	if (i+1 == stack_trace_max.nr_entries || -	    stack_dump_trace[i+1] == ULONG_MAX) +	if (i + 1 == stack_trace_nr_entries)  		size = stack_trace_index[i];  	else  		size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -422,23 +394,21 @@ stack_trace_sysctl(struct ctl_table *table, int write,  		   void __user *buffer, size_t *lenp,  		   loff_t *ppos)  { +	int was_enabled;  	int ret;  	mutex_lock(&stack_sysctl_mutex); +	was_enabled = !!stack_tracer_enabled;  	ret = proc_dointvec(table, write, buffer, lenp, ppos); -	if (ret || !write || -	    (last_stack_tracer_enabled == !!stack_tracer_enabled)) +	if (ret || !write || (was_enabled == !!stack_tracer_enabled))  		goto out; -	last_stack_tracer_enabled = !!stack_tracer_enabled; -  	if (stack_tracer_enabled)  		register_ftrace_function(&trace_ops);  	else  		unregister_ftrace_function(&trace_ops); -   out:  	mutex_unlock(&stack_sysctl_mutex);  	return ret; @@ -454,7 +424,6 @@ static __init int enable_stacktrace(char *str)  		strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);  	stack_tracer_enabled = 1; -	last_stack_tracer_enabled = 1;  	return 1;  }  __setup("stacktrace", enable_stacktrace); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6a5787233113..7f9e7b9306fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -590,7 +590,7 @@ static void lockup_detector_reconfigure(void)   * Create the watchdog thread infrastructure and configure the detector(s).   *   * The threads are not unparked as watchdog_allowed_mask is empty.  When - * the threads are sucessfully initialized, take the proper locks and + * the threads are successfully initialized, take the proper locks and   * unpark the threads in the watchdog_cpumask if the watchdog is enabled.   */  static __init void lockup_detector_setup(void) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ddee541ea97a..faf7622246da 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -841,43 +841,32 @@ static void wake_up_worker(struct worker_pool *pool)  }  /** - * wq_worker_waking_up - a worker is waking up + * wq_worker_running - a worker is running again   * @task: task waking up - * @cpu: CPU @task is waking up to   * - * This function is called during try_to_wake_up() when a worker is - * being awoken. - * - * CONTEXT: - * spin_lock_irq(rq->lock) + * This function is called when a worker returns from schedule()   */ -void wq_worker_waking_up(struct task_struct *task, int cpu) +void wq_worker_running(struct task_struct *task)  {  	struct worker *worker = kthread_data(task); -	if (!(worker->flags & WORKER_NOT_RUNNING)) { -		WARN_ON_ONCE(worker->pool->cpu != cpu); +	if (!worker->sleeping) +		return; +	if (!(worker->flags & WORKER_NOT_RUNNING))  		atomic_inc(&worker->pool->nr_running); -	} +	worker->sleeping = 0;  }  /**   * wq_worker_sleeping - a worker is going to sleep   * @task: task going to sleep   * - * This function is called during schedule() when a busy worker is - * going to sleep.  Worker on the same cpu can be woken up by - * returning pointer to its task. - * - * CONTEXT: - * spin_lock_irq(rq->lock) - * - * Return: - * Worker task on @cpu to wake up, %NULL if none. + * This function is called from schedule() when a busy worker is + * going to sleep.   */ -struct task_struct *wq_worker_sleeping(struct task_struct *task) +void wq_worker_sleeping(struct task_struct *task)  { -	struct worker *worker = kthread_data(task), *to_wakeup = NULL; +	struct worker *next, *worker = kthread_data(task);  	struct worker_pool *pool;  	/* @@ -886,13 +875,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)  	 * checking NOT_RUNNING.  	 */  	if (worker->flags & WORKER_NOT_RUNNING) -		return NULL; +		return;  	pool = worker->pool; -	/* this can only happen on the local cpu */ -	if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) -		return NULL; +	if (WARN_ON_ONCE(worker->sleeping)) +		return; + +	worker->sleeping = 1; +	spin_lock_irq(&pool->lock);  	/*  	 * The counterpart of the following dec_and_test, implied mb, @@ -906,9 +897,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)  	 * lock is safe.  	 */  	if (atomic_dec_and_test(&pool->nr_running) && -	    !list_empty(&pool->worklist)) -		to_wakeup = first_idle_worker(pool); -	return to_wakeup ? to_wakeup->task : NULL; +	    !list_empty(&pool->worklist)) { +		next = first_idle_worker(pool); +		if (next) +			wake_up_process(next->task); +	} +	spin_unlock_irq(&pool->lock);  }  /** @@ -2277,7 +2271,7 @@ __acquires(&pool->lock)  	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {  		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" -		       "     last function: %pf\n", +		       "     last function: %ps\n",  		       current->comm, preempt_count(), task_pid_nr(current),  		       worker->current_func);  		debug_show_held_locks(current); @@ -2596,11 +2590,11 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,  	worker = current_wq_worker();  	WARN_ONCE(current->flags & PF_MEMALLOC, -		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", +		  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",  		  current->pid, current->comm, target_wq->name, target_func);  	WARN_ONCE(worker && ((worker->current_pwq->wq->flags &  			      (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), -		  "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", +		  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",  		  worker->current_pwq->wq->name, worker->current_func,  		  target_wq->name, target_func);  } @@ -4587,7 +4581,7 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)  	probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);  	if (fn || name[0] || desc[0]) { -		printk("%sWorkqueue: %s %pf", log_lvl, name, fn); +		printk("%sWorkqueue: %s %ps", log_lvl, name, fn);  		if (strcmp(name, desc))  			pr_cont(" (%s)", desc);  		pr_cont("\n"); @@ -4612,7 +4606,7 @@ static void pr_cont_work(bool comma, struct work_struct *work)  		pr_cont("%s BAR(%d)", comma ? "," : "",  			task_pid_nr(barr->task));  	} else { -		pr_cont("%s %pf", comma ? "," : "", work->func); +		pr_cont("%s %ps", comma ? "," : "", work->func);  	}  } @@ -4644,7 +4638,7 @@ static void show_pwq(struct pool_workqueue *pwq)  			if (worker->current_pwq != pwq)  				continue; -			pr_cont("%s %d%s:%pf", comma ? "," : "", +			pr_cont("%s %d%s:%ps", comma ? "," : "",  				task_pid_nr(worker->task),  				worker == pwq->wq->rescuer ? "(RESCUER)" : "",  				worker->current_func); @@ -4929,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool)  		 *  		 * WRITE_ONCE() is necessary because @worker->flags may be  		 * tested without holding any lock in -		 * wq_worker_waking_up().  Without it, NOT_RUNNING test may +		 * wq_worker_running().  Without it, NOT_RUNNING test may  		 * fail incorrectly leading to premature concurrency  		 * management operations.  		 */ diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index cb68b03ca89a..498de0e909a4 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -44,6 +44,7 @@ struct worker {  	unsigned long		last_active;	/* L: last active timestamp */  	unsigned int		flags;		/* X: flags */  	int			id;		/* I: worker id */ +	int			sleeping;	/* None */  	/*  	 * Opaque string set with work_set_desc().  Printed out with task @@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void)   * Scheduler hooks for concurrency managed workqueue.  Only to be used from   * sched/ and workqueue.c.   */ -void wq_worker_waking_up(struct task_struct *task, int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task); +void wq_worker_running(struct task_struct *task); +void wq_worker_sleeping(struct task_struct *task);  work_func_t wq_worker_last_func(struct task_struct *task);  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */  | 
