From 4f311afc2035f7b33d97e69ffaa2e6cd67fca16d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2020 12:14:09 +0200 Subject: sched/core: Fix CONFIG_GCC_PLUGIN_RANDSTRUCT build fail As a temporary build fix, the proper cleanup needs more work. Reported-by: Guenter Roeck Reported-by: Eric Biggers Suggested-by: Eric Biggers Suggested-by: Kees Cook Fixes: a148866489fb ("sched: Replace rq::wake_list") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index b62e6aaf28f0..224b5de568e7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -654,8 +654,10 @@ struct task_struct { unsigned int ptrace; #ifdef CONFIG_SMP - struct llist_node wake_entry; - unsigned int wake_entry_type; + struct { + struct llist_node wake_entry; + unsigned int wake_entry_type; + }; int on_cpu; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ -- cgit v1.2.3 From 8c4890d1c3358fb8023d46e1e554c41d54f02878 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Jun 2020 12:01:25 +0200 Subject: smp, irq_work: Continue smp_call_function*() and irq_work*() integration Instead of relying on BUG_ON() to ensure the various data structures line up, use a bunch of horrible unions to make it all automatic. Much of the union magic is to ensure irq_work and smp_call_function do not (yet) see the members of their respective data structures change name. Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/20200622100825.844455025@infradead.org --- include/linux/irq_work.h | 26 ++++++------------- include/linux/sched.h | 5 +--- include/linux/smp.h | 23 ++++++----------- include/linux/smp_types.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 6 ++--- kernel/smp.c | 18 ------------- 6 files changed, 86 insertions(+), 58 deletions(-) create mode 100644 include/linux/smp_types.h (limited to 'include/linux/sched.h') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 2735da5f839e..30823780c192 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -2,7 +2,7 @@ #ifndef _LINUX_IRQ_WORK_H #define _LINUX_IRQ_WORK_H -#include +#include /* * An entry can be in one of four states: @@ -13,24 +13,14 @@ * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed */ -/* flags share CSD_FLAG_ space */ - -#define IRQ_WORK_PENDING BIT(0) -#define IRQ_WORK_BUSY BIT(1) - -/* Doesn't want IPI, wait for tick: */ -#define IRQ_WORK_LAZY BIT(2) -/* Run hard IRQ context, even on RT */ -#define IRQ_WORK_HARD_IRQ BIT(3) - -#define IRQ_WORK_CLAIMED (IRQ_WORK_PENDING | IRQ_WORK_BUSY) - -/* - * structure shares layout with single_call_data_t. - */ struct irq_work { - struct llist_node llnode; - atomic_t flags; + union { + struct __call_single_node node; + struct { + struct llist_node llnode; + atomic_t flags; + }; + }; void (*func)(struct irq_work *); }; diff --git a/include/linux/sched.h b/include/linux/sched.h index 224b5de568e7..692e327d7455 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -654,11 +654,8 @@ struct task_struct { unsigned int ptrace; #ifdef CONFIG_SMP - struct { - struct llist_node wake_entry; - unsigned int wake_entry_type; - }; int on_cpu; + struct __call_single_node wake_entry; #ifdef CONFIG_THREAD_INFO_IN_TASK /* Current CPU: */ unsigned int cpu; diff --git a/include/linux/smp.h b/include/linux/smp.h index 7ee202ad21a6..80d557ef8a11 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -12,29 +12,22 @@ #include #include #include -#include +#include typedef void (*smp_call_func_t)(void *info); typedef bool (*smp_cond_func_t)(int cpu, void *info); -enum { - CSD_FLAG_LOCK = 0x01, - - /* IRQ_WORK_flags */ - - CSD_TYPE_ASYNC = 0x00, - CSD_TYPE_SYNC = 0x10, - CSD_TYPE_IRQ_WORK = 0x20, - CSD_TYPE_TTWU = 0x30, - CSD_FLAG_TYPE_MASK = 0xF0, -}; - /* * structure shares (partial) layout with struct irq_work */ struct __call_single_data { - struct llist_node llist; - unsigned int flags; + union { + struct __call_single_node node; + struct { + struct llist_node llist; + unsigned int flags; + }; + }; smp_call_func_t func; void *info; }; diff --git a/include/linux/smp_types.h b/include/linux/smp_types.h new file mode 100644 index 000000000000..364b3ae3e41d --- /dev/null +++ b/include/linux/smp_types.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_SMP_TYPES_H +#define __LINUX_SMP_TYPES_H + +#include + +enum { + CSD_FLAG_LOCK = 0x01, + + IRQ_WORK_PENDING = 0x01, + IRQ_WORK_BUSY = 0x02, + IRQ_WORK_LAZY = 0x04, /* No IPI, wait for tick */ + IRQ_WORK_HARD_IRQ = 0x08, /* IRQ context on PREEMPT_RT */ + + IRQ_WORK_CLAIMED = (IRQ_WORK_PENDING | IRQ_WORK_BUSY), + + CSD_TYPE_ASYNC = 0x00, + CSD_TYPE_SYNC = 0x10, + CSD_TYPE_IRQ_WORK = 0x20, + CSD_TYPE_TTWU = 0x30, + + CSD_FLAG_TYPE_MASK = 0xF0, +}; + +/* + * struct __call_single_node is the primary type on + * smp.c:call_single_queue. + * + * flush_smp_call_function_queue() only reads the type from + * __call_single_node::u_flags as a regular load, the above + * (anonymous) enum defines all the bits of this word. + * + * Other bits are not modified until the type is known. + * + * CSD_TYPE_SYNC/ASYNC: + * struct { + * struct llist_node node; + * unsigned int flags; + * smp_call_func_t func; + * void *info; + * }; + * + * CSD_TYPE_IRQ_WORK: + * struct { + * struct llist_node node; + * atomic_t flags; + * void (*func)(struct irq_work *); + * }; + * + * CSD_TYPE_TTWU: + * struct { + * struct llist_node node; + * unsigned int flags; + * }; + * + */ + +struct __call_single_node { + struct llist_node llist; + union { + unsigned int u_flags; + atomic_t a_flags; + }; +}; + +#endif /* __LINUX_SMP_TYPES_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f778067de277..ca5db40392d4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2293,7 +2293,7 @@ void sched_ttwu_pending(void *arg) rq_lock_irqsave(rq, &rf); update_rq_clock(rq); - llist_for_each_entry_safe(p, t, llist, wake_entry) { + llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { if (WARN_ON_ONCE(p->on_cpu)) smp_cond_load_acquire(&p->on_cpu, !VAL); @@ -2329,7 +2329,7 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); - __smp_call_single_queue(cpu, &p->wake_entry); + __smp_call_single_queue(cpu, &p->wake_entry.llist); } void wake_up_if_idle(int cpu) @@ -2786,7 +2786,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) #endif init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP - p->wake_entry_type = CSD_TYPE_TTWU; + p->wake_entry.u_flags = CSD_TYPE_TTWU; #endif } diff --git a/kernel/smp.c b/kernel/smp.c index 472c2b274c65..aa17eedff5be 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -669,24 +669,6 @@ void __init smp_init(void) { int num_nodes, num_cpus; - /* - * Ensure struct irq_work layout matches so that - * flush_smp_call_function_queue() can do horrible things. - */ - BUILD_BUG_ON(offsetof(struct irq_work, llnode) != - offsetof(struct __call_single_data, llist)); - BUILD_BUG_ON(offsetof(struct irq_work, func) != - offsetof(struct __call_single_data, func)); - BUILD_BUG_ON(offsetof(struct irq_work, flags) != - offsetof(struct __call_single_data, flags)); - - /* - * Assert the CSD_TYPE_TTWU layout is similar enough - * for task_struct to be on the @call_single_queue. - */ - BUILD_BUG_ON(offsetof(struct task_struct, wake_entry_type) - offsetof(struct task_struct, wake_entry) != - offsetof(struct __call_single_data, flags) - offsetof(struct __call_single_data, llist)); - idle_threads_init(); cpuhp_threads_init(); -- cgit v1.2.3 From dbfb089d360b1cc623c51a2c7cf9b99eff78e0e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 3 Jul 2020 12:40:33 +0200 Subject: sched: Fix loadavg accounting race The recent commit: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") moved these lines in ttwu(): p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; up before: smp_cond_load_acquire(&p->on_cpu, !VAL); into the 'p->on_rq == 0' block, with the thinking that once we hit schedule() the current task cannot change it's ->state anymore. And while this is true, it is both incorrect and flawed. It is incorrect in that we need at least an ACQUIRE on 'p->on_rq == 0' to avoid weak hardware from re-ordering things for us. This can fairly easily be achieved by relying on the control-dependency already in place. The second problem, which makes the flaw in the original argument, is that while schedule() will not change prev->state, it will read it a number of times (arguably too many times since it's marked volatile). The previous condition 'p->on_cpu == 0' was sufficient because that indicates schedule() has completed, and will no longer read prev->state. So now the trick is to make this same true for the (much) earlier 'prev->on_rq == 0' case. Furthermore, in order to make the ordering stick, the 'prev->on_rq = 0' assignment needs to he a RELEASE, but adding additional ordering to schedule() is an unwelcome proposition at the best of times, doubly so for mere accounting. Luckily we can push the prev->state load up before rq->lock, with the only caveat that we then have to re-read the state after. However, we know that if it changed, we no longer have to worry about the blocking path. This gives us the required ordering, if we block, we did the prev->state load before an (effective) smp_mb() and the p->on_rq store needs not change. With this we end up with the effective ordering: LOAD p->state LOAD-ACQUIRE p->on_rq == 0 MB STORE p->on_rq, 0 STORE p->state, TASK_WAKING which ensures the TASK_WAKING store happens after the prev->state load, and all is well again. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Dave Jones Reported-by: Paul Gortmaker Signed-off-by: Peter Zijlstra (Intel) Tested-by: Dave Jones Tested-by: Paul Gortmaker Link: https://lkml.kernel.org/r/20200707102957.GN117543@hirez.programming.kicks-ass.net --- include/linux/sched.h | 4 --- kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 51 insertions(+), 20 deletions(-) (limited to 'include/linux/sched.h') diff --git a/include/linux/sched.h b/include/linux/sched.h index 692e327d7455..683372943093 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -114,10 +114,6 @@ struct task_group; #define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) -#define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FROZEN) == 0 && \ - (task->state & TASK_NOLOAD) == 0) - #ifdef CONFIG_DEBUG_ATOMIC_SLEEP /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ca5db40392d4..950ac45d5480 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1311,9 +1311,6 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) void activate_task(struct rq *rq, struct task_struct *p, int flags) { - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; - enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; @@ -1323,9 +1320,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - dequeue_task(rq, p, flags); } @@ -2236,10 +2230,10 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, lockdep_assert_held(&rq->lock); -#ifdef CONFIG_SMP if (p->sched_contributes_to_load) rq->nr_uninterruptible--; +#ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; #endif @@ -2583,7 +2577,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). */ smp_rmb(); - if (p->on_rq && ttwu_remote(p, wake_flags)) + if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) goto unlock; if (p->in_iowait) { @@ -2592,9 +2586,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) } #ifdef CONFIG_SMP - p->sched_contributes_to_load = !!task_contributes_to_load(p); - p->state = TASK_WAKING; - /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -2613,8 +2604,20 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). */ - smp_rmb(); + smp_acquire__after_ctrl_dep(); + + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + p->state = TASK_WAKING; /* * If the owning (remote) CPU is still in the middle of schedule() with @@ -4097,6 +4100,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; + unsigned long prev_state; struct rq_flags rf; struct rq *rq; int cpu; @@ -4113,12 +4117,22 @@ static void __sched notrace __schedule(bool preempt) local_irq_disable(); rcu_note_context_switch(preempt); + /* See deactivate_task() below. */ + prev_state = prev->state; + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) - * done by the caller to avoid the race with signal_wake_up(). + * done by the caller to avoid the race with signal_wake_up(): + * + * __set_current_state(@state) signal_wake_up() + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) + * wake_up_state(p, state) + * LOCK rq->lock LOCK p->pi_state + * smp_mb__after_spinlock() smp_mb__after_spinlock() + * if (signal_pending_state()) if (p->state & @state) * - * The membarrier system call requires a full memory barrier + * Also, the membarrier system call requires a full memory barrier * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); @@ -4129,10 +4143,31 @@ static void __sched notrace __schedule(bool preempt) update_rq_clock(rq); switch_count = &prev->nivcsw; - if (!preempt && prev->state) { - if (signal_pending_state(prev->state, prev)) { + /* + * We must re-load prev->state in case ttwu_remote() changed it + * before we acquired rq->lock. + */ + if (!preempt && prev_state && prev_state == prev->state) { + if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev->flags & PF_FROZEN); + + if (prev->sched_contributes_to_load) + rq->nr_uninterruptible++; + + /* + * __schedule() ttwu() + * prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...) + * LOCK rq->lock goto out; + * smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep(); + * p->on_rq = 0; p->state = TASK_WAKING; + * + * After this, schedule() must not care about p->state any more. + */ deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); if (prev->in_iowait) { -- cgit v1.2.3