From 99bae5f94185c2cc65701e95c54e31e2f4345b88 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 12 Jun 2014 14:31:31 +0800 Subject: cgroup: fix broken css_has_online_children() After running: # mount -t cgroup cpu xxx /cgroup && mkdir /cgroup/sub && \ rmdir /cgroup/sub && umount /cgroup I found the cgroup root still existed: # cat /proc/cgroups #subsys_name hierarchy num_cgroups enabled cpuset 0 1 1 cpu 1 1 1 ... It turned out css_has_online_children() is broken. Signed-off-by: Li Zefan Sigend-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7868fc3c0bc5..d9a8be911f5b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3328,7 +3328,7 @@ bool css_has_online_children(struct cgroup_subsys_state *css) rcu_read_lock(); css_for_each_child(child, css) { - if (css->flags & CSS_ONLINE) { + if (child->flags & CSS_ONLINE) { ret = true; break; } -- cgit v1.2.3 From 546a9d8519ed137b2804a3f5a3659003039dd49c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Jun 2014 14:57:10 -0700 Subject: rcu: Export debug_init_rcu_head() and and debug_init_rcu_head() Currently, call_rcu() relies on implicit allocation and initialization for the debug-objects handling of RCU callbacks. If you hammer the kernel hard enough with Sasha's modified version of trinity, you can end up with the sl*b allocators recursing into themselves via this implicit call_rcu() allocation. This commit therefore exports the debug_init_rcu_head() and debug_rcu_head_free() functions, which permits the allocators to allocated and pre-initialize the debug-objects information, so that there no longer any need for call_rcu() to do that initialization, which in turn prevents the recursion into the memory allocators. Reported-by: Sasha Levin Suggested-by: Thomas Gleixner Signed-off-by: Paul E. McKenney Acked-by: Thomas Gleixner Looks-good-to: Christoph Lameter --- include/linux/rcupdate.h | 10 ++++++++++ kernel/rcu/update.c | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5a75d19aa661..13bbfbde41b9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -358,9 +358,19 @@ void wait_rcu_gp(call_rcu_func_t crf); * initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +void init_rcu_head(struct rcu_head *head); +void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void init_rcu_head(struct rcu_head *head) +{ +} + +static inline void destroy_rcu_head(struct rcu_head *head) +{ +} + static inline void init_rcu_head_on_stack(struct rcu_head *head) { } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index a2aeb4df0f60..0fb691e63ce6 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -200,12 +200,12 @@ void wait_rcu_gp(call_rcu_func_t crf) EXPORT_SYMBOL_GPL(wait_rcu_gp); #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD -static inline void debug_init_rcu_head(struct rcu_head *head) +void init_rcu_head(struct rcu_head *head) { debug_object_init(head, &rcuhead_debug_descr); } -static inline void debug_rcu_head_free(struct rcu_head *head) +void destroy_rcu_head(struct rcu_head *head) { debug_object_free(head, &rcuhead_debug_descr); } -- cgit v1.2.3 From 4a81e8328d3791a4f99bf5b436d050f6dc5ffea3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 Jun 2014 16:49:01 -0700 Subject: rcu: Reduce overhead of cond_resched() checks for RCU Commit ac1bea85781e (Make cond_resched() report RCU quiescent states) fixed a problem where a CPU looping in the kernel with but one runnable task would give RCU CPU stall warnings, even if the in-kernel loop contained cond_resched() calls. Unfortunately, in so doing, it introduced performance regressions in Anton Blanchard's will-it-scale "open1" test. The problem appears to be not so much the increased cond_resched() path length as an increase in the rate at which grace periods complete, which increased per-update grace-period overhead. This commit takes a different approach to fixing this bug, mainly by moving the RCU-visible quiescent state from cond_resched() to rcu_note_context_switch(), and by further reducing the check to a simple non-zero test of a single per-CPU variable. However, this approach requires that the force-quiescent-state processing send resched IPIs to the offending CPUs. These will be sent only once the grace period has reached an age specified by the boot/sysfs parameter rcutree.jiffies_till_sched_qs, or once the grace period reaches an age halfway to the point at which RCU CPU stall warnings will be emitted, whichever comes first. Reported-by: Dave Hansen Signed-off-by: Paul E. McKenney Cc: Andi Kleen Cc: Christoph Lameter Cc: Mike Galbraith Cc: Eric Dumazet Reviewed-by: Josh Triplett [ paulmck: Made rcu_momentary_dyntick_idle() as suggested by the ktest build robot. Also fixed smp_mb() comment as noted by Oleg Nesterov. ] Merge with e552592e (Reduce overhead of cond_resched() checks for RCU) Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 6 ++ include/linux/rcupdate.h | 36 ---------- kernel/rcu/tree.c | 140 ++++++++++++++++++++++++++++-------- kernel/rcu/tree.h | 6 +- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/update.c | 18 ----- kernel/sched/core.c | 7 +- 7 files changed, 125 insertions(+), 90 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6eaa9cdb7094..910c3829f81d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2785,6 +2785,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. leaf rcu_node structure. Useful for very large systems. + rcutree.jiffies_till_sched_qs= [KNL] + Set required age in jiffies for a + given grace period before RCU starts + soliciting quiescent-state help from + rcu_note_context_switch(). + rcutree.jiffies_till_first_fqs= [KNL] Set delay from grace-period initialization to first attempt to force quiescent states. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 13bbfbde41b9..6a94cc8b1ca0 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -44,7 +44,6 @@ #include #include #include -#include #include extern int rcu_expedited; /* for sysctl */ @@ -299,41 +298,6 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev, bool __rcu_is_watching(void); #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */ -/* - * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. - */ - -#define RCU_COND_RESCHED_LIM 256 /* ms vs. 100s of ms. */ -DECLARE_PER_CPU(int, rcu_cond_resched_count); -void rcu_resched(void); - -/* - * Is it time to report RCU quiescent states? - * - * Note unsynchronized access to rcu_cond_resched_count. Yes, we might - * increment some random CPU's count, and possibly also load the result from - * yet another CPU's count. We might even clobber some other CPU's attempt - * to zero its counter. This is all OK because the goal is not precision, - * but rather reasonable amortization of rcu_note_context_switch() overhead - * and extremely high probability of avoiding RCU CPU stall warnings. - * Note that this function has to be preempted in just the wrong place, - * many thousands of times in a row, for anything bad to happen. - */ -static inline bool rcu_should_resched(void) -{ - return raw_cpu_inc_return(rcu_cond_resched_count) >= - RCU_COND_RESCHED_LIM; -} - -/* - * Report quiscent states to RCU if it is time to do so. - */ -static inline void rcu_cond_resched(void) -{ - if (unlikely(rcu_should_resched())) - rcu_resched(); -} - /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f1ba77363fbb..625d0b0cd75a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu) rdp->passed_quiesce = 1; } +static DEFINE_PER_CPU(int, rcu_sched_qs_mask); + +static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { + .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, + .dynticks = ATOMIC_INIT(1), +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, + .dynticks_idle = ATOMIC_INIT(1), +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +}; + +/* + * Let the RCU core know that this CPU has gone through the scheduler, + * which is a quiescent state. This is called when the need for a + * quiescent state is urgent, so we burn an atomic operation and full + * memory barriers to let the RCU core know about it, regardless of what + * this CPU might (or might not) do in the near future. + * + * We inform the RCU core by emulating a zero-duration dyntick-idle + * period, which we in turn do by incrementing the ->dynticks counter + * by two. + */ +static void rcu_momentary_dyntick_idle(void) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_dynticks *rdtp; + int resched_mask; + struct rcu_state *rsp; + + local_irq_save(flags); + + /* + * Yes, we can lose flag-setting operations. This is OK, because + * the flag will be set again after some delay. + */ + resched_mask = raw_cpu_read(rcu_sched_qs_mask); + raw_cpu_write(rcu_sched_qs_mask, 0); + + /* Find the flavor that needs a quiescent state. */ + for_each_rcu_flavor(rsp) { + rdp = raw_cpu_ptr(rsp->rda); + if (!(resched_mask & rsp->flavor_mask)) + continue; + smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ + if (ACCESS_ONCE(rdp->mynode->completed) != + ACCESS_ONCE(rdp->cond_resched_completed)) + continue; + + /* + * Pretend to be momentarily idle for the quiescent state. + * This allows the grace-period kthread to record the + * quiescent state, with no need for this CPU to do anything + * further. + */ + rdtp = this_cpu_ptr(&rcu_dynticks); + smp_mb__before_atomic(); /* Earlier stuff before QS. */ + atomic_add(2, &rdtp->dynticks); /* QS. */ + smp_mb__after_atomic(); /* Later stuff after QS. */ + break; + } + local_irq_restore(flags); +} + /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. @@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu) trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + rcu_momentary_dyntick_idle(); trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); -static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, - .dynticks = ATOMIC_INIT(1), -#ifdef CONFIG_NO_HZ_FULL_SYSIDLE - .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, - .dynticks_idle = ATOMIC_INIT(1), -#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -}; - static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static long qhimark = 10000; /* If this many pending, ignore blimit. */ static long qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +/* + * How long the grace period must be before we start recruiting + * quiescent-state help from rcu_note_context_switch(). + */ +static ulong jiffies_till_sched_qs = HZ / 20; +module_param(jiffies_till_sched_qs, ulong, 0644); + static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, @@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { unsigned int curr; + int *rcrmp; unsigned int snap; curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); @@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, } /* - * There is a possibility that a CPU in adaptive-ticks state - * might run in the kernel with the scheduling-clock tick disabled - * for an extended time period. Invoke rcu_kick_nohz_cpu() to - * force the CPU to restart the scheduling-clock tick in this - * CPU is in this state. - */ - rcu_kick_nohz_cpu(rdp->cpu); - - /* - * Alternatively, the CPU might be running in the kernel - * for an extended period of time without a quiescent state. - * Attempt to force the CPU through the scheduler to gain the - * needed quiescent state, but only if the grace period has gone - * on for an uncommonly long time. If there are many stuck CPUs, - * we will beat on the first one until it gets unstuck, then move - * to the next. Only do this for the primary flavor of RCU. + * A CPU running for an extended time within the kernel can + * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, + * even context-switching back and forth between a pair of + * in-kernel CPU-bound tasks cannot advance grace periods. + * So if the grace period is old enough, make the CPU pay attention. + * Note that the unsynchronized assignments to the per-CPU + * rcu_sched_qs_mask variable are safe. Yes, setting of + * bits can be lost, but they will be set again on the next + * force-quiescent-state pass. So lost bit sets do not result + * in incorrect behavior, merely in a grace period lasting + * a few jiffies longer than it might otherwise. Because + * there are at most four threads involved, and because the + * updates are only once every few jiffies, the probability of + * lossage (and thus of slight grace-period extension) is + * quite low. + * + * Note that if the jiffies_till_sched_qs boot/sysfs parameter + * is set too high, we override with half of the RCU CPU stall + * warning delay. */ - if (rdp->rsp == rcu_state_p && + rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); + if (ULONG_CMP_GE(jiffies, + rdp->rsp->gp_start + jiffies_till_sched_qs) || ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - rdp->rsp->jiffies_resched += 5; - resched_cpu(rdp->cpu); + if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { + ACCESS_ONCE(rdp->cond_resched_completed) = + ACCESS_ONCE(rdp->mynode->completed); + smp_mb(); /* ->cond_resched_completed before *rcrmp. */ + ACCESS_ONCE(*rcrmp) = + ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask; + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + rdp->rsp->jiffies_resched += 5; /* Enable beating. */ + } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { + /* Time to beat on that CPU again! */ + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ + } } return 0; @@ -3491,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + static u8 fl_mask = 0x1; int cpustride = 1; int i; int j; @@ -3509,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, for (i = 1; i < rcu_num_lvls; i++) rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; rcu_init_levelspread(rsp); + rsp->flavor_mask = fl_mask; + fl_mask <<= 1; /* Initialize the elements themselves, starting from the leaves. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index bf2c1e669691..0f69a79c5b7d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -307,6 +307,9 @@ struct rcu_data { /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ unsigned long offline_fqs; /* Kicked due to being offline. */ + unsigned long cond_resched_completed; + /* Grace period that needs help */ + /* from cond_resched(). */ /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ @@ -392,6 +395,7 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ + u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ void (*func)(struct rcu_head *head)); @@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); static void do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); -static void rcu_kick_nohz_cpu(int cpu); +static void __maybe_unused rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index cbc2c45265e2..02ac0fb186b8 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) * if an adaptive-ticks CPU is failing to respond to the current grace * period and has not be idle from an RCU perspective, kick it. */ -static void rcu_kick_nohz_cpu(int cpu) +static void __maybe_unused rcu_kick_nohz_cpu(int cpu) { #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(cpu)) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 0fb691e63ce6..bc7883570530 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void) early_initcall(check_cpu_stall_init); #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ - -/* - * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings. - */ - -DEFINE_PER_CPU(int, rcu_cond_resched_count); - -/* - * Report a set of RCU quiescent states, for use by cond_resched() - * and friends. Out of line due to being called infrequently. - */ -void rcu_resched(void) -{ - preempt_disable(); - __this_cpu_write(rcu_cond_resched_count, 0); - rcu_note_context_switch(smp_processor_id()); - preempt_enable(); -} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3bdf01b494fe..bc1638b33449 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4147,7 +4147,6 @@ static void __cond_resched(void) int __sched _cond_resched(void) { - rcu_cond_resched(); if (should_resched()) { __cond_resched(); return 1; @@ -4166,18 +4165,15 @@ EXPORT_SYMBOL(_cond_resched); */ int __cond_resched_lock(spinlock_t *lock) { - bool need_rcu_resched = rcu_should_resched(); int resched = should_resched(); int ret = 0; lockdep_assert_held(lock); - if (spin_needbreak(lock) || resched || need_rcu_resched) { + if (spin_needbreak(lock) || resched) { spin_unlock(lock); if (resched) __cond_resched(); - else if (unlikely(need_rcu_resched)) - rcu_resched(); else cpu_relax(); ret = 1; @@ -4191,7 +4187,6 @@ int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); - rcu_cond_resched(); /* BH disabled OK, just recording QSes. */ if (should_resched()) { local_bh_enable(); __cond_resched(); -- cgit v1.2.3 From bddbceb688c6d0decaabc7884fede319d02f96c8 Mon Sep 17 00:00:00 2001 From: Maxime Bizon Date: Mon, 23 Jun 2014 16:35:35 +0200 Subject: workqueue: fix dev_set_uevent_suppress() imbalance Uevents are suppressed during attributes registration, but never restored, so kobject_uevent() does nothing. Signed-off-by: Maxime Bizon Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org Fixes: 226223ab3c4118ddd10688cc2c131135848371ab --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6203d2900877..6f5f9c7323f4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3284,6 +3284,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) } } + dev_set_uevent_suppress(&wq_dev->dev, false); kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); return 0; } -- cgit v1.2.3 From 391acf970d21219a2a5446282d3b20eace0c0d7a Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 25 Jun 2014 09:57:18 +0800 Subject: cpuset,mempolicy: fix sleeping function called from invalid context When runing with the kernel(3.15-rc7+), the follow bug occurs: [ 9969.258987] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:586 [ 9969.359906] in_atomic(): 1, irqs_disabled(): 0, pid: 160655, name: python [ 9969.441175] INFO: lockdep is turned off. [ 9969.488184] CPU: 26 PID: 160655 Comm: python Tainted: G A 3.15.0-rc7+ #85 [ 9969.581032] Hardware name: FUJITSU-SV PRIMEQUEST 1800E/SB, BIOS PRIMEQUEST 1000 Series BIOS Version 1.39 11/16/2012 [ 9969.706052] ffffffff81a20e60 ffff8803e941fbd0 ffffffff8162f523 ffff8803e941fd18 [ 9969.795323] ffff8803e941fbe0 ffffffff8109995a ffff8803e941fc58 ffffffff81633e6c [ 9969.884710] ffffffff811ba5dc ffff880405c6b480 ffff88041fdd90a0 0000000000002000 [ 9969.974071] Call Trace: [ 9970.003403] [] dump_stack+0x4d/0x66 [ 9970.065074] [] __might_sleep+0xfa/0x130 [ 9970.130743] [] mutex_lock_nested+0x3c/0x4f0 [ 9970.200638] [] ? kmem_cache_alloc+0x1bc/0x210 [ 9970.272610] [] cpuset_mems_allowed+0x27/0x140 [ 9970.344584] [] ? __mpol_dup+0x63/0x150 [ 9970.409282] [] __mpol_dup+0xe5/0x150 [ 9970.471897] [] ? __mpol_dup+0x63/0x150 [ 9970.536585] [] ? copy_process.part.23+0x606/0x1d40 [ 9970.613763] [] ? trace_hardirqs_on+0xd/0x10 [ 9970.683660] [] ? monotonic_to_bootbased+0x2f/0x50 [ 9970.759795] [] copy_process.part.23+0x670/0x1d40 [ 9970.834885] [] do_fork+0xd8/0x380 [ 9970.894375] [] ? __audit_syscall_entry+0x9c/0xf0 [ 9970.969470] [] SyS_clone+0x16/0x20 [ 9971.030011] [] stub_clone+0x69/0x90 [ 9971.091573] [] ? system_call_fastpath+0x16/0x1b The cause is that cpuset_mems_allowed() try to take mutex_lock(&callback_mutex) under the rcu_read_lock(which was hold in __mpol_dup()). And in cpuset_mems_allowed(), the access to cpuset is under rcu_read_lock, so in __mpol_dup, we can reduce the rcu_read_lock protection region to protect the access to cpuset only in current_cpuset_is_being_rebound(). So that we can avoid this bug. This patch is a temporary solution that just addresses the bug mentioned above, can not fix the long-standing issue about cpuset.mems rebinding on fork(): "When the forker's task_struct is duplicated (which includes ->mems_allowed) and it races with an update to cpuset_being_rebound in update_tasks_nodemask() then the task's mems_allowed doesn't get updated. And the child task's mems_allowed can be wrong if the cpuset's nodemask changes before the child has been added to the cgroup's tasklist." Signed-off-by: Gu Zheng Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable --- kernel/cpuset.c | 8 +++++++- mm/mempolicy.c | 2 -- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f6b33c696224..d3df02e76643 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1181,7 +1181,13 @@ done: int current_cpuset_is_being_rebound(void) { - return task_cs(current) == cpuset_being_rebound; + int ret; + + rcu_read_lock(); + ret = task_cs(current) == cpuset_being_rebound; + rcu_read_unlock(); + + return ret; } static int update_relax_domain_level(struct cpuset *cs, s64 val) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 284974230459..9a3783ccff67 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2145,7 +2145,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) } else *new = *old; - rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); if (new->flags & MPOL_F_REBINDING) @@ -2153,7 +2152,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) else mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); } - rcu_read_unlock(); atomic_set(&new->refcnt, 1); return new; } -- cgit v1.2.3 From 970317aa48c6ef66cd023c039c2650c897bad927 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 30 Jun 2014 11:49:58 +0800 Subject: cgroup: fix mount failure in a corner case # cat test.sh #! /bin/bash mount -t cgroup -o cpu xxx /cgroup umount /cgroup mount -t cgroup -o cpu,cpuacct xxx /cgroup umount /cgroup # ./test.sh mount: xxx already mounted or /cgroup busy mount: according to mtab, xxx is already mounted on /cgroup It's because the cgroupfs_root of the first mount was under destruction asynchronously. Fix this by delaying and then retrying mount for this case. v3: - put the refcnt immediately after getting it. (Tejun) v2: - use percpu_ref_tryget_live() rather that introducing percpu_ref_alive(). (Tejun) - adjust comment. tj: Updated the comment a bit. Cc: # 3.15 Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d9a8be911f5b..64068667be84 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1648,10 +1648,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct cgroup_subsys *ss; struct cgroup_root *root; struct cgroup_sb_opts opts; struct dentry *dentry; int ret; + int i; bool new_sb; /* @@ -1677,6 +1679,27 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } + /* + * Destruction of cgroup root is asynchronous, so subsystems may + * still be dying after the previous unmount. Let's drain the + * dying subsystems. We just need to ensure that the ones + * unmounted previously finish dying and don't care about new ones + * starting. Testing ref liveliness is good enough. + */ + for_each_subsys(ss, i) { + if (!(opts.subsys_mask & (1 << i)) || + ss->root == &cgrp_dfl_root) + continue; + + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { + mutex_unlock(&cgroup_mutex); + msleep(10); + ret = restart_syscall(); + goto out_free; + } + cgroup_put(&ss->root->cgrp); + } + for_each_root(root) { bool name_match = false; -- cgit v1.2.3 From 3a32bd72d77058d768dbb38183ad517f720dd1bc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 30 Jun 2014 11:50:59 +0800 Subject: cgroup: fix a race between cgroup_mount() and cgroup_kill_sb() We've converted cgroup to kernfs so cgroup won't be intertwined with vfs objects and locking, but there are dark areas. Run two instances of this script concurrently: for ((; ;)) { mount -t cgroup -o cpuacct xxx /cgroup umount /cgroup } After a while, I saw two mount processes were stuck at retrying, because they were waiting for a subsystem to become free, but the root associated with this subsystem never got freed. This can happen, if thread A is in the process of killing superblock but hasn't called percpu_ref_kill(), and at this time thread B is mounting the same cgroup root and finds the root in the root list and performs percpu_ref_try_get(). To fix this, we try to increase both the refcnt of the superblock and the percpu refcnt of cgroup root. v2: - we should try to get both the superblock refcnt and cgroup_root refcnt, because cgroup_root may have no superblock assosiated with it. - adjust/add comments. tj: Updated comments. Renamed @sb to @pinned_sb. Cc: # 3.15 Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 64068667be84..70776aec2562 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1648,6 +1648,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct super_block *pinned_sb = NULL; struct cgroup_subsys *ss; struct cgroup_root *root; struct cgroup_sb_opts opts; @@ -1740,15 +1741,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, } /* - * A root's lifetime is governed by its root cgroup. - * tryget_live failure indicate that the root is being - * destroyed. Wait for destruction to complete so that the - * subsystems are free. We can use wait_queue for the wait - * but this path is super cold. Let's just sleep for a bit - * and retry. + * We want to reuse @root whose lifetime is governed by its + * ->cgrp. Let's check whether @root is alive and keep it + * that way. As cgroup_kill_sb() can happen anytime, we + * want to block it by pinning the sb so that @root doesn't + * get killed before mount is complete. + * + * With the sb pinned, tryget_live can reliably indicate + * whether @root can be reused. If it's being killed, + * drain it. We can use wait_queue for the wait but this + * path is super cold. Let's just sleep a bit and retry. */ - if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); + if (IS_ERR(pinned_sb) || + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { mutex_unlock(&cgroup_mutex); + if (!IS_ERR_OR_NULL(pinned_sb)) + deactivate_super(pinned_sb); msleep(10); ret = restart_syscall(); goto out_free; @@ -1793,6 +1802,16 @@ out_free: CGROUP_SUPER_MAGIC, &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); + + /* + * If @pinned_sb, we're reusing an existing root and holding an + * extra ref on its sb. Mount is complete. Put the extra ref. + */ + if (pinned_sb) { + WARN_ON(new_sb); + deactivate_super(pinned_sb); + } + return dentry; } -- cgit v1.2.3 From 76bb5ab8f6e3e7bebdcefec4146ff305e7d0b465 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Jun 2014 15:47:32 -0400 Subject: cpuset: break kernfs active protection in cpuset_write_resmask() Writing to either "cpuset.cpus" or "cpuset.mems" file flushes cpuset_hotplug_work so that cpu or memory hotunplug doesn't end up migrating tasks off a cpuset after new resources are added to it. As cpuset_hotplug_work calls into cgroup core via cgroup_transfer_tasks(), this flushing adds the dependency to cgroup core locking from cpuset_write_resmak(). This used to be okay because cgroup interface files were protected by a different mutex; however, 8353da1f91f1 ("cgroup: remove cgroup_tree_mutex") simplified the cgroup core locking and this dependency became a deadlock hazard - cgroup file removal performed under cgroup core lock tries to drain on-going file operation which is trying to flush cpuset_hotplug_work blocked on the same cgroup core lock. The locking simplification was done because kernfs added an a lot easier way to deal with circular dependencies involving kernfs active protection. Let's use the same strategy in cpuset and break active protection in cpuset_write_resmask(). While it isn't the prettiest, this is a very rare, likely unique, situation which also goes away on the unified hierarchy. The commands to trigger the deadlock warning without the patch and the lockdep output follow. localhost:/ # mount -t cgroup -o cpuset xxx /cpuset localhost:/ # mkdir /cpuset/tmp localhost:/ # echo 1 > /cpuset/tmp/cpuset.cpus localhost:/ # echo 0 > cpuset/tmp/cpuset.mems localhost:/ # echo $$ > /cpuset/tmp/tasks localhost:/ # echo 0 > /sys/devices/system/cpu/cpu1/online ====================================================== [ INFO: possible circular locking dependency detected ] 3.16.0-rc1-0.1-default+ #7 Not tainted ------------------------------------------------------- kworker/1:0/32649 is trying to acquire lock: (cgroup_mutex){+.+.+.}, at: [] cgroup_transfer_tasks+0x37/0x150 but task is already holding lock: (cpuset_hotplug_work){+.+...}, at: [] process_one_work+0x192/0x520 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (cpuset_hotplug_work){+.+...}: ... -> #1 (s_active#175){++++.+}: ... -> #0 (cgroup_mutex){+.+.+.}: ... other info that might help us debug this: Chain exists of: cgroup_mutex --> s_active#175 --> cpuset_hotplug_work Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(cpuset_hotplug_work); lock(s_active#175); lock(cpuset_hotplug_work); lock(cgroup_mutex); *** DEADLOCK *** 2 locks held by kworker/1:0/32649: #0: ("events"){.+.+.+}, at: [] process_one_work+0x192/0x520 #1: (cpuset_hotplug_work){+.+...}, at: [] process_one_work+0x192/0x520 stack backtrace: CPU: 1 PID: 32649 Comm: kworker/1:0 Not tainted 3.16.0-rc1-0.1-default+ #7 ... Call Trace: [] dump_stack+0x72/0x8a [] print_circular_bug+0x10f/0x120 [] check_prev_add+0x43e/0x4b0 [] validate_chain+0x656/0x7c0 [] __lock_acquire+0x382/0x660 [] lock_acquire+0xf9/0x170 [] mutex_lock_nested+0x6f/0x380 [] cgroup_transfer_tasks+0x37/0x150 [] hotplug_update_tasks_insane+0x110/0x1d0 [] cpuset_hotplug_update_tasks+0x13d/0x180 [] cpuset_hotplug_workfn+0x18c/0x630 [] process_one_work+0x254/0x520 [] worker_thread+0x13d/0x3d0 [] kthread+0xf8/0x100 [] ret_from_fork+0x7c/0xb0 Signed-off-by: Tejun Heo Reported-by: Li Zefan Tested-by: Li Zefan --- kernel/cpuset.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d3df02e76643..116a4164720a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1623,7 +1623,17 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * resources, wait for the previously scheduled operations before * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. + * + * cpuset_hotplug_work calls back into cgroup core via + * cgroup_transfer_tasks() and waiting for it from a cgroupfs + * operation like this one can lead to a deadlock through kernfs + * active_ref protection. Let's break the protection. Losing the + * protection is okay as we check whether @cs is online after + * grabbing cpuset_mutex anyway. This only happens on the legacy + * hierarchies. */ + css_get(&cs->css); + kernfs_break_active_protection(of->kn); flush_work(&cpuset_hotplug_work); mutex_lock(&cpuset_mutex); @@ -1651,6 +1661,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, free_trial_cpuset(trialcs); out_unlock: mutex_unlock(&cpuset_mutex); + kernfs_unbreak_active_protection(of->kn); + css_put(&cs->css); return retval ?: nbytes; } -- cgit v1.2.3 From 1f9a7268c67f0290837aada443d28fd953ddca90 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 24 Jun 2014 10:20:25 +0200 Subject: perf: Do not allow optimized switch for non-cloned events The context check in perf_event_context_sched_out allows non-cloned context to be part of the optimized schedule out switch. This could move non-cloned context into another workload child. Once this child exits, the context is closed and leaves all original (parent) events in closed state. Any other new cloned event will have closed state and not measure anything. And probably causing other odd bugs. Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra Cc: Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Paul Mackerras Cc: Corey Ashford Cc: David Ahern Cc: Jiri Olsa Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1403598026-2310-2-git-send-email-jolsa@kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a33d9a2bcbd7..b0c95f0f06fd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2320,7 +2320,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, next_parent = rcu_dereference(next_ctx->parent_ctx); /* If neither context have a parent context; they cannot be clones. */ - if (!parent && !next_parent) + if (!parent || !next_parent) goto unlock; if (next_parent == ctx || next_ctx == parent || next_parent == parent) { -- cgit v1.2.3 From 5a6024f1604eef119cf3a6fa413fe0261a81a8f3 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Mon, 7 Jul 2014 09:56:48 -0400 Subject: workqueue: zero cpumask of wq_numa_possible_cpumask on init When hot-adding and onlining CPU, kernel panic occurs, showing following call trace. BUG: unable to handle kernel paging request at 0000000000001d08 IP: [] __alloc_pages_nodemask+0x9d/0xb10 PGD 0 Oops: 0000 [#1] SMP ... Call Trace: [] ? cpumask_next_and+0x35/0x50 [] ? find_busiest_group+0x113/0x8f0 [] ? deactivate_slab+0x349/0x3c0 [] new_slab+0x91/0x300 [] __slab_alloc+0x2bb/0x482 [] ? copy_process.part.25+0xfc/0x14c0 [] ? load_balance+0x218/0x890 [] ? sched_clock+0x9/0x10 [] ? trace_clock_local+0x9/0x10 [] kmem_cache_alloc_node+0x8c/0x200 [] copy_process.part.25+0xfc/0x14c0 [] ? trace_buffer_unlock_commit+0x4d/0x60 [] ? kthread_create_on_node+0x140/0x140 [] do_fork+0xbc/0x360 [] kernel_thread+0x26/0x30 [] kthreadd+0x2c2/0x300 [] ? kthread_create_on_cpu+0x60/0x60 [] ret_from_fork+0x7c/0xb0 [] ? kthread_create_on_cpu+0x60/0x60 In my investigation, I found the root cause is wq_numa_possible_cpumask. All entries of wq_numa_possible_cpumask is allocated by alloc_cpumask_var_node(). And these entries are used without initializing. So these entries have wrong value. When hot-adding and onlining CPU, wq_update_unbound_numa() is called. wq_update_unbound_numa() calls alloc_unbound_pwq(). And alloc_unbound_pwq() calls get_unbound_pool(). In get_unbound_pool(), worker_pool->node is set as follow: 3592 /* if cpumask is contained inside a NUMA node, we belong to that node */ 3593 if (wq_numa_enabled) { 3594 for_each_node(node) { 3595 if (cpumask_subset(pool->attrs->cpumask, 3596 wq_numa_possible_cpumask[node])) { 3597 pool->node = node; 3598 break; 3599 } 3600 } 3601 } But wq_numa_possible_cpumask[node] does not have correct cpumask. So, wrong node is selected. As a result, kernel panic occurs. By this patch, all entries of wq_numa_possible_cpumask are allocated by zalloc_cpumask_var_node to initialize them. And the panic disappeared. Signed-off-by: Yasuaki Ishimatsu Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org Fixes: bce903809ab3 ("workqueue: add wq_numa_tbl_len and wq_numa_possible_cpumask[]") --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6f5f9c7323f4..35974ac69600 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4880,7 +4880,7 @@ static void __init wq_numa_init(void) BUG_ON(!tbl); for_each_node(node) - BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, + BUG_ON(!zalloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node_online(node) ? node : NUMA_NO_NODE)); for_each_possible_cpu(cpu) { -- cgit v1.2.3 From 16927776ae757d0d132bdbfabbfe2c498342bd59 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 7 Jul 2014 14:06:11 -0700 Subject: alarmtimer: Fix bug where relative alarm timers were treated as absolute Sharvil noticed with the posix timer_settime interface, using the CLOCK_REALTIME_ALARM or CLOCK_BOOTTIME_ALARM clockid, if the users tried to specify a relative time timer, it would incorrectly be treated as absolute regardless of the state of the flags argument. This patch corrects this, properly checking the absolute/relative flag, as well as adds further error checking that no invalid flag bits are set. Reported-by: Sharvil Nanavati Signed-off-by: John Stultz Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Prarit Bhargava Cc: Sharvil Nanavati Cc: stable #3.0+ Link: http://lkml.kernel.org/r/1404767171-6902-1-git-send-email-john.stultz@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/alarmtimer.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 88c9c65a430d..fe75444ae7ec 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -585,9 +585,14 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, struct itimerspec *new_setting, struct itimerspec *old_setting) { + ktime_t exp; + if (!rtcdev) return -ENOTSUPP; + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + if (old_setting) alarm_timer_get(timr, old_setting); @@ -597,8 +602,16 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, /* start the timer */ timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); - alarm_start(&timr->it.alarm.alarmtimer, - timespec_to_ktime(new_setting->it_value)); + exp = timespec_to_ktime(new_setting->it_value); + /* Convert (if necessary) to absolute time */ + if (flags != TIMER_ABSTIME) { + ktime_t now; + + now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime(); + exp = ktime_add(now, exp); + } + + alarm_start(&timr->it.alarm.alarmtimer, exp); return 0; } @@ -730,6 +743,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; + if (flags & ~TIMER_ABSTIME) + return -EINVAL; + if (!capable(CAP_WAKE_ALARM)) return -EPERM; -- cgit v1.2.3 From 2448e3493cb3874baa90725c87869455ebf11cd2 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 11 Jul 2014 21:06:38 +0200 Subject: tracing: instance_rmdir() leaks ftrace_event_file->filter instance_rmdir() path destroys the event files but forgets to free file->filter. Change remove_event_file_dir() to free_event_filter(). Link: http://lkml.kernel.org/p/20140711190638.GA19517@redhat.com Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Srikar Dronamraju Cc: Tom Zanussi Cc: "zhangwei(Jovi)" Cc: stable@vger.kernel.org # 3.11+ Fixes: f6a84bdc75b5 "tracing: Introduce remove_event_file_dir()" Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f99e0b3bca8c..2de53628689f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -470,6 +470,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) list_del(&file->list); remove_subsystem(file->system); + free_event_filter(file->filter); kmem_cache_free(file_cachep, file); } -- cgit v1.2.3 From 4320f6b1d9db4ca912c5eb6ecb328b2e090e1586 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 15 Jul 2014 08:51:27 +0200 Subject: PM / sleep: Fix request_firmware() error at resume The commit [247bc037: PM / Sleep: Mitigate race between the freezer and request_firmware()] introduced the finer state control, but it also leads to a new bug; for example, a bug report regarding the firmware loading of intel BT device at suspend/resume: https://bugzilla.novell.com/show_bug.cgi?id=873790 The root cause seems to be a small window between the process resume and the clear of usermodehelper lock. The request_firmware() function checks the UMH lock and gives up when it's in UMH_DISABLE state. This is for avoiding the invalid f/w loading during suspend/resume phase. The problem is, however, that usermodehelper_enable() is called at the end of thaw_processes(). Thus, a thawed process in between can kick off the f/w loader code path (in this case, via btusb_setup_intel()) even before the call of usermodehelper_enable(). Then usermodehelper_read_trylock() returns an error and request_firmware() spews WARN_ON() in the end. This oneliner patch fixes the issue just by setting to UMH_FREEZING state again before restarting tasks, so that the call of request_firmware() will be blocked until the end of this function instead of returning an error. Fixes: 247bc0374254 (PM / Sleep: Mitigate race between the freezer and request_firmware()) Link: https://bugzilla.novell.com/show_bug.cgi?id=873790 Cc: 3.4+ # 3.4+ Signed-off-by: Takashi Iwai Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 0ca8d83e2369..4ee194eb524b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -186,6 +186,7 @@ void thaw_processes(void) printk("Restarting tasks ... "); + __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); read_lock(&tasklist_lock); -- cgit v1.2.3 From 653a3538f865d26350727df25397bee2bacde773 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 15 Jul 2014 14:26:13 +0200 Subject: PM / sleep: fix freeze_ops NULL pointer dereferences This patch fixes a NULL pointer dereference issue introduced by commit 1f0b63866fc1 (ACPI / PM: Hold ACPI scan lock over the "freeze" sleep state). Fixes: 1f0b63866fc1 (ACPI / PM: Hold ACPI scan lock over the "freeze" sleep state) Link: http://marc.info/?l=linux-pm&m=140541182017443&w=2 Reported-and-tested-by: Alexander Stein Signed-off-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4dd8822f732a..ed35a4790afe 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -306,7 +306,7 @@ int suspend_devices_and_enter(suspend_state_t state) error = suspend_ops->begin(state); if (error) goto Close; - } else if (state == PM_SUSPEND_FREEZE && freeze_ops->begin) { + } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) { error = freeze_ops->begin(); if (error) goto Close; @@ -335,7 +335,7 @@ int suspend_devices_and_enter(suspend_state_t state) Close: if (need_suspend_ops(state) && suspend_ops->end) suspend_ops->end(); - else if (state == PM_SUSPEND_FREEZE && freeze_ops->end) + else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) freeze_ops->end(); return error; -- cgit v1.2.3 From 8abfb8727f4a724d31f9ccfd8013fbd16d539445 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Thu, 18 Jul 2013 16:31:05 +0800 Subject: tracing: Add ftrace_trace_stack into __trace_puts/__trace_bputs Currently trace option stacktrace is not applicable for trace_printk with constant string argument, the reason is in __trace_puts/__trace_bputs ftrace_trace_stack is missing. In contrast, when using trace_printk with non constant string argument(will call into __trace_printk/__trace_bprintk), then trace option stacktrace is workable, this inconstant result will confuses users a lot. Link: http://lkml.kernel.org/p/51E7A7C9.9040401@huawei.com Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f243444a3772..a6ffc8918dda 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -466,6 +466,9 @@ int __trace_puts(unsigned long ip, const char *str, int size) struct print_entry *entry; unsigned long irq_flags; int alloc; + int pc; + + pc = preempt_count(); if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -475,7 +478,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, - irq_flags, preempt_count()); + irq_flags, pc); if (!event) return 0; @@ -492,6 +495,7 @@ int __trace_puts(unsigned long ip, const char *str, int size) entry->buf[size] = '\0'; __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 4, pc); return size; } @@ -509,6 +513,9 @@ int __trace_bputs(unsigned long ip, const char *str) struct bputs_entry *entry; unsigned long irq_flags; int size = sizeof(struct bputs_entry); + int pc; + + pc = preempt_count(); if (unlikely(tracing_selftest_running || tracing_disabled)) return 0; @@ -516,7 +523,7 @@ int __trace_bputs(unsigned long ip, const char *str) local_save_flags(irq_flags); buffer = global_trace.trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, - irq_flags, preempt_count()); + irq_flags, pc); if (!event) return 0; @@ -525,6 +532,7 @@ int __trace_bputs(unsigned long ip, const char *str) entry->str = str; __buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 4, pc); return 1; } -- cgit v1.2.3 From 5f8bf2d263a20b986225ae1ed7d6759dc4b93af9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 15 Jul 2014 11:05:12 -0400 Subject: tracing: Fix graph tracer with stack tracer on other archs Running my ftrace tests on PowerPC, it failed the test that checks if function_graph tracer is affected by the stack tracer. It was. Looking into this, I found that the update_function_graph_func() must be called even if the trampoline function is not changed. This is because archs like PowerPC do not support ftrace_ops being passed by assembly and instead uses a helper function (what the trampoline function points to). Since this function is not changed even when multiple ftrace_ops are added to the code, the test that falls out before calling update_function_graph_func() will miss that the update must still be done. Call update_function_graph_function() for all calls to update_ftrace_function() Cc: stable@vger.kernel.org # 3.3+ Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b372e3ed675..ac9d1dad630b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -265,12 +265,12 @@ static void update_ftrace_function(void) func = ftrace_ops_list_func; } + update_function_graph_func(); + /* If there's no change, then do nothing more here */ if (ftrace_trace_function == func) return; - update_function_graph_func(); - /* * If we are using the list function, it doesn't care * about the function_trace_ops. -- cgit v1.2.3 From f0160a5a2912267c02cfe692eac955c360de5fdf Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" Date: Thu, 18 Jul 2013 16:31:18 +0800 Subject: tracing: Add TRACE_ITER_PRINTK flag check in __trace_puts/__trace_bputs The TRACE_ITER_PRINTK check in __trace_puts/__trace_bputs is missing, so add it, to be consistent with __trace_printk/__trace_bprintk. Those functions are all called by the same function: trace_printk(). Link: http://lkml.kernel.org/p/51E7A7D6.8090900@huawei.com Cc: stable@vger.kernel.org # 3.11+ Signed-off-by: zhangwei(Jovi) Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a6ffc8918dda..bda9621638cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -468,6 +468,9 @@ int __trace_puts(unsigned long ip, const char *str, int size) int alloc; int pc; + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + pc = preempt_count(); if (unlikely(tracing_selftest_running || tracing_disabled)) @@ -515,6 +518,9 @@ int __trace_bputs(unsigned long ip, const char *str) int size = sizeof(struct bputs_entry); int pc; + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + pc = preempt_count(); if (unlikely(tracing_selftest_running || tracing_disabled)) -- cgit v1.2.3 From 97b8ee845393701edc06e27ccec2876ff9596019 Mon Sep 17 00:00:00 2001 From: Martin Lau Date: Mon, 9 Jun 2014 23:06:42 -0700 Subject: ring-buffer: Fix polling on trace_pipe ring_buffer_poll_wait() should always put the poll_table to its wait_queue even there is immediate data available. Otherwise, the following epoll and read sequence will eventually hang forever: 1. Put some data to make the trace_pipe ring_buffer read ready first 2. epoll_ctl(efd, EPOLL_CTL_ADD, trace_pipe_fd, ee) 3. epoll_wait() 4. read(trace_pipe_fd) till EAGAIN 5. Add some more data to the trace_pipe ring_buffer 6. epoll_wait() -> this epoll_wait() will block forever ~ During the epoll_ctl(efd, EPOLL_CTL_ADD,...) call in step 2, ring_buffer_poll_wait() returns immediately without adding poll_table, which has poll_table->_qproc pointing to ep_poll_callback(), to its wait_queue. ~ During the epoll_wait() call in step 3 and step 6, ring_buffer_poll_wait() cannot add ep_poll_callback() to its wait_queue because the poll_table->_qproc is NULL and it is how epoll works. ~ When there is new data available in step 6, ring_buffer does not know it has to call ep_poll_callback() because it is not in its wait queue. Hence, block forever. Other poll implementation seems to call poll_wait() unconditionally as the very first thing to do. For example, tcp_poll() in tcp.c. Link: http://lkml.kernel.org/p/20140610060637.GA14045@devbig242.prn2.facebook.com Cc: stable@vger.kernel.org # 2.6.27 Fixes: 2a2cc8f7c4d0 "ftrace: allow the event pipe to be polled" Reviewed-by: Chris Mason Signed-off-by: Martin Lau Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7c56c3d06943..ff7027199a9a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -616,10 +616,6 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct ring_buffer_per_cpu *cpu_buffer; struct rb_irq_work *work; - if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || - (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) - return POLLIN | POLLRDNORM; - if (cpu == RING_BUFFER_ALL_CPUS) work = &buffer->irq_work; else { -- cgit v1.2.3 From 1903d50cba54261a6562a476c05085f3d7a54097 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Jul 2014 17:27:27 +0200 Subject: perf: Revert ("perf: Always destroy groups on exit") Vince reported that commit 15a2d4de0eab5 ("perf: Always destroy groups on exit") causes a regression with grouped events. In particular his read_group_attached.c test fails. https://github.com/deater/perf_event_tests/blob/master/tests/bugs/read_group_attached.c Because of the context switch optimization in perf_event_context_sched_out() the 'original' event may end up in the child process and when that exits the change in the patch in question destroys the actual grouping. Therefore revert that change and only destroy inherited groups. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-zedy3uktcp753q8fw8dagx7a@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b0c95f0f06fd..c46b02bfe179 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7458,7 +7458,19 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - perf_remove_from_context(child_event, true); + /* + * Do not destroy the 'original' grouping; because of the context + * switch optimization the original events could've ended up in a + * random child task. + * + * If we were to destroy the original group, all group related + * operations would cease to function properly after this random + * child dies. + * + * Do destroy all inherited groups, we don't care about those + * and being thorough is better. + */ + perf_remove_from_context(child_event, !!child_event->parent); /* * It can happen that the parent exits first, and has events -- cgit v1.2.3 From 4a1c0f262f88e2676fda80a6bf80e7dbccae1dcb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Jun 2014 16:12:42 +0200 Subject: perf: Fix lockdep warning on process exit Sasha Levin reported: > While fuzzing with trinity inside a KVM tools guest running the latest -next > kernel I've stumbled on the following spew: > > ====================================================== > [ INFO: possible circular locking dependency detected ] > 3.15.0-next-20140613-sasha-00026-g6dd125d-dirty #654 Not tainted > ------------------------------------------------------- > trinity-c578/9725 is trying to acquire lock: > (&(&pool->lock)->rlock){-.-...}, at: __queue_work (kernel/workqueue.c:1346) > > but task is already holding lock: > (&ctx->lock){-.....}, at: perf_event_exit_task (kernel/events/core.c:7471 kernel/events/core.c:7533) > > which lock already depends on the new lock. > 1 lock held by trinity-c578/9725: > #0: (&ctx->lock){-.....}, at: perf_event_exit_task (kernel/events/core.c:7471 kernel/events/core.c:7533) > > Call Trace: > dump_stack (lib/dump_stack.c:52) > print_circular_bug (kernel/locking/lockdep.c:1216) > __lock_acquire (kernel/locking/lockdep.c:1840 kernel/locking/lockdep.c:1945 kernel/locking/lockdep.c:2131 kernel/locking/lockdep.c:3182) > lock_acquire (./arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602) > _raw_spin_lock (include/linux/spinlock_api_smp.h:143 kernel/locking/spinlock.c:151) > __queue_work (kernel/workqueue.c:1346) > queue_work_on (kernel/workqueue.c:1424) > free_object (lib/debugobjects.c:209) > __debug_check_no_obj_freed (lib/debugobjects.c:715) > debug_check_no_obj_freed (lib/debugobjects.c:727) > kmem_cache_free (mm/slub.c:2683 mm/slub.c:2711) > free_task (kernel/fork.c:221) > __put_task_struct (kernel/fork.c:250) > put_ctx (include/linux/sched.h:1855 kernel/events/core.c:898) > perf_event_exit_task (kernel/events/core.c:907 kernel/events/core.c:7478 kernel/events/core.c:7533) > do_exit (kernel/exit.c:766) > do_group_exit (kernel/exit.c:884) > get_signal_to_deliver (kernel/signal.c:2347) > do_signal (arch/x86/kernel/signal.c:698) > do_notify_resume (arch/x86/kernel/signal.c:751) > int_signal (arch/x86/kernel/entry_64.S:600) Urgh.. so the only way I can make that happen is through: perf_event_exit_task_context() raw_spin_lock(&child_ctx->lock); unclone_ctx(child_ctx) put_ctx(ctx->parent_ctx); raw_spin_unlock_irqrestore(&child_ctx->lock); And we can avoid this by doing the change below. I can't immediately see how this changed recently, but given that you say it's easy to reproduce, lets fix this. Reported-by: Sasha Levin Signed-off-by: Peter Zijlstra Cc: Tejun Heo Cc: Dave Jones Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20140623141242.GB19860@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index c46b02bfe179..6b17ac1b0c2a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7486,7 +7486,7 @@ __perf_event_exit_task(struct perf_event *child_event, static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event *child_event, *next; - struct perf_event_context *child_ctx; + struct perf_event_context *child_ctx, *parent_ctx; unsigned long flags; if (likely(!child->perf_event_ctxp[ctxn])) { @@ -7511,6 +7511,15 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) raw_spin_lock(&child_ctx->lock); task_ctx_sched_out(child_ctx); child->perf_event_ctxp[ctxn] = NULL; + + /* + * In order to avoid freeing: child_ctx->parent_ctx->task + * under perf_event_context::lock, grab another reference. + */ + parent_ctx = child_ctx->parent_ctx; + if (parent_ctx) + get_ctx(parent_ctx); + /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -7520,6 +7529,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) update_context_time(child_ctx); raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + /* + * Now that we no longer hold perf_event_context::lock, drop + * our extra child_ctx->parent_ctx reference. + */ + if (parent_ctx) + put_ctx(parent_ctx); + /* * Report the task dead after unscheduling the events so that we * won't get any samples after PERF_RECORD_EXIT. We can however still -- cgit v1.2.3 From 37e9562453b813d2ea527bd9531fef2c3c592847 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 4 Jul 2014 20:49:32 -0700 Subject: locking/rwsem: Allow conservative optimistic spinning when readers have lock Commit 4fc828e24cd9 ("locking/rwsem: Support optimistic spinning") introduced a major performance regression for workloads such as xfs_repair which mix read and write locking of the mmap_sem across many threads. The result was xfs_repair ran 5x slower on 3.16-rc2 than on 3.15 and using 20x more system CPU time. Perf profiles indicate in some workloads that significant time can be spent spinning on !owner. This is because we don't set the lock owner when readers(s) obtain the rwsem. In this patch, we'll modify rwsem_can_spin_on_owner() such that we'll return false if there is no lock owner. The rationale is that if we just entered the slowpath, yet there is no lock owner, then there is a possibility that a reader has the lock. To be conservative, we'll avoid spinning in these situations. This patch reduced the total run time of the xfs_repair workload from about 4 minutes 24 seconds down to approximately 1 minute 26 seconds, back to close to the same performance as on 3.15. Retesting of AIM7, which were some of the workloads used to test the original optimistic spinning code, confirmed that we still get big performance gains with optimistic spinning, even with this additional regression fix. Davidlohr found that while the 'custom' workload took a performance hit of ~-14% to throughput for >300 users with this additional patch, the overall gain with optimistic spinning is still ~+45%. The 'disk' workload even improved by ~+15% at >1000 users. Tested-by: Dave Chinner Acked-by: Davidlohr Bueso Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Tim Chen Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1404532172.2572.30.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index dacc32142fcc..c40c7d28661d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -285,10 +285,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; - bool on_cpu = true; + bool on_cpu = false; if (need_resched()) - return 0; + return false; rcu_read_lock(); owner = ACCESS_ONCE(sem->owner); @@ -297,9 +297,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) rcu_read_unlock(); /* - * If sem->owner is not set, the rwsem owner may have - * just acquired it and not set the owner yet or the rwsem - * has been released. + * If sem->owner is not set, yet we have just recently entered the + * slowpath, then there is a possibility reader(s) may have the lock. + * To be safe, avoid spinning in these situations. */ return on_cpu; } -- cgit v1.2.3 From 046a619d8e9746fa4c0e29e8c6b78e16efc008a8 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 14 Jul 2014 10:27:48 -0700 Subject: locking/spinlocks/mcs: Rename optimistic_spin_queue() to optimistic_spin_node() Currently, the per-cpu nodes structure for the cancellable MCS spinlock is named "optimistic_spin_queue". However, in a follow up patch in the series we will be introducing a new structure that serves as the new "handle" for the lock. It would make more sense if that structure is named "optimistic_spin_queue". Additionally, since the current use of the "optimistic_spin_queue" structure are "nodes", it might be better if we rename them to "node" anyway. This preparatory patch renames all current "optimistic_spin_queue" to "optimistic_spin_node". Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Scott Norton Cc: "Paul E. McKenney" Cc: Dave Chinner Cc: Waiman Long Cc: Davidlohr Bueso Cc: Rik van Riel Cc: Andrew Morton Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Tim Chen Cc: Konrad Rzeszutek Wilk Cc: Aswin Chandramouleeswaran Cc: Linus Torvalds Cc: Chris Mason Cc: Heiko Carstens Cc: Josef Bacik Link: http://lkml.kernel.org/r/1405358872-3732-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 4 ++-- include/linux/rwsem.h | 4 ++-- kernel/locking/mcs_spinlock.c | 24 ++++++++++++------------ kernel/locking/mcs_spinlock.h | 8 ++++---- 4 files changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 11692dea18aa..885f3f56a77f 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -46,7 +46,7 @@ * - detects multi-task circular deadlocks and prints out all affected * locks and tasks (and only those tasks) */ -struct optimistic_spin_queue; +struct optimistic_spin_node; struct mutex { /* 1: unlocked, 0: locked, negative: locked, possible waiters */ atomic_t count; @@ -56,7 +56,7 @@ struct mutex { struct task_struct *owner; #endif #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - struct optimistic_spin_queue *osq; /* Spinner MCS lock */ + struct optimistic_spin_node *osq; /* Spinner MCS lock */ #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 8d79708146aa..ba3f108ddea1 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -16,7 +16,7 @@ #include -struct optimistic_spin_queue; +struct optimistic_spin_node; struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK @@ -33,7 +33,7 @@ struct rw_semaphore { * if the owner is running on the cpu. */ struct task_struct *owner; - struct optimistic_spin_queue *osq; /* spinner MCS lock */ + struct optimistic_spin_node *osq; /* spinner MCS lock */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c index 838dc9e00669..e9866f70e828 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/mcs_spinlock.c @@ -14,18 +14,18 @@ * called from interrupt context and we have preemption disabled while * spinning. */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); /* * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. * Can return NULL in case we were the last queued and we updated @lock instead. */ -static inline struct optimistic_spin_queue * -osq_wait_next(struct optimistic_spin_queue **lock, - struct optimistic_spin_queue *node, - struct optimistic_spin_queue *prev) +static inline struct optimistic_spin_node * +osq_wait_next(struct optimistic_spin_node **lock, + struct optimistic_spin_node *node, + struct optimistic_spin_node *prev) { - struct optimistic_spin_queue *next = NULL; + struct optimistic_spin_node *next = NULL; for (;;) { if (*lock == node && cmpxchg(lock, node, prev) == node) { @@ -59,10 +59,10 @@ osq_wait_next(struct optimistic_spin_queue **lock, return next; } -bool osq_lock(struct optimistic_spin_queue **lock) +bool osq_lock(struct optimistic_spin_node **lock) { - struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); - struct optimistic_spin_queue *prev, *next; + struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_node *prev, *next; node->locked = 0; node->next = NULL; @@ -149,10 +149,10 @@ unqueue: return false; } -void osq_unlock(struct optimistic_spin_queue **lock) +void osq_unlock(struct optimistic_spin_node **lock) { - struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node); - struct optimistic_spin_queue *next; + struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); + struct optimistic_spin_node *next; /* * Fast path for the uncontended case. diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index a2dbac4aca6b..c99dc0052f49 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -118,12 +118,12 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) * mutex_lock()/rwsem_down_{read,write}() etc. */ -struct optimistic_spin_queue { - struct optimistic_spin_queue *next, *prev; +struct optimistic_spin_node { + struct optimistic_spin_node *next, *prev; int locked; /* 1 if lock acquired */ }; -extern bool osq_lock(struct optimistic_spin_queue **lock); -extern void osq_unlock(struct optimistic_spin_queue **lock); +extern bool osq_lock(struct optimistic_spin_node **lock); +extern void osq_unlock(struct optimistic_spin_node **lock); #endif /* __LINUX_MCS_SPINLOCK_H */ -- cgit v1.2.3 From 90631822c5d307b5410500806e8ac3e63928aa3e Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 14 Jul 2014 10:27:49 -0700 Subject: locking/spinlocks/mcs: Convert osq lock to atomic_t to reduce overhead The cancellable MCS spinlock is currently used to queue threads that are doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining the lock would access and queue the local node corresponding to the CPU that it's running on. Currently, the cancellable MCS lock is implemented by using pointers to these nodes. In this patch, instead of operating on pointers to the per-cpu nodes, we store the CPU numbers in which the per-cpu nodes correspond to in atomic_t. A similar concept is used with the qspinlock. By operating on the CPU # of the nodes using atomic_t instead of pointers to those nodes, this can reduce the overhead of the cancellable MCS spinlock by 32 bits (on 64 bit systems). Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Scott Norton Cc: "Paul E. McKenney" Cc: Dave Chinner Cc: Waiman Long Cc: Davidlohr Bueso Cc: Rik van Riel Cc: Andrew Morton Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Tim Chen Cc: Konrad Rzeszutek Wilk Cc: Aswin Chandramouleeswaran Cc: Linus Torvalds Cc: Chris Mason Cc: Heiko Carstens Cc: Josef Bacik Link: http://lkml.kernel.org/r/1405358872-3732-3-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 4 ++-- include/linux/osq_lock.h | 19 ++++++++++++++++++ include/linux/rwsem.h | 7 +++---- kernel/locking/mcs_spinlock.c | 46 ++++++++++++++++++++++++++++++++++++------- kernel/locking/mcs_spinlock.h | 5 +++-- kernel/locking/mutex.c | 2 +- kernel/locking/rwsem-xadd.c | 2 +- 7 files changed, 68 insertions(+), 17 deletions(-) create mode 100644 include/linux/osq_lock.h (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 885f3f56a77f..42aa9b9ecd5f 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -17,6 +17,7 @@ #include #include #include +#include /* * Simple, straightforward mutexes with strict semantics: @@ -46,7 +47,6 @@ * - detects multi-task circular deadlocks and prints out all affected * locks and tasks (and only those tasks) */ -struct optimistic_spin_node; struct mutex { /* 1: unlocked, 0: locked, negative: locked, possible waiters */ atomic_t count; @@ -56,7 +56,7 @@ struct mutex { struct task_struct *owner; #endif #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - struct optimistic_spin_node *osq; /* Spinner MCS lock */ + struct optimistic_spin_queue osq; /* Spinner MCS lock */ #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h new file mode 100644 index 000000000000..b001682bf7cb --- /dev/null +++ b/include/linux/osq_lock.h @@ -0,0 +1,19 @@ +#ifndef __LINUX_OSQ_LOCK_H +#define __LINUX_OSQ_LOCK_H + +/* + * An MCS like lock especially tailored for optimistic spinning for sleeping + * lock implementations (mutex, rwsem, etc). + */ + +#define OSQ_UNLOCKED_VAL (0) + +struct optimistic_spin_queue { + /* + * Stores an encoded value of the CPU # of the tail node in the queue. + * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL. + */ + atomic_t tail; +}; + +#endif diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index ba3f108ddea1..9fdcdd03507d 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -13,10 +13,9 @@ #include #include #include - #include +#include -struct optimistic_spin_node; struct rw_semaphore; #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK @@ -33,7 +32,7 @@ struct rw_semaphore { * if the owner is running on the cpu. */ struct task_struct *owner; - struct optimistic_spin_node *osq; /* spinner MCS lock */ + struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; @@ -70,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ LIST_HEAD_INIT((name).wait_list), \ NULL, /* owner */ \ - NULL /* mcs lock */ \ + { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \ __RWSEM_DEP_MAP_INIT(name) } #else #define __RWSEM_INITIALIZER(name) \ diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c index e9866f70e828..32fc16c0a545 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/mcs_spinlock.c @@ -16,19 +16,45 @@ */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node); +/* + * We use the value 0 to represent "no CPU", thus the encoded value + * will be the CPU number incremented by 1. + */ +static inline int encode_cpu(int cpu_nr) +{ + return cpu_nr + 1; +} + +static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) +{ + int cpu_nr = encoded_cpu_val - 1; + + return per_cpu_ptr(&osq_node, cpu_nr); +} + /* * Get a stable @node->next pointer, either for unlock() or unqueue() purposes. * Can return NULL in case we were the last queued and we updated @lock instead. */ static inline struct optimistic_spin_node * -osq_wait_next(struct optimistic_spin_node **lock, +osq_wait_next(struct optimistic_spin_queue *lock, struct optimistic_spin_node *node, struct optimistic_spin_node *prev) { struct optimistic_spin_node *next = NULL; + int curr = encode_cpu(smp_processor_id()); + int old; + + /* + * If there is a prev node in queue, then the 'old' value will be + * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if + * we're currently last in queue, then the queue will then become empty. + */ + old = prev ? prev->cpu : OSQ_UNLOCKED_VAL; for (;;) { - if (*lock == node && cmpxchg(lock, node, prev) == node) { + if (atomic_read(&lock->tail) == curr && + atomic_cmpxchg(&lock->tail, curr, old) == curr) { /* * We were the last queued, we moved @lock back. @prev * will now observe @lock and will complete its @@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_node **lock, return next; } -bool osq_lock(struct optimistic_spin_node **lock) +bool osq_lock(struct optimistic_spin_queue *lock) { struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); struct optimistic_spin_node *prev, *next; + int curr = encode_cpu(smp_processor_id()); + int old; node->locked = 0; node->next = NULL; + node->cpu = curr; - node->prev = prev = xchg(lock, node); - if (likely(prev == NULL)) + old = atomic_xchg(&lock->tail, curr); + if (old == OSQ_UNLOCKED_VAL) return true; + prev = decode_cpu(old); + node->prev = prev; ACCESS_ONCE(prev->next) = node; /* @@ -149,15 +180,16 @@ unqueue: return false; } -void osq_unlock(struct optimistic_spin_node **lock) +void osq_unlock(struct optimistic_spin_queue *lock) { struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); struct optimistic_spin_node *next; + int curr = encode_cpu(smp_processor_id()); /* * Fast path for the uncontended case. */ - if (likely(cmpxchg(lock, node, NULL) == node)) + if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr)) return; /* diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index c99dc0052f49..74356dc0ce29 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -121,9 +121,10 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) struct optimistic_spin_node { struct optimistic_spin_node *next, *prev; int locked; /* 1 if lock acquired */ + int cpu; /* encoded CPU # value */ }; -extern bool osq_lock(struct optimistic_spin_node **lock); -extern void osq_unlock(struct optimistic_spin_node **lock); +extern bool osq_lock(struct optimistic_spin_queue *lock); +extern void osq_unlock(struct optimistic_spin_queue *lock); #endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index bc73d33c6760..d9b313906caa 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - lock->osq = NULL; + atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL); #endif debug_mutex_init(lock, name, key); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index c40c7d28661d..b77a6230bbf6 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, INIT_LIST_HEAD(&sem->wait_list); #ifdef CONFIG_SMP sem->owner = NULL; - sem->osq = NULL; + atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL); #endif } -- cgit v1.2.3 From 4d9d951e6b5df85ccfca2c5bd8b4f5c71d256b65 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 14 Jul 2014 10:27:50 -0700 Subject: locking/spinlocks/mcs: Introduce and use init macro and function for osq locks Currently, we initialize the osq lock by directly setting the lock's values. It would be preferable if we use an init macro to do the initialization like we do with other locks. This patch introduces and uses a macro and function for initializing the osq lock. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Scott Norton Cc: "Paul E. McKenney" Cc: Dave Chinner Cc: Waiman Long Cc: Davidlohr Bueso Cc: Rik van Riel Cc: Andrew Morton Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Tim Chen Cc: Konrad Rzeszutek Wilk Cc: Aswin Chandramouleeswaran Cc: Linus Torvalds Cc: Chris Mason Cc: Josef Bacik Link: http://lkml.kernel.org/r/1405358872-3732-4-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- include/linux/osq_lock.h | 8 ++++++++ include/linux/rwsem.h | 2 +- kernel/locking/mutex.c | 2 +- kernel/locking/rwsem-xadd.c | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h index b001682bf7cb..90230d5811c5 100644 --- a/include/linux/osq_lock.h +++ b/include/linux/osq_lock.h @@ -16,4 +16,12 @@ struct optimistic_spin_queue { atomic_t tail; }; +/* Init macro and function. */ +#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } + +static inline void osq_lock_init(struct optimistic_spin_queue *lock) +{ + atomic_set(&lock->tail, OSQ_UNLOCKED_VAL); +} + #endif diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 9fdcdd03507d..25cd9aa2f3d7 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -69,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ LIST_HEAD_INIT((name).wait_list), \ NULL, /* owner */ \ - { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \ + OSQ_LOCK_UNLOCKED /* osq */ \ __RWSEM_DEP_MAP_INIT(name) } #else #define __RWSEM_INITIALIZER(name) \ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index d9b313906caa..acca2c1a3c5e 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER - atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL); + osq_lock_init(&lock->osq); #endif debug_mutex_init(lock, name, key); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index b77a6230bbf6..7190592c2645 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, INIT_LIST_HEAD(&sem->wait_list); #ifdef CONFIG_SMP sem->owner = NULL; - atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL); + osq_lock_init(&sem->osq); #endif } -- cgit v1.2.3 From 33ecd2083a9560fbc1ef1b1279ef3ecb4c012a4f Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 14 Jul 2014 10:27:51 -0700 Subject: locking/spinlocks/mcs: Micro-optimize osq_unlock() In the unlock function of the cancellable MCS spinlock, the first thing we do is to retrive the current CPU's osq node. However, due to the changes made in the previous patch, in the common case where the lock is not contended, we wouldn't need to access the current CPU's osq node anymore. This patch optimizes this by only retriving this CPU's osq node after we attempt the initial cmpxchg to unlock the osq and found that its contended. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra Cc: Scott Norton Cc: "Paul E. McKenney" Cc: Dave Chinner Cc: Waiman Long Cc: Davidlohr Bueso Cc: Rik van Riel Cc: Andrew Morton Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Tim Chen Cc: Konrad Rzeszutek Wilk Cc: Aswin Chandramouleeswaran Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1405358872-3732-5-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mcs_spinlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c index 32fc16c0a545..be9ee1559fca 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/mcs_spinlock.c @@ -182,8 +182,7 @@ unqueue: void osq_unlock(struct optimistic_spin_queue *lock) { - struct optimistic_spin_node *node = this_cpu_ptr(&osq_node); - struct optimistic_spin_node *next; + struct optimistic_spin_node *node, *next; int curr = encode_cpu(smp_processor_id()); /* @@ -195,6 +194,7 @@ void osq_unlock(struct optimistic_spin_queue *lock) /* * Second most likely case. */ + node = this_cpu_ptr(&osq_node); next = xchg(&node->next, NULL); if (next) { ACCESS_ONCE(next->locked) = 1; -- cgit v1.2.3 From b0ab99e7736af88b8ac1b7ae50ea287fffa2badc Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 14 Jun 2014 15:00:09 +0200 Subject: sched: Fix possible divide by zero in avg_atom() calculation proc_sched_show_task() does: if (nr_switches) do_div(avg_atom, nr_switches); nr_switches is unsigned long and do_div truncates it to 32 bits, which means it can test non-zero on e.g. x86-64 and be truncated to zero for division. Fix the problem by using div64_ul() instead. As a side effect calculations of avg_atom for big nr_switches are now correct. Signed-off-by: Mateusz Guzik Signed-off-by: Peter Zijlstra Cc: stable@vger.kernel.org Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1402750809-31991-1-git-send-email-mguzik@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 695f9773bb60..627b3c34b821 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) avg_atom = p->se.sum_exec_runtime; if (nr_switches) - do_div(avg_atom, nr_switches); + avg_atom = div64_ul(avg_atom, nr_switches); else avg_atom = -1LL; -- cgit v1.2.3 From 13b9a962a2594ee880c5d50d7f70964da1d4fe5a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Jul 2014 14:54:55 +0200 Subject: locking/rwsem: Rename 'activity' to 'count' There are two definitions of struct rw_semaphore, one in linux/rwsem.h and one in linux/rwsem-spinlock.h. For some reason they have different names for the initial field. This makes it impossible to use C99 named initialization for __RWSEM_INITIALIZER() -- or we have to duplicate that entire thing along with the structure definitions. The simpler patch is renaming the rwsem-spinlock variant to match the regular rwsem. This allows us to switch to C99 named initialization. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/n/tip-bmrZolsbGmautmzrerog27io@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/rwsem-spinlock.h | 8 ++++---- kernel/locking/rwsem-spinlock.c | 28 ++++++++++++++-------------- 2 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index d5b13bc07a0b..561e8615528d 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -15,13 +15,13 @@ #ifdef __KERNEL__ /* * the rw-semaphore definition - * - if activity is 0 then there are no active readers or writers - * - if activity is +ve then that is the number of active readers - * - if activity is -1 then there is one active writer + * - if count is 0 then there are no active readers or writers + * - if count is +ve then that is the number of active readers + * - if count is -1 then there is one active writer * - if wait_list is not empty, then there are processes waiting for the semaphore */ struct rw_semaphore { - __s32 activity; + __s32 count; raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 9be8a9144978..2c93571162cb 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem) unsigned long flags; if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) { - ret = (sem->activity != 0); + ret = (sem->count != 0); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); } return ret; @@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif - sem->activity = 0; + sem->count = 0; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); } @@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) waiter = list_entry(next, struct rwsem_waiter, list); } while (waiter->type != RWSEM_WAITING_FOR_WRITE); - sem->activity += woken; + sem->count += woken; out: return sem; @@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + if (sem->count >= 0 && list_empty(&sem->wait_list)) { /* granted */ - sem->activity++; + sem->count++; raw_spin_unlock_irqrestore(&sem->wait_lock, flags); goto out; } @@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity >= 0 && list_empty(&sem->wait_list)) { + if (sem->count >= 0 && list_empty(&sem->wait_list)) { /* granted */ - sem->activity++; + sem->count++; ret = 1; } @@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) * itself into sleep and waiting for system woke it or someone * else in the head of the wait list up. */ - if (sem->activity == 0) + if (sem->count == 0) break; set_task_state(tsk, TASK_UNINTERRUPTIBLE); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass) raw_spin_lock_irqsave(&sem->wait_lock, flags); } /* got the lock */ - sem->activity = -1; + sem->count = -1; list_del(&waiter.list); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (sem->activity == 0) { + if (sem->count == 0) { /* got the lock */ - sem->activity = -1; + sem->count = -1; ret = 1; } @@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - if (--sem->activity == 0 && !list_empty(&sem->wait_list)) + if (--sem->count == 0 && !list_empty(&sem->wait_list)) sem = __rwsem_wake_one_writer(sem); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); @@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - sem->activity = 0; + sem->count = 0; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 1); @@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem) raw_spin_lock_irqsave(&sem->wait_lock, flags); - sem->activity = 1; + sem->count = 1; if (!list_empty(&sem->wait_list)) sem = __rwsem_do_wake(sem, 0); -- cgit v1.2.3 From 4badad352a6bb202ec68afa7a574c0bb961e5ebc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 Jun 2014 19:53:16 +0200 Subject: locking/mutex: Disable optimistic spinning on some architectures The optimistic spin code assumes regular stores and cmpxchg() play nice; this is found to not be true for at least: parisc, sparc32, tile32, metag-lock1, arc-!llsc and hexagon. There is further wreckage, but this in particular seemed easy to trigger, so blacklist this. Opt in for known good archs. Signed-off-by: Peter Zijlstra Reported-by: Mikulas Patocka Cc: David Miller Cc: Chris Metcalf Cc: James Bottomley Cc: Vineet Gupta Cc: Jason Low Cc: Waiman Long Cc: "James E.J. Bottomley" Cc: Paul McKenney Cc: John David Anglin Cc: James Hogan Cc: Linus Torvalds Cc: Davidlohr Bueso Cc: stable@vger.kernel.org Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Russell King Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/20140606175316.GV13930@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/arm/Kconfig | 1 + arch/arm64/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/sparc/Kconfig | 1 + arch/x86/Kconfig | 1 + kernel/Kconfig.locks | 5 ++++- 6 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 245058b3b0ef..88acf8bc1490 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -6,6 +6,7 @@ config ARM select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_MIGHT_HAVE_PC_PARPORT + select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a474de346be6..839f48c26ef0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -4,6 +4,7 @@ config ARM64 select ARCH_HAS_OPP select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_COMPAT_IPC_PARSE_VERSION select ARCH_WANT_FRAME_POINTERS diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index fefe7c8bf05f..80b94b0add1f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -145,6 +145,7 @@ config PPC select HAVE_IRQ_EXIT_ON_IRQ_STACK select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select HAVE_ARCH_AUDITSYSCALL + select ARCH_SUPPORTS_ATOMIC_RMW config GENERIC_CSUM def_bool CPU_LITTLE_ENDIAN diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 29f2e988c56a..407c87d9879a 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -78,6 +78,7 @@ config SPARC64 select HAVE_C_RECORDMCOUNT select NO_BOOTMEM select HAVE_ARCH_AUDITSYSCALL + select ARCH_SUPPORTS_ATOMIC_RMW config ARCH_DEFCONFIG string diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a8f749ef0fdc..d24887b645dc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -131,6 +131,7 @@ config X86 select HAVE_CC_STACKPROTECTOR select GENERIC_CPU_AUTOPROBE select HAVE_ARCH_AUDITSYSCALL + select ARCH_SUPPORTS_ATOMIC_RMW config INSTRUCTION_DECODER def_bool y diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 35536d9c0964..81907941d921 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -220,9 +220,12 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE endif +config ARCH_SUPPORTS_ATOMIC_RMW + bool + config MUTEX_SPIN_ON_OWNER def_bool y - depends on SMP && !DEBUG_MUTEXES + depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW config ARCH_USE_QUEUE_RWLOCK bool -- cgit v1.2.3 From 5db6c6fefb1ca0e81e3bd6dd8998bf51c453d823 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 11 Jul 2014 14:00:06 -0700 Subject: locking/rwsem: Add CONFIG_RWSEM_SPIN_ON_OWNER Just like with mutexes (CONFIG_MUTEX_SPIN_ON_OWNER), encapsulate the dependencies for rwsem optimistic spinning. No logical changes here as it continues to depend on both SMP and the XADD algorithm variant. Signed-off-by: Davidlohr Bueso Acked-by: Jason Low [ Also make it depend on ARCH_SUPPORTS_ATOMIC_RMW. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1405112406-13052-2-git-send-email-davidlohr@hp.com Cc: aswin@hp.com Cc: Chris Mason Cc: Davidlohr Bueso Cc: Josef Bacik Cc: Linus Torvalds Cc: Waiman Long Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- include/linux/rwsem.h | 6 ++++-- kernel/Kconfig.locks | 4 ++++ kernel/locking/rwsem-xadd.c | 4 ++-- kernel/locking/rwsem.c | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 716807f0eb2d..035d3c57fc8a 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -14,7 +14,9 @@ #include #include #include +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include +#endif struct rw_semaphore; @@ -26,7 +28,7 @@ struct rw_semaphore { long count; struct list_head wait_list; raw_spinlock_t wait_lock; -#ifdef CONFIG_SMP +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* spinner MCS lock */ /* * Write owner. Used as a speculative check to see @@ -63,7 +65,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) # define __RWSEM_DEP_MAP_INIT(lockname) #endif -#if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK) +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER #define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL #else #define __RWSEM_OPT_INIT(lockname) diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 81907941d921..76768ee812b2 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -227,6 +227,10 @@ config MUTEX_SPIN_ON_OWNER def_bool y depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW +config RWSEM_SPIN_ON_OWNER + def_bool y + depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW + config ARCH_USE_QUEUE_RWLOCK bool diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 7190592c2645..a2391ac135c8 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -82,7 +82,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, sem->count = RWSEM_UNLOCKED_VALUE; raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); -#ifdef CONFIG_SMP +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER sem->owner = NULL; osq_lock_init(&sem->osq); #endif @@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) return false; } -#ifdef CONFIG_SMP +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * Try to acquire write lock before the writer has been put on wait queue. */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 42f806de49d4..e2d3bc7f03b4 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -12,7 +12,7 @@ #include -#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM) +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER static inline void rwsem_set_owner(struct rw_semaphore *sem) { sem->owner = current; -- cgit v1.2.3 From d81b4253b0f0f1e7b7e03b0cd0f80cab18bc4d7b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 17 Jul 2014 11:44:11 +0000 Subject: kprobes: Fix "Failed to find blacklist" probing errors on ia64 and ppc64 On ia64 and ppc64, function pointers do not point to the entry address of the function, but to the address of a function descriptor (which contains the entry address and misc data). Since the kprobes code passes the function pointer stored by NOKPROBE_SYMBOL() to kallsyms_lookup_size_offset() for initalizing its blacklist, it fails and reports many errors, such as: Failed to find blacklist 0001013168300000 Failed to find blacklist 0001013000f0a000 [...] To fix this bug, use arch_deref_entry_point() to get the function entry address for kallsyms_lookup_size_offset() instead of the raw function pointer. Suzuki also pointed out that blacklist entries should also be updated as well. Reported-by: Tony Luck Fixed-by: Suzuki K. Poulose Tested-by: Tony Luck Tested-by: Michael Ellerman Signed-off-by: Masami Hiramatsu Acked-by: Michael Ellerman (for powerpc) Acked-by: Benjamin Herrenschmidt Cc: Jeremy Fitzhardinge Cc: sparse@chrisli.org Cc: Paul Mackerras Cc: akataria@vmware.com Cc: anil.s.keshavamurthy@intel.com Cc: Fenghua Yu Cc: Arnd Bergmann Cc: Rusty Russell Cc: Chris Wright Cc: yrl.pp-manager.tt@hitachi.com Cc: Kevin Hao Cc: Ananth N Mavinakayanahalli Cc: rdunlap@infradead.org Cc: dl9pf@gmx.de Cc: Linus Torvalds Cc: David S. Miller Cc: linux-ia64@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20140717114411.13401.2632.stgit@kbuild-fedora.novalocal Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3214289df5a7..734e9a7d280b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2037,19 +2037,23 @@ static int __init populate_kprobe_blacklist(unsigned long *start, { unsigned long *iter; struct kprobe_blacklist_entry *ent; - unsigned long offset = 0, size = 0; + unsigned long entry, offset = 0, size = 0; for (iter = start; iter < end; iter++) { - if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) { - pr_err("Failed to find blacklist %p\n", (void *)*iter); + entry = arch_deref_entry_point((void *)*iter); + + if (!kernel_text_address(entry) || + !kallsyms_lookup_size_offset(entry, &size, &offset)) { + pr_err("Failed to find blacklist at %p\n", + (void *)entry); continue; } ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) return -ENOMEM; - ent->start_addr = *iter; - ent->end_addr = *iter + size; + ent->start_addr = entry; + ent->end_addr = entry + size; INIT_LIST_HEAD(&ent->list); list_add_tail(&ent->list, &kprobe_blacklist); } -- cgit v1.2.3 From 58d4e21e50ff3cc57910a8abc20d7e14375d2f61 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 18 Jul 2014 11:43:01 -0700 Subject: tracing: Fix wraparound problems in "uptime" trace clock The "uptime" trace clock added in: commit 8aacf017b065a805d27467843490c976835eb4a5 tracing: Add "uptime" trace clock that uses jiffies has wraparound problems when the system has been up more than 1 hour 11 minutes and 34 seconds. It converts jiffies to nanoseconds using: (u64)jiffies_to_usecs(jiffy) * 1000ULL but since jiffies_to_usecs() only returns a 32-bit value, it truncates at 2^32 microseconds. An additional problem on 32-bit systems is that the argument is "unsigned long", so fixing the return value only helps until 2^32 jiffies (49.7 days on a HZ=1000 system). Avoid these problems by using jiffies_64 as our basis, and not converting to nanoseconds (we do convert to clock_t because user facing API must not be dependent on internal kernel HZ values). Link: http://lkml.kernel.org/p/99d63c5bfe9b320a3b428d773825a37095bf6a51.1405708254.git.tony.luck@intel.com Cc: stable@vger.kernel.org # 3.10+ Fixes: 8aacf017b065 "tracing: Add "uptime" trace clock that uses jiffies" Signed-off-by: Tony Luck Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- kernel/trace/trace_clock.c | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bda9621638cc..291397e66669 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -823,7 +823,7 @@ static struct { { trace_clock_local, "local", 1 }, { trace_clock_global, "global", 1 }, { trace_clock_counter, "counter", 0 }, - { trace_clock_jiffies, "uptime", 1 }, + { trace_clock_jiffies, "uptime", 0 }, { trace_clock, "perf", 1 }, ARCH_TRACE_CLOCKS }; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 26dc348332b7..57b67b1f24d1 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -59,13 +59,14 @@ u64 notrace trace_clock(void) /* * trace_jiffy_clock(): Simply use jiffies as a clock counter. + * Note that this use of jiffies_64 is not completely safe on + * 32-bit systems. But the window is tiny, and the effect if + * we are affected is that we will have an obviously bogus + * timestamp on a trace event - i.e. not life threatening. */ u64 notrace trace_clock_jiffies(void) { - u64 jiffy = jiffies - INITIAL_JIFFIES; - - /* Return nsecs */ - return (u64)jiffies_to_usecs(jiffy) * 1000ULL; + return jiffies_64_to_clock_t(jiffies_64 - INITIAL_JIFFIES); } /* -- cgit v1.2.3 From f723aa1817dd8f4fe005aab52ba70c8ab0ef9457 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 23 Jul 2014 21:03:50 -0700 Subject: sched_clock: Avoid corrupting hrtimer tree during suspend During suspend we call sched_clock_poll() to update the epoch and accumulated time and reprogram the sched_clock_timer to fire before the next wrap-around time. Unfortunately, sched_clock_poll() doesn't restart the timer, instead it relies on the hrtimer layer to do that and during suspend we aren't calling that function from the hrtimer layer. Instead, we're reprogramming the expires time while the hrtimer is enqueued, which can cause the hrtimer tree to be corrupted. Furthermore, we restart the timer during suspend but we update the epoch during resume which seems counter-intuitive. Let's fix this by saving the accumulated state and canceling the timer during suspend. On resume we can update the epoch and restart the timer similar to what we would do if we were starting the clock for the first time. Fixes: a08ca5d1089d "sched_clock: Use an hrtimer instead of timer" Signed-off-by: Stephen Boyd Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1406174630-23458-1-git-send-email-john.stultz@linaro.org Cc: Ingo Molnar Cc: stable Signed-off-by: Thomas Gleixner --- kernel/time/sched_clock.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 445106d2c729..01d2d15aa662 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -191,7 +191,8 @@ void __init sched_clock_postinit(void) static int sched_clock_suspend(void) { - sched_clock_poll(&sched_clock_timer); + update_sched_clock(); + hrtimer_cancel(&sched_clock_timer); cd.suspended = true; return 0; } @@ -199,6 +200,7 @@ static int sched_clock_suspend(void) static void sched_clock_resume(void) { cd.epoch_cyc = read_sched_clock(); + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); cd.suspended = false; } -- cgit v1.2.3 From 8f1d26d0e59b9676587c54578f976709b625d6e9 Mon Sep 17 00:00:00 2001 From: Atsushi Kumagai Date: Wed, 30 Jul 2014 16:08:39 -0700 Subject: kexec: export free_huge_page to VMCOREINFO PG_head_mask was added into VMCOREINFO to filter huge pages in b3acc56bfe1 ("kexec: save PG_head_mask in VMCOREINFO"), but makedumpfile still need another symbol to filter *hugetlbfs* pages. If a user hope to filter user pages, makedumpfile tries to exclude them by checking the condition whether the page is anonymous, but hugetlbfs pages aren't anonymous while they also be user pages. We know it's possible to detect them in the same way as PageHuge(), so we need the start address of free_huge_page(): int PageHuge(struct page *page) { if (!PageCompound(page)) return 0; page = compound_head(page); return get_compound_page_dtor(page) == free_huge_page; } For that reason, this patch changes free_huge_page() into public to export it to VMCOREINFO. Signed-off-by: Atsushi Kumagai Acked-by: Baoquan He Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + kernel/kexec.c | 2 ++ mm/hugetlb.c | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 255cd5cc0754..a23c096b3080 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -80,6 +80,7 @@ int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); bool is_hugepage_active(struct page *page); +void free_huge_page(struct page *page); #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); diff --git a/kernel/kexec.c b/kernel/kexec.c index 369f41a94124..23a088fec3c0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1619,6 +1620,7 @@ static int __init crash_save_vmcoreinfo_init(void) #endif VMCOREINFO_NUMBER(PG_head_mask); VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); + VMCOREINFO_SYMBOL(free_huge_page); arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9221c02ed9e2..7a0a73d2fcff 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -856,7 +856,7 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -static void free_huge_page(struct page *page) +void free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the -- cgit v1.2.3 From e0198b290dcd8313bdf313a0d083033d5c01d761 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 30 Jul 2014 16:08:42 -0700 Subject: Josh has moved My IBM email addresses haven't worked for years; also map some old-but-functional forwarding addresses to my canonical address. Update my GPG key fingerprint; I moved to 4096R a long time ago. Update description. Signed-off-by: Josh Triplett Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .mailmap | 5 +++++ CREDITS | 7 ++++--- MAINTAINERS | 2 +- kernel/rcu/rcutorture.c | 4 ++-- 4 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/.mailmap b/.mailmap index df1baba43a64..1ad68731fb47 100644 --- a/.mailmap +++ b/.mailmap @@ -62,6 +62,11 @@ Jeff Garzik Jens Axboe Jens Osterkamp John Stultz + + + + + Juha Yrjola Juha Yrjola Juha Yrjola diff --git a/CREDITS b/CREDITS index 28ee1514b9de..a80b66718f66 100644 --- a/CREDITS +++ b/CREDITS @@ -3511,10 +3511,11 @@ S: MacGregor A.C.T 2615 S: Australia N: Josh Triplett -E: josh@freedesktop.org -P: 1024D/D0FE7AFB B24A 65C9 1D71 2AC2 DE87 CA26 189B 9946 D0FE 7AFB -D: rcutorture maintainer +E: josh@joshtriplett.org +P: 4096R/8AFF873D 758E 5042 E397 4BA3 3A9C 1E67 0ED9 A3DF 8AFF 873D +D: RCU and rcutorture D: lock annotations, finding and fixing lock bugs +D: kernel tinification N: Winfried Trümper E: winni@xpilot.org diff --git a/MAINTAINERS b/MAINTAINERS index 86efa7e213c2..95990dd2678c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7424,7 +7424,7 @@ S: Orphan F: drivers/net/wireless/ray* RCUTORTURE MODULE -M: Josh Triplett +M: Josh Triplett M: "Paul E. McKenney" L: linux-kernel@vger.kernel.org S: Supported diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7fa34f86e5ba..948a7693748e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -18,7 +18,7 @@ * Copyright (C) IBM Corporation, 2005, 2006 * * Authors: Paul E. McKenney - * Josh Triplett + * Josh Triplett * * See also: Documentation/RCU/torture.txt */ @@ -51,7 +51,7 @@ #include MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); +MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); torture_param(int, fqs_duration, 0, -- cgit v1.2.3 From 3a1122d26c62d4e8c61ef9a0eaba6e21c0862c77 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 30 Jul 2014 19:05:55 -0700 Subject: kexec: fix build error when hugetlbfs is disabled free_huge_page() is undefined without CONFIG_HUGETLBFS and there's no need to filter PageHuge() page is such a configuration either, so avoid exporting the symbol to fix a build error: In file included from kernel/kexec.c:14:0: kernel/kexec.c: In function 'crash_save_vmcoreinfo_init': kernel/kexec.c:1623:20: error: 'free_huge_page' undeclared (first use in this function) VMCOREINFO_SYMBOL(free_huge_page); ^ Introduced by commit 8f1d26d0e59b ("kexec: export free_huge_page to VMCOREINFO") Reported-by: kbuild test robot Acked-by: Olof Johansson Cc: Atsushi Kumagai Cc: Baoquan He Cc: Vivek Goyal Cc: Andrew Morton Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 23a088fec3c0..4b8f0c925884 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1620,7 +1620,9 @@ static int __init crash_save_vmcoreinfo_init(void) #endif VMCOREINFO_NUMBER(PG_head_mask); VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_HUGETLBFS VMCOREINFO_SYMBOL(free_huge_page); +#endif arch_crash_save_vmcoreinfo(); update_vmcoreinfo_note(); -- cgit v1.2.3 From 504d58745c9ca28d33572e2d8a9990b43e06075d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 1 Aug 2014 12:20:02 +0200 Subject: timer: Fix lock inversion between hrtimer_bases.lock and scheduler locks clockevents_increase_min_delta() calls printk() from under hrtimer_bases.lock. That causes lock inversion on scheduler locks because printk() can call into the scheduler. Lockdep puts it as: ====================================================== [ INFO: possible circular locking dependency detected ] 3.15.0-rc8-06195-g939f04b #2 Not tainted ------------------------------------------------------- trinity-main/74 is trying to acquire lock: (&port_lock_key){-.....}, at: [<811c60be>] serial8250_console_write+0x8c/0x10c but task is already holding lock: (hrtimer_bases.lock){-.-...}, at: [<8103caeb>] hrtimer_try_to_cancel+0x13/0x66 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #5 (hrtimer_bases.lock){-.-...}: [<8104a942>] lock_acquire+0x92/0x101 [<8142f11d>] _raw_spin_lock_irqsave+0x2e/0x3e [<8103c918>] __hrtimer_start_range_ns+0x1c/0x197 [<8107ec20>] perf_swevent_start_hrtimer.part.41+0x7a/0x85 [<81080792>] task_clock_event_start+0x3a/0x3f [<810807a4>] task_clock_event_add+0xd/0x14 [<8108259a>] event_sched_in+0xb6/0x17a [<810826a2>] group_sched_in+0x44/0x122 [<81082885>] ctx_sched_in.isra.67+0x105/0x11f [<810828e6>] perf_event_sched_in.isra.70+0x47/0x4b [<81082bf6>] __perf_install_in_context+0x8b/0xa3 [<8107eb8e>] remote_function+0x12/0x2a [<8105f5af>] smp_call_function_single+0x2d/0x53 [<8107e17d>] task_function_call+0x30/0x36 [<8107fb82>] perf_install_in_context+0x87/0xbb [<810852c9>] SYSC_perf_event_open+0x5c6/0x701 [<810856f9>] SyS_perf_event_open+0x17/0x19 [<8142f8ee>] syscall_call+0x7/0xb -> #4 (&ctx->lock){......}: [<8104a942>] lock_acquire+0x92/0x101 [<8142f04c>] _raw_spin_lock+0x21/0x30 [<81081df3>] __perf_event_task_sched_out+0x1dc/0x34f [<8142cacc>] __schedule+0x4c6/0x4cb [<8142cae0>] schedule+0xf/0x11 [<8142f9a6>] work_resched+0x5/0x30 -> #3 (&rq->lock){-.-.-.}: [<8104a942>] lock_acquire+0x92/0x101 [<8142f04c>] _raw_spin_lock+0x21/0x30 [<81040873>] __task_rq_lock+0x33/0x3a [<8104184c>] wake_up_new_task+0x25/0xc2 [<8102474b>] do_fork+0x15c/0x2a0 [<810248a9>] kernel_thread+0x1a/0x1f [<814232a2>] rest_init+0x1a/0x10e [<817af949>] start_kernel+0x303/0x308 [<817af2ab>] i386_start_kernel+0x79/0x7d -> #2 (&p->pi_lock){-.-...}: [<8104a942>] lock_acquire+0x92/0x101 [<8142f11d>] _raw_spin_lock_irqsave+0x2e/0x3e [<810413dd>] try_to_wake_up+0x1d/0xd6 [<810414cd>] default_wake_function+0xb/0xd [<810461f3>] __wake_up_common+0x39/0x59 [<81046346>] __wake_up+0x29/0x3b [<811b8733>] tty_wakeup+0x49/0x51 [<811c3568>] uart_write_wakeup+0x17/0x19 [<811c5dc1>] serial8250_tx_chars+0xbc/0xfb [<811c5f28>] serial8250_handle_irq+0x54/0x6a [<811c5f57>] serial8250_default_handle_irq+0x19/0x1c [<811c56d8>] serial8250_interrupt+0x38/0x9e [<810510e7>] handle_irq_event_percpu+0x5f/0x1e2 [<81051296>] handle_irq_event+0x2c/0x43 [<81052cee>] handle_level_irq+0x57/0x80 [<81002a72>] handle_irq+0x46/0x5c [<810027df>] do_IRQ+0x32/0x89 [<8143036e>] common_interrupt+0x2e/0x33 [<8142f23c>] _raw_spin_unlock_irqrestore+0x3f/0x49 [<811c25a4>] uart_start+0x2d/0x32 [<811c2c04>] uart_write+0xc7/0xd6 [<811bc6f6>] n_tty_write+0xb8/0x35e [<811b9beb>] tty_write+0x163/0x1e4 [<811b9cd9>] redirected_tty_write+0x6d/0x75 [<810b6ed6>] vfs_write+0x75/0xb0 [<810b7265>] SyS_write+0x44/0x77 [<8142f8ee>] syscall_call+0x7/0xb -> #1 (&tty->write_wait){-.....}: [<8104a942>] lock_acquire+0x92/0x101 [<8142f11d>] _raw_spin_lock_irqsave+0x2e/0x3e [<81046332>] __wake_up+0x15/0x3b [<811b8733>] tty_wakeup+0x49/0x51 [<811c3568>] uart_write_wakeup+0x17/0x19 [<811c5dc1>] serial8250_tx_chars+0xbc/0xfb [<811c5f28>] serial8250_handle_irq+0x54/0x6a [<811c5f57>] serial8250_default_handle_irq+0x19/0x1c [<811c56d8>] serial8250_interrupt+0x38/0x9e [<810510e7>] handle_irq_event_percpu+0x5f/0x1e2 [<81051296>] handle_irq_event+0x2c/0x43 [<81052cee>] handle_level_irq+0x57/0x80 [<81002a72>] handle_irq+0x46/0x5c [<810027df>] do_IRQ+0x32/0x89 [<8143036e>] common_interrupt+0x2e/0x33 [<8142f23c>] _raw_spin_unlock_irqrestore+0x3f/0x49 [<811c25a4>] uart_start+0x2d/0x32 [<811c2c04>] uart_write+0xc7/0xd6 [<811bc6f6>] n_tty_write+0xb8/0x35e [<811b9beb>] tty_write+0x163/0x1e4 [<811b9cd9>] redirected_tty_write+0x6d/0x75 [<810b6ed6>] vfs_write+0x75/0xb0 [<810b7265>] SyS_write+0x44/0x77 [<8142f8ee>] syscall_call+0x7/0xb -> #0 (&port_lock_key){-.....}: [<8104a62d>] __lock_acquire+0x9ea/0xc6d [<8104a942>] lock_acquire+0x92/0x101 [<8142f11d>] _raw_spin_lock_irqsave+0x2e/0x3e [<811c60be>] serial8250_console_write+0x8c/0x10c [<8104e402>] call_console_drivers.constprop.31+0x87/0x118 [<8104f5d5>] console_unlock+0x1d7/0x398 [<8104fb70>] vprintk_emit+0x3da/0x3e4 [<81425f76>] printk+0x17/0x19 [<8105bfa0>] clockevents_program_min_delta+0x104/0x116 [<8105c548>] clockevents_program_event+0xe7/0xf3 [<8105cc1c>] tick_program_event+0x1e/0x23 [<8103c43c>] hrtimer_force_reprogram+0x88/0x8f [<8103c49e>] __remove_hrtimer+0x5b/0x79 [<8103cb21>] hrtimer_try_to_cancel+0x49/0x66 [<8103cb4b>] hrtimer_cancel+0xd/0x18 [<8107f102>] perf_swevent_cancel_hrtimer.part.60+0x2b/0x30 [<81080705>] task_clock_event_stop+0x20/0x64 [<81080756>] task_clock_event_del+0xd/0xf [<81081350>] event_sched_out+0xab/0x11e [<810813e0>] group_sched_out+0x1d/0x66 [<81081682>] ctx_sched_out+0xaf/0xbf [<81081e04>] __perf_event_task_sched_out+0x1ed/0x34f [<8142cacc>] __schedule+0x4c6/0x4cb [<8142cae0>] schedule+0xf/0x11 [<8142f9a6>] work_resched+0x5/0x30 other info that might help us debug this: Chain exists of: &port_lock_key --> &ctx->lock --> hrtimer_bases.lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(hrtimer_bases.lock); lock(&ctx->lock); lock(hrtimer_bases.lock); lock(&port_lock_key); *** DEADLOCK *** 4 locks held by trinity-main/74: #0: (&rq->lock){-.-.-.}, at: [<8142c6f3>] __schedule+0xed/0x4cb #1: (&ctx->lock){......}, at: [<81081df3>] __perf_event_task_sched_out+0x1dc/0x34f #2: (hrtimer_bases.lock){-.-...}, at: [<8103caeb>] hrtimer_try_to_cancel+0x13/0x66 #3: (console_lock){+.+...}, at: [<8104fb5d>] vprintk_emit+0x3c7/0x3e4 stack backtrace: CPU: 0 PID: 74 Comm: trinity-main Not tainted 3.15.0-rc8-06195-g939f04b #2 00000000 81c3a310 8b995c14 81426f69 8b995c44 81425a99 8161f671 8161f570 8161f538 8161f559 8161f538 8b995c78 8b142bb0 00000004 8b142fdc 8b142bb0 8b995ca8 8104a62d 8b142fac 000016f2 81c3a310 00000001 00000001 00000003 Call Trace: [<81426f69>] dump_stack+0x16/0x18 [<81425a99>] print_circular_bug+0x18f/0x19c [<8104a62d>] __lock_acquire+0x9ea/0xc6d [<8104a942>] lock_acquire+0x92/0x101 [<811c60be>] ? serial8250_console_write+0x8c/0x10c [<811c6032>] ? wait_for_xmitr+0x76/0x76 [<8142f11d>] _raw_spin_lock_irqsave+0x2e/0x3e [<811c60be>] ? serial8250_console_write+0x8c/0x10c [<811c60be>] serial8250_console_write+0x8c/0x10c [<8104af87>] ? lock_release+0x191/0x223 [<811c6032>] ? wait_for_xmitr+0x76/0x76 [<8104e402>] call_console_drivers.constprop.31+0x87/0x118 [<8104f5d5>] console_unlock+0x1d7/0x398 [<8104fb70>] vprintk_emit+0x3da/0x3e4 [<81425f76>] printk+0x17/0x19 [<8105bfa0>] clockevents_program_min_delta+0x104/0x116 [<8105cc1c>] tick_program_event+0x1e/0x23 [<8103c43c>] hrtimer_force_reprogram+0x88/0x8f [<8103c49e>] __remove_hrtimer+0x5b/0x79 [<8103cb21>] hrtimer_try_to_cancel+0x49/0x66 [<8103cb4b>] hrtimer_cancel+0xd/0x18 [<8107f102>] perf_swevent_cancel_hrtimer.part.60+0x2b/0x30 [<81080705>] task_clock_event_stop+0x20/0x64 [<81080756>] task_clock_event_del+0xd/0xf [<81081350>] event_sched_out+0xab/0x11e [<810813e0>] group_sched_out+0x1d/0x66 [<81081682>] ctx_sched_out+0xaf/0xbf [<81081e04>] __perf_event_task_sched_out+0x1ed/0x34f [<8104416d>] ? __dequeue_entity+0x23/0x27 [<81044505>] ? pick_next_task_fair+0xb1/0x120 [<8142cacc>] __schedule+0x4c6/0x4cb [<81047574>] ? trace_hardirqs_off_caller+0xd7/0x108 [<810475b0>] ? trace_hardirqs_off+0xb/0xd [<81056346>] ? rcu_irq_exit+0x64/0x77 Fix the problem by using printk_deferred() which does not call into the scheduler. Reported-by: Fengguang Wu Signed-off-by: Jan Kara Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index ad362c260ef4..9c94c19f1305 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -146,7 +146,8 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) { /* Nothing to do if we already reached the limit */ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { - printk(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); + printk_deferred(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); dev->next_event.tv64 = KTIME_MAX; return -ETIME; } @@ -159,9 +160,10 @@ static int clockevents_increase_min_delta(struct clock_event_device *dev) if (dev->min_delta_ns > MIN_DELTA_LIMIT) dev->min_delta_ns = MIN_DELTA_LIMIT; - printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns); + printk_deferred(KERN_WARNING + "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); return 0; } -- cgit v1.2.3