From de9b8f5dcbd94bfb1d249907a635f1fb1968e19c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2015 23:09:29 +0200 Subject: sched: Fix crash trying to dequeue/enqueue the idle thread Sasha reports that his virtual machine tries to schedule the idle thread since commit 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context"). Hit trace shows this happening from idle_thread_get()->init_idle(), which is the _second_ init_idle() invocation on that task_struct, the first being done through idle_init()->fork_idle(). (this code is insane...) Because we call init_idle() twice in a row, its ->sched_class == &idle_sched_class and ->on_rq = TASK_ON_RQ_QUEUED. This means do_set_cpus_allowed() think we're queued and will call dequeue_task(), which is implemented with BUG() for the idle class, seeing how dequeueing the idle task is a daft thing. Aside of the whole insanity of calling init_idle() _twice_, change the code to call set_cpus_allowed_common() instead as this is 'obviously' before the idle task gets ran etc.. Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context") Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 97d276ff1edb..f0d043ec0182 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4927,7 +4927,15 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - do_set_cpus_allowed(idle, cpumask_of(cpu)); +#ifdef CONFIG_SMP + /* + * Its possible that init_idle() gets called multiple times on a task, + * in that case do_set_cpus_allowed() will not do the right thing. + * + * And since this is boot we can forgo the serialization. + */ + set_cpus_allowed_common(idle, cpumask_of(cpu)); +#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the cpu isn't yet set to this cpu so the @@ -4944,7 +4952,7 @@ void init_idle(struct task_struct *idle, int cpu) rq->curr = rq->idle = idle; idle->on_rq = TASK_ON_RQ_QUEUED; -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP idle->on_cpu = 1; #endif raw_spin_unlock(&rq->lock); @@ -4959,7 +4967,7 @@ void init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif } -- cgit v1.2.3 From f55fc2a57cc9ca3b1bb4fb8eb25b6e1989e5b993 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 19:06:33 +0200 Subject: perf: Restructure perf syscall point of no return The exclusive_event_installable() stuff only works because its exclusive with the grouping bits. Rework the code such that there is a sane place to error out before we go do things we cannot undo. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f548f69c4299..39679f749500 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8297,13 +8297,30 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; + mutex_lock_double(&gctx->mutex, &ctx->mutex); + } else { + mutex_lock(&ctx->mutex); + } + + /* + * Must be under the same ctx::mutex as perf_install_in_context(), + * because we need to serialize with concurrent event creation. + */ + if (!exclusive_event_installable(event, ctx)) { + /* exclusive and group stuff are assumed mutually exclusive */ + WARN_ON_ONCE(move_group); + + err = -EBUSY; + goto err_locked; + } + WARN_ON_ONCE(ctx->parent_ctx); + + if (move_group) { /* * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ - mutex_lock_double(&gctx->mutex, &ctx->mutex); - perf_remove_from_context(group_leader, false); list_for_each_entry(sibling, &group_leader->sibling_list, @@ -8311,13 +8328,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_remove_from_context(sibling, false); put_ctx(gctx); } - } else { - mutex_lock(&ctx->mutex); - } - - WARN_ON_ONCE(ctx->parent_ctx); - if (move_group) { /* * Wait for everybody to stop referencing the events through * the old lists, before installing it on new lists. @@ -8349,22 +8360,20 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); get_ctx(ctx); - } - if (!exclusive_event_installable(event, ctx)) { - err = -EBUSY; - mutex_unlock(&ctx->mutex); - fput(event_file); - goto err_context; + /* + * Now that all events are installed in @ctx, nothing + * references @gctx anymore, so drop the last reference we have + * on it. + */ + put_ctx(gctx); } perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); - if (move_group) { + if (move_group) mutex_unlock(&gctx->mutex); - put_ctx(gctx); - } mutex_unlock(&ctx->mutex); put_online_cpus(); @@ -8391,6 +8400,12 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; +err_locked: + if (move_group) + mutex_unlock(&gctx->mutex); + mutex_unlock(&ctx->mutex); +/* err_file: */ + fput(event_file); err_context: perf_unpin_context(ctx); put_ctx(ctx); -- cgit v1.2.3 From a723968c0ed36db676478c3d26078f13484fe01c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 19:06:33 +0200 Subject: perf: Fix u16 overflows Vince reported that its possible to overflow the various size fields and get weird stuff if you stick too many events in a group. Put a lid on this by requiring the fixed record size not exceed 16k. This is still a fair amount of events (silly amount really) and leaves plenty room for callchains and stack dwarves while also avoiding overflowing the u16 variables. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 50 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 39679f749500..dbb5329b6a3a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1243,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event) PERF_EVENT_STATE_INACTIVE; } -/* - * Called at perf_event creation and when events are attached/detached from a - * group. - */ -static void perf_event__read_size(struct perf_event *event) +static void __perf_event_read_size(struct perf_event *event, int nr_siblings) { int entry = sizeof(u64); /* value */ int size = 0; @@ -1263,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event) entry += sizeof(u64); if (event->attr.read_format & PERF_FORMAT_GROUP) { - nr += event->group_leader->nr_siblings; + nr += nr_siblings; size += sizeof(u64); } @@ -1271,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event) event->read_size = size; } -static void perf_event__header_size(struct perf_event *event) +static void __perf_event_header_size(struct perf_event *event, u64 sample_type) { struct perf_sample_data *data; - u64 sample_type = event->attr.sample_type; u16 size = 0; - perf_event__read_size(event); - if (sample_type & PERF_SAMPLE_IP) size += sizeof(data->ip); @@ -1303,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event) event->header_size = size; } +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__header_size(struct perf_event *event) +{ + __perf_event_read_size(event, + event->group_leader->nr_siblings); + __perf_event_header_size(event, event->attr.sample_type); +} + static void perf_event__id_header_size(struct perf_event *event) { struct perf_sample_data *data; @@ -1330,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event) event->id_header_size = size; } +static bool perf_event_validate_size(struct perf_event *event) +{ + /* + * The values computed here will be over-written when we actually + * attach the event. + */ + __perf_event_read_size(event, event->group_leader->nr_siblings + 1); + __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ); + perf_event__id_header_size(event); + + /* + * Sum the lot; should not exceed the 64k limit we have on records. + * Conservative limit to allow for callchains and other variable fields. + */ + if (event->read_size + event->header_size + + event->id_header_size + sizeof(struct perf_event_header) >= 16*1024) + return false; + + return true; +} + static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader, *pos; @@ -8302,6 +8327,11 @@ SYSCALL_DEFINE5(perf_event_open, mutex_lock(&ctx->mutex); } + if (!perf_event_validate_size(event)) { + err = -E2BIG; + goto err_locked; + } + /* * Must be under the same ctx::mutex as perf_install_in_context(), * because we need to serialize with concurrent event creation. -- cgit v1.2.3 From f73e22ab450140830005581c2c7ec389791a1b8d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2015 20:48:22 +0200 Subject: perf: Fix races in computing the header sizes There are two races with the current code: - Another event can join the group and compute a larger header_size concurrently, if the smaller store wins we'll have an incorrect header_size set. - We compute the header_size after the event becomes active, therefore its possible to use the size before its computed. Remedy the first by moving the computation inside the ctx::mutex lock, and the second by placing it _before_ perf_install_in_context(). Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index dbb5329b6a3a..b11756f9b6dc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8399,6 +8399,15 @@ SYSCALL_DEFINE5(perf_event_open, put_ctx(gctx); } + /* + * Precalculate sample_data sizes; do while holding ctx::mutex such + * that we're serialized against further additions and before + * perf_install_in_context() which is the point the event is active and + * can use these values. + */ + perf_event__header_size(event); + perf_event__id_header_size(event); + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -8414,12 +8423,6 @@ SYSCALL_DEFINE5(perf_event_open, list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); - /* - * Precalculate sample_data sizes - */ - perf_event__header_size(event); - perf_event__id_header_size(event); - /* * Drop the reference on the group_event after placing the * new event on the sibling_list. This ensures destruction -- cgit v1.2.3 From 19a5ecde086a6a5287978b12ae948fa691b197b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 20 Sep 2015 21:01:22 -0700 Subject: rcu: Suppress lockdep false positive for rcp->exp_funnel_mutex In kernels built with CONFIG_PREEMPT=y, synchronize_rcu_expedited() invokes synchronize_sched_expedited() while holding RCU-preempt's root rcu_node structure's ->exp_funnel_mutex, which is acquired after the rcu_data structure's ->exp_funnel_mutex. The first thing that synchronize_sched_expedited() will do is acquire RCU-sched's rcu_data structure's ->exp_funnel_mutex. There is no danger of an actual deadlock because the locking order is always from RCU-preempt's expedited mutexes to those of RCU-sched. Unfortunately, lockdep considers both rcu_data structures' ->exp_funnel_mutex to be in the same lock class and therefore reports a deadlock cycle. This commit silences this false positive by placing RCU-sched's rcu_data structures' ->exp_funnel_mutex locks into their own lock class. Reported-by: Sasha Levin Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 9f75f25cc5d9..775d36cc0050 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3868,6 +3868,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) static void __init rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { + static struct lock_class_key rcu_exp_sched_rdp_class; unsigned long flags; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); @@ -3883,6 +3884,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (rsp == &rcu_sched_state) + lockdep_set_class_and_name(&rdp->exp_funnel_mutex, + &rcu_exp_sched_rdp_class, + "rcu_data_exp_sched"); } /* -- cgit v1.2.3 From 21199f27b430576552b26210b3194a363d7f05cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2015 16:10:40 +0200 Subject: locking/lockdep: Fix hlock->pin_count reset on lock stack rebuilds Various people reported hitting the "unpinning an unpinned lock" warning. As it turns out there are 2 places where we take a lock out of the middle of a stack, and in those cases it would fail to preserve the pin_count when rebuilding the lock stack. Reported-by: Sasha Levin Reported-by: Tim Spriggs Tested-by: Sasha Levin Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: davej@codemonkey.org.uk Link: http://lkml.kernel.org/r/20150916141040.GA11639@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8acfbf773e06..4e49cc4c9952 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock); static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, int hardirqs_off, struct lockdep_map *nest_lock, unsigned long ip, - int references) + int references, int pin_count) { struct task_struct *curr = current; struct lock_class *class = NULL; @@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, hlock->waittime_stamp = 0; hlock->holdtime_stamp = lockstat_clock(); #endif - hlock->pin_count = 0; + hlock->pin_count = pin_count; if (check && !mark_irqflags(curr, hlock)) return 0; @@ -3343,7 +3343,7 @@ found_it: hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references)) + hlock->references, hlock->pin_count)) return 0; } @@ -3433,7 +3433,7 @@ found_it: hlock_class(hlock)->subclass, hlock->trylock, hlock->read, hlock->check, hlock->hardirqs_off, hlock->nest_lock, hlock->acquire_ip, - hlock->references)) + hlock->references, hlock->pin_count)) return 0; } @@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, current->lockdep_recursion = 1; trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, - irqs_disabled_flags(flags), nest_lock, ip, 0); + irqs_disabled_flags(flags), nest_lock, ip, 0, 0); current->lockdep_recursion = 0; raw_local_irq_restore(flags); } -- cgit v1.2.3