From b00c52dae6d9ee8d0f2407118ef6544ae5524781 Mon Sep 17 00:00:00 2001 From: Wenwei Tao Date: Fri, 13 May 2016 22:59:20 +0800 Subject: cgroup: remove redundant cleanup in css_create When create css failed, before call css_free_rcu_fn, we remove the css id and exit the percpu_ref, but we will do these again in css_free_work_fn, so they are redundant. Especially the css id, that would cause problem if we remove it twice, since it may be assigned to another css after the first remove. tj: This was broken by two commits updating the free path without synchronizing the creation failure path. This can be easily triggered by trying to create more than 64k memory cgroups. Signed-off-by: Wenwei Tao Signed-off-by: Tejun Heo Cc: Vladimir Davydov Fixes: 9a1049da9bd2 ("percpu-refcount: require percpu_ref to be exited explicitly") Fixes: 01e586598b22 ("cgroup: release css->id after css_free") Cc: stable@vger.kernel.org # v3.17+ --- kernel/cgroup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 86cb5c6e8932..789b84f973c9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5150,7 +5150,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL); if (err < 0) - goto err_free_percpu_ref; + goto err_free_css; css->id = err; /* @css is ready to be brought online now, make it visible */ @@ -5174,9 +5174,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); - cgroup_idr_remove(&ss->css_idr, css->id); -err_free_percpu_ref: - percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); return ERR_PTR(err); -- cgit v1.2.3 From b7fa30c9cc48c4f55663420472505d3b4f6e1705 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Jun 2016 15:07:50 +0200 Subject: sched/fair: Fix post_init_entity_util_avg() serialization Chris Wilson reported a divide by 0 at: post_init_entity_util_avg(): > 725 if (cfs_rq->avg.util_avg != 0) { > 726 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; > -> 727 sa->util_avg /= (cfs_rq->avg.load_avg + 1); > 728 > 729 if (sa->util_avg > cap) > 730 sa->util_avg = cap; > 731 } else { Which given the lack of serialization, and the code generated from update_cfs_rq_load_avg() is entirely possible: if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); removed_load = 1; } turns into: ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax ffffffff8108706b: 48 85 c0 test %rax,%rax ffffffff8108706e: 74 40 je ffffffff810870b0 ffffffff81087070: 4c 89 f8 mov %r15,%rax ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13) ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13) ffffffff8108707e: 4c 89 f9 mov %r15,%rcx ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13) ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx Which you'll note ends up with 'sa->load_avg - r' in memory at ffffffff8108707a. By calling post_init_entity_util_avg() under rq->lock we're sure to be fully serialized against PELT updates and cannot observe intermediate state like this. Reported-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Cc: Andrey Ryabinin Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: bsegall@google.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: steve.muckle@linaro.org Fixes: 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a bounded value") Link: http://lkml.kernel.org/r/20160609130750.GQ30909@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +-- kernel/sched/fair.c | 8 +++++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 017d5394f5dc..13d0896aff87 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2535,10 +2535,9 @@ void wake_up_new_task(struct task_struct *p) */ set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif - /* Post initialize new task's util average when its cfs_rq is set */ + rq = __task_rq_lock(p, &rf); post_init_entity_util_avg(&p->se); - rq = __task_rq_lock(p, &rf); activate_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..4e33ad12bb68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8496,8 +8496,9 @@ void free_fair_sched_group(struct task_group *tg) int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_rq *cfs_rq; struct sched_entity *se; + struct cfs_rq *cfs_rq; + struct rq *rq; int i; tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8512,6 +8513,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(i) { + rq = cpu_rq(i); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) @@ -8525,7 +8528,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); + + raw_spin_lock_irq(&rq->lock); post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); } return 1; -- cgit v1.2.3 From eda8dca519269c92a0771668b3d5678792de7b78 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 13 Jun 2016 02:32:09 -0500 Subject: sched/debug: Fix deadlock when enabling sched events I see a hang when enabling sched events: echo 1 > /sys/kernel/debug/tracing/events/sched/enable The printk buffer shows: BUG: spinlock recursion on CPU#1, swapper/1/0 lock: 0xffff88007d5d8c00, .magic: dead4ead, .owner: swapper/1/0, .owner_cpu: 1 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc2+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014 ... Call Trace: [] dump_stack+0x85/0xc2 [] spin_dump+0x78/0xc0 [] do_raw_spin_lock+0x11a/0x150 [] _raw_spin_lock+0x61/0x80 [] ? try_to_wake_up+0x256/0x4e0 [] try_to_wake_up+0x256/0x4e0 [] ? _raw_spin_unlock_irqrestore+0x4a/0x80 [] wake_up_process+0x15/0x20 [] insert_work+0x84/0xc0 [] __queue_work+0x18f/0x660 [] queue_work_on+0x46/0x90 [] drm_fb_helper_dirty.isra.11+0xcb/0xe0 [drm_kms_helper] [] drm_fb_helper_sys_imageblit+0x30/0x40 [drm_kms_helper] [] soft_cursor+0x1ad/0x230 [] bit_cursor+0x649/0x680 [] ? update_attr.isra.2+0x90/0x90 [] fbcon_cursor+0x14a/0x1c0 [] hide_cursor+0x28/0x90 [] vt_console_print+0x3bf/0x3f0 [] call_console_drivers.constprop.24+0x183/0x200 [] console_unlock+0x3d4/0x610 [] vprintk_emit+0x3c5/0x610 [] vprintk_default+0x29/0x40 [] printk+0x57/0x73 [] enqueue_entity+0xc2e/0xc70 [] enqueue_task_fair+0x59/0xab0 [] ? kvm_sched_clock_read+0x9/0x20 [] ? sched_clock+0x9/0x10 [] activate_task+0x5c/0xa0 [] ttwu_do_activate+0x54/0xb0 [] sched_ttwu_pending+0x7a/0xb0 [] scheduler_ipi+0x61/0x170 [] smp_trace_reschedule_interrupt+0x4f/0x2a0 [] trace_reschedule_interrupt+0x96/0xa0 [] ? native_safe_halt+0x6/0x10 [] ? trace_hardirqs_on+0xd/0x10 [] default_idle+0x20/0x1a0 [] arch_cpu_idle+0xf/0x20 [] default_idle_call+0x2f/0x50 [] cpu_startup_entry+0x37e/0x450 [] start_secondary+0x160/0x1a0 Note the hang only occurs when echoing the above from a physical serial console, not from an ssh session. The bug is caused by a deadlock where the task is trying to grab the rq lock twice because printk()'s aren't safe in sched code. Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Matt Fleming Cc: Mel Gorman Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default") Link: http://lkml.kernel.org/r/20160613073209.gdvdybiruljbkn3p@treble Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4e33ad12bb68..a2348deab7a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3246,7 +3246,7 @@ static inline void check_schedstat_required(void) trace_sched_stat_iowait_enabled() || trace_sched_stat_blocked_enabled() || trace_sched_stat_runtime_enabled()) { - pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " "stat_blocked and stat_runtime require the " "kernel parameter schedstats=enabled or " "kernel.sched_schedstats=1\n"); -- cgit v1.2.3 From 57675cb976eff977aefb428e68e4e0236d48a9ff Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 9 Jun 2016 15:20:05 +0300 Subject: kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w Lengthy output of sysrq-w may take a lot of time on slow serial console. Currently we reset NMI-watchdog on the current CPU to avoid spurious lockup messages. Sometimes this doesn't work since softlockup watchdog might trigger on another CPU which is waiting for an IPI to proceed. We reset softlockup watchdogs on all CPUs, but we do this only after listing all tasks, and this may be too late on a busy system. So, reset watchdogs CPUs earlier, in for_each_process_thread() loop. Signed-off-by: Andrey Ryabinin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Link: http://lkml.kernel.org/r/1465474805-14641-1-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13d0896aff87..4135ac83cb65 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5147,14 +5147,16 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG if (!state_filter) sysrq_sched_debug_show(); -- cgit v1.2.3 From 19de99f70b87fcc3338da52a89c439b088cbff71 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 15 Jun 2016 18:25:38 -0700 Subject: bpf: fix matching of data/data_end in verifier The ctx structure passed into bpf programs is different depending on bpf program type. The verifier incorrectly marked ctx->data and ctx->data_end access based on ctx offset only. That caused loads in tracing programs int bpf_prog(struct pt_regs *ctx) { .. ctx->ax .. } to be incorrectly marked as PTR_TO_PACKET which later caused verifier to reject the program that was actually valid in tracing context. Fix this by doing program type specific matching of ctx offsets. Fixes: 969bf05eb3ce ("bpf: direct packet access") Reported-by: Sasha Goldshtein Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 28 +++++++++++++++++++++++++++- kernel/bpf/verifier.c | 41 +++++++---------------------------------- kernel/trace/bpf_trace.c | 6 ++++-- net/core/filter.c | 16 ++++++++++++++-- 4 files changed, 52 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8ee27b8afe81..8269cafc6eb1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -111,6 +111,31 @@ enum bpf_access_type { BPF_WRITE = 2 }; +/* types of values stored in eBPF registers */ +enum bpf_reg_type { + NOT_INIT = 0, /* nothing was written into register */ + UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ + PTR_TO_CTX, /* reg points to bpf_context */ + CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ + PTR_TO_MAP_VALUE, /* reg points to map element value */ + PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ + FRAME_PTR, /* reg == frame_pointer */ + PTR_TO_STACK, /* reg == frame_pointer + imm */ + CONST_IMM, /* constant integer value */ + + /* PTR_TO_PACKET represents: + * skb->data + * skb->data + imm + * skb->data + (u16) var + * skb->data + (u16) var + imm + * if (range > 0) then [ptr, ptr + range - off) is safe to access + * if (id > 0) means that some 'var' was added + * if (off > 0) menas that 'imm' was added + */ + PTR_TO_PACKET, + PTR_TO_PACKET_END, /* skb->data + headlen */ +}; + struct bpf_prog; struct bpf_verifier_ops { @@ -120,7 +145,8 @@ struct bpf_verifier_ops { /* return true if 'size' wide access at offset 'off' within bpf_context * with 'type' (read or write) is allowed */ - bool (*is_valid_access)(int off, int size, enum bpf_access_type type); + bool (*is_valid_access)(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type); u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 668e07903c8f..eec9f90ba030 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -126,31 +126,6 @@ * are set to NOT_INIT to indicate that they are no longer readable. */ -/* types of values stored in eBPF registers */ -enum bpf_reg_type { - NOT_INIT = 0, /* nothing was written into register */ - UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ - PTR_TO_CTX, /* reg points to bpf_context */ - CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ - PTR_TO_MAP_VALUE, /* reg points to map element value */ - PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ - FRAME_PTR, /* reg == frame_pointer */ - PTR_TO_STACK, /* reg == frame_pointer + imm */ - CONST_IMM, /* constant integer value */ - - /* PTR_TO_PACKET represents: - * skb->data - * skb->data + imm - * skb->data + (u16) var - * skb->data + (u16) var + imm - * if (range > 0) then [ptr, ptr + range - off) is safe to access - * if (id > 0) means that some 'var' was added - * if (off > 0) menas that 'imm' was added - */ - PTR_TO_PACKET, - PTR_TO_PACKET_END, /* skb->data + headlen */ -}; - struct reg_state { enum bpf_reg_type type; union { @@ -695,10 +670,10 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields */ static int check_ctx_access(struct verifier_env *env, int off, int size, - enum bpf_access_type t) + enum bpf_access_type t, enum bpf_reg_type *reg_type) { if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t)) { + env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) { /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -798,21 +773,19 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off, mark_reg_unknown_value(state->regs, value_regno); } else if (reg->type == PTR_TO_CTX) { + enum bpf_reg_type reg_type = UNKNOWN_VALUE; + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose("R%d leaks addr into ctx\n", value_regno); return -EACCES; } - err = check_ctx_access(env, off, size, t); + err = check_ctx_access(env, off, size, t, ®_type); if (!err && t == BPF_READ && value_regno >= 0) { mark_reg_unknown_value(state->regs, value_regno); - if (off == offsetof(struct __sk_buff, data) && - env->allow_ptr_leaks) + if (env->allow_ptr_leaks) /* note that reg.[id|off|range] == 0 */ - state->regs[value_regno].type = PTR_TO_PACKET; - else if (off == offsetof(struct __sk_buff, data_end) && - env->allow_ptr_leaks) - state->regs[value_regno].type = PTR_TO_PACKET_END; + state->regs[value_regno].type = reg_type; } } else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 720b7bb01d43..e7af6cb9d5cf 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -349,7 +349,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ -static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) { /* check bounds */ if (off < 0 || off >= sizeof(struct pt_regs)) @@ -427,7 +428,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) } } -static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type) +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + enum bpf_reg_type *reg_type) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) return false; diff --git a/net/core/filter.c b/net/core/filter.c index 68adb5f52110..c4b330c85c02 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2085,7 +2085,8 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type) } static bool sk_filter_is_valid_access(int off, int size, - enum bpf_access_type type) + enum bpf_access_type type, + enum bpf_reg_type *reg_type) { switch (off) { case offsetof(struct __sk_buff, tc_classid): @@ -2108,7 +2109,8 @@ static bool sk_filter_is_valid_access(int off, int size, } static bool tc_cls_act_is_valid_access(int off, int size, - enum bpf_access_type type) + enum bpf_access_type type, + enum bpf_reg_type *reg_type) { if (type == BPF_WRITE) { switch (off) { @@ -2123,6 +2125,16 @@ static bool tc_cls_act_is_valid_access(int off, int size, return false; } } + + switch (off) { + case offsetof(struct __sk_buff, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct __sk_buff, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + return __is_valid_access(off, size, type); } -- cgit v1.2.3 From ad572d174787daa59e24b8b5c83028c09cdb5ddb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 15 Jun 2016 18:25:39 -0700 Subject: bpf, trace: check event type in bpf_perf_event_read similar to bpf_perf_event_output() the bpf_perf_event_read() helper needs to check the type of the perf_event before reading the counter. Fixes: a43eec304259 ("bpf: introduce bpf_perf_event_output() helper") Reported-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e7af6cb9d5cf..26f603da7e26 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -209,6 +209,10 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) event->pmu->count) return -EINVAL; + if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && + event->attr.type != PERF_TYPE_RAW)) + return -EINVAL; + /* * we don't know if the function is run successfully by the * return value. It can be judged in other places, such as -- cgit v1.2.3 From 8fa3b8d689a54d6d04ff7803c724fb7aca6ce98e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 May 2016 15:42:13 -0400 Subject: cgroup: set css->id to -1 during init If percpu_ref initialization fails during css_create(), the free path can end up trying to free css->id of zero. As ID 0 is unused, it doesn't cause a critical breakage but it does trigger a warning message. Fix it by setting css->id to -1 from init_and_link_css(). Signed-off-by: Tejun Heo Cc: Wenwei Tao Fixes: 01e586598b22 ("cgroup: release css->id after css_free") Cc: stable@vger.kernel.org # v4.0+ Signed-off-by: Tejun Heo --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 789b84f973c9..688eb0cd1851 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5063,6 +5063,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; + css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; -- cgit v1.2.3 From 8974189222159154c55f24ddad33e3613960521a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 10:50:40 +0200 Subject: sched/fair: Fix cfs_rq avg tracking underflow As per commit: b7fa30c9cc48 ("sched/fair: Fix post_init_entity_util_avg() serialization") > the code generated from update_cfs_rq_load_avg(): > > if (atomic_long_read(&cfs_rq->removed_load_avg)) { > s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); > sa->load_avg = max_t(long, sa->load_avg - r, 0); > sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); > removed_load = 1; > } > > turns into: > > ffffffff81087064: 49 8b 85 98 00 00 00 mov 0x98(%r13),%rax > ffffffff8108706b: 48 85 c0 test %rax,%rax > ffffffff8108706e: 74 40 je ffffffff810870b0 > ffffffff81087070: 4c 89 f8 mov %r15,%rax > ffffffff81087073: 49 87 85 98 00 00 00 xchg %rax,0x98(%r13) > ffffffff8108707a: 49 29 45 70 sub %rax,0x70(%r13) > ffffffff8108707e: 4c 89 f9 mov %r15,%rcx > ffffffff81087081: bb 01 00 00 00 mov $0x1,%ebx > ffffffff81087086: 49 83 7d 70 00 cmpq $0x0,0x70(%r13) > ffffffff8108708b: 49 0f 49 4d 70 cmovns 0x70(%r13),%rcx > > Which you'll note ends up with sa->load_avg -= r in memory at > ffffffff8108707a. So I _should_ have looked at other unserialized users of ->load_avg, but alas. Luckily nikbor reported a similar /0 from task_h_load() which instantly triggered recollection of this here problem. Aside from the intermediate value hitting memory and causing problems, there's another problem: the underflow detection relies on the signed bit. This reduces the effective width of the variables, IOW its effectively the same as having these variables be of signed type. This patch changes to a different means of unsigned underflow detection to not rely on the signed bit. This allows the variables to use the 'full' unsigned range. And it does so with explicit LOAD - STORE to ensure any intermediate value will never be visible in memory, allowing these unserialized loads. Note: GCC generates crap code for this, might warrant a look later. Note2: I say 'full' above, if we end up at U*_MAX we'll still explode; maybe we should do clamping on add too. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrey Ryabinin Cc: Chris Wilson Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: bsegall@google.com Cc: kernel@kyup.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: steve.muckle@linaro.org Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking") Link: http://lkml.kernel.org/r/20160617091948.GJ30927@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a2348deab7a3..2ae68f0e3bf5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2904,6 +2904,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) } } +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -2913,15 +2930,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) if (atomic_long_read(&cfs_rq->removed_load_avg)) { s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); - sa->load_avg = max_t(long, sa->load_avg - r, 0); - sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->load_avg, r); + sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); removed_load = 1; } if (atomic_long_read(&cfs_rq->removed_util_avg)) { long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); - sa->util_avg = max_t(long, sa->util_avg - r, 0); - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); + sub_positive(&sa->util_avg, r); + sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); removed_util = 1; } @@ -2994,10 +3011,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s &se->avg, se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL); - cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); - cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); cfs_rq_util_change(cfs_rq); } -- cgit v1.2.3 From 70c8217acd4383e069fe1898bbad36ea4fcdbdcc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 17 Jun 2016 16:10:42 -0400 Subject: tracing: Handle NULL formats in hold_module_trace_bprintk_format() If a task uses a non constant string for the format parameter in trace_printk(), then the trace_printk_fmt variable is set to NULL. This variable is then saved in the __trace_printk_fmt section. The function hold_module_trace_bprintk_format() checks to see if duplicate formats are used by modules, and reuses them if so (saves them to the list if it is new). But this function calls lookup_format() that does a strcmp() to the value (which is now NULL) and can cause a kernel oops. This wasn't an issue till 3debb0a9ddb ("tracing: Fix trace_printk() to print when not using bprintk()") which added "__used" to the trace_printk_fmt variable, and before that, the kernel simply optimized it out (no NULL value was saved). The fix is simply to handle the NULL pointer in lookup_format() and have the caller ignore the value if it was NULL. Link: http://lkml.kernel.org/r/1464769870-18344-1-git-send-email-zhengjun.xing@intel.com Reported-by: xingzhen Acked-by: Namhyung Kim Fixes: 3debb0a9ddb ("tracing: Fix trace_printk() to print when not using bprintk()") Cc: stable@vger.kernel.org # v3.5+ Signed-off-by: Steven Rostedt --- kernel/trace/trace_printk.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index f96f0383f6c6..ad1d6164e946 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -36,6 +36,10 @@ struct trace_bprintk_fmt { static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) { struct trace_bprintk_fmt *pos; + + if (!fmt) + return ERR_PTR(-EINVAL); + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { if (!strcmp(pos->fmt, fmt)) return pos; @@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); if (tb_fmt) { - *iter = tb_fmt->fmt; + if (!IS_ERR(tb_fmt)) + *iter = tb_fmt->fmt; continue; } -- cgit v1.2.3 From 6720a305df74ca30bcc10fc316881641b6ff0c80 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 23 Jun 2016 12:11:17 -0700 Subject: locking: avoid passing around 'thread_info' in mutex debugging code None of the code actually wants a thread_info, it all wants a task_struct, and it's just converting back and forth between the two ("ti->task" to get the task_struct from the thread_info, and "task_thread_info(task)" to go the other way). No semantic change. Acked-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/locking/mutex-debug.c | 12 ++++++------ kernel/locking/mutex-debug.h | 4 ++-- kernel/locking/mutex.c | 6 +++--- kernel/locking/mutex.h | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 3ef3736002d8..9c951fade415 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter) } void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); /* Mark the current thread as blocked on the lock: */ - ti->task->blocked_on = waiter; + task->blocked_on = waiter; } void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti) + struct task_struct *task) { DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); - DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); - DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); - ti->task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(waiter->task != task); + DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); + task->blocked_on = NULL; list_del_init(&waiter->list); waiter->task = NULL; diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..d06ae3bb46c5 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock, extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); extern void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, - struct thread_info *ti); + struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 79d2d765a75f..a70b90db3909 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -537,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto skip_wait; debug_mutex_lock_common(lock, &waiter); - debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + debug_mutex_add_waiter(lock, &waiter, task); /* add waiting tasks to the end of the waitqueue (FIFO): */ list_add_tail(&waiter.list, &lock->wait_list); @@ -584,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current_thread_info()); + mutex_remove_waiter(lock, &waiter, task); /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); @@ -605,7 +605,7 @@ skip_wait: return 0; err: - mutex_remove_waiter(lock, &waiter, task_thread_info(task)); + mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..a68bae5e852a 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -13,7 +13,7 @@ do { spin_lock(lock); (void)(flags); } while (0) #define spin_unlock_mutex(lock, flags) \ do { spin_unlock(lock); (void)(flags); } while (0) -#define mutex_remove_waiter(lock, waiter, ti) \ +#define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -- cgit v1.2.3 From 82d6489d0fed2ec8a8c48c19e8d8a04ac8e5bb26 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 22 Jun 2016 17:28:41 -0300 Subject: cgroup: Disable IRQs while holding css_set_lock While testing the deadline scheduler + cgroup setup I hit this warning. [ 132.612935] ------------[ cut here ]------------ [ 132.612951] WARNING: CPU: 5 PID: 0 at kernel/softirq.c:150 __local_bh_enable_ip+0x6b/0x80 [ 132.612952] Modules linked in: (a ton of modules...) [ 132.612981] CPU: 5 PID: 0 Comm: swapper/5 Not tainted 4.7.0-rc2 #2 [ 132.612981] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.2-20150714_191134- 04/01/2014 [ 132.612982] 0000000000000086 45c8bb5effdd088b ffff88013fd43da0 ffffffff813d229e [ 132.612984] 0000000000000000 0000000000000000 ffff88013fd43de0 ffffffff810a652b [ 132.612985] 00000096811387b5 0000000000000200 ffff8800bab29d80 ffff880034c54c00 [ 132.612986] Call Trace: [ 132.612987] [] dump_stack+0x63/0x85 [ 132.612994] [] __warn+0xcb/0xf0 [ 132.612997] [] ? push_dl_task.part.32+0x170/0x170 [ 132.612999] [] warn_slowpath_null+0x1d/0x20 [ 132.613000] [] __local_bh_enable_ip+0x6b/0x80 [ 132.613008] [] _raw_write_unlock_bh+0x1a/0x20 [ 132.613010] [] _raw_spin_unlock_bh+0xe/0x10 [ 132.613015] [] put_css_set+0x5c/0x60 [ 132.613016] [] cgroup_free+0x7f/0xa0 [ 132.613017] [] __put_task_struct+0x42/0x140 [ 132.613018] [] dl_task_timer+0xca/0x250 [ 132.613027] [] ? push_dl_task.part.32+0x170/0x170 [ 132.613030] [] __hrtimer_run_queues+0xee/0x270 [ 132.613031] [] hrtimer_interrupt+0xa8/0x190 [ 132.613034] [] local_apic_timer_interrupt+0x38/0x60 [ 132.613035] [] smp_apic_timer_interrupt+0x3d/0x50 [ 132.613037] [] apic_timer_interrupt+0x8c/0xa0 [ 132.613038] [] ? native_safe_halt+0x6/0x10 [ 132.613043] [] default_idle+0x1e/0xd0 [ 132.613044] [] arch_cpu_idle+0xf/0x20 [ 132.613046] [] default_idle_call+0x2a/0x40 [ 132.613047] [] cpu_startup_entry+0x2e7/0x340 [ 132.613048] [] start_secondary+0x155/0x190 [ 132.613049] ---[ end trace f91934d162ce9977 ]--- The warn is the spin_(lock|unlock)_bh(&css_set_lock) in the interrupt context. Converting the spin_lock_bh to spin_lock_irq(save) to avoid this problem - and other problems of sharing a spinlock with an interrupt. Cc: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: Juri Lelli Cc: Steven Rostedt Cc: cgroups@vger.kernel.org Cc: stable@vger.kernel.org # 4.5+ Cc: linux-kernel@vger.kernel.org Reviewed-by: Rik van Riel Reviewed-by: "Luis Claudio R. Goncalves" Signed-off-by: Daniel Bristot de Oliveira Acked-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 142 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 74 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 688eb0cd1851..75c0ff00aca6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -837,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset) static void put_css_set(struct css_set *cset) { + unsigned long flags; + /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an @@ -845,9 +847,9 @@ static void put_css_set(struct css_set *cset) if (atomic_add_unless(&cset->refcount, -1, 1)) return; - spin_lock_bh(&css_set_lock); + spin_lock_irqsave(&css_set_lock, flags); put_css_set_locked(cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, flags); } /* @@ -1070,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, /* First see if we already have a cgroup group that matches * the desired set */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (cset) return cset; @@ -1102,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -1128,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, css_get(css); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return cset; } @@ -1192,7 +1194,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) * Release all the links from cset_links to this hierarchy's * root cgroup */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { list_del(&link->cset_link); @@ -1200,7 +1202,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) kfree(link); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (!list_empty(&root->root_list)) { list_del(&root->root_list); @@ -1600,11 +1602,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) ss->root = dst_root; css->cgroup = dcgrp; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) list_move_tail(&cset->e_cset_node[ss->id], &dcgrp->e_csets[ss->id]); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; @@ -1640,10 +1642,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, if (!buf) return -ENOMEM; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot); len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (len >= PATH_MAX) len = -ERANGE; @@ -1897,7 +1899,7 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); if (use_task_css_set_links) goto out_unlock; @@ -1922,8 +1924,12 @@ static void cgroup_enable_task_cg_lists(void) * entry won't be deleted though the process has exited. * Do it while holding siglock so that we don't end up * racing against cgroup_exit(). + * + * Interrupts were already disabled while acquiring + * the css_set_lock, so we do not need to disable it + * again when acquiring the sighand->siglock here. */ - spin_lock_irq(&p->sighand->siglock); + spin_lock(&p->sighand->siglock); if (!(p->flags & PF_EXITING)) { struct css_set *cset = task_css_set(p); @@ -1932,11 +1938,11 @@ static void cgroup_enable_task_cg_lists(void) list_add_tail(&p->cg_list, &cset->tasks); get_css_set(cset); } - spin_unlock_irq(&p->sighand->siglock); + spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); read_unlock(&tasklist_lock); out_unlock: - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -2043,13 +2049,13 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) * Link the root cgroup in this hierarchy into all the css_set * objects. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); hash_for_each(css_set_table, i, cset, hlist) { link_css_set(&tmp_links, cset, root_cgrp); if (css_set_populated(cset)) cgroup_update_populated(root_cgrp, true); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); BUG_ON(!list_empty(&root_cgrp->self.children)); BUG_ON(atomic_read(&root->nr_cgrps) != 1); @@ -2256,11 +2262,11 @@ out_mount: struct cgroup *cgrp; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cgrp = cset_cgroup_from_root(ns->root_cset, root); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); @@ -2337,11 +2343,11 @@ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, char *ret; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); return ret; @@ -2369,7 +2375,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) char *path = NULL; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); @@ -2382,7 +2388,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) path = buf; } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); return path; } @@ -2557,7 +2563,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, * the new cgroup. There are no failure cases after here, so this * is the commit point. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(cset, &tset->src_csets, mg_node) { list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) { struct css_set *from_cset = task_css_set(task); @@ -2568,7 +2574,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, put_css_set_locked(from_cset); } } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* * Migration is committed, all target tasks are now on dst_csets. @@ -2597,13 +2603,13 @@ out_cancel_attach: } } while_each_subsys_mask(); out_release_tset: - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) { list_splice_tail_init(&cset->mg_tasks, &cset->tasks); list_del_init(&cset->mg_node); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return ret; } @@ -2634,7 +2640,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) lockdep_assert_held(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; @@ -2642,7 +2648,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /** @@ -2783,7 +2789,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2792,7 +2798,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, break; } while_each_thread(leader, task); rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return cgroup_taskset_migrate(&tset, root); } @@ -2816,7 +2822,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, return -EBUSY; /* look up all src csets */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); task = leader; do { @@ -2826,7 +2832,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, break; } while_each_thread(leader, task); rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* prepare dst csets and commit */ ret = cgroup_migrate_prepare_dst(&preloaded_csets); @@ -2859,9 +2865,9 @@ static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *cgrp; struct inode *inode; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); while (!cgroup_is_descendant(dst_cgrp, cgrp)) cgrp = cgroup_parent(cgrp); @@ -2962,9 +2968,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (root == &cgrp_dfl_root) continue; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) @@ -3080,7 +3086,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) percpu_down_write(&cgroup_threadgroup_rwsem); /* look up all csses currently attached to @cgrp's subtree */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; @@ -3088,14 +3094,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) cgroup_migrate_add_src(link->cset, dsct, &preloaded_csets); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_finish; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) { struct task_struct *task, *ntask; @@ -3107,7 +3113,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list) cgroup_taskset_add(task, &tset); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); ret = cgroup_taskset_migrate(&tset, cgrp->root); out_finish: @@ -3908,10 +3914,10 @@ static int cgroup_task_count(const struct cgroup *cgrp) int count = 0; struct cgrp_cset_link *link; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return count; } @@ -4249,7 +4255,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, memset(it, 0, sizeof(*it)); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); it->ss = css->ss; @@ -4262,7 +4268,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, css_task_iter_advance_css_set(it); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /** @@ -4280,7 +4286,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) it->cur_task = NULL; } - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, @@ -4289,7 +4295,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) css_task_iter_advance(it); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return it->cur_task; } @@ -4303,10 +4309,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) void css_task_iter_end(struct css_task_iter *it) { if (it->cur_cset) { - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_del(&it->iters_node); put_css_set_locked(it->cur_cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } if (it->cur_task) @@ -4338,10 +4344,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) mutex_lock(&cgroup_mutex); /* all tasks in @from are being moved, all csets are source */ - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) @@ -5449,10 +5455,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ cgrp->self.flags &= ~CSS_ONLINE; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) link->cset->dead = true; - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) @@ -5723,7 +5729,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, goto out; mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); for_each_root(root) { struct cgroup_subsys *ss; @@ -5776,7 +5782,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, retval = 0; out_unlock: - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); kfree(buf); out: @@ -5921,13 +5927,13 @@ void cgroup_post_fork(struct task_struct *child) if (use_task_css_set_links) { struct css_set *cset; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cset = task_css_set(current); if (list_empty(&child->cg_list)) { get_css_set(cset); css_set_move_task(child, NULL, cset, false); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } /* @@ -5972,9 +5978,9 @@ void cgroup_exit(struct task_struct *tsk) cset = task_css_set(tsk); if (!list_empty(&tsk->cg_list)) { - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); } @@ -6042,9 +6048,9 @@ static void cgroup_release_agent(struct work_struct *work) if (!pathbuf || !agentbuf) goto out; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); if (!path) goto out; @@ -6304,12 +6310,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, return ERR_PTR(-EPERM); mutex_lock(&cgroup_mutex); - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); cset = task_css_set(current); get_css_set(cset); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); new_ns = alloc_cgroup_ns(); @@ -6433,7 +6439,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) if (!name_buf) return -ENOMEM; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { @@ -6444,7 +6450,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) c->root->hierarchy_id, name_buf); } rcu_read_unlock(); - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); kfree(name_buf); return 0; } @@ -6455,7 +6461,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - spin_lock_bh(&css_set_lock); + spin_lock_irq(&css_set_lock); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; @@ -6478,7 +6484,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) overflow: seq_puts(seq, " ...\n"); } - spin_unlock_bh(&css_set_lock); + spin_unlock_irq(&css_set_lock); return 0; } -- cgit v1.2.3 From 4c5ea0a9cd02d6aa8adc86e100b2a4cff8d614ff Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 21 Jun 2016 18:52:17 +0200 Subject: locking/static_key: Fix concurrent static_key_slow_inc() The following scenario is possible: CPU 1 CPU 2 static_key_slow_inc() atomic_inc_not_zero() -> key.enabled == 0, no increment jump_label_lock() atomic_inc_return() -> key.enabled == 1 now static_key_slow_inc() atomic_inc_not_zero() -> key.enabled == 1, inc to 2 return ** static key is wrong! jump_label_update() jump_label_unlock() Testing the static key at the point marked by (**) will follow the wrong path for jumps that have not been patched yet. This can actually happen when creating many KVM virtual machines with userspace LAPIC emulation; just run several copies of the following program: #include #include #include #include int main(void) { for (;;) { int kvmfd = open("/dev/kvm", O_RDONLY); int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0); close(ioctl(vmfd, KVM_CREATE_VCPU, 1)); close(vmfd); close(kvmfd); } return 0; } Every KVM_CREATE_VCPU ioctl will attempt a static_key_slow_inc() call. The static key's purpose is to skip NULL pointer checks and indeed one of the processes eventually dereferences NULL. As explained in the commit that introduced the bug: 706249c222f6 ("locking/static_keys: Rework update logic") jump_label_update() needs key.enabled to be true. The solution adopted here is to temporarily make key.enabled == -1, and use go down the slow path when key.enabled <= 0. Reported-by: Dmitry Vyukov Signed-off-by: Paolo Bonzini Signed-off-by: Peter Zijlstra (Intel) Cc: # v4.3+ Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 706249c222f6 ("locking/static_keys: Rework update logic") Link: http://lkml.kernel.org/r/1466527937-69798-1-git-send-email-pbonzini@redhat.com [ Small stylistic edits to the changelog and the code. ] Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 16 +++++++++++++--- kernel/jump_label.c | 36 +++++++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 0536524bb9eb..68904469fba1 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -117,13 +117,18 @@ struct module; #include +#ifdef HAVE_JUMP_LABEL + static inline int static_key_count(struct static_key *key) { - return atomic_read(&key->enabled); + /* + * -1 means the first static_key_slow_inc() is in progress. + * static_key_enabled() must return true, so return 1 here. + */ + int n = atomic_read(&key->enabled); + return n >= 0 ? n : 1; } -#ifdef HAVE_JUMP_LABEL - #define JUMP_TYPE_FALSE 0UL #define JUMP_TYPE_TRUE 1UL #define JUMP_TYPE_MASK 1UL @@ -162,6 +167,11 @@ extern void jump_label_apply_nops(struct module *mod); #else /* !HAVE_JUMP_LABEL */ +static inline int static_key_count(struct static_key *key) +{ + return atomic_read(&key->enabled); +} + static __always_inline void jump_label_init(void) { static_key_initialized = true; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 05254eeb4b4e..4b353e0be121 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key); void static_key_slow_inc(struct static_key *key) { + int v, v1; + STATIC_KEY_CHECK_USE(); - if (atomic_inc_not_zero(&key->enabled)) - return; + + /* + * Careful if we get concurrent static_key_slow_inc() calls; + * later calls must wait for the first one to _finish_ the + * jump_label_update() process. At the same time, however, + * the jump_label_update() call below wants to see + * static_key_enabled(&key) for jumps to be updated properly. + * + * So give a special meaning to negative key->enabled: it sends + * static_key_slow_inc() down the slow path, and it is non-zero + * so it counts as "enabled" in jump_label_update(). Note that + * atomic_inc_unless_negative() checks >= 0, so roll our own. + */ + for (v = atomic_read(&key->enabled); v > 0; v = v1) { + v1 = atomic_cmpxchg(&key->enabled, v, v + 1); + if (likely(v1 == v)) + return; + } jump_label_lock(); - if (atomic_inc_return(&key->enabled) == 1) + if (atomic_read(&key->enabled) == 0) { + atomic_set(&key->enabled, -1); jump_label_update(key); + atomic_set(&key->enabled, 1); + } else { + atomic_inc(&key->enabled); + } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc); static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { + /* + * The negative count check is valid even when a negative + * key->enabled is in use by static_key_slow_inc(); a + * __static_key_slow_dec() before the first static_key_slow_inc() + * returns is unbalanced, because all other static_key_slow_inc() + * instances block while the update is in progress. + */ if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { WARN(atomic_read(&key->enabled) < 0, "jump label: negative count!\n"); -- cgit v1.2.3 From 094f469172e00d6ab0a3130b0e01c83b3cf3a98d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 16 Jun 2016 15:57:01 +0300 Subject: sched/fair: Initialize throttle_count for new task-groups lazily Cgroup created inside throttled group must inherit current throttle_count. Broken throttle_count allows to nominate throttled entries as a next buddy, later this leads to null pointer dereference in pick_next_task_fair(). This patch initialize cfs_rq->throttle_count at first enqueue: laziness allows to skip locking all rq at group creation. Lazy approach also allows to skip full sub-tree scan at throttling hierarchy (not in this patch). Signed-off-by: Konstantin Khlebnikov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Link: http://lkml.kernel.org/r/146608182119.21870.8439834428248129633.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 20 ++++++++++++++++++++ kernel/sched/sched.h | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2ae68f0e3bf5..8c5d8c0c8827 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4202,6 +4202,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; + /* Synchronize hierarchical throttle counter: */ + if (unlikely(!cfs_rq->throttle_uptodate)) { + struct rq *rq = rq_of(cfs_rq); + struct cfs_rq *pcfs_rq; + struct task_group *tg; + + cfs_rq->throttle_uptodate = 1; + + /* Get closest up-to-date node, because leaves go first: */ + for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { + pcfs_rq = tg->cfs_rq[cpu_of(rq)]; + if (pcfs_rq->throttle_uptodate) + break; + } + if (tg) { + cfs_rq->throttle_count = pcfs_rq->throttle_count; + cfs_rq->throttled_clock_task = rq_clock_task(rq); + } + } + /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..7cbeb92a1cb9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -437,7 +437,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count; + int throttled, throttle_count, throttle_uptodate; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3 From 754bd598be9bbc953bc709a9e8ed7f3188bfb9d7 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 16 Jun 2016 15:57:15 +0300 Subject: sched/fair: Do not announce throttled next buddy in dequeue_task_fair() Hierarchy could be already throttled at this point. Throttled next buddy could trigger a NULL pointer dereference in pick_next_task_fair(). Signed-off-by: Konstantin Khlebnikov Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/146608183552.21905.15924473394414832071.stgit@buzz Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8c5d8c0c8827..bdcbeea90c95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4537,15 +4537,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ - if (task_sleep && parent_entity(se)) - set_next_buddy(parent_entity(se)); - - /* avoid re-evaluating load for this entity */ - se = parent_entity(se); + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) + set_next_buddy(se); break; } flags |= DEQUEUE_SLEEP; -- cgit v1.2.3 From feb245e304f343cf5e4f9123db36354144dce8a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 16 Jun 2016 15:35:04 -0400 Subject: sched/core: Allow kthreads to fall back to online && !active cpus During CPU hotplug, CPU_ONLINE callbacks are run while the CPU is online but not active. A CPU_ONLINE callback may create or bind a kthread so that its cpus_allowed mask only allows the CPU which is being brought online. The kthread may start executing before the CPU is made active and can end up in select_fallback_rq(). In such cases, the expected behavior is selecting the CPU which is coming online; however, because select_fallback_rq() only chooses from active CPUs, it determines that the task doesn't have any viable CPU in its allowed mask and ends up overriding it to cpu_possible_mask. CPU_ONLINE callbacks should be able to put kthreads on the CPU which is coming online. Update select_fallback_rq() so that it follows cpu_online() rather than cpu_active() for kthreads. Reported-by: Gautham R Shenoy Tested-by: Gautham R. Shenoy Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Cc: Abdul Haleem Cc: Aneesh Kumar Cc: Linus Torvalds Cc: Michael Ellerman Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-team@fb.com Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20160616193504.GB3262@mtj.duckdns.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4135ac83cb65..51d7105f529a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) for (;;) { /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { - if (!cpu_active(dest_cpu)) + if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) + continue; + if (!cpu_online(dest_cpu)) continue; goto out; } -- cgit v1.2.3 From b235beea9e996a4d36fed6cfef4801a3e7d7a9a5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 24 Jun 2016 15:09:37 -0700 Subject: Clarify naming of thread info/stack allocators We've had the thread info allocated together with the thread stack for most architectures for a long time (since the thread_info was split off from the task struct), but that is about to change. But the patches that move the thread info to be off-stack (and a part of the task struct instead) made it clear how confused the allocator and freeing functions are. Because the common case was that we share an allocation with the thread stack and the thread_info, the two pointers were identical. That identity then meant that we would have things like ti = alloc_thread_info_node(tsk, node); ... tsk->stack = ti; which certainly _worked_ (since stack and thread_info have the same value), but is rather confusing: why are we assigning a thread_info to the stack? And if we move the thread_info away, the "confusing" code just gets to be entirely bogus. So remove all this confusion, and make it clear that we are doing the stack allocation by renaming and clarifying the function names to be about the stack. The fact that the thread_info then shares the allocation is an implementation detail, and not really about the allocation itself. This is a pure renaming and type fix: we pass in the same pointer, it's just that we clarify what the pointer means. The ia64 code that actually only has one single allocation (for all of task_struct, thread_info and kernel thread stack) now looks a bit odd, but since "tsk->stack" is actually not even used there, that oddity doesn't matter. It would be a separate thing to clean that up, I intentionally left the ia64 changes as a pure brute-force renaming and type change. Acked-by: Andy Lutomirski Signed-off-by: Linus Torvalds --- arch/Kconfig | 4 +-- arch/ia64/Kconfig | 2 +- arch/ia64/include/asm/thread_info.h | 8 +++--- arch/mn10300/include/asm/thread_info.h | 2 +- arch/mn10300/kernel/kgdb.c | 3 +- arch/tile/include/asm/thread_info.h | 2 +- arch/tile/kernel/process.c | 3 +- include/linux/sched.h | 2 +- init/main.c | 4 +-- kernel/fork.c | 50 +++++++++++++++++----------------- 10 files changed, 41 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index e9734796531f..15996290fed4 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -226,8 +226,8 @@ config ARCH_INIT_TASK config ARCH_TASK_STRUCT_ALLOCATOR bool -# Select if arch has its private alloc_thread_info() function -config ARCH_THREAD_INFO_ALLOCATOR +# Select if arch has its private alloc_thread_stack() function +config ARCH_THREAD_STACK_ALLOCATOR bool # Select if arch wants to size task_struct dynamically via arch_task_struct_size: diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index f80758cb7157..e109ee95e919 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -45,7 +45,7 @@ config IA64 select GENERIC_SMP_IDLE_THREAD select ARCH_INIT_TASK select ARCH_TASK_STRUCT_ALLOCATOR - select ARCH_THREAD_INFO_ALLOCATOR + select ARCH_THREAD_STACK_ALLOCATOR select ARCH_CLOCKSOURCE_DATA select GENERIC_TIME_VSYSCALL_OLD select SYSCTL_ARCH_UNALIGN_NO_WARN diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index aa995b67c3f5..d1212b84fb83 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -48,15 +48,15 @@ struct thread_info { #ifndef ASM_OFFSETS_C /* how to get the thread information struct from C */ #define current_thread_info() ((struct thread_info *) ((char *) current + IA64_TASK_SIZE)) -#define alloc_thread_info_node(tsk, node) \ - ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) +#define alloc_thread_stack_node(tsk, node) \ + ((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE)) #define task_thread_info(tsk) ((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE)) #else #define current_thread_info() ((struct thread_info *) 0) -#define alloc_thread_info_node(tsk, node) ((struct thread_info *) 0) +#define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) #define task_thread_info(tsk) ((struct thread_info *) 0) #endif -#define free_thread_info(ti) /* nothing */ +#define free_thread_stack(ti) /* nothing */ #define task_stack_page(tsk) ((void *)(tsk)) #define __HAVE_THREAD_FUNCTIONS diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index 4861a78c7160..f5f90bbf019d 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -115,7 +115,7 @@ static inline unsigned long current_stack_pointer(void) } #ifndef CONFIG_KGDB -void arch_release_thread_info(struct thread_info *ti); +void arch_release_thread_stack(unsigned long *stack); #endif #define get_thread_info(ti) get_task_struct((ti)->task) #define put_thread_info(ti) put_task_struct((ti)->task) diff --git a/arch/mn10300/kernel/kgdb.c b/arch/mn10300/kernel/kgdb.c index 99770823451a..2d7986c386fe 100644 --- a/arch/mn10300/kernel/kgdb.c +++ b/arch/mn10300/kernel/kgdb.c @@ -397,8 +397,9 @@ static bool kgdb_arch_undo_singlestep(struct pt_regs *regs) * single-step state is cleared. At this point the breakpoints should have * been removed by __switch_to(). */ -void arch_release_thread_info(struct thread_info *ti) +void arch_release_thread_stack(unsigned long *stack) { + struct thread_info *ti = (void *)stack; if (kgdb_sstep_thread == ti) { kgdb_sstep_thread = NULL; diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index 4b7cef9e94e0..c1467ac59ce6 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -78,7 +78,7 @@ struct thread_info { #ifndef __ASSEMBLY__ -void arch_release_thread_info(struct thread_info *info); +void arch_release_thread_stack(unsigned long *stack); /* How to get the thread information struct from C. */ register unsigned long stack_pointer __asm__("sp"); diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 6b705ccc9cc1..a465d8372edd 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -73,8 +73,9 @@ void arch_cpu_idle(void) /* * Release a thread_info structure */ -void arch_release_thread_info(struct thread_info *info) +void arch_release_thread_stack(unsigned long *stack) { + struct thread_info *info = (void *)stack; struct single_step_state *step_state = info->step_state; if (step_state) { diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e42ada26345..253538f29ade 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3007,7 +3007,7 @@ static inline int object_is_on_stack(void *obj) return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } -extern void thread_info_cache_init(void); +extern void thread_stack_cache_init(void); #ifdef CONFIG_DEBUG_STACK_USAGE static inline unsigned long stack_not_used(struct task_struct *p) diff --git a/init/main.c b/init/main.c index 4c17fda5c2ff..826fd57fa3aa 100644 --- a/init/main.c +++ b/init/main.c @@ -453,7 +453,7 @@ void __init __weak smp_setup_processor_id(void) } # if THREAD_SIZE >= PAGE_SIZE -void __init __weak thread_info_cache_init(void) +void __init __weak thread_stack_cache_init(void) { } #endif @@ -627,7 +627,7 @@ asmlinkage __visible void __init start_kernel(void) /* Should be run before the first non-init thread is created */ init_espfix_bsp(); #endif - thread_info_cache_init(); + thread_stack_cache_init(); cred_init(); fork_init(); proc_caches_init(); diff --git a/kernel/fork.c b/kernel/fork.c index 5c2c355aa97f..37b9439b8c07 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -148,18 +148,18 @@ static inline void free_task_struct(struct task_struct *tsk) } #endif -void __weak arch_release_thread_info(struct thread_info *ti) +void __weak arch_release_thread_stack(unsigned long *stack) { } -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, @@ -172,33 +172,33 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, return page ? page_address(page) : NULL; } -static inline void free_thread_info(struct thread_info *ti) +static inline void free_thread_stack(unsigned long *stack) { - struct page *page = virt_to_page(ti); + struct page *page = virt_to_page(stack); memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, -(1 << THREAD_SIZE_ORDER)); __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else -static struct kmem_cache *thread_info_cache; +static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk, int node) { - return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); + return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_info(struct thread_info *ti) +static void free_stack(unsigned long *stack) { - kmem_cache_free(thread_info_cache, ti); + kmem_cache_free(thread_stack_cache, stack); } -void thread_info_cache_init(void) +void thread_stack_cache_init(void) { - thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, + thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, THREAD_SIZE, 0, NULL); - BUG_ON(thread_info_cache == NULL); + BUG_ON(thread_stack_cache == NULL); } # endif #endif @@ -221,9 +221,9 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(struct thread_info *ti, int account) +static void account_kernel_stack(unsigned long *stack, int account) { - struct zone *zone = page_zone(virt_to_page(ti)); + struct zone *zone = page_zone(virt_to_page(stack)); mod_zone_page_state(zone, NR_KERNEL_STACK, account); } @@ -231,8 +231,8 @@ static void account_kernel_stack(struct thread_info *ti, int account) void free_task(struct task_struct *tsk) { account_kernel_stack(tsk->stack, -1); - arch_release_thread_info(tsk->stack); - free_thread_info(tsk->stack); + arch_release_thread_stack(tsk->stack); + free_thread_stack(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -343,7 +343,7 @@ void set_task_stack_end_magic(struct task_struct *tsk) static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - struct thread_info *ti; + unsigned long *stack; int err; if (node == NUMA_NO_NODE) @@ -352,15 +352,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!tsk) return NULL; - ti = alloc_thread_info_node(tsk, node); - if (!ti) + stack = alloc_thread_stack_node(tsk, node); + if (!stack) goto free_tsk; err = arch_dup_task_struct(tsk, orig); if (err) - goto free_ti; + goto free_stack; - tsk->stack = ti; + tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -392,14 +392,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(ti, 1); + account_kernel_stack(stack, 1); kcov_task_init(tsk); return tsk; -free_ti: - free_thread_info(ti); +free_stack: + free_thread_stack(stack); free_tsk: free_task_struct(tsk); return NULL; -- cgit v1.2.3 From 74070542099c66d87aebeacd7b54dc0e8b6a73f9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 24 Jun 2016 14:50:16 -0700 Subject: oom, suspend: fix oom_reaper vs. oom_killer_disable race Tetsuo has reported the following potential oom_killer_disable vs. oom_reaper race: (1) freeze_processes() starts freezing user space threads. (2) Somebody (maybe a kenrel thread) calls out_of_memory(). (3) The OOM killer calls mark_oom_victim() on a user space thread P1 which is already in __refrigerator(). (4) oom_killer_disable() sets oom_killer_disabled = true. (5) P1 leaves __refrigerator() and enters do_exit(). (6) The OOM reaper calls exit_oom_victim(P1) before P1 can call exit_oom_victim(P1). (7) oom_killer_disable() returns while P1 not yet finished (8) P1 perform IO/interfere with the freezer. This situation is unfortunate. We cannot move oom_killer_disable after all the freezable kernel threads are frozen because the oom victim might depend on some of those kthreads to make a forward progress to exit so we could deadlock. It is also far from trivial to teach the oom_reaper to not call exit_oom_victim() because then we would lose a guarantee of the OOM killer and oom_killer_disable forward progress because exit_mm->mmput might block and never call exit_oom_victim. It seems the easiest way forward is to workaround this race by calling try_to_freeze_tasks again after oom_killer_disable. This will make sure that all the tasks are frozen or it bails out. Fixes: 449d777d7ad6 ("mm, oom_reaper: clear TIF_MEMDIE for all tasks queued for oom_reaper") Link: http://lkml.kernel.org/r/1466597634-16199-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index df058bed53ce..0c2ee9761d57 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -146,6 +146,18 @@ int freeze_processes(void) if (!error && !oom_killer_disable()) error = -EBUSY; + /* + * There is a hard to fix race between oom_reaper kernel thread + * and oom_killer_disable. oom_reaper calls exit_oom_victim + * before the victim reaches exit_mm so try to freeze all the tasks + * again and catch such a left over task. + */ + if (!error) { + pr_info("Double checking all user space processes after OOM killer disable... "); + error = try_to_freeze_tasks(true); + pr_cont("\n"); + } + if (error) thaw_processes(); return error; -- cgit v1.2.3 From 9521d39976db20f8ef9b56af66661482a17d5364 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 25 Jun 2016 21:53:30 +1000 Subject: Fix build break in fork.c when THREAD_SIZE < PAGE_SIZE Commit b235beea9e99 ("Clarify naming of thread info/stack allocators") breaks the build on some powerpc configs, where THREAD_SIZE < PAGE_SIZE: kernel/fork.c:235:2: error: implicit declaration of function 'free_thread_stack' kernel/fork.c:355:8: error: assignment from incompatible pointer type stack = alloc_thread_stack_node(tsk, node); ^ Fix it by renaming free_stack() to free_thread_stack(), and updating the return type of alloc_thread_stack_node(). Fixes: b235beea9e99 ("Clarify naming of thread info/stack allocators") Signed-off-by: Michael Ellerman Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 37b9439b8c07..4a7ec0c6c88c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -183,13 +183,13 @@ static inline void free_thread_stack(unsigned long *stack) # else static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_stack(unsigned long *stack) +static void free_thread_stack(unsigned long *stack) { kmem_cache_free(thread_stack_cache, stack); } -- cgit v1.2.3 From 76a658c20efd541a62838d9ff68ce94170d7a549 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 28 Jun 2016 12:06:58 -0400 Subject: audit: move calcs after alloc and check when logging set loginuid Move the calculations of values after the allocation in case the allocation fails. This avoids wasting effort in the rare case that it fails, but more importantly saves us extra logic to release the tty ref. Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- kernel/auditsc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 71e14d836e69..33dafa70229d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1985,14 +1985,15 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid, if (!audit_enabled) return; + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (!ab) + return; + uid = from_kuid(&init_user_ns, task_uid(current)); oldloginuid = from_kuid(&init_user_ns, koldloginuid); loginuid = from_kuid(&init_user_ns, kloginuid), tty = audit_get_tty(current); - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (!ab) - return; audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid); audit_log_task_context(ab); audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", -- cgit v1.2.3 From 3f5be2da8565c1cce5655bb0948fcc957c6eb6c6 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 28 Jun 2016 12:07:50 -0400 Subject: audit: move audit_get_tty to reduce scope and kabi changes The only users of audit_get_tty and audit_put_tty are internal to audit, so move it out of include/linux/audit.h to kernel.h and create a proper function rather than inlining it. This also reduces kABI changes. Suggested-by: Paul Moore Signed-off-by: Richard Guy Briggs [PM: line wrapped description] Signed-off-by: Paul Moore --- include/linux/audit.h | 24 ------------------------ kernel/audit.c | 17 +++++++++++++++++ kernel/audit.h | 4 ++++ kernel/auditsc.c | 1 - 4 files changed, 21 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 32cdafb312d8..b40ed5df5542 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -26,7 +26,6 @@ #include #include #include -#include #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) @@ -344,23 +343,6 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk) return tsk->sessionid; } -static inline struct tty_struct *audit_get_tty(struct task_struct *tsk) -{ - struct tty_struct *tty = NULL; - unsigned long flags; - - spin_lock_irqsave(&tsk->sighand->siglock, flags); - if (tsk->signal) - tty = tty_kref_get(tsk->signal->tty); - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); - return tty; -} - -static inline void audit_put_tty(struct tty_struct *tty) -{ - tty_kref_put(tty); -} - extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); extern void __audit_bprm(struct linux_binprm *bprm); @@ -518,12 +500,6 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return -1; } -static inline struct tty_struct *audit_get_tty(struct task_struct *tsk) -{ - return NULL; -} -static inline void audit_put_tty(struct tty_struct *tty) -{ } static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, diff --git a/kernel/audit.c b/kernel/audit.c index 384374a1d232..d5971010e44a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1866,6 +1866,23 @@ out_null: audit_log_format(ab, " exe=(null)"); } +struct tty_struct *audit_get_tty(struct task_struct *tsk) +{ + struct tty_struct *tty = NULL; + unsigned long flags; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + if (tsk->signal) + tty = tty_kref_get(tsk->signal->tty); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + return tty; +} + +void audit_put_tty(struct tty_struct *tty) +{ + tty_kref_put(tty); +} + void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) { const struct cred *cred; diff --git a/kernel/audit.h b/kernel/audit.h index cbbe6bb6496e..a492f4c4e710 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -23,6 +23,7 @@ #include #include #include +#include /* AUDIT_NAMES is the number of slots we reserve in the audit_context * for saving names from getname(). If we get more names we will allocate @@ -262,6 +263,9 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); extern void audit_log_d_path_exe(struct audit_buffer *ab, struct mm_struct *mm); +extern struct tty_struct *audit_get_tty(struct task_struct *tsk); +extern void audit_put_tty(struct tty_struct *tty); + /* audit watch functions */ #ifdef CONFIG_AUDIT_WATCH extern void audit_put_watch(struct audit_watch *watch); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 33dafa70229d..60a354eed2fa 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -63,7 +63,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From ceb56070359b7329b5678b5d95a376fcb24767be Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 27 Jun 2016 21:38:11 +0200 Subject: bpf, perf: delay release of BPF prog after grace period Commit dead9f29ddcc ("perf: Fix race in BPF program unregister") moved destruction of BPF program from free_event_rcu() callback to __free_event(), which is problematic if used with tail calls: if prog A is attached as trace event directly, but at the same time present in a tail call map used by another trace event program elsewhere, then we need to delay destruction via RCU grace period since it can still be in use by the program doing the tail call (the prog first needs to be dropped from the tail call map, then trace event with prog A attached destroyed, so we get immediate destruction). Fixes: dead9f29ddcc ("perf: Fix race in BPF program unregister") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Cc: Jann Horn Signed-off-by: David S. Miller --- include/linux/bpf.h | 4 ++++ kernel/events/core.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8269cafc6eb1..0de4de6dd43e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -264,6 +264,10 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd) static inline void bpf_prog_put(struct bpf_prog *prog) { } + +static inline void bpf_prog_put_rcu(struct bpf_prog *prog) +{ +} #endif /* CONFIG_BPF_SYSCALL */ /* verifier prototypes for helper functions called from eBPF programs */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 274450efea90..d00c47b67a97 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7531,7 +7531,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event) prog = event->tp_event->prog; if (prog) { event->tp_event->prog = NULL; - bpf_prog_put(prog); + bpf_prog_put_rcu(prog); } } -- cgit v1.2.3