From fe9161db2e6053da21e4649d77bbefaf3030b11d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 1 Feb 2012 22:16:36 +0100 Subject: PM / Hibernate: Thaw kernel threads in SNAPSHOT_CREATE_IMAGE ioctl path In the SNAPSHOT_CREATE_IMAGE ioctl, if the call to hibernation_snapshot() fails, the frozen tasks are not thawed. And in the case of success, if we happen to exit due to a successful freezer test, all tasks (including those of userspace) are thawed, whereas actually we should have thawed only the kernel threads at that point. Fix both these issues. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki Cc: stable@vger.kernel.org --- kernel/power/user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index e5a21a857302..3e100075b13c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -249,13 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (!error) { + if (error) { + thaw_kernel_threads(); + } else { error = put_user(in_suspend, (int __user *)arg); if (!error && !freezer_test_done) data->ready = 1; if (freezer_test_done) { freezer_test_done = false; - thaw_processes(); + thaw_kernel_threads(); } } break; -- cgit v1.2.3 From 55ca6140e9bb307efc97a9301a4f501de02a6fd6 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 3 Feb 2012 15:37:16 -0800 Subject: kprobes: fix a memory leak in function pre_handler_kretprobe() In function pre_handler_kretprobe(), the allocated kretprobe_instance object will get leaked if the entry_handler callback returns non-zero. This may cause all the preallocated kretprobe_instance objects exhausted. This issue can be reproduced by changing samples/kprobes/kretprobe_example.c to probe "mutex_unlock". And the fix is straightforward: just put the allocated kretprobe_instance object back onto the free_instances list. [akpm@linux-foundation.org: use raw_spin_lock/unlock] Signed-off-by: Jiang Liu Acked-by: Jim Keniston Acked-by: Ananth N Mavinakayanahalli Cc: Masami Hiramatsu Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 29f5b65bee29..9788c0ec6f43 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1673,8 +1673,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, ri->rp = rp; ri->task = current; - if (rp->entry_handler && rp->entry_handler(ri, regs)) + if (rp->entry_handler && rp->entry_handler(ri, regs)) { + raw_spin_lock_irqsave(&rp->lock, flags); + hlist_add_head(&ri->hlist, &rp->free_instances); + raw_spin_unlock_irqrestore(&rp->lock, flags); return 0; + } arch_prepare_kretprobe(ri, regs); -- cgit v1.2.3 From 379e0be812ab8a2a351e784b0c987788f5123090 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 3 Feb 2012 22:22:41 +0100 Subject: PM / Freezer: Thaw only kernel threads if freezing of kernel threads fails If freezing of kernel threads fails, we are expected to automatically thaw tasks in the error recovery path. However, at times, we encounter situations in which we would like the automatic error recovery path to thaw only the kernel threads, because we want to be able to do some more cleanup before we thaw userspace. Something like: error = freeze_kernel_threads(); if (error) { /* Do some cleanup */ /* Only then thaw userspace tasks*/ thaw_processes(); } An example of such a situation is where we freeze/thaw filesystems during suspend/hibernation. There, if freezing of kernel threads fails, we would like to thaw the frozen filesystems before thawing the userspace tasks. So, modify freeze_kernel_threads() to thaw only kernel threads in case of freezing failure. And change suspend_freeze_processes() accordingly. (At the same time, let us also get rid of the rather cryptic usage of the conditional operator (:?) in that function.) [rjw: In fact, this patch fixes a regression introduced during the 3.3 merge window, because without it thaw_processes() may be called before swsusp_free() in some situations and that may lead to massive memory allocation failures.] Signed-off-by: Srivatsa S. Bhat Acked-by: Tejun Heo Acked-by: Nigel Cunningham Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 24 ++++++++++++++++++++++-- kernel/power/process.c | 7 +++++-- 2 files changed, 27 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 0c4defe6d3b8..21724eee5206 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -231,8 +231,28 @@ extern int pm_test_level; #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { - int error = freeze_processes(); - return error ? : freeze_kernel_threads(); + int error; + + error = freeze_processes(); + + /* + * freeze_processes() automatically thaws every task if freezing + * fails. So we need not do anything extra upon error. + */ + if (error) + goto Finish; + + error = freeze_kernel_threads(); + + /* + * freeze_kernel_threads() thaws only kernel threads upon freezing + * failure. So we have to thaw the userspace tasks ourselves. + */ + if (error) + thaw_processes(); + + Finish: + return error; } static inline void suspend_thaw_processes(void) diff --git a/kernel/power/process.c b/kernel/power/process.c index eeca00311f39..7e426459e60a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -143,7 +143,10 @@ int freeze_processes(void) /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. * - * On success, returns 0. On failure, -errno and system is fully thawed. + * On success, returns 0. On failure, -errno and only the kernel threads are + * thawed, so as to give a chance to the caller to do additional cleanups + * (if any) before thawing the userspace tasks. So, it is the responsibility + * of the caller to thaw the userspace tasks, when the time is right. */ int freeze_kernel_threads(void) { @@ -159,7 +162,7 @@ int freeze_kernel_threads(void) BUG_ON(in_atomic()); if (error) - thaw_processes(); + thaw_kernel_threads(); return error; } -- cgit v1.2.3 From 11a3122f6cf2d988a77eb8883d0fc49cd013a6d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 7 Feb 2012 07:51:30 +0100 Subject: block: strip out locking optimization in put_io_context() put_io_context() performed a complex trylock dancing to avoid deferring ioc release to workqueue. It was also broken on UP because trylock was always assumed to succeed which resulted in unbalanced preemption count. While there are ways to fix the UP breakage, even the most pathological microbench (forced ioc allocation and tight fork/exit loop) fails to show any appreciable performance benefit of the optimization. Strip it out. If there turns out to be workloads which are affected by this change, simpler optimization from the discussion thread can be applied later. Signed-off-by: Tejun Heo LKML-Reference: <1328514611.21268.66.camel@sli10-conroe> Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-core.c | 2 +- block/blk-ioc.c | 92 ++++++----------------------------------------- block/cfq-iosched.c | 2 +- fs/ioprio.c | 2 +- include/linux/blkdev.h | 3 -- include/linux/iocontext.h | 5 ++- kernel/fork.c | 2 +- 8 files changed, 18 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fa8f26309444..75642a352a8f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1659,7 +1659,7 @@ static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_cgroup_changed(ioc); - put_io_context(ioc, NULL); + put_io_context(ioc); } } } diff --git a/block/blk-core.c b/block/blk-core.c index 636702575118..532b3a21b383 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -642,7 +642,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) if (rq->cmd_flags & REQ_ELVPRIV) { elv_put_request(q, rq); if (rq->elv.icq) - put_io_context(rq->elv.icq->ioc, q); + put_io_context(rq->elv.icq->ioc); } mempool_free(rq, q->rq.rq_pool); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 7490b6da2453..9884fd7427fe 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -29,21 +29,6 @@ void get_io_context(struct io_context *ioc) } EXPORT_SYMBOL(get_io_context); -/* - * Releasing ioc may nest into another put_io_context() leading to nested - * fast path release. As the ioc's can't be the same, this is okay but - * makes lockdep whine. Keep track of nesting and use it as subclass. - */ -#ifdef CONFIG_LOCKDEP -#define ioc_release_depth(q) ((q) ? (q)->ioc_release_depth : 0) -#define ioc_release_depth_inc(q) (q)->ioc_release_depth++ -#define ioc_release_depth_dec(q) (q)->ioc_release_depth-- -#else -#define ioc_release_depth(q) 0 -#define ioc_release_depth_inc(q) do { } while (0) -#define ioc_release_depth_dec(q) do { } while (0) -#endif - static void icq_free_icq_rcu(struct rcu_head *head) { struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); @@ -75,11 +60,8 @@ static void ioc_exit_icq(struct io_cq *icq) if (rcu_dereference_raw(ioc->icq_hint) == icq) rcu_assign_pointer(ioc->icq_hint, NULL); - if (et->ops.elevator_exit_icq_fn) { - ioc_release_depth_inc(q); + if (et->ops.elevator_exit_icq_fn) et->ops.elevator_exit_icq_fn(icq); - ioc_release_depth_dec(q); - } /* * @icq->q might have gone away by the time RCU callback runs @@ -149,81 +131,29 @@ static void ioc_release_fn(struct work_struct *work) /** * put_io_context - put a reference of io_context * @ioc: io_context to put - * @locked_q: request_queue the caller is holding queue_lock of (hint) * * Decrement reference count of @ioc and release it if the count reaches - * zero. If the caller is holding queue_lock of a queue, it can indicate - * that with @locked_q. This is an optimization hint and the caller is - * allowed to pass in %NULL even when it's holding a queue_lock. + * zero. */ -void put_io_context(struct io_context *ioc, struct request_queue *locked_q) +void put_io_context(struct io_context *ioc) { - struct request_queue *last_q = locked_q; unsigned long flags; if (ioc == NULL) return; BUG_ON(atomic_long_read(&ioc->refcount) <= 0); - if (locked_q) - lockdep_assert_held(locked_q->queue_lock); - - if (!atomic_long_dec_and_test(&ioc->refcount)) - return; /* - * Destroy @ioc. This is a bit messy because icq's are chained - * from both ioc and queue, and ioc->lock nests inside queue_lock. - * The inner ioc->lock should be held to walk our icq_list and then - * for each icq the outer matching queue_lock should be grabbed. - * ie. We need to do reverse-order double lock dancing. - * - * Another twist is that we are often called with one of the - * matching queue_locks held as indicated by @locked_q, which - * prevents performing double-lock dance for other queues. - * - * So, we do it in two stages. The fast path uses the queue_lock - * the caller is holding and, if other queues need to be accessed, - * uses trylock to avoid introducing locking dependency. This can - * handle most cases, especially if @ioc was performing IO on only - * single device. - * - * If trylock doesn't cut it, we defer to @ioc->release_work which - * can do all the double-locking dancing. + * Releasing ioc requires reverse order double locking and we may + * already be holding a queue_lock. Do it asynchronously from wq. */ - spin_lock_irqsave_nested(&ioc->lock, flags, - ioc_release_depth(locked_q)); - - while (!hlist_empty(&ioc->icq_list)) { - struct io_cq *icq = hlist_entry(ioc->icq_list.first, - struct io_cq, ioc_node); - struct request_queue *this_q = icq->q; - - if (this_q != last_q) { - if (last_q && last_q != locked_q) - spin_unlock(last_q->queue_lock); - last_q = NULL; - - /* spin_trylock() always successes in UP case */ - if (this_q != locked_q && - !spin_trylock(this_q->queue_lock)) - break; - last_q = this_q; - continue; - } - ioc_exit_icq(icq); + if (atomic_long_dec_and_test(&ioc->refcount)) { + spin_lock_irqsave(&ioc->lock, flags); + if (!hlist_empty(&ioc->icq_list)) + schedule_work(&ioc->release_work); + spin_unlock_irqrestore(&ioc->lock, flags); } - - if (last_q && last_q != locked_q) - spin_unlock(last_q->queue_lock); - - spin_unlock_irqrestore(&ioc->lock, flags); - - /* if no icq is left, we're done; otherwise, kick release_work */ - if (hlist_empty(&ioc->icq_list)) - kmem_cache_free(iocontext_cachep, ioc); - else - schedule_work(&ioc->release_work); } EXPORT_SYMBOL(put_io_context); @@ -238,7 +168,7 @@ void exit_io_context(struct task_struct *task) task_unlock(task); atomic_dec(&ioc->nr_tasks); - put_io_context(ioc, NULL); + put_io_context(ioc); } /** diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index da21c24dbed3..5684df6848bc 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1794,7 +1794,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->active_queue = NULL; if (cfqd->active_cic) { - put_io_context(cfqd->active_cic->icq.ioc, cfqd->queue); + put_io_context(cfqd->active_cic->icq.ioc); cfqd->active_cic = NULL; } } diff --git a/fs/ioprio.c b/fs/ioprio.c index f84b380d65e5..0f1b9515213b 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_ioprio_changed(ioc, ioprio); - put_io_context(ioc, NULL); + put_io_context(ioc); } return err; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6c6a1f008065..606cf339bb56 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -399,9 +399,6 @@ struct request_queue { /* Throttle data */ struct throtl_data *td; #endif -#ifdef CONFIG_LOCKDEP - int ioc_release_depth; -#endif }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 7e1371c4bccf..119773eebe31 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -133,7 +133,7 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc) struct task_struct; #ifdef CONFIG_BLOCK -void put_io_context(struct io_context *ioc, struct request_queue *locked_q); +void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); @@ -141,8 +141,7 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio); void ioc_cgroup_changed(struct io_context *ioc); #else struct io_context; -static inline void put_io_context(struct io_context *ioc, - struct request_queue *locked_q) { } +static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 051f090d40c1..c574aefa8d1b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -890,7 +890,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc, NULL); + put_io_context(new_ioc); } #endif return 0; -- cgit v1.2.3 From f39d47ff819ed52a2afbdbecbe35f23f7755f58d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 7 Feb 2012 14:39:57 +0100 Subject: perf: Fix double start/stop in x86_pmu_start() The following patch fixes a bug introduced by the following commit: e050e3f0a71b ("perf: Fix broken interrupt rate throttling") The patch caused the following warning to pop up depending on the sampling frequency adjustments: ------------[ cut here ]------------ WARNING: at arch/x86/kernel/cpu/perf_event.c:995 x86_pmu_start+0x79/0xd4() It was caused by the following call sequence: perf_adjust_freq_unthr_context.part() { stop() if (delta > 0) { perf_adjust_period() { if (period > 8*...) { stop() ... start() } } } start() } Which caused a double start and a double stop, thus triggering the assert in x86_pmu_start(). The patch fixes the problem by avoiding the double calls. We pass a new argument to perf_adjust_period() to indicate whether or not the event is already stopped. We can't just remove the start/stop from that function because it's called from __perf_event_overflow where the event needs to be reloaded via a stop/start back-toback call. The patch reintroduces the assertion in x86_pmu_start() which was removed by commit: 84f2b9b ("perf: Remove deprecated WARN_ON_ONCE()") In this second version, we've added calls to disable/enable PMU during unthrottling or frequency adjustment based on bug report of spurious NMI interrupts from Eric Dumazet. Reported-and-tested-by: Eric Dumazet Signed-off-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: markus@trippelsdorf.de Cc: paulus@samba.org Link: http://lkml.kernel.org/r/20120207133956.GA4932@quad [ Minor edits to the changelog and to the code ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ kernel/events/core.c | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2a30e5ae6acf..5adce1040b11 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -986,6 +986,9 @@ static void x86_pmu_start(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx = event->hw.idx; + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + if (WARN_ON_ONCE(idx == -1)) return; diff --git a/kernel/events/core.c b/kernel/events/core.c index ba36013cfb21..1b5c081d8b9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2303,7 +2303,7 @@ do { \ static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; @@ -2322,9 +2322,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - event->pmu->stop(event, PERF_EF_UPDATE); + if (disable) + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); - event->pmu->start(event, PERF_EF_RELOAD); + + if (disable) + event->pmu->start(event, PERF_EF_RELOAD); } } @@ -2350,6 +2354,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, return; raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -2381,13 +2386,17 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, /* * restart the event * reload only if value has changed + * we have stopped the event so tell that + * to perf_adjust_period() to avoid stopping it + * twice. */ if (delta > 0) - perf_adjust_period(event, period, delta); + perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } + perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -4562,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event, hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) - perf_adjust_period(event, delta, hwc->last_period); + perf_adjust_period(event, delta, hwc->last_period, true); } /* -- cgit v1.2.3 From f6302f1bcd75a042df69866d98b8d775a668f8f1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 10 Feb 2012 09:03:58 +0100 Subject: relay: prevent integer overflow in relay_open() "subbuf_size" and "n_subbufs" come from the user and they need to be capped to prevent an integer overflow. Signed-off-by: Dan Carpenter Cc: stable@kernel.org Signed-off-by: Jens Axboe --- kernel/relay.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 4335e1d7ee2d..ab56a1764d4d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -164,10 +164,14 @@ depopulate: */ static struct rchan_buf *relay_create_buf(struct rchan *chan) { - struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); - if (!buf) + struct rchan_buf *buf; + + if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) return NULL; + buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); + if (!buf) + return NULL; buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); if (!buf->padding) goto free_buf; @@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename, if (!(subbuf_size && n_subbufs)) return NULL; + if (subbuf_size > UINT_MAX / n_subbufs) + return NULL; chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); if (!chan) -- cgit v1.2.3 From 10f296cbfe3b93188c41463fd7a53808ebdbcbe3 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 1 Feb 2012 10:33:11 +0800 Subject: module: make module param bint handle nul value Allow bint param accept nul values, just do same as bool param. Signed-off-by: Dave Young Cc: Rusty Russell Signed-off-by: Rusty Russell --- kernel/params.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 32ee04308285..4bc965d8a1fe 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -97,7 +97,8 @@ static int parse_one(char *param, for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { /* No one handled NULL, so do it here. */ - if (!val && params[i].ops->set != param_set_bool) + if (!val && params[i].ops->set != param_set_bool + && params[i].ops->set != param_set_bint) return -EINVAL; pr_debug("They are equal! Calling %p\n", params[i].ops->set); -- cgit v1.2.3 From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 8 Feb 2012 12:39:07 -0800 Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts When the number of dentry cache hash table entries gets too high (2147483648 entries), as happens by default on a 16TB system, use of a signed integer in the dcache_init() initialization loop prevents the dentry_hashtable from getting initialized, causing a panic in __d_lookup(). Fix this in dcache_init() and similar areas. Signed-off-by: Dimitri Sivanich Acked-by: David S. Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- fs/inode.c | 8 ++++---- kernel/pid.c | 4 ++-- mm/page_alloc.c | 1 + net/ipv4/tcp.c | 5 +++-- 5 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/dcache.c b/fs/dcache.c index 16a53cc2cc02..fe19ac13f75f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2968,7 +2968,7 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -2986,13 +2986,13 @@ static void __init dcache_init_early(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - int loop; + unsigned int loop; /* * A constructor could be added for stable state like the lists, @@ -3016,7 +3016,7 @@ static void __init dcache_init(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } diff --git a/fs/inode.c b/fs/inode.c index fb10d86ffad7..d3ebdbe723d0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1651,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -1669,13 +1669,13 @@ void __init inode_init_early(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - int loop; + unsigned int loop; /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", @@ -1699,7 +1699,7 @@ void __init inode_init(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } diff --git a/kernel/pid.c b/kernel/pid.c index ce8e00deaccb..9f08dfabaf13 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - int i, pidhash_size; + unsigned int i, pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL, &pidhash_shift, NULL, 4096); - pidhash_size = 1 << pidhash_shift; + pidhash_size = 1U << pidhash_shift; for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2186ecb36f7..a13ded1938f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5236,6 +5236,7 @@ void *__init alloc_large_system_hash(const char *tablename, max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; do_div(max, bucketsize); } + max = min(max, 0x80000000ULL); if (numentries > max) numentries = max; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 37755ccc0e96..22ef5f9fd2ff 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3240,7 +3240,8 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; - int i, max_share, cnt; + int max_share, cnt; + unsigned int i; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3283,7 +3284,7 @@ void __init tcp_init(void) &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); -- cgit v1.2.3 From ac5637611150281f398bb7a47e3fcb69a09e7803 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Feb 2012 17:58:03 +0100 Subject: genirq: Unmask oneshot irqs when thread was not woken When the primary handler of an interrupt which is marked IRQ_ONESHOT returns IRQ_HANDLED or IRQ_NONE, then the interrupt thread is not woken and the unmask logic of the interrupt line is never invoked. This keeps the interrupt masked forever. This was not noticed as most IRQ_ONESHOT users wake the thread unconditionally (usually because they cannot access the underlying device from hard interrupt context). Though this behaviour was nowhere documented and not necessarily intentional. Some drivers can avoid the thread wakeup in certain cases and run into the situation where the interrupt line s kept masked. Handle it gracefully. Reported-and-tested-by: Lothar Wassmann Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f7c543a801d9..b742edc0bdd4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -330,6 +330,24 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_simple_irq); +/* + * Called unconditionally from handle_level_irq() and only for oneshot + * interrupts from handle_fasteoi_irq() + */ +static void cond_unmask_irq(struct irq_desc *desc) +{ + /* + * We need to unmask in the following cases: + * - Standard level irq (IRQF_ONESHOT is not set) + * - Oneshot irq which did not wake the thread (caused by a + * spurious interrupt or a primary handler handling it + * completely). + */ + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) + unmask_irq(desc); +} + /** * handle_level_irq - Level type irq handler * @irq: the interrupt number @@ -362,8 +380,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) handle_irq_event(desc); - if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) - unmask_irq(desc); + cond_unmask_irq(desc); + out_unlock: raw_spin_unlock(&desc->lock); } @@ -417,6 +435,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) preflow_handler(desc); handle_irq_event(desc); + if (desc->istate & IRQS_ONESHOT) + cond_unmask_irq(desc); + out_eoi: desc->irq_data.chip->irq_eoi(&desc->irq_data); out_unlock: -- cgit v1.2.3 From b4bc724e82e80478cba5fe9825b62e71ddf78757 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Feb 2012 11:57:52 +0100 Subject: genirq: Handle pending irqs in irq_startup() An interrupt might be pending when irq_startup() is called, but the startup code does not invoke the resend logic. In some cases this prevents the device from issuing another interrupt which renders the device non functional. Call the resend function in irq_startup() to keep things going. Reported-and-tested-by: Russell King Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 4 ++-- kernel/irq/chip.c | 17 ++++++++++------- kernel/irq/internals.h | 2 +- kernel/irq/manage.c | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 342d8f44e401..0119b9d467ae 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -53,7 +53,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - irq_startup(desc); + irq_startup(desc, false); } raw_spin_unlock_irq(&desc->lock); } @@ -70,7 +70,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc)) + if (irq_startup(desc, false)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b742edc0bdd4..fb7db75ee0c8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -157,19 +157,22 @@ static void irq_state_set_masked(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } -int irq_startup(struct irq_desc *desc) +int irq_startup(struct irq_desc *desc, bool resend) { + int ret = 0; + irq_state_clr_disabled(desc); desc->depth = 0; if (desc->irq_data.chip->irq_startup) { - int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + ret = desc->irq_data.chip->irq_startup(&desc->irq_data); irq_state_clr_masked(desc); - return ret; + } else { + irq_enable(desc); } - - irq_enable(desc); - return 0; + if (resend) + check_irq_resend(desc, desc->irq_data.irq); + return ret; } void irq_shutdown(struct irq_desc *desc) @@ -646,7 +649,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); - irq_startup(desc); + irq_startup(desc, true); } out: irq_put_desc_busunlock(desc, flags); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b7952316016a..40378ff877e7 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -67,7 +67,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); -extern int irq_startup(struct irq_desc *desc); +extern int irq_startup(struct irq_desc *desc, bool resend); extern void irq_shutdown(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a9a9dbe49fea..32313c084442 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1027,7 +1027,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->istate |= IRQS_ONESHOT; if (irq_settings_can_autoenable(desc)) - irq_startup(desc); + irq_startup(desc, true); else /* Undo nested disables: */ desc->depth = 1; -- cgit v1.2.3 From 1fd36adcd98c14d2fd97f545293c488775cb2823 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Feb 2012 17:49:54 +0000 Subject: Replace the fd_sets in struct fdtable with an array of unsigned longs Replace the fd_sets in struct fdtable with an array of unsigned longs and then use the standard non-atomic bit operations rather than the FD_* macros. This: (1) Removes the abuses of struct fd_set: (a) Since we don't want to allocate a full fd_set the vast majority of the time, we actually, in effect, just allocate a just-big-enough array of unsigned longs and cast it to an fd_set type - so why bother with the fd_set at all? (b) Some places outside of the core fdtable handling code (such as SELinux) want to look inside the array of unsigned longs hidden inside the fd_set struct for more efficient iteration over the entire set. (2) Eliminates the use of FD_*() macros in the kernel completely. (3) Permits the __FD_*() macros to be deleted entirely where not exposed to userspace. Signed-off-by: David Howells Link: http://lkml.kernel.org/r/20120216174954.23314.48147.stgit@warthog.procyon.org.uk Signed-off-by: H. Peter Anvin Cc: Al Viro --- fs/exec.c | 4 ++-- fs/file.c | 46 ++++++++++++++++++++++------------------------ fs/select.c | 2 +- include/linux/fdtable.h | 28 ++++++++++------------------ kernel/exit.c | 2 +- security/selinux/hooks.c | 2 +- 6 files changed, 37 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 22cc38d9e79f..cfd5e3047bd8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1026,10 +1026,10 @@ static void flush_old_files(struct files_struct * files) fdt = files_fdtable(files); if (i >= fdt->max_fds) break; - set = fdt->close_on_exec->fds_bits[j]; + set = fdt->close_on_exec[j]; if (!set) continue; - fdt->close_on_exec->fds_bits[j] = 0; + fdt->close_on_exec[j] = 0; spin_unlock(&files->file_lock); for ( ; set ; i++,set >>= 1) { if (set & 1) { diff --git a/fs/file.c b/fs/file.c index 114fea0a2cec..2d479dd8484e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -40,7 +40,7 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */ */ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); -static void *alloc_fdmem(unsigned int size) +static void *alloc_fdmem(size_t size) { /* * Very large allocations can stress page reclaim, so fall back to @@ -142,7 +142,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; - char *data; + void *data; /* * Figure out how many fds we actually want to support in this fdtable. @@ -172,14 +172,15 @@ static struct fdtable * alloc_fdtable(unsigned int nr) data = alloc_fdmem(nr * sizeof(struct file *)); if (!data) goto out_fdt; - fdt->fd = (struct file **)data; - data = alloc_fdmem(max_t(unsigned int, + fdt->fd = data; + + data = alloc_fdmem(max_t(size_t, 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); if (!data) goto out_arr; - fdt->open_fds = (fd_set *)data; - data += nr / BITS_PER_BYTE; - fdt->close_on_exec = (fd_set *)data; + fdt->open_fds = data; + data += nr / BITS_PER_LONG; + fdt->close_on_exec = data; fdt->next = NULL; return fdt; @@ -275,11 +276,11 @@ static int count_open_files(struct fdtable *fdt) int i; /* Find the last open fd */ - for (i = size/(8*sizeof(long)); i > 0; ) { - if (fdt->open_fds->fds_bits[--i]) + for (i = size / BITS_PER_LONG; i > 0; ) { + if (fdt->open_fds[--i]) break; } - i = (i+1) * 8 * sizeof(long); + i = (i + 1) * BITS_PER_LONG; return i; } @@ -306,8 +307,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; - new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; - new_fdt->open_fds = (fd_set *)&newf->open_fds_init; + new_fdt->close_on_exec = newf->close_on_exec_init; + new_fdt->open_fds = newf->open_fds_init; new_fdt->fd = &newf->fd_array[0]; new_fdt->next = NULL; @@ -350,10 +351,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds->fds_bits, - old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, - old_fdt->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); + memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -379,11 +378,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) memset(new_fds, 0, size); if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds-open_files)/8; - int start = open_files / (8 * sizeof(unsigned long)); + int left = (new_fdt->max_fds - open_files) / 8; + int start = open_files / BITS_PER_LONG; - memset(&new_fdt->open_fds->fds_bits[start], 0, left); - memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); + memset(&new_fdt->open_fds[start], 0, left); + memset(&new_fdt->close_on_exec[start], 0, left); } rcu_assign_pointer(newf->fdt, new_fdt); @@ -419,8 +418,8 @@ struct files_struct init_files = { .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], - .close_on_exec = (fd_set *)&init_files.close_on_exec_init, - .open_fds = (fd_set *)&init_files.open_fds_init, + .close_on_exec = init_files.close_on_exec_init, + .open_fds = init_files.open_fds_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; @@ -443,8 +442,7 @@ repeat: fd = files->next_fd; if (fd < fdt->max_fds) - fd = find_next_zero_bit(fdt->open_fds->fds_bits, - fdt->max_fds, fd); + fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); error = expand_files(files, fd); if (error < 0) diff --git a/fs/select.c b/fs/select.c index d33418fdc858..2e7fbe8a092c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -348,7 +348,7 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds) set = ~(~0UL << (n & (__NFDBITS-1))); n /= __NFDBITS; fdt = files_fdtable(current->files); - open_fds = fdt->open_fds->fds_bits+n; + open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 7675da2c18f7..158a41eed314 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -21,51 +21,43 @@ */ #define NR_OPEN_DEFAULT BITS_PER_LONG -/* - * The embedded_fd_set is a small fd_set, - * suitable for most tasks (which open <= BITS_PER_LONG files) - */ -struct embedded_fd_set { - unsigned long fds_bits[1]; -}; - struct fdtable { unsigned int max_fds; struct file __rcu **fd; /* current fd array */ - fd_set *close_on_exec; - fd_set *open_fds; + unsigned long *close_on_exec; + unsigned long *open_fds; struct rcu_head rcu; struct fdtable *next; }; static inline void __set_close_on_exec(int fd, struct fdtable *fdt) { - FD_SET(fd, fdt->close_on_exec); + __set_bit(fd, fdt->close_on_exec); } static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) { - FD_CLR(fd, fdt->close_on_exec); + __clear_bit(fd, fdt->close_on_exec); } static inline bool close_on_exec(int fd, const struct fdtable *fdt) { - return FD_ISSET(fd, fdt->close_on_exec); + return test_bit(fd, fdt->close_on_exec); } static inline void __set_open_fd(int fd, struct fdtable *fdt) { - FD_SET(fd, fdt->open_fds); + __set_bit(fd, fdt->open_fds); } static inline void __clear_open_fd(int fd, struct fdtable *fdt) { - FD_CLR(fd, fdt->open_fds); + __clear_bit(fd, fdt->open_fds); } static inline bool fd_is_open(int fd, const struct fdtable *fdt) { - return FD_ISSET(fd, fdt->open_fds); + return test_bit(fd, fdt->open_fds); } /* @@ -83,8 +75,8 @@ struct files_struct { */ spinlock_t file_lock ____cacheline_aligned_in_smp; int next_fd; - struct embedded_fd_set close_on_exec_init; - struct embedded_fd_set open_fds_init; + unsigned long close_on_exec_init[1]; + unsigned long open_fds_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f9bc6a..4db020015f14 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -473,7 +473,7 @@ static void close_files(struct files_struct * files) i = j * __NFDBITS; if (i >= fdt->max_fds) break; - set = fdt->open_fds->fds_bits[j++]; + set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 6a3683e28426..421c990a20b2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2145,7 +2145,7 @@ static inline void flush_unauthorized_files(const struct cred *cred, fdt = files_fdtable(files); if (i >= fdt->max_fds) break; - set = fdt->open_fds->fds_bits[j]; + set = fdt->open_fds[j]; if (!set) continue; spin_unlock(&files->file_lock); -- cgit v1.2.3 From 6684ba202b5ab2f36d574c72fe50c207d99b3e35 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 17:38:00 -0800 Subject: compat: Add helper functions to read/write struct timeval, timespec Add helper functions to read and write struct timeval and struct timespec from userspace. We already had helper functions for reading and writing struct compat_timespec; add a set of functions to do the same with struct timeval, and add a second suite of functions which can be sensitive to COMPAT_USE_64BIT_TIME and access either 32- or 64-bit time structures. This also exports these helper functions to modules. Rename the existing inlines for converting between struct compat_timeval and native struct timespec so we can have a saner naming convention for the exported functions. Suggested-by: Linus Torvalds Signed-off-by: H. Peter Anvin --- include/linux/compat.h | 16 ++++++++++++ kernel/compat.c | 68 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 76 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 1be91c048249..a82e452bbdb9 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -87,10 +87,26 @@ typedef struct { compat_sigset_word sig[_COMPAT_NSIG_WORDS]; } compat_sigset_t; +/* + * These functions operate strictly on struct compat_time* + */ extern int get_compat_timespec(struct timespec *, const struct compat_timespec __user *); extern int put_compat_timespec(const struct timespec *, struct compat_timespec __user *); +extern int get_compat_timeval(struct timeval *, + const struct compat_timeval __user *); +extern int put_compat_timeval(const struct timeval *, + struct compat_timeval __user *); +/* + * These functions operate on 32- or 64-bit specs depending on + * COMPAT_USE_64BIT_TIME, hence the void user pointer arguments and the + * naming as compat_get/put_ rather than get/put_compat_. + */ +extern int compat_get_timespec(struct timespec *, const void __user *); +extern int compat_put_timespec(const struct timespec *, void __user *); +extern int compat_get_timeval(struct timeval *, const void __user *); +extern int compat_put_timeval(const struct timeval *, void __user *); struct compat_iovec { compat_uptr_t iov_base; diff --git a/kernel/compat.c b/kernel/compat.c index f346cedfe24d..74ff8498809a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -31,11 +31,10 @@ #include /* - * Note that the native side is already converted to a timespec, because - * that's what we want anyway. + * Get/set struct timeval with struct timespec on the native side */ -static int compat_get_timeval(struct timespec *o, - struct compat_timeval __user *i) +static int compat_get_timeval_convert(struct timespec *o, + struct compat_timeval __user *i) { long usec; @@ -46,8 +45,8 @@ static int compat_get_timeval(struct timespec *o, return 0; } -static int compat_put_timeval(struct compat_timeval __user *o, - struct timeval *i) +static int compat_put_timeval_convert(struct compat_timeval __user *o, + struct timeval *i) { return (put_user(i->tv_sec, &o->tv_sec) || put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; @@ -117,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, if (tv) { struct timeval ktv; do_gettimeofday(&ktv); - if (compat_put_timeval(tv, &ktv)) + if (compat_put_timeval_convert(tv, &ktv)) return -EFAULT; } if (tz) { @@ -135,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, struct timezone ktz; if (tv) { - if (compat_get_timeval(&kts, tv)) + if (compat_get_timeval_convert(&kts, tv)) return -EFAULT; } if (tz) { @@ -146,12 +145,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); } +int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +{ + return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || + __get_user(tv->tv_sec, &ctv->tv_sec) || + __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(get_compat_timeval); + +int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +{ + return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || + __put_user(tv->tv_sec, &ctv->tv_sec) || + __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(put_compat_timeval); + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(get_compat_timespec); int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) { @@ -161,6 +177,42 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user } EXPORT_SYMBOL_GPL(put_compat_timespec); +int compat_get_timeval(struct timeval *tv, const void __user *utv) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; + else + return get_compat_timeval(tv, utv); +} +EXPORT_SYMBOL_GPL(compat_get_timeval); + +int compat_put_timeval(const struct timeval *tv, void __user *utv) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; + else + return put_compat_timeval(tv, utv); +} +EXPORT_SYMBOL_GPL(compat_put_timeval); + +int compat_get_timespec(struct timespec *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; + else + return get_compat_timespec(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec); + +int compat_put_timespec(const struct timespec *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; + else + return put_compat_timespec(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec); + static long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; -- cgit v1.2.3 From 8c79a045fd590a26e81e75f5d8d4ec5c7d23e565 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Jan 2012 14:51:37 +0100 Subject: sched/events: Revert trace_sched_stat_sleeptime() Commit 1ac9bc69 ("sched/tracing: Add a new tracepoint for sleeptime") added a new sched:sched_stat_sleeptime tracepoint. It's broken: the first sample we get on a task might be bad because of a stale sleep_start value that wasn't reset at the last task switch because the tracepoint was not active. It also breaks the existing schedstat samples due to the side effects of: - se->statistics.sleep_start = 0; ... - se->statistics.block_start = 0; Nor do I see means to fix it without adding overhead to the scheduler fast path, which I'm not willing to for the sake of redundant instrumentation. Most importantly, sleep time information can already be constructed by tracing context switches and wakeups, and taking the timestamp difference between the schedule-out, the wakeup and the schedule-in. Signed-off-by: Peter Zijlstra Cc: Andrew Vagin Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-pc4c9qhl8q6vg3bs4j6k0rbd@git.kernel.org Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 50 -------------------------------------------- kernel/sched/core.c | 1 - kernel/sched/fair.c | 2 ++ 3 files changed, 2 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 6ba596b07a72..e33ed1bfa113 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -370,56 +370,6 @@ TRACE_EVENT(sched_stat_runtime, (unsigned long long)__entry->vruntime) ); -#ifdef CREATE_TRACE_POINTS -static inline u64 trace_get_sleeptime(struct task_struct *tsk) -{ -#ifdef CONFIG_SCHEDSTATS - u64 block, sleep; - - block = tsk->se.statistics.block_start; - sleep = tsk->se.statistics.sleep_start; - tsk->se.statistics.block_start = 0; - tsk->se.statistics.sleep_start = 0; - - return block ? block : sleep ? sleep : 0; -#else - return 0; -#endif -} -#endif - -/* - * Tracepoint for accounting sleeptime (time the task is sleeping - * or waiting for I/O). - */ -TRACE_EVENT(sched_stat_sleeptime, - - TP_PROTO(struct task_struct *tsk, u64 now), - - TP_ARGS(tsk, now), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( u64, sleeptime ) - ), - - TP_fast_assign( - memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); - __entry->pid = tsk->pid; - __entry->sleeptime = trace_get_sleeptime(tsk); - __entry->sleeptime = __entry->sleeptime ? - now - __entry->sleeptime : 0; - ) - TP_perf_assign( - __perf_count(__entry->sleeptime); - ), - - TP_printk("comm=%s pid=%d sleeptime=%Lu [ns]", - __entry->comm, __entry->pid, - (unsigned long long)__entry->sleeptime) -); - /* * Tracepoint for showing priority inheritance modifying a tasks * priority. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..b342f57879e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1932,7 +1932,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); - trace_sched_stat_sleeptime(current, rq->clock); fire_sched_in_preempt_notifiers(current); if (mm) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..aca16b843b7e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1003,6 +1003,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.sleep_max)) se->statistics.sleep_max = delta; + se->statistics.sleep_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -1019,6 +1020,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.block_max)) se->statistics.block_max = delta; + se->statistics.block_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { -- cgit v1.2.3 From d80e731ecab420ddcb79ee9d0ac427acbc187b4b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Feb 2012 20:07:11 +0100 Subject: epoll: introduce POLLFREE to flush ->signalfd_wqh before kfree() This patch is intentionally incomplete to simplify the review. It ignores ep_unregister_pollwait() which plays with the same wqh. See the next change. epoll assumes that the EPOLL_CTL_ADD'ed file controls everything f_op->poll() needs. In particular it assumes that the wait queue can't go away until eventpoll_release(). This is not true in case of signalfd, the task which does EPOLL_CTL_ADD uses its ->sighand which is not connected to the file. This patch adds the special event, POLLFREE, currently only for epoll. It expects that init_poll_funcptr()'ed hook should do the necessary cleanup. Perhaps it should be defined as EPOLLFREE in eventpoll. __cleanup_sighand() is changed to do wake_up_poll(POLLFREE) if ->signalfd_wqh is not empty, we add the new signalfd_cleanup() helper. ep_poll_callback(POLLFREE) simply does list_del_init(task_list). This make this poll entry inconsistent, but we don't care. If you share epoll fd which contains our sigfd with another process you should blame yourself. signalfd is "really special". I simply do not know how we can define the "right" semantics if it used with epoll. The main problem is, epoll calls signalfd_poll() once to establish the connection with the wait queue, after that signalfd_poll(NULL) returns the different/inconsistent results depending on who does EPOLL_CTL_MOD/signalfd_read/etc. IOW: apart from sigmask, signalfd has nothing to do with the file, it works with the current thread. In short: this patch is the hack which tries to fix the symptoms. It also assumes that nobody can take tasklist_lock under epoll locks, this seems to be true. Note: - we do not have wake_up_all_poll() but wake_up_poll() is fine, poll/epoll doesn't use WQ_FLAG_EXCLUSIVE. - signalfd_cleanup() uses POLLHUP along with POLLFREE, we need a couple of simple changes in eventpoll.c to make sure it can't be "lost". Reported-by: Maxime Bizon Cc: Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++++ fs/signalfd.c | 11 +++++++++++ include/asm-generic/poll.h | 2 ++ include/linux/signalfd.h | 5 ++++- kernel/fork.c | 5 ++++- 5 files changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aabdfc38cf24..34bbfc6dd8dc 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -842,6 +842,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + /* the caller holds eppoll_entry->whead->lock */ + if ((unsigned long)key & POLLFREE) + list_del_init(&wait->task_list); + spin_lock_irqsave(&ep->lock, flags); /* diff --git a/fs/signalfd.c b/fs/signalfd.c index 492465b451dd..79c1eea98a3a 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,17 @@ #include #include +void signalfd_cleanup(struct sighand_struct *sighand) +{ + wait_queue_head_t *wqh = &sighand->signalfd_wqh; + + if (likely(!waitqueue_active(wqh))) + return; + + /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + wake_up_poll(wqh, POLLHUP | POLLFREE); +} + struct signalfd_ctx { sigset_t sigmask; }; diff --git a/include/asm-generic/poll.h b/include/asm-generic/poll.h index 44bce836d350..9ce7f44aebd2 100644 --- a/include/asm-generic/poll.h +++ b/include/asm-generic/poll.h @@ -28,6 +28,8 @@ #define POLLRDHUP 0x2000 #endif +#define POLLFREE 0x4000 /* currently only for epoll */ + struct pollfd { int fd; short events; diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h index 3ff4961da9b5..247399b2979a 100644 --- a/include/linux/signalfd.h +++ b/include/linux/signalfd.h @@ -61,13 +61,16 @@ static inline void signalfd_notify(struct task_struct *tsk, int sig) wake_up(&tsk->sighand->signalfd_wqh); } +extern void signalfd_cleanup(struct sighand_struct *sighand); + #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } +static inline void signalfd_cleanup(struct sighand_struct *sighand) { } + #endif /* CONFIG_SIGNALFD */ #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNALFD_H */ - diff --git a/kernel/fork.c b/kernel/fork.c index b77fd559c78e..e2cd3e2a5ae8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -935,8 +936,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) + if (atomic_dec_and_test(&sighand->count)) { + signalfd_cleanup(sighand); kmem_cache_free(sighand_cachep, sighand); + } } -- cgit v1.2.3 From 8f2f748b0656257153bcf0941df8d6060acc5ca6 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 23 Feb 2012 15:27:15 +0530 Subject: CPU hotplug, cpusets, suspend: Don't touch cpusets during suspend/resume Currently, during CPU hotplug, the cpuset callbacks modify the cpusets to reflect the state of the system, and this handling is asymmetric. That is, upon CPU offline, that CPU is removed from all cpusets. However when it comes back online, it is put back only to the root cpuset. This gives rise to a significant problem during suspend/resume. During suspend, we offline all non-boot cpus and during resume we online them back. Which means, after a resume, all cpusets (except the root cpuset) will be restricted to just one single CPU (the boot cpu). But the whole point of suspend/resume is to restore the system to a state which is as close as possible to how it was before suspend. So to fix this, don't touch cpusets during suspend/resume. That is, modify the cpuset-related CPU hotplug callback to just ignore CPU hotplug when it is initiated as part of the suspend/resume sequence. Reported-by: Prashanth Nageshappa Signed-off-by: Srivatsa S. Bhat Cc: Linus Torvalds Cc: Andrew Morton Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/4F460D7B.1020703@linux.vnet.ibm.com Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b342f57879e6..33a0676ea744 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6728,7 +6728,7 @@ int __init sched_create_sysfs_power_savings_entries(struct device *dev) static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: cpuset_update_active_cpus(); @@ -6741,7 +6741,7 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: cpuset_update_active_cpus(); return NOTIFY_OK; -- cgit v1.2.3 From 30ce2f7eef095d1b8d070740f1948629814fe3c7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 28 Feb 2012 10:19:38 +0900 Subject: perf/hwbp: Fix a possible memory leak If kzalloc() for TYPE_DATA failed on a given cpu, previous chunk of TYPE_INST will be leaked. Fix it. Thanks to Peter Zijlstra for suggesting this better solution. It should work as long as the initial value of the region is all 0's and that's the case of static (per-cpu) memory allocation. Signed-off-by: Namhyung Kim Acked-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1330391978-28070-1-git-send-email-namhyung.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b7971d6f38bf..ee706ce44aa0 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -651,10 +651,10 @@ int __init init_hw_breakpoint(void) err_alloc: for_each_possible_cpu(err_cpu) { - if (err_cpu == cpu) - break; for (i = 0; i < TYPE_MAX; i++) kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + if (err_cpu == cpu) + break; } return -ENOMEM; -- cgit v1.2.3