From 8d9df9f0844ed87541453a3ef91bfc9f487053b7 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Mon, 16 Aug 2010 09:54:28 +0200 Subject: workqueue: free rescuer on destroy_workqueue wq->rescuer is not freed when wq is destroyed, leads a memory leak then. This patch also remove a redundant line. Signed-off-by: Xiaotian Feng Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2994a0e3a61c..1001b6e3fcbd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2782,7 +2782,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, if (IS_ERR(rescuer->task)) goto err; - wq->rescuer = rescuer; rescuer->task->flags |= PF_THREAD_BOUND; wake_up_process(rescuer->task); } @@ -2848,6 +2847,7 @@ void destroy_workqueue(struct workqueue_struct *wq) if (wq->flags & WQ_RESCUER) { kthread_stop(wq->rescuer->task); free_mayday_mask(wq->mayday_mask); + kfree(wq->rescuer); } free_cwqs(wq); -- cgit v1.2.3 From 1495cc9df4e81f5a8fa9b0b8f1034b14d24b7d8c Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 17 Aug 2010 21:15:46 -0700 Subject: Input: sysrq - drop tty argument from sysrq ops handlers Noone is using tty argument so let's get rid of it. Acked-by: Alan Cox Acked-by: Jason Wessel Acked-by: Greg Kroah-Hartman Signed-off-by: Dmitry Torokhov --- arch/arm/kernel/etm.c | 2 +- arch/powerpc/xmon/xmon.c | 5 ++--- arch/sparc/kernel/process_64.c | 2 +- drivers/char/sysrq.c | 42 ++++++++++++++++++++--------------------- drivers/gpu/drm/drm_fb_helper.c | 2 +- drivers/net/ibm_newemac/debug.c | 2 +- include/linux/sysrq.h | 6 +++++- kernel/debug/debug_core.c | 2 +- kernel/power/poweroff.c | 2 +- 9 files changed, 34 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/etm.c b/arch/arm/kernel/etm.c index 56418f98cd01..33c7077174db 100644 --- a/arch/arm/kernel/etm.c +++ b/arch/arm/kernel/etm.c @@ -230,7 +230,7 @@ static void etm_dump(void) etb_lock(t); } -static void sysrq_etm_dump(int key, struct tty_struct *tty) +static void sysrq_etm_dump(int key) { dev_dbg(tracer.dev, "Dumping ETB buffer\n"); etm_dump(); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 0554445200bf..d17d04cfb2cd 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2880,15 +2880,14 @@ static void xmon_init(int enable) } #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_xmon(int key, struct tty_struct *tty) +static void sysrq_handle_xmon(int key) { /* ensure xmon is enabled */ xmon_init(1); debugger(get_irq_regs()); } -static struct sysrq_key_op sysrq_xmon_op = -{ +static struct sysrq_key_op sysrq_xmon_op = { .handler = sysrq_handle_xmon, .help_msg = "Xmon", .action_msg = "Entering xmon", diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index dbe81a368b45..25b01b43b40d 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -303,7 +303,7 @@ void arch_trigger_all_cpu_backtrace(void) #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_globreg(int key, struct tty_struct *tty) +static void sysrq_handle_globreg(int key) { arch_trigger_all_cpu_backtrace(); } diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 878ac0c2cc68..a892a3c249dd 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -76,7 +76,7 @@ static int __init sysrq_always_enabled_setup(char *str) __setup("sysrq_always_enabled", sysrq_always_enabled_setup); -static void sysrq_handle_loglevel(int key, struct tty_struct *tty) +static void sysrq_handle_loglevel(int key) { int i; @@ -93,7 +93,7 @@ static struct sysrq_key_op sysrq_loglevel_op = { }; #ifdef CONFIG_VT -static void sysrq_handle_SAK(int key, struct tty_struct *tty) +static void sysrq_handle_SAK(int key) { struct work_struct *SAK_work = &vc_cons[fg_console].SAK_work; schedule_work(SAK_work); @@ -109,7 +109,7 @@ static struct sysrq_key_op sysrq_SAK_op = { #endif #ifdef CONFIG_VT -static void sysrq_handle_unraw(int key, struct tty_struct *tty) +static void sysrq_handle_unraw(int key) { struct kbd_struct *kbd = &kbd_table[fg_console]; @@ -126,7 +126,7 @@ static struct sysrq_key_op sysrq_unraw_op = { #define sysrq_unraw_op (*(struct sysrq_key_op *)NULL) #endif /* CONFIG_VT */ -static void sysrq_handle_crash(int key, struct tty_struct *tty) +static void sysrq_handle_crash(int key) { char *killer = NULL; @@ -141,7 +141,7 @@ static struct sysrq_key_op sysrq_crash_op = { .enable_mask = SYSRQ_ENABLE_DUMP, }; -static void sysrq_handle_reboot(int key, struct tty_struct *tty) +static void sysrq_handle_reboot(int key) { lockdep_off(); local_irq_enable(); @@ -154,7 +154,7 @@ static struct sysrq_key_op sysrq_reboot_op = { .enable_mask = SYSRQ_ENABLE_BOOT, }; -static void sysrq_handle_sync(int key, struct tty_struct *tty) +static void sysrq_handle_sync(int key) { emergency_sync(); } @@ -165,7 +165,7 @@ static struct sysrq_key_op sysrq_sync_op = { .enable_mask = SYSRQ_ENABLE_SYNC, }; -static void sysrq_handle_show_timers(int key, struct tty_struct *tty) +static void sysrq_handle_show_timers(int key) { sysrq_timer_list_show(); } @@ -176,7 +176,7 @@ static struct sysrq_key_op sysrq_show_timers_op = { .action_msg = "Show clockevent devices & pending hrtimers (no others)", }; -static void sysrq_handle_mountro(int key, struct tty_struct *tty) +static void sysrq_handle_mountro(int key) { emergency_remount(); } @@ -188,7 +188,7 @@ static struct sysrq_key_op sysrq_mountro_op = { }; #ifdef CONFIG_LOCKDEP -static void sysrq_handle_showlocks(int key, struct tty_struct *tty) +static void sysrq_handle_showlocks(int key) { debug_show_all_locks(); } @@ -226,7 +226,7 @@ static void sysrq_showregs_othercpus(struct work_struct *dummy) static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus); -static void sysrq_handle_showallcpus(int key, struct tty_struct *tty) +static void sysrq_handle_showallcpus(int key) { /* * Fall back to the workqueue based printing if the @@ -252,7 +252,7 @@ static struct sysrq_key_op sysrq_showallcpus_op = { }; #endif -static void sysrq_handle_showregs(int key, struct tty_struct *tty) +static void sysrq_handle_showregs(int key) { struct pt_regs *regs = get_irq_regs(); if (regs) @@ -266,7 +266,7 @@ static struct sysrq_key_op sysrq_showregs_op = { .enable_mask = SYSRQ_ENABLE_DUMP, }; -static void sysrq_handle_showstate(int key, struct tty_struct *tty) +static void sysrq_handle_showstate(int key) { show_state(); } @@ -277,7 +277,7 @@ static struct sysrq_key_op sysrq_showstate_op = { .enable_mask = SYSRQ_ENABLE_DUMP, }; -static void sysrq_handle_showstate_blocked(int key, struct tty_struct *tty) +static void sysrq_handle_showstate_blocked(int key) { show_state_filter(TASK_UNINTERRUPTIBLE); } @@ -291,7 +291,7 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = { #ifdef CONFIG_TRACING #include -static void sysrq_ftrace_dump(int key, struct tty_struct *tty) +static void sysrq_ftrace_dump(int key) { ftrace_dump(DUMP_ALL); } @@ -305,7 +305,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = { #define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)NULL) #endif -static void sysrq_handle_showmem(int key, struct tty_struct *tty) +static void sysrq_handle_showmem(int key) { show_mem(); } @@ -330,7 +330,7 @@ static void send_sig_all(int sig) } } -static void sysrq_handle_term(int key, struct tty_struct *tty) +static void sysrq_handle_term(int key) { send_sig_all(SIGTERM); console_loglevel = 8; @@ -349,7 +349,7 @@ static void moom_callback(struct work_struct *ignored) static DECLARE_WORK(moom_work, moom_callback); -static void sysrq_handle_moom(int key, struct tty_struct *tty) +static void sysrq_handle_moom(int key) { schedule_work(&moom_work); } @@ -361,7 +361,7 @@ static struct sysrq_key_op sysrq_moom_op = { }; #ifdef CONFIG_BLOCK -static void sysrq_handle_thaw(int key, struct tty_struct *tty) +static void sysrq_handle_thaw(int key) { emergency_thaw_all(); } @@ -373,7 +373,7 @@ static struct sysrq_key_op sysrq_thaw_op = { }; #endif -static void sysrq_handle_kill(int key, struct tty_struct *tty) +static void sysrq_handle_kill(int key) { send_sig_all(SIGKILL); console_loglevel = 8; @@ -385,7 +385,7 @@ static struct sysrq_key_op sysrq_kill_op = { .enable_mask = SYSRQ_ENABLE_SIGNAL, }; -static void sysrq_handle_unrt(int key, struct tty_struct *tty) +static void sysrq_handle_unrt(int key) { normalize_rt_tasks(); } @@ -520,7 +520,7 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) if (!check_mask || sysrq_on_mask(op_p->enable_mask)) { printk("%s\n", op_p->action_msg); console_loglevel = orig_log_level; - op_p->handler(key, tty); + op_p->handler(key); } else { printk("This sysrq operation is disabled.\n"); } diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index de82e201d682..5efd6d6742ec 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -369,7 +369,7 @@ static void drm_fb_helper_restore_work_fn(struct work_struct *ignored) } static DECLARE_WORK(drm_fb_helper_restore_work, drm_fb_helper_restore_work_fn); -static void drm_fb_helper_sysrq(int dummy1, struct tty_struct *dummy3) +static void drm_fb_helper_sysrq(int dummy1) { schedule_work(&drm_fb_helper_restore_work); } diff --git a/drivers/net/ibm_newemac/debug.c b/drivers/net/ibm_newemac/debug.c index 3995fafc1e08..8c6c1e2a8750 100644 --- a/drivers/net/ibm_newemac/debug.c +++ b/drivers/net/ibm_newemac/debug.c @@ -238,7 +238,7 @@ void emac_dbg_dump_all(void) } #if defined(CONFIG_MAGIC_SYSRQ) -static void emac_sysrq_handler(int key, struct tty_struct *tty) +static void emac_sysrq_handler(int key) { emac_dbg_dump_all(); } diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 609e8ca5f534..4ee650315119 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -31,7 +31,7 @@ struct tty_struct; #define SYSRQ_ENABLE_RTNICE 0x0100 struct sysrq_key_op { - void (*handler)(int, struct tty_struct *); + void (*handler)(int); char *help_msg; char *action_msg; int enable_mask; @@ -58,6 +58,10 @@ static inline void handle_sysrq(int key, struct tty_struct *tty) { } +static inline void __handle_sysrq(int key, struct tty_struct *tty, int check_mask); +{ +} + static inline int register_sysrq_key(int key, struct sysrq_key_op *op) { return -EINVAL; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 3c2d4972d235..de407c78178d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -741,7 +741,7 @@ static struct console kgdbcons = { }; #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_dbg(int key, struct tty_struct *tty) +static void sysrq_handle_dbg(int key) { if (!dbg_io_ops) { printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index e8b337006276..d52359374e85 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -24,7 +24,7 @@ static void do_poweroff(struct work_struct *dummy) static DECLARE_WORK(poweroff_work, do_poweroff); -static void handle_poweroff(int key, struct tty_struct *tty) +static void handle_poweroff(int key) { /* run sysrq poweroff on boot cpu */ schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); -- cgit v1.2.3 From 861d034ee814917a83bd5de4b26e3b8336ddeeb8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Aug 2010 13:31:43 +0200 Subject: sched: Fix rq->clock synchronization when migrating tasks sched_fork() -- we do task placement in ->task_fork_fair() ensure we update_rq_clock() so we work with current time. We leave the vruntime in relative state, so the time delay until wake_up_new_task() doesn't matter. wake_up_new_task() -- Since task_fork_fair() left p->vruntime in relative state we can safely migrate, the activate_task() on the remote rq will call update_rq_clock() and causes the clock to be synced (enough). Tested-by: Jack Daniel Tested-by: Philby John Signed-off-by: Peter Zijlstra LKML-Reference: <1281002322.1923.1708.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 806d1b227a21..ab661ebc4895 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3752,6 +3752,8 @@ static void task_fork_fair(struct task_struct *p) raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + if (unlikely(task_cpu(p) != this_cpu)) __set_task_cpu(p, this_cpu); -- cgit v1.2.3 From f335397d177c906256ee1bba28e8c49e8ec63817 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 17 Aug 2010 21:15:47 -0700 Subject: Input: sysrq - drop tty argument form handle_sysrq() Sysrq operations do not accept tty argument anymore so no need to pass it to us. [Stephen Rothwell : fix build breakage in drm code caused by sysrq using bool but not including linux/types.h] [Sachin Sant : fix build breakage in s390 keyboadr driver] Acked-by: Alan Cox Acked-by: Jason Wessel Acked-by: Greg Kroah-Hartman Signed-off-by: Dmitry Torokhov --- arch/ia64/hp/sim/simserial.c | 2 +- arch/um/drivers/mconsole_kern.c | 2 +- drivers/char/hangcheck-timer.c | 2 +- drivers/char/hvc_console.c | 2 +- drivers/char/hvsi.c | 2 +- drivers/char/sysrq.c | 11 +++++------ drivers/s390/char/ctrlchar.c | 4 +--- drivers/s390/char/keyboard.c | 2 +- drivers/serial/sn_console.c | 2 +- drivers/usb/serial/generic.c | 2 +- drivers/xen/manage.c | 2 +- include/linux/serial_core.h | 2 +- include/linux/sysrq.h | 12 +++++------- kernel/debug/kdb/kdb_main.c | 2 +- 14 files changed, 22 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c index 2bef5261d96d..1e8d71ad93ef 100644 --- a/arch/ia64/hp/sim/simserial.c +++ b/arch/ia64/hp/sim/simserial.c @@ -149,7 +149,7 @@ static void receive_chars(struct tty_struct *tty) ch = ia64_ssc(0, 0, 0, 0, SSC_GETCHAR); while (!ch); - handle_sysrq(ch, NULL); + handle_sysrq(ch); } #endif seen_esc = 0; diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index de317d0c3294..ebc680717e59 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -690,7 +690,7 @@ static void with_console(struct mc_request *req, void (*proc)(void *), static void sysrq_proc(void *arg) { char *op = arg; - handle_sysrq(*op, NULL); + handle_sysrq(*op); } void mconsole_sysrq(struct mc_request *req) diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c index e0249722d25f..f953c96efc86 100644 --- a/drivers/char/hangcheck-timer.c +++ b/drivers/char/hangcheck-timer.c @@ -159,7 +159,7 @@ static void hangcheck_fire(unsigned long data) if (hangcheck_dump_tasks) { printk(KERN_CRIT "Hangcheck: Task state:\n"); #ifdef CONFIG_MAGIC_SYSRQ - handle_sysrq('t', NULL); + handle_sysrq('t'); #endif /* CONFIG_MAGIC_SYSRQ */ } if (hangcheck_reboot) { diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c index fa27d1676ee5..3afd62e856eb 100644 --- a/drivers/char/hvc_console.c +++ b/drivers/char/hvc_console.c @@ -651,7 +651,7 @@ int hvc_poll(struct hvc_struct *hp) if (sysrq_pressed) continue; } else if (sysrq_pressed) { - handle_sysrq(buf[i], tty); + handle_sysrq(buf[i]); sysrq_pressed = 0; continue; } diff --git a/drivers/char/hvsi.c b/drivers/char/hvsi.c index 1f4b6de65a2d..a2bc885ce60a 100644 --- a/drivers/char/hvsi.c +++ b/drivers/char/hvsi.c @@ -403,7 +403,7 @@ static void hvsi_insert_chars(struct hvsi_struct *hp, const char *buf, int len) hp->sysrq = 1; continue; } else if (hp->sysrq) { - handle_sysrq(c, hp->tty); + handle_sysrq(c); hp->sysrq = 0; continue; } diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index a892a3c249dd..ef31bb81e843 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -493,7 +492,7 @@ static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p) sysrq_key_table[i] = op_p; } -void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) +void __handle_sysrq(int key, bool check_mask) { struct sysrq_key_op *op_p; int orig_log_level; @@ -545,10 +544,10 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) spin_unlock_irqrestore(&sysrq_key_table_lock, flags); } -void handle_sysrq(int key, struct tty_struct *tty) +void handle_sysrq(int key) { if (sysrq_on()) - __handle_sysrq(key, tty, 1); + __handle_sysrq(key, true); } EXPORT_SYMBOL(handle_sysrq); @@ -597,7 +596,7 @@ static bool sysrq_filter(struct input_handle *handle, unsigned int type, default: if (sysrq_down && value && value != 2) - __handle_sysrq(sysrq_xlate[code], NULL, 1); + __handle_sysrq(sysrq_xlate[code], true); break; } @@ -765,7 +764,7 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, if (get_user(c, buf)) return -EFAULT; - __handle_sysrq(c, NULL, 0); + __handle_sysrq(c, false); } return count; diff --git a/drivers/s390/char/ctrlchar.c b/drivers/s390/char/ctrlchar.c index c6cbcb3f925e..0e9a309b9669 100644 --- a/drivers/s390/char/ctrlchar.c +++ b/drivers/s390/char/ctrlchar.c @@ -16,12 +16,11 @@ #ifdef CONFIG_MAGIC_SYSRQ static int ctrlchar_sysrq_key; -static struct tty_struct *sysrq_tty; static void ctrlchar_handle_sysrq(struct work_struct *work) { - handle_sysrq(ctrlchar_sysrq_key, sysrq_tty); + handle_sysrq(ctrlchar_sysrq_key); } static DECLARE_WORK(ctrlchar_work, ctrlchar_handle_sysrq); @@ -54,7 +53,6 @@ ctrlchar_handle(const unsigned char *buf, int len, struct tty_struct *tty) /* racy */ if (len == 3 && buf[1] == '-') { ctrlchar_sysrq_key = buf[2]; - sysrq_tty = tty; schedule_work(&ctrlchar_work); return CTRLCHAR_SYSRQ; } diff --git a/drivers/s390/char/keyboard.c b/drivers/s390/char/keyboard.c index 18d9a497863b..8cd58e412b5e 100644 --- a/drivers/s390/char/keyboard.c +++ b/drivers/s390/char/keyboard.c @@ -305,7 +305,7 @@ kbd_keycode(struct kbd_data *kbd, unsigned int keycode) if (kbd->sysrq) { if (kbd->sysrq == K(KT_LATIN, '-')) { kbd->sysrq = 0; - handle_sysrq(value, kbd->tty); + handle_sysrq(value); return; } if (value == '-') { diff --git a/drivers/serial/sn_console.c b/drivers/serial/sn_console.c index 7e5e5efea4e2..cff9a306660f 100644 --- a/drivers/serial/sn_console.c +++ b/drivers/serial/sn_console.c @@ -492,7 +492,7 @@ sn_receive_chars(struct sn_cons_port *port, unsigned long flags) sysrq_requested = 0; if (ch && time_before(jiffies, sysrq_timeout)) { spin_unlock_irqrestore(&port->sc_port.lock, flags); - handle_sysrq(ch, NULL); + handle_sysrq(ch); spin_lock_irqsave(&port->sc_port.lock, flags); /* ignore actual sysrq command char */ continue; diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c index ca92f67747cc..1e846cc3c7a4 100644 --- a/drivers/usb/serial/generic.c +++ b/drivers/usb/serial/generic.c @@ -453,7 +453,7 @@ int usb_serial_handle_sysrq_char(struct tty_struct *tty, { if (port->sysrq && port->port.console) { if (ch && time_before(jiffies, port->sysrq)) { - handle_sysrq(ch, tty); + handle_sysrq(ch); port->sysrq = 0; return 1; } diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 1799bd890315..ef9c7db52077 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -237,7 +237,7 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, goto again; if (sysrq_key != '\0') - handle_sysrq(sysrq_key, NULL); + handle_sysrq(sysrq_key); } static struct xenbus_watch sysrq_watch = { diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 3c2ad99fed34..64458a9a8938 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -465,7 +465,7 @@ uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) #ifdef SUPPORT_SYSRQ if (port->sysrq) { if (ch && time_before(jiffies, port->sysrq)) { - handle_sysrq(ch, port->state->port.tty); + handle_sysrq(ch); port->sysrq = 0; return 1; } diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 4ee650315119..387fa7d05c98 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -15,9 +15,7 @@ #define _LINUX_SYSRQ_H #include - -struct pt_regs; -struct tty_struct; +#include /* Possible values of bitmask for enabling sysrq functions */ /* 0x0001 is reserved for enable everything */ @@ -44,8 +42,8 @@ struct sysrq_key_op { * are available -- else NULL's). */ -void handle_sysrq(int key, struct tty_struct *tty); -void __handle_sysrq(int key, struct tty_struct *tty, int check_mask); +void handle_sysrq(int key); +void __handle_sysrq(int key, bool check_mask); int register_sysrq_key(int key, struct sysrq_key_op *op); int unregister_sysrq_key(int key, struct sysrq_key_op *op); struct sysrq_key_op *__sysrq_get_key_op(int key); @@ -54,11 +52,11 @@ int sysrq_toggle_support(int enable_mask); #else -static inline void handle_sysrq(int key, struct tty_struct *tty) +static inline void handle_sysrq(int key) { } -static inline void __handle_sysrq(int key, struct tty_struct *tty, int check_mask); +static inline void __handle_sysrq(int key, bool check_mask) { } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 28b844118bbd..caf057a3de0e 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1929,7 +1929,7 @@ static int kdb_sr(int argc, const char **argv) if (argc != 1) return KDB_ARGCOUNT; kdb_trap_printk++; - __handle_sysrq(*argv[1], NULL, 0); + __handle_sysrq(*argv[1], false); kdb_trap_printk--; return 0; -- cgit v1.2.3 From c6db67cda735d8ace5f19c3831240e1408679790 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Aug 2010 11:49:15 +0200 Subject: watchdog: Don't throttle the watchdog Stephane reported that when the machine locks up, the regular ticks, which are responsible to resetting the throttle count, stop too. Hence the NMI watchdog can end up being throttled before it reports on the locked up state, and we end up being sad.. Cure this by having the watchdog overflow reset its own throttle count. Reported-by: Stephane Eranian Tested-by: Stephane Eranian Cc: Don Zickus Cc: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1282215916.1926.4696.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 613bc1f04610..0d53c8e853b1 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -206,6 +206,9 @@ void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { + /* Ensure the watchdog never gets throttled */ + event->hw.interrupts = 0; + if (__get_cpu_var(watchdog_nmi_touch) == true) { __get_cpu_var(watchdog_nmi_touch) = false; return; -- cgit v1.2.3 From 9d0f4dcc5c4d1c5dd01172172684a45b5f49d740 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 18 Aug 2010 15:00:27 -0700 Subject: mutex: Improve the scalability of optimistic spinning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a scalability issue for current implementation of optimistic mutex spin in the kernel. It is found on a 8 node 64 core Nehalem-EX system (HT mode). The intention of the optimistic mutex spin is to busy wait and spin on a mutex if the owner of the mutex is running, in the hope that the mutex will be released soon and be acquired, without the thread trying to acquire mutex going to sleep. However, when we have a large number of threads, contending for the mutex, we could have the mutex grabbed by other thread, and then another ……, and we will keep spinning, wasting cpu cycles and adding to the contention. One possible fix is to quit spinning and put the current thread on wait-list if mutex lock switch to a new owner while we spin, indicating heavy contention (see the patch included). I did some testing on a 8 socket Nehalem-EX system with a total of 64 cores. Using Ingo's test-mutex program that creates/delete files with 256 threads (http://lkml.org/lkml/2006/1/8/50) , I see the following speed up after putting in the mutex spin fix: ./mutex-test V 256 10 Ops/sec 2.6.34 62864 With fix 197200 Repeating the test with Aim7 fserver workload, again there is a speed up with the fix: Jobs/min 2.6.34 91657 With fix 149325 To look at the impact on the distribution of mutex acquisition time, I collected the mutex acquisition time on Aim7 fserver workload with some instrumentation. The average acquisition time is reduced by 48% and number of contentions reduced by 32%. #contentions Time to acquire mutex (cycles) 2.6.34 72973 44765791 With fix 49210 23067129 The histogram of mutex acquisition time is listed below. The acquisition time is in 2^bin cycles. We see that without the fix, the acquisition time is mostly around 2^26 cycles. With the fix, we the distribution get spread out a lot more towards the lower cycles, starting from 2^13. However, there is an increase of the tail distribution with the fix at 2^28 and 2^29 cycles. It seems a small price to pay for the reduced average acquisition time and also getting the cpu to do useful work. Mutex acquisition time distribution (acq time = 2^bin cycles): 2.6.34 With Fix bin #occurrence % #occurrence % 11 2 0.00% 120 0.24% 12 10 0.01% 790 1.61% 13 14 0.02% 2058 4.18% 14 86 0.12% 3378 6.86% 15 393 0.54% 4831 9.82% 16 710 0.97% 4893 9.94% 17 815 1.12% 4667 9.48% 18 790 1.08% 5147 10.46% 19 580 0.80% 6250 12.70% 20 429 0.59% 6870 13.96% 21 311 0.43% 1809 3.68% 22 255 0.35% 2305 4.68% 23 317 0.44% 916 1.86% 24 610 0.84% 233 0.47% 25 3128 4.29% 95 0.19% 26 63902 87.69% 122 0.25% 27 619 0.85% 286 0.58% 28 0 0.00% 3536 7.19% 29 0 0.00% 903 1.83% 30 0 0.00% 0 0.00% I've done similar experiments with 2.6.35 kernel on smaller boxes as well. One is on a dual-socket Westmere box (12 cores total, with HT). Another experiment is on an old dual-socket Core 2 box (4 cores total, no HT) On the 12-core Westmere box, I see a 250% increase for Ingo's mutex-test program with my mutex patch but no significant difference in aim7's fserver workload. On the 4-core Core 2 box, I see the difference with the patch for both mutex-test and aim7 fserver are negligible. So far, it seems like the patch has not caused regression on smaller systems. Signed-off-by: Tim Chen Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Frederic Weisbecker Cc: # .35.x LKML-Reference: <1282168827.9542.72.camel@schen9-DESK> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 41541d79e3c8..09b574e7f4df 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3865,8 +3865,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) /* * Owner changed, break to re-assess state. */ - if (lock->owner != owner) + if (lock->owner != owner) { + /* + * If the lock has switched to a different owner, + * we likely have heavy contention. Return 0 to quit + * optimistic spinning and not contend further: + */ + if (lock->owner) + return 0; break; + } /* * Is that owner really running on that cpu? -- cgit v1.2.3 From 06bd6ebffae36d3b105677598c48e8bd0a10b205 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 22 Aug 2010 23:19:42 +0900 Subject: workqueue: annotate lock context change Some of internal functions called within gcwq->lock context releases and regrabs the lock but were missing proper annotations. Add it. Signed-off-by: Namhyung Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1001b6e3fcbd..7415f27a8aa7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1485,6 +1485,8 @@ static void gcwq_mayday_timeout(unsigned long __gcwq) * otherwise. */ static bool maybe_create_worker(struct global_cwq *gcwq) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) { if (!need_to_create_worker(gcwq)) return false; @@ -1722,6 +1724,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) * spin_lock_irq(gcwq->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) { struct cpu_workqueue_struct *cwq = get_work_cwq(work); struct global_cwq *gcwq = cwq->gcwq; @@ -3230,6 +3234,8 @@ static int __cpuinit trustee_thread(void *__gcwq) * multiple times. To be used by cpu_callback. */ static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) { if (!(gcwq->trustee_state == state || gcwq->trustee_state == TRUSTEE_DONE)) { -- cgit v1.2.3 From 972fa1c5316d18c8297123e08e9b6930ca34f888 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 22 Aug 2010 23:19:43 +0900 Subject: workqueue: mark lock acquisition on worker_maybe_bind_and_lock() worker_maybe_bind_and_lock() actually grabs gcwq->lock but was missing proper annotation. Add it. So this patch will remove following sparse warnings: kernel/workqueue.c:1214:13: warning: context imbalance in 'worker_maybe_bind_and_lock' - wrong count at exit arch/x86/include/asm/irqflags.h:44:9: warning: context imbalance in 'worker_rebind_fn' - unexpected unlock kernel/workqueue.c:1991:17: warning: context imbalance in 'rescuer_thread' - unexpected unlock Signed-off-by: Namhyung Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7415f27a8aa7..cc3456f96c56 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1212,6 +1212,7 @@ static void worker_leave_idle(struct worker *worker) * bound), %false if offline. */ static bool worker_maybe_bind_and_lock(struct worker *worker) +__acquires(&gcwq->lock) { struct global_cwq *gcwq = worker->gcwq; struct task_struct *task = worker->task; -- cgit v1.2.3 From e41e704bc4f49057fc68b643108366e6e6781aa3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Aug 2010 14:22:47 +0200 Subject: workqueue: improve destroy_workqueue() debuggability Now that the worklist is global, having works pending after wq destruction can easily lead to oops and destroy_workqueue() have several BUG_ON()s to catch these cases. Unfortunately, BUG_ON() doesn't tell much about how the work became pending after the final flush_workqueue(). This patch adds WQ_DYING which is set before the final flush begins. If a work is requested to be queued on a dying workqueue, WARN_ON_ONCE() is triggered and the request is ignored. This clearly indicates which caller is trying to queue a work on a dying workqueue and keeps the system working in most cases. Locking rule comment is updated such that the 'I' rule includes modifying the field from destruction path. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 ++ kernel/workqueue.c | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 4f9d277bcd9a..c959666eafca 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -241,6 +241,8 @@ enum { WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ + WQ_DYING = 1 << 6, /* internal: workqueue is dying */ + WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cc3456f96c56..362b50d092e2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -87,7 +87,8 @@ enum { /* * Structure fields follow one of the following exclusion rules. * - * I: Set during initialization and read-only afterwards. + * I: Modifiable by initialization/destruction paths and read-only for + * everyone else. * * P: Preemption protected. Disabling preemption is enough and should * only be modified and accessed from the local cpu. @@ -944,6 +945,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, debug_work_activate(work); + if (WARN_ON_ONCE(wq->flags & WQ_DYING)) + return; + /* determine gcwq to use */ if (!(wq->flags & WQ_UNBOUND)) { struct global_cwq *last_gcwq; @@ -2828,6 +2832,7 @@ void destroy_workqueue(struct workqueue_struct *wq) { unsigned int cpu; + wq->flags |= WQ_DYING; flush_workqueue(wq); /* -- cgit v1.2.3 From bac1e74dba9755128748b872a0f304dad4d198c6 Mon Sep 17 00:00:00 2001 From: David Alan Gilbert Date: Tue, 24 Aug 2010 20:22:18 +0200 Subject: PM QoS: Fix kzalloc() parameters swapped in pm_qos_power_open() sparse spotted that the kzalloc() in pm_qos_power_open() in the current Linus' git tree had its parameters swapped. Fix this. Signed-off-by: David Alan Gilbert Acked-by: mark gross Signed-off-by: Rafael J. Wysocki --- kernel/pm_qos_params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 996a4dec5f96..9da439b419aa 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -348,7 +348,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req)); + struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; -- cgit v1.2.3 From 8a2e8e5dec7e29c56a46ba176c664ab6a3d04118 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Aug 2010 10:33:56 +0200 Subject: workqueue: fix cwq->nr_active underflow cwq->nr_active is used to keep track of how many work items are active for the cpu workqueue, where 'active' is defined as either pending on global worklist or executing. This is used to implement the max_active limit and workqueue freezing. If a work item is queued after nr_active has already reached max_active, the work item doesn't increment nr_active and is put on the delayed queue and gets activated later as previous active work items retire. try_to_grab_pending() which is used in the cancellation path unconditionally decremented nr_active whether the work item being cancelled is currently active or delayed, so cancelling a delayed work item makes nr_active underflow. This breaks max_active enforcement and triggers BUG_ON() in destroy_workqueue() later on. This patch fixes this bug by adding a flag WORK_STRUCT_DELAYED, which is set while a work item in on the delayed list and making try_to_grab_pending() decrement nr_active iff the work item is currently active. The addition of the flag enlarges cwq alignment to 256 bytes which is getting a bit too large. It's scheduled to be reduced back to 128 bytes by merging WORK_STRUCT_PENDING and WORK_STRUCT_CWQ in the next devel cycle. Signed-off-by: Tejun Heo Reported-by: Johannes Berg --- include/linux/workqueue.h | 16 +++++++++------- kernel/workqueue.c | 30 ++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index c959666eafca..f11100f96482 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -25,18 +25,20 @@ typedef void (*work_func_t)(struct work_struct *work); enum { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ - WORK_STRUCT_CWQ_BIT = 1, /* data points to cwq */ - WORK_STRUCT_LINKED_BIT = 2, /* next work is linked to this one */ + WORK_STRUCT_DELAYED_BIT = 1, /* work item is delayed */ + WORK_STRUCT_CWQ_BIT = 2, /* data points to cwq */ + WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK - WORK_STRUCT_STATIC_BIT = 3, /* static initializer (debugobjects) */ - WORK_STRUCT_COLOR_SHIFT = 4, /* color for workqueue flushing */ + WORK_STRUCT_STATIC_BIT = 4, /* static initializer (debugobjects) */ + WORK_STRUCT_COLOR_SHIFT = 5, /* color for workqueue flushing */ #else - WORK_STRUCT_COLOR_SHIFT = 3, /* color for workqueue flushing */ + WORK_STRUCT_COLOR_SHIFT = 4, /* color for workqueue flushing */ #endif WORK_STRUCT_COLOR_BITS = 4, WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, + WORK_STRUCT_DELAYED = 1 << WORK_STRUCT_DELAYED_BIT, WORK_STRUCT_CWQ = 1 << WORK_STRUCT_CWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -59,8 +61,8 @@ enum { /* * Reserve 7 bits off of cwq pointer w/ debugobjects turned - * off. This makes cwqs aligned to 128 bytes which isn't too - * excessive while allowing 15 workqueue flush colors. + * off. This makes cwqs aligned to 256 bytes and allows 15 + * workqueue flush colors. */ WORK_STRUCT_FLAG_BITS = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 362b50d092e2..a2dccfca03ba 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -941,6 +941,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct global_cwq *gcwq; struct cpu_workqueue_struct *cwq; struct list_head *worklist; + unsigned int work_flags; unsigned long flags; debug_work_activate(work); @@ -990,14 +991,17 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, BUG_ON(!list_empty(&work->entry)); cwq->nr_in_flight[cwq->work_color]++; + work_flags = work_color_to_flags(cwq->work_color); if (likely(cwq->nr_active < cwq->max_active)) { cwq->nr_active++; worklist = gcwq_determine_ins_pos(gcwq, cwq); - } else + } else { + work_flags |= WORK_STRUCT_DELAYED; worklist = &cwq->delayed_works; + } - insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color)); + insert_work(cwq, work, worklist, work_flags); spin_unlock_irqrestore(&gcwq->lock, flags); } @@ -1666,6 +1670,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); move_linked_works(work, pos, NULL); + __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); cwq->nr_active++; } @@ -1673,6 +1678,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight * @cwq: cwq of interest * @color: color of work which left the queue + * @delayed: for a delayed work * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its cwq and handle workqueue flushing. @@ -1680,19 +1686,22 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) * CONTEXT: * spin_lock_irq(gcwq->lock). */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, + bool delayed) { /* ignore uncolored works */ if (color == WORK_NO_COLOR) return; cwq->nr_in_flight[color]--; - cwq->nr_active--; - if (!list_empty(&cwq->delayed_works)) { - /* one down, submit a delayed one */ - if (cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); + if (!delayed) { + cwq->nr_active--; + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } } /* is flush in progress and are we at the flushing tip? */ @@ -1823,7 +1832,7 @@ __acquires(&gcwq->lock) hlist_del_init(&worker->hentry); worker->current_work = NULL; worker->current_cwq = NULL; - cwq_dec_nr_in_flight(cwq, work_color); + cwq_dec_nr_in_flight(cwq, work_color, false); } /** @@ -2388,7 +2397,8 @@ static int try_to_grab_pending(struct work_struct *work) debug_work_deactivate(work); list_del_init(&work->entry); cwq_dec_nr_in_flight(get_work_cwq(work), - get_work_color(work)); + get_work_color(work), + *work_data_bits(work) & WORK_STRUCT_DELAYED); ret = 1; } } -- cgit v1.2.3 From 151772dbfad4dbe81721e40f9b3d588ea77bb7aa Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 25 Aug 2010 11:32:38 +1000 Subject: tracing/trace_stack: Fix stack trace on ppc64 save_stack_trace() stores the instruction pointer, not the function descriptor. On ppc64 the trace stack code currently dereferences the instruction pointer and shows 8 bytes of instructions in our backtraces: # cat /sys/kernel/debug/tracing/stack_trace Depth Size Location (26 entries) ----- ---- -------- 0) 5424 112 0x6000000048000004 1) 5312 160 0x60000000ebad01b0 2) 5152 160 0x2c23000041c20030 3) 4992 240 0x600000007c781b79 4) 4752 160 0xe84100284800000c 5) 4592 192 0x600000002fa30000 6) 4400 256 0x7f1800347b7407e0 7) 4144 208 0xe89f0108f87f0070 8) 3936 272 0xe84100282fa30000 Since we aren't dealing with function descriptors, use %pS instead of %pF to fix it: # cat /sys/kernel/debug/tracing/stack_trace Depth Size Location (26 entries) ----- ---- -------- 0) 5424 112 ftrace_call+0x4/0x8 1) 5312 160 .current_io_context+0x28/0x74 2) 5152 160 .get_io_context+0x48/0xa0 3) 4992 240 .cfq_set_request+0x94/0x4c4 4) 4752 160 .elv_set_request+0x60/0x84 5) 4592 192 .get_request+0x2d4/0x468 6) 4400 256 .get_request_wait+0x7c/0x258 7) 4144 208 .__make_request+0x49c/0x610 8) 3936 272 .generic_make_request+0x390/0x434 Signed-off-by: Anton Blanchard Cc: rostedt@goodmis.org Cc: fweisbec@gmail.com LKML-Reference: <20100825013238.GE28360@kryten> Signed-off-by: Ingo Molnar --- kernel/trace/trace_stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 056468eae7cf..a6b7e0e0f3eb 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -249,7 +249,7 @@ static int trace_lookup_stack(struct seq_file *m, long i) { unsigned long addr = stack_dump_trace[i]; - return seq_printf(m, "%pF\n", (void *)addr); + return seq_printf(m, "%pS\n", (void *)addr); } static void print_disabled(struct seq_file *m) -- cgit v1.2.3 From 25cc69ec34a563e943e85b3b68a79a8aac7f076d Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Thu, 26 Aug 2010 20:18:43 +0200 Subject: PM QoS: Fix inline documentation. Fix the pm_qos_add_request() kerneldoc comment that doesn't reflect the behavior of the function after the last PM QoS update. Signed-off-by: Saravana Kannan Acked-by: mark gross Signed-off-by: Rafael J. Wysocki --- kernel/pm_qos_params.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 9da439b419aa..b7e4c362361b 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -212,15 +212,17 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active); /** * pm_qos_add_request - inserts new qos request into the list - * @pm_qos_class: identifies which list of qos request to us + * @dep: pointer to a preallocated handle + * @pm_qos_class: identifies which list of qos request to use * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters, and returns the pm_qos_request list - * element as a handle for use in updating and removal. Call needs to save - * this handle for later use. + * for the pm_qos_class of parameters and initializes the pm_qos_request_list + * handle. Caller needs to save this handle for later use in updates and + * removal. */ + void pm_qos_add_request(struct pm_qos_request_list *dep, int pm_qos_class, s32 value) { -- cgit v1.2.3 From fa66f07aa1f0950e1dc78b7ab39728b3f8aa77a1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 26 Aug 2010 16:40:01 +0200 Subject: perf_events: Fix time tracking for events with pid != -1 and cpu != -1 Per-thread events with a cpu filter, i.e., cpu != -1, were not reporting correct timings when the thread never ran on the monitored cpu. The time enabled was reported as a negative value. This patch fixes the problem by updating tstamp_stopped, tstamp_running in event_sched_out() for events with filters and which are marked as INACTIVE. The function group_sched_out() is modified to systematically call into event_sched_out() to avoid duplicating the timing adjustment code twice. With the patch, I now get: $ task_cpu -i -e unhalted_core_cycles,unhalted_core_cycles noploop 2 noploop for 2 seconds CPU0 0 unhalted_core_cycles (ena=1,991,136,594, run=0) CPU0 0 unhalted_core_cycles (ena=1,991,136,594, run=0) CPU1 0 unhalted_core_cycles (ena=1,991,136,594, run=0) CPU1 0 unhalted_core_cycles (ena=1,991,136,594, run=0) CPU2 0 unhalted_core_cycles (ena=1,991,136,594, run=0) CPU2 0 unhalted_core_cycles (ena=1,991,136,594, run=0) CPU3 4,747,990,931 unhalted_core_cycles (ena=1,991,136,594, run=1,991,136,594) CPU3 4,747,990,931 unhalted_core_cycles (ena=1,991,136,594, run=1,991,136,594) Signed-off-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: paulus@samba.org Cc: davem@davemloft.net Cc: fweisbec@gmail.com Cc: perfmon2-devel@lists.sf.net Cc: eranian@google.com LKML-Reference: <4c76802d.aae9d80a.115d.70fe@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 403d1804b198..657555a5f30f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -402,11 +402,31 @@ static void perf_group_detach(struct perf_event *event) } } +static inline int +event_filter_match(struct perf_event *event) +{ + return event->cpu == -1 || event->cpu == smp_processor_id(); +} + static void event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { + u64 delta; + /* + * An event which could not be activated because of + * filter mismatch still needs to have its timings + * maintained, otherwise bogus information is return + * via read() for time_enabled, time_running: + */ + if (event->state == PERF_EVENT_STATE_INACTIVE + && !event_filter_match(event)) { + delta = ctx->time - event->tstamp_stopped; + event->tstamp_running += delta; + event->tstamp_stopped = ctx->time; + } + if (event->state != PERF_EVENT_STATE_ACTIVE) return; @@ -432,9 +452,7 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; - - if (group_event->state != PERF_EVENT_STATE_ACTIVE) - return; + int state = group_event->state; event_sched_out(group_event, cpuctx, ctx); @@ -444,7 +462,7 @@ group_sched_out(struct perf_event *group_event, list_for_each_entry(event, &group_event->sibling_list, group_entry) event_sched_out(event, cpuctx, ctx); - if (group_event->attr.exclusive) + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) cpuctx->exclusive = 0; } -- cgit v1.2.3 From 477a3c33d1efa0342a74bd02da2e049191993e2c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 31 Aug 2010 10:54:35 +0200 Subject: workqueue: fix GCWQ_DISASSOCIATED initialization init_workqueues() incorrectly marks workqueues for all possible CPUs associated. Combined with mayday_mask initialization bug, this can make rescuers keep trying to bind to an offline gcwq indefinitely. Fix init_workqueues() such that only online CPUs have their gcwqs have GCWQ_DISASSOCIATED cleared. Signed-off-by: Tejun Heo Reported-by: CAI Qian --- kernel/workqueue.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a2dccfca03ba..c8183b235d16 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3558,8 +3558,7 @@ static int __init init_workqueues(void) spin_lock_init(&gcwq->lock); INIT_LIST_HEAD(&gcwq->worklist); gcwq->cpu = cpu; - if (cpu == WORK_CPU_UNBOUND) - gcwq->flags |= GCWQ_DISASSOCIATED; + gcwq->flags |= GCWQ_DISASSOCIATED; INIT_LIST_HEAD(&gcwq->idle_list); for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) @@ -3583,6 +3582,8 @@ static int __init init_workqueues(void) struct global_cwq *gcwq = get_gcwq(cpu); struct worker *worker; + if (cpu != WORK_CPU_UNBOUND) + gcwq->flags &= ~GCWQ_DISASSOCIATED; worker = create_worker(gcwq, true); BUG_ON(!worker); spin_lock_irq(&gcwq->lock); -- cgit v1.2.3 From 9c37547ab62f88aac3e1e3c2065b611f811de9b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 31 Aug 2010 11:18:34 +0200 Subject: workqueue: use zalloc_cpumask_var() for gcwq->mayday_mask alloc_mayday_mask() was using alloc_cpumask_var() making gcwq->mayday_mask contain garbage after initialization on CONFIG_CPUMASK_OFFSTACK=y configurations. This combined with the previously fixed GCWQ_DISASSOCIATED initialization bug could make rescuers fall into infinite loop trying to bind to an offline cpu. Signed-off-by: Tejun Heo Reported-by: CAI Qian --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c8183b235d16..785542976b00 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -196,7 +196,7 @@ typedef cpumask_var_t mayday_mask_t; cpumask_test_and_set_cpu((cpu), (mask)) #define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) #define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) -#define alloc_mayday_mask(maskp, gfp) alloc_cpumask_var((maskp), (gfp)) +#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) #define free_mayday_mask(mask) free_cpumask_var((mask)) #else typedef unsigned long mayday_mask_t; -- cgit v1.2.3 From 3aaba20f26f58843e8f20611e5c0b1c06954310f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 23 Aug 2010 16:50:12 +0800 Subject: tracing: Fix a race in function profile While we are reading trace_stat/functionX and someone just disabled function_profile at that time, we can trigger this: divide error: 0000 [#1] PREEMPT SMP ... EIP is at function_stat_show+0x90/0x230 ... This fix just takes the ftrace_profile_lock and checks if rec->counter is 0. If it's 0, we know the profile buffer has been reset. Signed-off-by: Li Zefan Cc: stable@kernel.org LKML-Reference: <4C723644.4040708@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0d88ce9b9fb8..7cb1f45a1de1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; + int ret = 0; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - static DEFINE_MUTEX(mutex); static struct trace_seq s; unsigned long long avg; unsigned long long stddev; #endif + mutex_lock(&ftrace_profile_lock); + + /* we raced with function_profile_reset() */ + if (unlikely(rec->counter == 0)) { + ret = -EBUSY; + goto out; + } kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); seq_printf(m, " %-30.30s %10lu", str, rec->counter); @@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v) do_div(stddev, (rec->counter - 1) * 1000); } - mutex_lock(&mutex); trace_seq_init(&s); trace_print_graph_duration(rec->time, &s); trace_seq_puts(&s, " "); @@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v) trace_seq_puts(&s, " "); trace_print_graph_duration(stddev, &s); trace_print_seq(m, &s); - mutex_unlock(&mutex); #endif seq_putc(m, '\n'); +out: + mutex_unlock(&ftrace_profile_lock); - return 0; + return ret; } static void ftrace_profile_reset(struct ftrace_profile_stat *stat) -- cgit v1.2.3 From 950eaaca681c44aab87a46225c9e44f902c080aa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Aug 2010 17:00:18 -0700 Subject: pid: make setpgid() system call use RCU read-side critical section [ 23.584719] [ 23.584720] =================================================== [ 23.585059] [ INFO: suspicious rcu_dereference_check() usage. ] [ 23.585176] --------------------------------------------------- [ 23.585176] kernel/pid.c:419 invoked rcu_dereference_check() without protection! [ 23.585176] [ 23.585176] other info that might help us debug this: [ 23.585176] [ 23.585176] [ 23.585176] rcu_scheduler_active = 1, debug_locks = 1 [ 23.585176] 1 lock held by rc.sysinit/728: [ 23.585176] #0: (tasklist_lock){.+.+..}, at: [] sys_setpgid+0x5f/0x193 [ 23.585176] [ 23.585176] stack backtrace: [ 23.585176] Pid: 728, comm: rc.sysinit Not tainted 2.6.36-rc2 #2 [ 23.585176] Call Trace: [ 23.585176] [] lockdep_rcu_dereference+0x99/0xa2 [ 23.585176] [] find_task_by_pid_ns+0x50/0x6a [ 23.585176] [] find_task_by_vpid+0x1d/0x1f [ 23.585176] [] sys_setpgid+0x67/0x193 [ 23.585176] [] system_call_fastpath+0x16/0x1b [ 24.959669] type=1400 audit(1282938522.956:4): avc: denied { module_request } for pid=766 comm="hwclock" kmod="char-major-10-135" scontext=system_u:system_r:hwclock_t:s0 tcontext=system_u:system_r:kernel_t:s0 tclas It turns out that the setpgid() system call fails to enter an RCU read-side critical section before doing a PID-to-task_struct translation. This commit therefore does rcu_read_lock() before the translation, and also does rcu_read_unlock() after the last use of the returned pointer. Reported-by: Andrew Morton Signed-off-by: Paul E. McKenney Acked-by: David Howells --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index e9ad44489828..7f5a0cd296a9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) pgid = pid; if (pgid < 0) return -EINVAL; + rcu_read_lock(); /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM @@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); + rcu_read_unlock(); return err; } -- cgit v1.2.3 From 68d3f1d810500e8b975bdf0b20dd83d060076b4b Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Tue, 31 Aug 2010 23:00:07 -0400 Subject: lockup_detector: Sync touch_*_watchdog back to old semantics During my rewrite, the semantics of touch_nmi_watchdog and touch_softlockup_watchdog changed enough to break some drivers (mostly over preemptable regions). These are cases where long delays on one CPU (due to print_delay for example) can cause long delays on other CPUs - so we must 'touch' the nmi_watchdog flag of those other CPUs as well. This change brings those touch_*_watchdog() functions back in line with to how they used to work. Signed-off-by: Don Zickus Acked-by: Cyrill Gorcunov Cc: peterz@infradead.org Cc: fweisbec@gmail.com LKML-Reference: <1283310009-22168-2-git-send-email-dzickus@redhat.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 0d53c8e853b1..7f9c3c52ecc1 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -122,7 +122,7 @@ static void __touch_watchdog(void) void touch_softlockup_watchdog(void) { - __get_cpu_var(watchdog_touch_ts) = 0; + __raw_get_cpu_var(watchdog_touch_ts) = 0; } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -142,7 +142,14 @@ void touch_all_softlockup_watchdogs(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR void touch_nmi_watchdog(void) { - __get_cpu_var(watchdog_nmi_touch) = true; + if (watchdog_enabled) { + unsigned cpu; + + for_each_present_cpu(cpu) { + if (per_cpu(watchdog_nmi_touch, cpu) != true) + per_cpu(watchdog_nmi_touch, cpu) = true; + } + } touch_softlockup_watchdog(); } EXPORT_SYMBOL(touch_nmi_watchdog); @@ -433,6 +440,9 @@ static int watchdog_enable(int cpu) wake_up_process(p); } + /* if any cpu succeeds, watchdog is considered enabled for the system */ + watchdog_enabled = 1; + return 0; } @@ -455,9 +465,6 @@ static void watchdog_disable(int cpu) per_cpu(softlockup_watchdog, cpu) = NULL; kthread_stop(p); } - - /* if any cpu succeeds, watchdog is considered enabled for the system */ - watchdog_enabled = 1; } static void watchdog_enable_all_cpus(void) -- cgit v1.2.3 From ef5dc121d5a0bb1fa477c5395277259f07d318a3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 2 Sep 2010 15:48:16 -0700 Subject: mutex: Fix annotations to include it in kernel-locking docbook Fix kernel-doc notation in linux/mutex.h and kernel/mutex.c, then add these 2 files to the kernel-locking docbook as the Mutex API reference chapter. Add one API function to mutex-design.txt and correct a typo in that file. Signed-off-by: Randy Dunlap Cc: Rusty Russell LKML-Reference: <20100902154816.6cc2f9ad.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- Documentation/DocBook/kernel-locking.tmpl | 6 ++++++ Documentation/mutex-design.txt | 3 ++- include/linux/mutex.h | 8 ++++++++ kernel/mutex.c | 23 +++++++---------------- 4 files changed, 23 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl index 0b1a3f97f285..a0d479d1e1dd 100644 --- a/Documentation/DocBook/kernel-locking.tmpl +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -1961,6 +1961,12 @@ machines due to caching. + + Mutex API reference +!Iinclude/linux/mutex.h +!Ekernel/mutex.c + + Further reading diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt index c91ccc0720fa..38c10fd7f411 100644 --- a/Documentation/mutex-design.txt +++ b/Documentation/mutex-design.txt @@ -9,7 +9,7 @@ firstly, there's nothing wrong with semaphores. But if the simpler mutex semantics are sufficient for your code, then there are a couple of advantages of mutexes: - - 'struct mutex' is smaller on most architectures: .e.g on x86, + - 'struct mutex' is smaller on most architectures: E.g. on x86, 'struct semaphore' is 20 bytes, 'struct mutex' is 16 bytes. A smaller structure size means less RAM footprint, and better CPU-cache utilization. @@ -136,3 +136,4 @@ the APIs of 'struct mutex' have been streamlined: void mutex_lock_nested(struct mutex *lock, unsigned int subclass); int mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); + int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 878cab4f5fcc..f363bc8fdc74 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -78,6 +78,14 @@ struct mutex_waiter { # include #else # define __DEBUG_MUTEX_INITIALIZER(lockname) +/** + * mutex_init - initialize the mutex + * @mutex: the mutex to be initialized + * + * Initialize the mutex to unlocked state. + * + * It is not allowed to initialize an already locked mutex. + */ # define mutex_init(mutex) \ do { \ static struct lock_class_key __key; \ diff --git a/kernel/mutex.c b/kernel/mutex.c index 4c0b7b3e6d2e..200407c1502f 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -36,15 +36,6 @@ # include #endif -/*** - * mutex_init - initialize the mutex - * @lock: the mutex to be initialized - * @key: the lock_class_key for the class; used by mutex lock debugging - * - * Initialize the mutex to unlocked state. - * - * It is not allowed to initialize an already locked mutex. - */ void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init); static __used noinline void __sched __mutex_lock_slowpath(atomic_t *lock_count); -/*** +/** * mutex_lock - acquire the mutex * @lock: the mutex to be acquired * @@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock); static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); -/*** +/** * mutex_unlock - release the mutex * @lock: the mutex to be released * @@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count); static noinline int __sched __mutex_lock_interruptible_slowpath(atomic_t *lock_count); -/*** - * mutex_lock_interruptible - acquire the mutex, interruptable +/** + * mutex_lock_interruptible - acquire the mutex, interruptible * @lock: the mutex to be acquired * * Lock the mutex like mutex_lock(), and return 0 if the mutex has @@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) return prev == 1; } -/*** - * mutex_trylock - try acquire the mutex, without waiting +/** + * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired * * Try to acquire the mutex atomically. Returns 1 if the mutex * has been acquired successfully, and 0 on contention. * * NOTE: this function follows the spin_trylock() convention, so - * it is negated to the down_trylock() return values! Be careful + * it is negated from the down_trylock() return values! Be careful * about this when converting semaphore users to mutexes. * * This function must not be used in interrupt context. The -- cgit v1.2.3 From b3bd3de66f60df4c9a2076e2886a622458929056 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 10 Aug 2010 14:17:51 -0700 Subject: gcc-4.6: kernel/*: Fix unused but set warnings No real bugs I believe, just some dead code. Signed-off-by: Andi Kleen Cc: Peter Zijlstra Cc: andi@firstfloor.org Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/debug/kdb/kdb_bp.c | 2 -- kernel/hrtimer.c | 3 +-- kernel/sched_fair.c | 3 +-- kernel/sysctl.c | 5 +---- kernel/trace/ring_buffer.c | 2 -- 5 files changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 75bd9b3ebbb7..20059ef4459a 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv) int i, bpno; kdb_bp_t *bp, *bp_check; int diag; - int free; char *symname = NULL; long offset = 0ul; int nextarg; @@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv) /* * Find an empty bp structure to allocate */ - free = KDB_MAXBPT; for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { if (bp->bp_free) break; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ce669174f355..1decafbb6b1a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1091,11 +1091,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { - struct hrtimer_clock_base *base; unsigned long flags; ktime_t rem; - base = lock_hrtimer_base(timer, &flags); + lock_hrtimer_base(timer, &flags); rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ab661ebc4895..134f7edb30c6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1313,7 +1313,7 @@ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int load_idx) { - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - this = group; } else if (avg_load < min_load) { min_load = avg_load; idlest = group; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ca38e8e3e907..f88552c6d227 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1713,10 +1713,7 @@ static __init int sysctl_init(void) { sysctl_set_parent(NULL, root_table); #ifdef CONFIG_SYSCTL_SYSCALL_CHECK - { - int err; - err = sysctl_check_table(current->nsproxy, root_table); - } + sysctl_check_table(current->nsproxy, root_table); #endif return 0; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 19cccc3c3028..492197e2f86c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2985,13 +2985,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) static void rb_advance_iter(struct ring_buffer_iter *iter) { - struct ring_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; unsigned length; cpu_buffer = iter->cpu_buffer; - buffer = cpu_buffer->buffer; /* * Check if we are at the end of the buffer. -- cgit v1.2.3 From 9c55cb12c1c172e2d51e85fbb5a4796ca86b77e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 8 Sep 2010 11:20:37 -0400 Subject: tracing: Do not allow llseek to set_ftrace_filter Reading the file set_ftrace_filter does three things. 1) shows whether or not filters are set for the function tracer 2) shows what functions are set for the function tracer 3) shows what triggers are set on any functions 3 is independent from 1 and 2. The way this file currently works is that it is a state machine, and as you read it, it may change state. But this assumption breaks when you use lseek() on the file. The state machine gets out of sync and the t_show() may use the wrong pointer and cause a kernel oops. Luckily, this will only kill the app that does the lseek, but the app dies while holding a mutex. This prevents anyone else from using the set_ftrace_filter file (or any other function tracing file for that matter). A real fix for this is to rewrite the code, but that is too much for a -rc release or stable. This patch simply disables llseek on the set_ftrace_filter() file for now, and we can do the proper fix for the next major release. Reported-by: Robert Swiecki Cc: Chris Wright Cc: Tavis Ormandy Cc: Eugene Teo Cc: vendor-sec@lst.de Cc: Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7cb1f45a1de1..83a16e9ee518 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2416,7 +2416,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, + .llseek = no_llseek, .release = ftrace_filter_release, }; -- cgit v1.2.3 From 61a527362234ac3352a91ac67c50c6f7cd248eb1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 27 Aug 2010 20:38:46 +0900 Subject: tracing/kprobe: Fix a memory leak in error case Fix a memory leak which happens when a field name conflicts with others. In error case, free_trace_probe() will free all arguments until nr_args, so this increments nr_args the begining of the loop instead of the end. Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mathieu Desnoyers LKML-Reference: <20100827113846.22882.12670.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8b27c9849b42..0116c038b0bc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -992,6 +992,9 @@ static int create_trace_probe(int argc, char **argv) /* parse arguments */ ret = 0; for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + /* Increment count for freeing args in error case */ + tp->nr_args++; + /* Parse argument name */ arg = strchr(argv[i], '='); if (arg) @@ -1021,11 +1024,8 @@ static int create_trace_probe(int argc, char **argv) ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); if (ret) { pr_info("Parse error at argument%d. (%d)\n", i, ret); - kfree(tp->args[i].name); goto error; } - - tp->nr_args++; } ret = register_trace_probe(tp); -- cgit v1.2.3 From aba91595cfcebd193425e20aabc407531526a1c5 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 27 Aug 2010 20:39:06 +0900 Subject: tracing/kprobes: Fix handling of argument names Set "argN" name for each argument automatically if it has no specified name. Since dynamic trace event(kprobe_events) accepts special characters for its argument, its format can show those special characters (e.g. '$', '%', '+'). However, perf can't parse those format because of the character (especially '%') mess up the format. This sets "argX" name for those arguments if user omitted the argument names. E.g. # echo 'p do_fork %ax IP=%ip $stack' > tracing/kprobe_events # cat tracing/kprobe_events p:kprobes/p_do_fork_0 do_fork arg1=%ax IP=%ip arg3=$stack Reported-by: Srikar Dronamraju Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mathieu Desnoyers LKML-Reference: <20100827113906.22882.59312.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0116c038b0bc..a39251ef1a7b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -997,15 +997,18 @@ static int create_trace_probe(int argc, char **argv) /* Parse argument name */ arg = strchr(argv[i], '='); - if (arg) + if (arg) { *arg++ = '\0'; - else + tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + } else { arg = argv[i]; + /* If argument name is omitted, set "argN" */ + snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); + tp->args[i].name = kstrdup(buf, GFP_KERNEL); + } - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); if (!tp->args[i].name) { - pr_info("Failed to allocate argument%d name '%s'.\n", - i, argv[i]); + pr_info("Failed to allocate argument[%d] name.\n", i); ret = -ENOMEM; goto error; } @@ -1014,7 +1017,7 @@ static int create_trace_probe(int argc, char **argv) *tmp = '_'; /* convert : to _ */ if (conflict_field_name(tp->args[i].name, tp->args, i)) { - pr_info("Argument%d name '%s' conflicts with " + pr_info("Argument[%d] name '%s' conflicts with " "another field.\n", i, argv[i]); ret = -EINVAL; goto error; @@ -1023,7 +1026,7 @@ static int create_trace_probe(int argc, char **argv) /* Parse fetch argument */ ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); if (ret) { - pr_info("Parse error at argument%d. (%d)\n", i, ret); + pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; } } -- cgit v1.2.3 From da34634fd39958725310d2c30c9b4543945f968b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 27 Aug 2010 20:39:12 +0900 Subject: tracing/kprobe: Fix handling of C-unlike argument names Check the argument name whether it is invalid (not C-like symbol name). This makes event format simple. Reported-by: Srikar Dronamraju Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mathieu Desnoyers LKML-Reference: <20100827113912.22882.62313.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a39251ef1a7b..544301d29dee 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -514,8 +514,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs); -/* Check the name is good for event/group */ -static int check_event_name(const char *name) +/* Check the name is good for event/group/fields */ +static int is_good_name(const char *name) { if (!isalpha(*name) && *name != '_') return 0; @@ -557,7 +557,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, else tp->rp.kp.pre_handler = kprobe_dispatcher; - if (!event || !check_event_name(event)) { + if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; } @@ -567,7 +567,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, if (!tp->call.name) goto error; - if (!group || !check_event_name(group)) { + if (!group || !is_good_name(group)) { ret = -EINVAL; goto error; } @@ -883,7 +883,7 @@ static int create_trace_probe(int argc, char **argv) int i, ret = 0; int is_return = 0, is_delete = 0; char *symbol = NULL, *event = NULL, *group = NULL; - char *arg, *tmp; + char *arg; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -1012,9 +1012,13 @@ static int create_trace_probe(int argc, char **argv) ret = -ENOMEM; goto error; } - tmp = strchr(tp->args[i].name, ':'); - if (tmp) - *tmp = '_'; /* convert : to _ */ + + if (!is_good_name(tp->args[i].name)) { + pr_info("Invalid argument[%d] name: %s\n", + i, tp->args[i].name); + ret = -EINVAL; + goto error; + } if (conflict_field_name(tp->args[i].name, tp->args, i)) { pr_info("Argument[%d] name '%s' conflicts with " -- cgit v1.2.3 From 9cb627d5f38830ca19aa0dca52d1d3a633018bf7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Sep 2010 12:58:43 +0200 Subject: perf, trace: Fix module leak Commit 1c024eca (perf, trace: Optimize tracepoints by using per-tracepoint-per-cpu hlist to track events) caused a module refcount leak. Reported-And-Tested-by: Avi Kivity Signed-off-by: Peter Zijlstra LKML-Reference: <4C7E1F12.8030304@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 000e6e85b445..31cc4cb0dbf2 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -91,6 +91,8 @@ int perf_trace_init(struct perf_event *p_event) tp_event->class && tp_event->class->reg && try_module_get(tp_event->mod)) { ret = perf_trace_event_init(tp_event, p_event); + if (ret) + module_put(tp_event->mod); break; } } @@ -146,6 +148,7 @@ void perf_trace_destroy(struct perf_event *p_event) } } out: + module_put(tp_event->mod); mutex_unlock(&event_mutex); } -- cgit v1.2.3 From 5e11637e2c929e34dcc0fbbfb48bdb638937701a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2010 13:35:08 +0200 Subject: perf: Fix CPU hotplug Since we have UP_PREPARE, we should also have UP_CANCELED. Signed-off-by: Peter Zijlstra Cc: paulus LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 657555a5f30f..db5b56064687 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5761,15 +5761,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: + case CPU_DOWN_FAILED: perf_event_init_cpu(cpu); break; + case CPU_UP_CANCELED: case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: perf_event_exit_cpu(cpu); break; -- cgit v1.2.3 From da2b71edd8a7db44fe1746261410a981f3e03632 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 23 Aug 2010 13:42:51 -0700 Subject: sched: Move sched_avg_update() to update_cpu_load() Currently sched_avg_update() (which updates rt_avg stats in the rq) is getting called from scale_rt_power() (in the load balance context) which doesn't take rq->lock. Fix it by moving the sched_avg_update() to more appropriate update_cpu_load() where the CFS load gets updated as well. Signed-off-by: Suresh Siddha Signed-off-by: Peter Zijlstra LKML-Reference: <1282596171.2694.3.camel@sbsiddha-MOBL3> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ kernel/sched_fair.c | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 09b574e7f4df..ed09d4f2a69c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1294,6 +1294,10 @@ static void resched_task(struct task_struct *p) static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } + +static void sched_avg_update(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 @@ -3182,6 +3186,8 @@ static void update_cpu_load(struct rq *this_rq) this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } + + sched_avg_update(this_rq); } static void update_cpu_load_active(struct rq *this_rq) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ab661ebc4895..f53ec7550056 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2268,8 +2268,6 @@ unsigned long scale_rt_power(int cpu) struct rq *rq = cpu_rq(cpu); u64 total, available; - sched_avg_update(rq); - total = sched_avg_period() + (rq->clock - rq->age_stamp); available = total - rq->rt_avg; -- cgit v1.2.3 From 85a0fdfd0f967507f3903e8419bc7e408f5a59de Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Thu, 9 Sep 2010 16:37:35 -0700 Subject: gcov: fix null-pointer dereference for certain module types The gcov-kernel infrastructure expects that each object file is loaded only once. This may not be true, e.g. when loading multiple kernel modules which are linked to the same object file. As a result, loading such kernel modules will result in incorrect gcov results while unloading will cause a null-pointer dereference. This patch fixes these problems by changing the gcov-kernel infrastructure so that multiple profiling data sets can be associated with one debugfs entry. It applies to 2.6.36-rc1. Signed-off-by: Peter Oberparleiter Reported-by: Werner Spies Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/fs.c | 244 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 180 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index ef3c3f88a7a3..f83972b16564 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -33,10 +33,11 @@ * @children: child nodes * @all: list head for list of all nodes * @parent: parent node - * @info: associated profiling data structure if not a directory - * @ghost: when an object file containing profiling data is unloaded we keep a - * copy of the profiling data here to allow collecting coverage data - * for cleanup code. Such a node is called a "ghost". + * @loaded_info: array of pointers to profiling data sets for loaded object + * files. + * @num_loaded: number of profiling data sets for loaded object files. + * @unloaded_info: accumulated copy of profiling data sets for unloaded + * object files. Used only when gcov_persist=1. * @dentry: main debugfs entry, either a directory or data file * @links: associated symbolic links * @name: data file basename @@ -51,10 +52,11 @@ struct gcov_node { struct list_head children; struct list_head all; struct gcov_node *parent; - struct gcov_info *info; - struct gcov_info *ghost; + struct gcov_info **loaded_info; + struct gcov_info *unloaded_info; struct dentry *dentry; struct dentry **links; + int num_loaded; char name[0]; }; @@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = { }; /* - * Return the profiling data set for a given node. This can either be the - * original profiling data structure or a duplicate (also called "ghost") - * in case the associated object file has been unloaded. + * Return a profiling data set associated with the given node. This is + * either a data set for a loaded object file or a data set copy in case + * all associated object files have been unloaded. */ static struct gcov_info *get_node_info(struct gcov_node *node) { - if (node->info) - return node->info; + if (node->num_loaded > 0) + return node->loaded_info[0]; - return node->ghost; + return node->unloaded_info; +} + +/* + * Return a newly allocated profiling data set which contains the sum of + * all profiling data associated with the given node. + */ +static struct gcov_info *get_accumulated_info(struct gcov_node *node) +{ + struct gcov_info *info; + int i = 0; + + if (node->unloaded_info) + info = gcov_info_dup(node->unloaded_info); + else + info = gcov_info_dup(node->loaded_info[i++]); + if (!info) + return NULL; + for (; i < node->num_loaded; i++) + gcov_info_add(info, node->loaded_info[i]); + + return info; } /* @@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file) mutex_lock(&node_lock); /* * Read from a profiling data copy to minimize reference tracking - * complexity and concurrent access. + * complexity and concurrent access and to keep accumulating multiple + * profiling data sets associated with one node simple. */ - info = gcov_info_dup(get_node_info(node)); + info = get_accumulated_info(node); if (!info) goto out_unlock; iter = gcov_iter_new(info); @@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name) return NULL; } +/* + * Reset all profiling data associated with the specified node. + */ +static void reset_node(struct gcov_node *node) +{ + int i; + + if (node->unloaded_info) + gcov_info_reset(node->unloaded_info); + for (i = 0; i < node->num_loaded; i++) + gcov_info_reset(node->loaded_info[i]); +} + static void remove_node(struct gcov_node *node); /* * write() implementation for gcov data files. Reset profiling data for the - * associated file. If the object file has been unloaded (i.e. this is - * a "ghost" node), remove the debug fs node as well. + * corresponding file. If all associated object files have been unloaded, + * remove the debug fs node as well. */ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, size_t len, loff_t *pos) @@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, node = get_node_by_name(info->filename); if (node) { /* Reset counts or remove node for unloaded modules. */ - if (node->ghost) + if (node->num_loaded == 0) remove_node(node); else - gcov_info_reset(node->info); + reset_node(node); } /* Reset counts for open file. */ gcov_info_reset(info); @@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info, INIT_LIST_HEAD(&node->list); INIT_LIST_HEAD(&node->children); INIT_LIST_HEAD(&node->all); - node->info = info; + if (node->loaded_info) { + node->loaded_info[0] = info; + node->num_loaded = 1; + } node->parent = parent; if (name) strcpy(node->name, name); @@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent, struct gcov_node *node; node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); - if (!node) { - pr_warning("out of memory\n"); - return NULL; + if (!node) + goto err_nomem; + if (info) { + node->loaded_info = kcalloc(1, sizeof(struct gcov_info *), + GFP_KERNEL); + if (!node->loaded_info) + goto err_nomem; } init_node(node, info, name, parent); /* Differentiate between gcov data file nodes and directory nodes. */ @@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent, list_add(&node->all, &all_head); return node; + +err_nomem: + kfree(node); + pr_warning("out of memory\n"); + return NULL; } /* Remove symbolic links associated with node. */ @@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node) list_del(&node->all); debugfs_remove(node->dentry); remove_links(node); - if (node->ghost) - gcov_info_free(node->ghost); + kfree(node->loaded_info); + if (node->unloaded_info) + gcov_info_free(node->unloaded_info); kfree(node); } @@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent, /* * write() implementation for reset file. Reset all profiling data to zero - * and remove ghost nodes. + * and remove nodes for which all associated object files are unloaded. */ static ssize_t reset_write(struct file *file, const char __user *addr, size_t len, loff_t *pos) @@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr, mutex_lock(&node_lock); restart: list_for_each_entry(node, &all_head, all) { - if (node->info) - gcov_info_reset(node->info); + if (node->num_loaded > 0) + reset_node(node); else if (list_empty(&node->children)) { remove_node(node); /* Several nodes may have gone - restart loop. */ @@ -564,37 +614,115 @@ err_remove: } /* - * The profiling data set associated with this node is being unloaded. Store a - * copy of the profiling data and turn this node into a "ghost". + * Associate a profiling data set with an existing node. Needs to be called + * with node_lock held. */ -static int ghost_node(struct gcov_node *node) +static void add_info(struct gcov_node *node, struct gcov_info *info) { - node->ghost = gcov_info_dup(node->info); - if (!node->ghost) { - pr_warning("could not save data for '%s' (out of memory)\n", - node->info->filename); - return -ENOMEM; + struct gcov_info **loaded_info; + int num = node->num_loaded; + + /* + * Prepare new array. This is done first to simplify cleanup in + * case the new data set is incompatible, the node only contains + * unloaded data sets and there's not enough memory for the array. + */ + loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); + if (!loaded_info) { + pr_warning("could not add '%s' (out of memory)\n", + info->filename); + return; + } + memcpy(loaded_info, node->loaded_info, + num * sizeof(struct gcov_info *)); + loaded_info[num] = info; + /* Check if the new data set is compatible. */ + if (num == 0) { + /* + * A module was unloaded, modified and reloaded. The new + * data set replaces the copy of the last one. + */ + if (!gcov_info_is_compatible(node->unloaded_info, info)) { + pr_warning("discarding saved data for %s " + "(incompatible version)\n", info->filename); + gcov_info_free(node->unloaded_info); + node->unloaded_info = NULL; + } + } else { + /* + * Two different versions of the same object file are loaded. + * The initial one takes precedence. + */ + if (!gcov_info_is_compatible(node->loaded_info[0], info)) { + pr_warning("could not add '%s' (incompatible " + "version)\n", info->filename); + kfree(loaded_info); + return; + } } - node->info = NULL; + /* Overwrite previous array. */ + kfree(node->loaded_info); + node->loaded_info = loaded_info; + node->num_loaded = num + 1; +} - return 0; +/* + * Return the index of a profiling data set associated with a node. + */ +static int get_info_index(struct gcov_node *node, struct gcov_info *info) +{ + int i; + + for (i = 0; i < node->num_loaded; i++) { + if (node->loaded_info[i] == info) + return i; + } + return -ENOENT; } /* - * Profiling data for this node has been loaded again. Add profiling data - * from previous instantiation and turn this node into a regular node. + * Save the data of a profiling data set which is being unloaded. */ -static void revive_node(struct gcov_node *node, struct gcov_info *info) +static void save_info(struct gcov_node *node, struct gcov_info *info) { - if (gcov_info_is_compatible(node->ghost, info)) - gcov_info_add(info, node->ghost); + if (node->unloaded_info) + gcov_info_add(node->unloaded_info, info); else { - pr_warning("discarding saved data for '%s' (version changed)\n", + node->unloaded_info = gcov_info_dup(info); + if (!node->unloaded_info) { + pr_warning("could not save data for '%s' " + "(out of memory)\n", info->filename); + } + } +} + +/* + * Disassociate a profiling data set from a node. Needs to be called with + * node_lock held. + */ +static void remove_info(struct gcov_node *node, struct gcov_info *info) +{ + int i; + + i = get_info_index(node, info); + if (i < 0) { + pr_warning("could not remove '%s' (not found)\n", info->filename); + return; } - gcov_info_free(node->ghost); - node->ghost = NULL; - node->info = info; + if (gcov_persist) + save_info(node, info); + /* Shrink array. */ + node->loaded_info[i] = node->loaded_info[node->num_loaded - 1]; + node->num_loaded--; + if (node->num_loaded > 0) + return; + /* Last loaded data set was removed. */ + kfree(node->loaded_info); + node->loaded_info = NULL; + node->num_loaded = 0; + if (!node->unloaded_info) + remove_node(node); } /* @@ -609,30 +737,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) node = get_node_by_name(info->filename); switch (action) { case GCOV_ADD: - /* Add new node or revive ghost. */ - if (!node) { + if (node) + add_info(node, info); + else add_node(info); - break; - } - if (gcov_persist) - revive_node(node, info); - else { - pr_warning("could not add '%s' (already exists)\n", - info->filename); - } break; case GCOV_REMOVE: - /* Remove node or turn into ghost. */ - if (!node) { + if (node) + remove_info(node, info); + else { pr_warning("could not remove '%s' (not found)\n", info->filename); - break; } - if (gcov_persist) { - if (!ghost_node(node)) - break; - } - remove_node(node); break; } mutex_unlock(&node_lock); -- cgit v1.2.3 From 31583bb0cf6cc40f2a468a4d2f3b9cbefd24f891 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 9 Sep 2010 16:37:37 -0700 Subject: cgroups: fix API thinko Add cgroup_attach_task_all() The existing cgroup_attach_task_current_cg() API is called by a thread to attach another thread to all of its cgroups; this is unsuitable for cases where a privileged task wants to attach itself to the cgroups of a less privileged one, since the call must be made from the context of the target task. This patch adds a more generic cgroup_attach_task_all() API that allows both the source task and to-be-moved task to be specified. cgroup_attach_task_current_cg() becomes a specialization of the more generic new function. [menage@google.com: rewrote changelog] [akpm@linux-foundation.org: address reviewer comments] Signed-off-by: Michael S. Tsirkin Tested-by: Alex Williamson Acked-by: Paul Menage Cc: Li Zefan Cc: Ben Blum Cc: Sridhar Samudrala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 12 +++++++++++- kernel/cgroup.c | 13 +++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ed3e92e41c6e..0c991023ee47 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -578,7 +578,12 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task(struct cgroup *, struct task_struct *); -int cgroup_attach_task_current_cg(struct task_struct *); +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); + +static inline int cgroup_attach_task_current_cg(struct task_struct *tsk) +{ + return cgroup_attach_task_all(current, tsk); +} /* * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works @@ -636,6 +641,11 @@ static inline int cgroupstats_build(struct cgroupstats *stats, } /* No cgroups - nothing to do */ +static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) +{ + return 0; +} static inline int cgroup_attach_task_current_cg(struct task_struct *t) { return 0; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 192f88c5b0f9..c9483d8f6140 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1791,19 +1791,20 @@ out: } /** - * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task * @tsk: the task to be attached */ -int cgroup_attach_task_current_cg(struct task_struct *tsk) +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { struct cgroupfs_root *root; - struct cgroup *cur_cg; int retval = 0; cgroup_lock(); for_each_active_root(root) { - cur_cg = task_cgroup_from_root(current, root); - retval = cgroup_attach_task(cur_cg, tsk); + struct cgroup *from_cg = task_cgroup_from_root(from, root); + + retval = cgroup_attach_task(from_cg, tsk); if (retval) break; } @@ -1811,7 +1812,7 @@ int cgroup_attach_task_current_cg(struct task_struct *tsk) return retval; } -EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /* * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex -- cgit v1.2.3 From 1c24de60e50fb19b94d94225458da17c720f0729 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 9 Sep 2010 16:37:59 -0700 Subject: kernel/groups.c: fix integer overflow in groups_search gid_t is a unsigned int. If group_info contains a gid greater than MAX_INT, groups_search() function may look on the wrong side of the search tree. This solves some unfair "permission denied" problems. Signed-off-by: Jerome Marchand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index 53b1916c9492..253dc0f35cf4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp) right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; - int cmp = grp - GROUP_AT(group_info, mid); - if (cmp > 0) + if (grp > GROUP_AT(group_info, mid)) left = mid + 1; - else if (cmp < 0) + else if (grp < GROUP_AT(group_info, mid)) right = mid; else return 1; -- cgit v1.2.3 From 910321ea817a202ff70fac666e37e2c8e2f88823 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 9 Sep 2010 16:38:07 -0700 Subject: swap: revert special hibernation allocation Please revert 2.6.36-rc commit d2997b1042ec150616c1963b5e5e919ffd0b0ebf "hibernation: freeze swap at hibernation". It complicated matters by adding a second swap allocation path, just for hibernation; without in any way fixing the issue that it was intended to address - page reclaim after fixing the hibernation image might free swap from a page already imaged as swapcache, letting its swap be reallocated to store a different page of the image: resulting in data corruption if the imaged page were freed as clean then swapped back in. Pages freed to si->swap_map were still in danger of being reallocated by the alternative allocation path. I guess it inadvertently fixed slow SSD swap allocation for hibernation, as reported by Nigel Cunningham: by missing out the discards that occur on the usual swap allocation path; but that was unintentional, and needs a separate fix. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: "Rafael J. Wysocki" Cc: Ondrej Zary Cc: Andrea Gelmini Cc: Balbir Singh Cc: Andrea Arcangeli Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 8 +---- kernel/power/hibernate.c | 1 - kernel/power/snapshot.c | 1 - kernel/power/swap.c | 6 ++-- mm/swapfile.c | 94 ++++++++++++------------------------------------ 5 files changed, 26 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fee51a11b73..bf4eb62506db 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -315,6 +315,7 @@ extern long nr_swap_pages; extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); +extern swp_entry_t get_swap_page_of_type(int); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); @@ -331,13 +332,6 @@ extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; -#ifdef CONFIG_HIBERNATION -void hibernation_freeze_swap(void); -void hibernation_thaw_swap(void); -swp_entry_t get_swap_for_hibernation(int type); -void swap_free_for_hibernation(swp_entry_t val); -#endif - /* linux/mm/thrash.c */ extern struct mm_struct *swap_token_mm; extern void grab_swap_token(struct mm_struct *); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c77963938bca..8dc31e02ae12 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -338,7 +338,6 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - hibernation_freeze_swap(); saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_FREEZE); if (error) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5e7edfb05e66..f6cd6faf84fd 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1086,7 +1086,6 @@ void swsusp_free(void) buffer = NULL; alloc_normal = 0; alloc_highmem = 0; - hibernation_thaw_swap(); } /* Helper functions used for the shrinking of memory. */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 5d0059eed3e4..e6a5bdf61a37 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap) { unsigned long offset; - offset = swp_offset(get_swap_for_hibernation(swap)); + offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { if (swsusp_extents_insert(offset)) - swap_free_for_hibernation(swp_entry(swap, offset)); + swap_free(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } @@ -163,7 +163,7 @@ void free_all_swap_pages(int swap) ext = container_of(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); for (offset = ext->start; offset <= ext->end; offset++) - swap_free_for_hibernation(swp_entry(swap, offset)); + swap_free(swp_entry(swap, offset)); kfree(ext); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f3f9c59a73a..f08d165871b3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -47,8 +47,6 @@ long nr_swap_pages; long total_swap_pages; static int least_priority; -static bool swap_for_hibernation; - static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; @@ -453,8 +451,6 @@ swp_entry_t get_swap_page(void) spin_lock(&swap_lock); if (nr_swap_pages <= 0) goto noswap; - if (swap_for_hibernation) - goto noswap; nr_swap_pages--; for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { @@ -487,6 +483,28 @@ noswap: return (swp_entry_t) {0}; } +/* The only caller of this function is now susupend routine */ +swp_entry_t get_swap_page_of_type(int type) +{ + struct swap_info_struct *si; + pgoff_t offset; + + spin_lock(&swap_lock); + si = swap_info[type]; + if (si && (si->flags & SWP_WRITEOK)) { + nr_swap_pages--; + /* This is called for allocating swap entry, not cache */ + offset = scan_swap_map(si, 1); + if (offset) { + spin_unlock(&swap_lock); + return swp_entry(type, offset); + } + nr_swap_pages++; + } + spin_unlock(&swap_lock); + return (swp_entry_t) {0}; +} + static struct swap_info_struct *swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; @@ -746,74 +764,6 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) #endif #ifdef CONFIG_HIBERNATION - -static pgoff_t hibernation_offset[MAX_SWAPFILES]; -/* - * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise, - * saved swap_map[] image to the disk will be an incomplete because it's - * changing without synchronization with hibernation snap shot. - * At resume, we just make swap_for_hibernation=false. We can forget - * used maps easily. - */ -void hibernation_freeze_swap(void) -{ - int i; - - spin_lock(&swap_lock); - - printk(KERN_INFO "PM: Freeze Swap\n"); - swap_for_hibernation = true; - for (i = 0; i < MAX_SWAPFILES; i++) - hibernation_offset[i] = 1; - spin_unlock(&swap_lock); -} - -void hibernation_thaw_swap(void) -{ - spin_lock(&swap_lock); - if (swap_for_hibernation) { - printk(KERN_INFO "PM: Thaw Swap\n"); - swap_for_hibernation = false; - } - spin_unlock(&swap_lock); -} - -/* - * Because updateing swap_map[] can make not-saved-status-change, - * we use our own easy allocator. - * Please see kernel/power/swap.c, Used swaps are recorded into - * RB-tree. - */ -swp_entry_t get_swap_for_hibernation(int type) -{ - pgoff_t off; - swp_entry_t val = {0}; - struct swap_info_struct *si; - - spin_lock(&swap_lock); - - si = swap_info[type]; - if (!si || !(si->flags & SWP_WRITEOK)) - goto done; - - for (off = hibernation_offset[type]; off < si->max; ++off) { - if (!si->swap_map[off]) - break; - } - if (off < si->max) { - val = swp_entry(type, off); - hibernation_offset[type] = off + 1; - } -done: - spin_unlock(&swap_lock); - return val; -} - -void swap_free_for_hibernation(swp_entry_t ent) -{ - /* Nothing to do */ -} - /* * Find the swap type that corresponds to given device (if any). * -- cgit v1.2.3 From df09162550fbb53354f0c88e85b5d0e6129ee9cc Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Thu, 9 Sep 2010 16:34:59 -0700 Subject: tracing: t_start: reset FTRACE_ITER_HASH in case of seek/pread Be sure to avoid entering t_show() with FTRACE_ITER_HASH set without having properly started the iterator to iterate the hash. This case is degenerate and, as discovered by Robert Swiecki, can cause t_hash_show() to misuse a pointer. This causes a NULL ptr deref with possible security implications. Tracked as CVE-2010-3079. Cc: Robert Swiecki Cc: Eugene Teo Cc: Signed-off-by: Chris Wright Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 83a16e9ee518..fa7ece649fe1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1510,6 +1510,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; + /* reset in case of seek/pread */ + iter->flags &= ~FTRACE_ITER_HASH; return iter; } -- cgit v1.2.3 From 27c379f7f89a4d558c685b5d89b5ba2fe79ae701 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 10 Sep 2010 13:47:29 +0200 Subject: generic-ipi: Fix deadlock in __smp_call_function_single Just got my 6 way machine to a state where cpu 0 is in an endless loop within __smp_call_function_single. All other cpus are idle. The call trace on cpu 0 looks like this: __smp_call_function_single scheduler_tick update_process_times tick_sched_timer __run_hrtimer hrtimer_interrupt clock_comparator_work do_extint ext_int_handler ----> timer irq cpu_idle __smp_call_function_single() got called from nohz_balancer_kick() (inlined) with the remote cpu being 1, wait being 0 and the per cpu variable remote_sched_softirq_cb (call_single_data) of the current cpu (0). Then it loops forever when it tries to grab the lock of the call_single_data, since it is already locked and enqueued on cpu 0. My theory how this could have happened: for some reason the scheduler decided to call __smp_call_function_single() on it's own cpu, and sends an IPI to itself. The interrupt stays pending since IRQs are disabled. If then the hypervisor schedules the cpu away it might happen that upon rescheduling both the IPI and the timer IRQ are pending. If then interrupts are enabled again it depends which one gets scheduled first. If the timer interrupt gets delivered first we end up with the local deadlock as seen in the calltrace above. Let's make __smp_call_function_single() check if the target cpu is the current cpu and execute the function immediately just like smp_call_function_single does. That should prevent at least the scenario described here. It might also be that the scheduler is not supposed to call __smp_call_function_single with the remote cpu being the current cpu, but that is a different issue. Signed-off-by: Heiko Carstens Acked-by: Peter Zijlstra Acked-by: Jens Axboe Cc: Venkatesh Pallipadi Cc: Suresh Siddha LKML-Reference: <20100910114729.GB2827@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/smp.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 75c970c715d3..ed6aacfcb7ef 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -365,9 +365,10 @@ call: EXPORT_SYMBOL_GPL(smp_call_function_any); /** - * __smp_call_function_single(): Run a function on another CPU + * __smp_call_function_single(): Run a function on a specific CPU * @cpu: The CPU to run on. * @data: Pre-allocated and setup data structure + * @wait: If true, wait until function has completed on specified CPU. * * Like smp_call_function_single(), but allow caller to pass in a * pre-allocated data structure. Useful for embedding @data inside @@ -376,8 +377,10 @@ EXPORT_SYMBOL_GPL(smp_call_function_any); void __smp_call_function_single(int cpu, struct call_single_data *data, int wait) { - csd_lock(data); + unsigned int this_cpu; + unsigned long flags; + this_cpu = get_cpu(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can @@ -387,7 +390,15 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() && !oops_in_progress); - generic_exec_single(cpu, data, wait); + if (cpu == this_cpu) { + local_irq_save(flags); + data->func(data->info); + local_irq_restore(flags); + } else { + csd_lock(data); + generic_exec_single(cpu, data, wait); + } + put_cpu(); } /** -- cgit v1.2.3 From 0109c2c48d062a04685638926a35ed20153fedc8 Mon Sep 17 00:00:00 2001 From: mark gross Date: Thu, 9 Sep 2010 23:20:09 +0200 Subject: PM QoS: Correct pr_debug() misuse and improve parameter checks Correct some pr_debug() misuse and add a stronger parameter check to pm_qos_write() for the ASCII hex value case. Thanks to Dan Carpenter for pointing out the problem! Signed-off-by: mark gross Signed-off-by: Rafael J. Wysocki --- kernel/pm_qos_params.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index b7e4c362361b..645e541a45f6 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -389,10 +389,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, } else if (count == 11) { /* len('0x12345678/0') */ if (copy_from_user(ascii_value, buf, 11)) return -EFAULT; + if (strlen(ascii_value) != 10) + return -EINVAL; x = sscanf(ascii_value, "%x", &value); if (x != 1) return -EINVAL; - pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); + pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); } else return -EINVAL; -- cgit v1.2.3 From 6715045ddc7472a22be5e49d4047d2d89b391f45 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 11 Sep 2010 20:58:27 +0200 Subject: PM / Hibernate: Avoid hitting OOM during preallocation of memory There is a problem in hibernate_preallocate_memory() that it calls preallocate_image_memory() with an argument that may be greater than the total number of available non-highmem memory pages. If that's the case, the OOM condition is guaranteed to trigger, which in turn can cause significant slowdown to occur during hibernation. To avoid that, make preallocate_image_memory() adjust its argument before calling preallocate_image_pages(), so that the total number of saveable non-highem pages left is not less than the minimum size of a hibernation image. Change hibernate_preallocate_memory() to try to allocate from highmem if the number of pages allocated by preallocate_image_memory() is too low. Modify free_unnecessary_pages() to take all possible memory allocation patterns into account. Reported-by: KOSAKI Motohiro Signed-off-by: Rafael J. Wysocki Tested-by: M. Vefa Bicakci --- kernel/power/snapshot.c | 85 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5e7edfb05e66..5209b39e6982 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1122,9 +1122,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) return nr_alloc; } -static unsigned long preallocate_image_memory(unsigned long nr_pages) +static unsigned long preallocate_image_memory(unsigned long nr_pages, + unsigned long avail_normal) { - return preallocate_image_pages(nr_pages, GFP_IMAGE); + unsigned long alloc; + + if (avail_normal <= alloc_normal) + return 0; + + alloc = avail_normal - alloc_normal; + if (nr_pages < alloc) + alloc = nr_pages; + + return preallocate_image_pages(alloc, GFP_IMAGE); } #ifdef CONFIG_HIGHMEM @@ -1170,15 +1180,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, */ static void free_unnecessary_pages(void) { - unsigned long save_highmem, to_free_normal, to_free_highmem; + unsigned long save, to_free_normal, to_free_highmem; - to_free_normal = alloc_normal - count_data_pages(); - save_highmem = count_highmem_pages(); - if (alloc_highmem > save_highmem) { - to_free_highmem = alloc_highmem - save_highmem; + save = count_data_pages(); + if (alloc_normal >= save) { + to_free_normal = alloc_normal - save; + save = 0; + } else { + to_free_normal = 0; + save -= alloc_normal; + } + save += count_highmem_pages(); + if (alloc_highmem >= save) { + to_free_highmem = alloc_highmem - save; } else { to_free_highmem = 0; - to_free_normal -= save_highmem - alloc_highmem; + to_free_normal -= save - alloc_highmem; } memory_bm_position_reset(©_bm); @@ -1259,7 +1276,7 @@ int hibernate_preallocate_memory(void) { struct zone *zone; unsigned long saveable, size, max_size, count, highmem, pages = 0; - unsigned long alloc, save_highmem, pages_highmem; + unsigned long alloc, save_highmem, pages_highmem, avail_normal; struct timeval start, stop; int error; @@ -1296,6 +1313,7 @@ int hibernate_preallocate_memory(void) else count += zone_page_state(zone, NR_FREE_PAGES); } + avail_normal = count; count += highmem; count -= totalreserve_pages; @@ -1310,12 +1328,21 @@ int hibernate_preallocate_memory(void) */ if (size >= saveable) { pages = preallocate_image_highmem(save_highmem); - pages += preallocate_image_memory(saveable - pages); + pages += preallocate_image_memory(saveable - pages, avail_normal); goto out; } /* Estimate the minimum size of the image. */ pages = minimum_image_size(saveable); + /* + * To avoid excessive pressure on the normal zone, leave room in it to + * accommodate an image of the minimum size (unless it's already too + * small, in which case don't preallocate pages from it at all). + */ + if (avail_normal > pages) + avail_normal -= pages; + else + avail_normal = 0; if (size < pages) size = min_t(unsigned long, pages, max_size); @@ -1336,16 +1363,34 @@ int hibernate_preallocate_memory(void) */ pages_highmem = preallocate_image_highmem(highmem / 2); alloc = (count - max_size) - pages_highmem; - pages = preallocate_image_memory(alloc); - if (pages < alloc) - goto err_out; - size = max_size - size; - alloc = size; - size = preallocate_highmem_fraction(size, highmem, count); - pages_highmem += size; - alloc -= size; - pages += preallocate_image_memory(alloc); - pages += pages_highmem; + pages = preallocate_image_memory(alloc, avail_normal); + if (pages < alloc) { + /* We have exhausted non-highmem pages, try highmem. */ + alloc -= pages; + pages += pages_highmem; + pages_highmem = preallocate_image_highmem(alloc); + if (pages_highmem < alloc) + goto err_out; + pages += pages_highmem; + /* + * size is the desired number of saveable pages to leave in + * memory, so try to preallocate (all memory - size) pages. + */ + alloc = (count - pages) - size; + pages += preallocate_image_highmem(alloc); + } else { + /* + * There are approximately max_size saveable pages at this point + * and we want to reduce this number down to size. + */ + alloc = max_size - size; + size = preallocate_highmem_fraction(alloc, highmem, count); + pages_highmem += size; + alloc -= size; + size = preallocate_image_memory(alloc, avail_normal); + pages_highmem += preallocate_image_highmem(alloc - size); + pages += pages_highmem + size; + } /* * We only need as many page frames for the image as there are saveable -- cgit v1.2.3 From c54fce6eff197d9c57c97afbf6c9722ce434fc8f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 10 Sep 2010 16:51:36 +0200 Subject: workqueue: add documentation Update copyright notice and add Documentation/workqueue.txt. Randy Dunlap, Dave Chinner: misc fixes. Signed-off-by: Tejun Heo Reviewed-By: Florian Mickler Cc: Ingo Molnar Cc: Christoph Lameter Cc: Randy Dunlap Cc: Dave Chinner --- Documentation/workqueue.txt | 380 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/workqueue.h | 4 + kernel/workqueue.c | 27 ++-- 3 files changed, 401 insertions(+), 10 deletions(-) create mode 100644 Documentation/workqueue.txt (limited to 'kernel') diff --git a/Documentation/workqueue.txt b/Documentation/workqueue.txt new file mode 100644 index 000000000000..e4498a2872c3 --- /dev/null +++ b/Documentation/workqueue.txt @@ -0,0 +1,380 @@ + +Concurrency Managed Workqueue (cmwq) + +September, 2010 Tejun Heo + Florian Mickler + +CONTENTS + +1. Introduction +2. Why cmwq? +3. The Design +4. Application Programming Interface (API) +5. Example Execution Scenarios +6. Guidelines + + +1. Introduction + +There are many cases where an asynchronous process execution context +is needed and the workqueue (wq) API is the most commonly used +mechanism for such cases. + +When such an asynchronous execution context is needed, a work item +describing which function to execute is put on a queue. An +independent thread serves as the asynchronous execution context. The +queue is called workqueue and the thread is called worker. + +While there are work items on the workqueue the worker executes the +functions associated with the work items one after the other. When +there is no work item left on the workqueue the worker becomes idle. +When a new work item gets queued, the worker begins executing again. + + +2. Why cmwq? + +In the original wq implementation, a multi threaded (MT) wq had one +worker thread per CPU and a single threaded (ST) wq had one worker +thread system-wide. A single MT wq needed to keep around the same +number of workers as the number of CPUs. The kernel grew a lot of MT +wq users over the years and with the number of CPU cores continuously +rising, some systems saturated the default 32k PID space just booting +up. + +Although MT wq wasted a lot of resource, the level of concurrency +provided was unsatisfactory. The limitation was common to both ST and +MT wq albeit less severe on MT. Each wq maintained its own separate +worker pool. A MT wq could provide only one execution context per CPU +while a ST wq one for the whole system. Work items had to compete for +those very limited execution contexts leading to various problems +including proneness to deadlocks around the single execution context. + +The tension between the provided level of concurrency and resource +usage also forced its users to make unnecessary tradeoffs like libata +choosing to use ST wq for polling PIOs and accepting an unnecessary +limitation that no two polling PIOs can progress at the same time. As +MT wq don't provide much better concurrency, users which require +higher level of concurrency, like async or fscache, had to implement +their own thread pool. + +Concurrency Managed Workqueue (cmwq) is a reimplementation of wq with +focus on the following goals. + +* Maintain compatibility with the original workqueue API. + +* Use per-CPU unified worker pools shared by all wq to provide + flexible level of concurrency on demand without wasting a lot of + resource. + +* Automatically regulate worker pool and level of concurrency so that + the API users don't need to worry about such details. + + +3. The Design + +In order to ease the asynchronous execution of functions a new +abstraction, the work item, is introduced. + +A work item is a simple struct that holds a pointer to the function +that is to be executed asynchronously. Whenever a driver or subsystem +wants a function to be executed asynchronously it has to set up a work +item pointing to that function and queue that work item on a +workqueue. + +Special purpose threads, called worker threads, execute the functions +off of the queue, one after the other. If no work is queued, the +worker threads become idle. These worker threads are managed in so +called thread-pools. + +The cmwq design differentiates between the user-facing workqueues that +subsystems and drivers queue work items on and the backend mechanism +which manages thread-pool and processes the queued work items. + +The backend is called gcwq. There is one gcwq for each possible CPU +and one gcwq to serve work items queued on unbound workqueues. + +Subsystems and drivers can create and queue work items through special +workqueue API functions as they see fit. They can influence some +aspects of the way the work items are executed by setting flags on the +workqueue they are putting the work item on. These flags include +things like CPU locality, reentrancy, concurrency limits and more. To +get a detailed overview refer to the API description of +alloc_workqueue() below. + +When a work item is queued to a workqueue, the target gcwq is +determined according to the queue parameters and workqueue attributes +and appended on the shared worklist of the gcwq. For example, unless +specifically overridden, a work item of a bound workqueue will be +queued on the worklist of exactly that gcwq that is associated to the +CPU the issuer is running on. + +For any worker pool implementation, managing the concurrency level +(how many execution contexts are active) is an important issue. cmwq +tries to keep the concurrency at a minimal but sufficient level. +Minimal to save resources and sufficient in that the system is used at +its full capacity. + +Each gcwq bound to an actual CPU implements concurrency management by +hooking into the scheduler. The gcwq is notified whenever an active +worker wakes up or sleeps and keeps track of the number of the +currently runnable workers. Generally, work items are not expected to +hog a CPU and consume many cycles. That means maintaining just enough +concurrency to prevent work processing from stalling should be +optimal. As long as there are one or more runnable workers on the +CPU, the gcwq doesn't start execution of a new work, but, when the +last running worker goes to sleep, it immediately schedules a new +worker so that the CPU doesn't sit idle while there are pending work +items. This allows using a minimal number of workers without losing +execution bandwidth. + +Keeping idle workers around doesn't cost other than the memory space +for kthreads, so cmwq holds onto idle ones for a while before killing +them. + +For an unbound wq, the above concurrency management doesn't apply and +the gcwq for the pseudo unbound CPU tries to start executing all work +items as soon as possible. The responsibility of regulating +concurrency level is on the users. There is also a flag to mark a +bound wq to ignore the concurrency management. Please refer to the +API section for details. + +Forward progress guarantee relies on that workers can be created when +more execution contexts are necessary, which in turn is guaranteed +through the use of rescue workers. All work items which might be used +on code paths that handle memory reclaim are required to be queued on +wq's that have a rescue-worker reserved for execution under memory +pressure. Else it is possible that the thread-pool deadlocks waiting +for execution contexts to free up. + + +4. Application Programming Interface (API) + +alloc_workqueue() allocates a wq. The original create_*workqueue() +functions are deprecated and scheduled for removal. alloc_workqueue() +takes three arguments - @name, @flags and @max_active. @name is the +name of the wq and also used as the name of the rescuer thread if +there is one. + +A wq no longer manages execution resources but serves as a domain for +forward progress guarantee, flush and work item attributes. @flags +and @max_active control how work items are assigned execution +resources, scheduled and executed. + +@flags: + + WQ_NON_REENTRANT + + By default, a wq guarantees non-reentrance only on the same + CPU. A work item may not be executed concurrently on the same + CPU by multiple workers but is allowed to be executed + concurrently on multiple CPUs. This flag makes sure + non-reentrance is enforced across all CPUs. Work items queued + to a non-reentrant wq are guaranteed to be executed by at most + one worker system-wide at any given time. + + WQ_UNBOUND + + Work items queued to an unbound wq are served by a special + gcwq which hosts workers which are not bound to any specific + CPU. This makes the wq behave as a simple execution context + provider without concurrency management. The unbound gcwq + tries to start execution of work items as soon as possible. + Unbound wq sacrifices locality but is useful for the following + cases. + + * Wide fluctuation in the concurrency level requirement is + expected and using bound wq may end up creating large number + of mostly unused workers across different CPUs as the issuer + hops through different CPUs. + + * Long running CPU intensive workloads which can be better + managed by the system scheduler. + + WQ_FREEZEABLE + + A freezeable wq participates in the freeze phase of the system + suspend operations. Work items on the wq are drained and no + new work item starts execution until thawed. + + WQ_RESCUER + + All wq which might be used in the memory reclaim paths _MUST_ + have this flag set. This reserves one worker exclusively for + the execution of this wq under memory pressure. + + WQ_HIGHPRI + + Work items of a highpri wq are queued at the head of the + worklist of the target gcwq and start execution regardless of + the current concurrency level. In other words, highpri work + items will always start execution as soon as execution + resource is available. + + Ordering among highpri work items is preserved - a highpri + work item queued after another highpri work item will start + execution after the earlier highpri work item starts. + + Although highpri work items are not held back by other + runnable work items, they still contribute to the concurrency + level. Highpri work items in runnable state will prevent + non-highpri work items from starting execution. + + This flag is meaningless for unbound wq. + + WQ_CPU_INTENSIVE + + Work items of a CPU intensive wq do not contribute to the + concurrency level. In other words, runnable CPU intensive + work items will not prevent other work items from starting + execution. This is useful for bound work items which are + expected to hog CPU cycles so that their execution is + regulated by the system scheduler. + + Although CPU intensive work items don't contribute to the + concurrency level, start of their executions is still + regulated by the concurrency management and runnable + non-CPU-intensive work items can delay execution of CPU + intensive work items. + + This flag is meaningless for unbound wq. + + WQ_HIGHPRI | WQ_CPU_INTENSIVE + + This combination makes the wq avoid interaction with + concurrency management completely and behave as a simple + per-CPU execution context provider. Work items queued on a + highpri CPU-intensive wq start execution as soon as resources + are available and don't affect execution of other work items. + +@max_active: + +@max_active determines the maximum number of execution contexts per +CPU which can be assigned to the work items of a wq. For example, +with @max_active of 16, at most 16 work items of the wq can be +executing at the same time per CPU. + +Currently, for a bound wq, the maximum limit for @max_active is 512 +and the default value used when 0 is specified is 256. For an unbound +wq, the limit is higher of 512 and 4 * num_possible_cpus(). These +values are chosen sufficiently high such that they are not the +limiting factor while providing protection in runaway cases. + +The number of active work items of a wq is usually regulated by the +users of the wq, more specifically, by how many work items the users +may queue at the same time. Unless there is a specific need for +throttling the number of active work items, specifying '0' is +recommended. + +Some users depend on the strict execution ordering of ST wq. The +combination of @max_active of 1 and WQ_UNBOUND is used to achieve this +behavior. Work items on such wq are always queued to the unbound gcwq +and only one work item can be active at any given time thus achieving +the same ordering property as ST wq. + + +5. Example Execution Scenarios + +The following example execution scenarios try to illustrate how cmwq +behave under different configurations. + + Work items w0, w1, w2 are queued to a bound wq q0 on the same CPU. + w0 burns CPU for 5ms then sleeps for 10ms then burns CPU for 5ms + again before finishing. w1 and w2 burn CPU for 5ms then sleep for + 10ms. + +Ignoring all other tasks, works and processing overhead, and assuming +simple FIFO scheduling, the following is one highly simplified version +of possible sequences of events with the original wq. + + TIME IN MSECS EVENT + 0 w0 starts and burns CPU + 5 w0 sleeps + 15 w0 wakes up and burns CPU + 20 w0 finishes + 20 w1 starts and burns CPU + 25 w1 sleeps + 35 w1 wakes up and finishes + 35 w2 starts and burns CPU + 40 w2 sleeps + 50 w2 wakes up and finishes + +And with cmwq with @max_active >= 3, + + TIME IN MSECS EVENT + 0 w0 starts and burns CPU + 5 w0 sleeps + 5 w1 starts and burns CPU + 10 w1 sleeps + 10 w2 starts and burns CPU + 15 w2 sleeps + 15 w0 wakes up and burns CPU + 20 w0 finishes + 20 w1 wakes up and finishes + 25 w2 wakes up and finishes + +If @max_active == 2, + + TIME IN MSECS EVENT + 0 w0 starts and burns CPU + 5 w0 sleeps + 5 w1 starts and burns CPU + 10 w1 sleeps + 15 w0 wakes up and burns CPU + 20 w0 finishes + 20 w1 wakes up and finishes + 20 w2 starts and burns CPU + 25 w2 sleeps + 35 w2 wakes up and finishes + +Now, let's assume w1 and w2 are queued to a different wq q1 which has +WQ_HIGHPRI set, + + TIME IN MSECS EVENT + 0 w1 and w2 start and burn CPU + 5 w1 sleeps + 10 w2 sleeps + 10 w0 starts and burns CPU + 15 w0 sleeps + 15 w1 wakes up and finishes + 20 w2 wakes up and finishes + 25 w0 wakes up and burns CPU + 30 w0 finishes + +If q1 has WQ_CPU_INTENSIVE set, + + TIME IN MSECS EVENT + 0 w0 starts and burns CPU + 5 w0 sleeps + 5 w1 and w2 start and burn CPU + 10 w1 sleeps + 15 w2 sleeps + 15 w0 wakes up and burns CPU + 20 w0 finishes + 20 w1 wakes up and finishes + 25 w2 wakes up and finishes + + +6. Guidelines + +* Do not forget to use WQ_RESCUER if a wq may process work items which + are used during memory reclaim. Each wq with WQ_RESCUER set has one + rescuer thread reserved for it. If there is dependency among + multiple work items used during memory reclaim, they should be + queued to separate wq each with WQ_RESCUER. + +* Unless strict ordering is required, there is no need to use ST wq. + +* Unless there is a specific need, using 0 for @max_active is + recommended. In most use cases, concurrency level usually stays + well under the default limit. + +* A wq serves as a domain for forward progress guarantee (WQ_RESCUER), + flush and work item attributes. Work items which are not involved + in memory reclaim and don't need to be flushed as a part of a group + of work items, and don't require any special attribute, can use one + of the system wq. There is no difference in execution + characteristics between using a dedicated wq and a system wq. + +* Unless work items are expected to consume a huge amount of CPU + cycles, using a bound wq is usually beneficial due to the increased + level of locality in wq operations and work item execution. diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index f11100f96482..25e02c941bac 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -235,6 +235,10 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } #define work_clear_pending(work) \ clear_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) +/* + * Workqueue flags and constants. For details, please refer to + * Documentation/workqueue.txt. + */ enum { WQ_NON_REENTRANT = 1 << 0, /* guarantee non-reentrance */ WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 727f24e563ae..f77afd939229 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1,19 +1,26 @@ /* - * linux/kernel/workqueue.c + * kernel/workqueue.c - generic async execution with shared worker pool * - * Generic mechanism for defining kernel helper threads for running - * arbitrary tasks in process context. + * Copyright (C) 2002 Ingo Molnar * - * Started by Ingo Molnar, Copyright (C) 2002 + * Derived from the taskqueue/keventd code by: + * David Woodhouse + * Andrew Morton + * Kai Petzke + * Theodore Ts'o * - * Derived from the taskqueue/keventd code by: + * Made to use alloc_percpu by Christoph Lameter. * - * David Woodhouse - * Andrew Morton - * Kai Petzke - * Theodore Ts'o + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo * - * Made to use alloc_percpu by Christoph Lameter. + * This is the generic async execution mechanism. Work items as are + * executed in process context. The worker pool is shared and + * automatically managed. There is one worker pool for each CPU and + * one extra for works which are better served by workers which are + * not bound to any specific CPU. + * + * Please read Documentation/workqueue.txt for details. */ #include -- cgit v1.2.3 From 0bf377bbb0bea6130f35613491887cc622e42a8b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 12 Sep 2010 08:14:52 +0200 Subject: sched: Improve latencies under load by decreasing minimum scheduling granularity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mathieu reported bad latencies with make -j10 kind of kbuild workloads - which is mostly caused by us scheduling with a too coarse granularity. Reduce the minimum granularity some more, to make sure we can meet the latency target. I got the following results (make -j10 kbuild load, average of 3 runs): vanilla: maximum latency: 38278.9 µs average latency: 7730.1 µs patched: maximum latency: 22702.1 µs average latency: 6684.8 µs Mathieu also measured it: | | * wakeup-latency.c (SIGEV_THREAD) with make -j10 | | - Mainline 2.6.35.2 kernel | | maximum latency: 45762.1 µs | average latency: 7348.6 µs | | - With only Peter's smaller min_gran (shown below): | | maximum latency: 29100.6 µs | average latency: 6684.1 µs | Reported-by: Mathieu Desnoyers Reported-by: Linus Torvalds Acked-by: Mathieu Desnoyers Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9b5b4f86b742..a171138a9402 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -54,13 +54,13 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling * Minimal preemption granularity for CPU-bound tasks: * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 2000000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; +unsigned int sysctl_sched_min_granularity = 750000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -static unsigned int sched_nr_latency = 3; +static unsigned int sched_nr_latency = 8; /* * After fork, child runs first. If set to 0 (default) then -- cgit v1.2.3 From c41d68a513c71e35a14f66d71782d27a79a81ea6 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 7 Sep 2010 16:16:18 -0700 Subject: compat: Make compat_alloc_user_space() incorporate the access_ok() compat_alloc_user_space() expects the caller to independently call access_ok() to verify the returned area. A missing call could introduce problems on some architectures. This patch incorporates the access_ok() check into compat_alloc_user_space() and also adds a sanity check on the length. The existing compat_alloc_user_space() implementations are renamed arch_compat_alloc_user_space() and are used as part of the implementation of the new global function. This patch assumes NULL will cause __get_user()/__put_user() to either fail or access userspace on all architectures. This should be followed by checking the return value of compat_access_user_space() for NULL in the callers, at which time the access_ok() in the callers can also be removed. Reported-by: Ben Hawkes Signed-off-by: H. Peter Anvin Acked-by: Benjamin Herrenschmidt Acked-by: Chris Metcalf Acked-by: David S. Miller Acked-by: Ingo Molnar Acked-by: Thomas Gleixner Acked-by: Tony Luck Cc: Andrew Morton Cc: Arnd Bergmann Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Heiko Carstens Cc: Helge Deller Cc: James Bottomley Cc: Kyle McMartin Cc: Martin Schwidefsky Cc: Paul Mackerras Cc: Ralf Baechle Cc: --- arch/ia64/include/asm/compat.h | 2 +- arch/mips/include/asm/compat.h | 2 +- arch/parisc/include/asm/compat.h | 2 +- arch/powerpc/include/asm/compat.h | 2 +- arch/s390/include/asm/compat.h | 2 +- arch/sparc/include/asm/compat.h | 2 +- arch/tile/include/asm/compat.h | 2 +- arch/x86/include/asm/compat.h | 2 +- include/linux/compat.h | 3 +++ kernel/compat.c | 21 +++++++++++++++++++++ 10 files changed, 32 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/asm/compat.h b/arch/ia64/include/asm/compat.h index f90edc85b509..9301a2821615 100644 --- a/arch/ia64/include/asm/compat.h +++ b/arch/ia64/include/asm/compat.h @@ -199,7 +199,7 @@ ptr_to_compat(void __user *uptr) } static __inline__ void __user * -compat_alloc_user_space (long len) +arch_compat_alloc_user_space (long len) { struct pt_regs *regs = task_pt_regs(current); return (void __user *) (((regs->r12 & 0xffffffff) & -16) - len); diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index 613f6912dfc1..dbc51065df5b 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -145,7 +145,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) return (u32)(unsigned long)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = (struct pt_regs *) ((unsigned long) current_thread_info() + THREAD_SIZE - 32) - 1; diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index 02b77baa5da6..efa0b60c63fe 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -147,7 +147,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) return (u32)(unsigned long)uptr; } -static __inline__ void __user *compat_alloc_user_space(long len) +static __inline__ void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = ¤t->thread.regs; return (void __user *)regs->gr[30]; diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 396d21a80058..a11d4eac4f97 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -134,7 +134,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) return (u32)(unsigned long)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = current->thread.regs; unsigned long usp = regs->gpr[1]; diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 104f2007f097..a875c2f542e1 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -181,7 +181,7 @@ static inline int is_compat_task(void) #endif -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { unsigned long stack; diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 5016f76ea98a..6f57325bb883 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -167,7 +167,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) return (u32)(unsigned long)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = current_thread_info()->kregs; unsigned long usp = regs->u_regs[UREG_I6]; diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h index 5a34da6cdd79..345d81ce44bb 100644 --- a/arch/tile/include/asm/compat.h +++ b/arch/tile/include/asm/compat.h @@ -195,7 +195,7 @@ static inline unsigned long ptr_to_compat_reg(void __user *uptr) return (long)(int)(long __force)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = task_pt_regs(current); return (void __user *)regs->sp - len; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 306160e58b48..1d9cd27c2920 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -205,7 +205,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) return (u32)(unsigned long)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = task_pt_regs(current); return (void __user *)regs->sp - len; diff --git a/include/linux/compat.h b/include/linux/compat.h index 9ddc8780e8db..5778b559d59c 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -360,5 +360,8 @@ extern ssize_t compat_rw_copy_check_uvector(int type, const struct compat_iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, struct iovec **ret_pointer); + +extern void __user *compat_alloc_user_space(unsigned long len); + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ diff --git a/kernel/compat.c b/kernel/compat.c index e167efce8423..c9e2ec0b34a8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1126,3 +1126,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info) return 0; } + +/* + * Allocate user-space memory for the duration of a single system call, + * in order to marshall parameters inside a compat thunk. + */ +void __user *compat_alloc_user_space(unsigned long len) +{ + void __user *ptr; + + /* If len would occupy more than half of the entire compat space... */ + if (unlikely(len > (((compat_uptr_t)~0) >> 1))) + return NULL; + + ptr = arch_compat_alloc_user_space(len); + + if (unlikely(!access_ok(VERIFY_WRITE, ptr, len))) + return NULL; + + return ptr; +} +EXPORT_SYMBOL_GPL(compat_alloc_user_space); -- cgit v1.2.3 From e75e863dd5c7d96b91ebbd241da5328fc38a78cc Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 14 Sep 2010 16:35:14 +0200 Subject: sched: Fix user time incorrectly accounted as system time on 32-bit We have 32-bit variable overflow possibility when multiply in task_times() and thread_group_times() functions. When the overflow happens then the scaled utime value becomes erroneously small and the scaled stime becomes i erroneously big. Reported here: https://bugzilla.redhat.com/show_bug.cgi?id=633037 https://bugzilla.kernel.org/show_bug.cgi?id=16559 Reported-by: Michael Chapman Reported-by: Ciriaco Garcia de Celis Signed-off-by: Stanislaw Gruszka Signed-off-by: Peter Zijlstra Cc: Hidetoshi Seto Cc: # 2.6.32.19+ (partially) and 2.6.33+ LKML-Reference: <20100914143513.GB8415@redhat.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ed09d4f2a69c..dc85ceb90832 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3513,9 +3513,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - u64 temp; + u64 temp = rtime; - temp = (u64)(rtime * utime); + temp *= utime; do_div(temp, total); utime = (cputime_t)temp; } else @@ -3546,9 +3546,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) rtime = nsecs_to_cputime(cputime.sum_exec_runtime); if (total) { - u64 temp; + u64 temp = rtime; - temp = (u64)(rtime * cputime.utime); + temp *= cputime.utime; do_div(temp, total); utime = (cputime_t)temp; } else -- cgit v1.2.3 From 068e35eee9ef98eb4cab55181977e24995d273be Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Mon, 13 Sep 2010 13:01:18 -0700 Subject: hw breakpoints: Fix pid namespace bug Hardware breakpoints can't be registered within pid namespaces because tsk->pid is passed rather than the pid in the current namespace. (See https://bugzilla.kernel.org/show_bug.cgi?id=17281 ) This is a quick fix demonstrating the problem but is not the best method of solving the problem since passing pids internally is not the best way to avoid pid namespace bugs. Subsequent patches will show a better solution. Much thanks to Frederic Weisbecker for doing the bulk of the work finding this bug. Reported-by: Robin Green Signed-off-by: Matt Helsley Signed-off-by: Peter Zijlstra Cc: Prasad Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Will Deacon Cc: Mahesh Salgaonkar Cc: 2.6.33-2.6.35 LKML-Reference: Signed-off-by: Ingo Molnar Signed-off-by: Frederic Weisbecker --- kernel/hw_breakpoint.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index d71a987fd2bf..c7c2aed9e2dc 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -433,7 +433,8 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, struct task_struct *tsk) { - return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); + return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), + triggered); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); -- cgit v1.2.3 From f6c3f1686e7ec1dd8725a9a3dcb857dfd0c7a5bf Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 13 Sep 2010 11:02:21 -0700 Subject: sched: Fix nohz balance kick There's a situation where the nohz balancer will try to wake itself: cpu-x is idle which is also ilb_cpu got a scheduler tick during idle and the nohz_kick_needed() in trigger_load_balance() checks for rq_x->nr_running which might not be zero (because of someone waking a task on this rq etc) and this leads to the situation of the cpu-x sending a kick to itself. And this can cause a lockup. Avoid this by not marking ourself eligible for kicking. Signed-off-by: Suresh Siddha Signed-off-by: Peter Zijlstra LKML-Reference: <1284400941.2684.19.camel@sbsiddha-MOBL3.sc.intel.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a171138a9402..db3f674ca49d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3630,7 +3630,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) if (time_before(now, nohz.next_balance)) return 0; - if (!rq->nr_running) + if (rq->idle_at_tick) return 0; first_pick_cpu = atomic_read(&nohz.first_pick_cpu); -- cgit v1.2.3 From a247c3a97a0216b18a46243eda26081f1928ec37 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 22 Sep 2010 13:05:12 -0700 Subject: rmap: fix walk during fork The below bug in fork led to the rmap walk finding the parent huge-pmd twice instead of just once, because the anon_vma_chain objects of the child vma still point to the vma->vm_mm of the parent. The patch fixes it by making the rmap walk accurate during fork. It's not a big deal normally but it worth being accurate considering the cost is the same. Signed-off-by: Andrea Arcangeli Acked-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b7e9d60a675d..c445f8cc408d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -356,10 +356,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); + tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; - tmp->vm_mm = mm; tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { -- cgit v1.2.3 From 399f1e30ac17b77d383444aff480c7390f5adf2a Mon Sep 17 00:00:00 2001 From: "Ira W. Snyder" Date: Thu, 30 Sep 2010 15:15:27 -0700 Subject: kfifo: fix scatterlist usage The kfifo_dma family of functions use sg_mark_end() on the last element in their scatterlist. This forces use of a fresh scatterlist for each DMA operation, which makes recycling a single scatterlist impossible. Change the behavior of the kfifo_dma functions to match the usage of the dma_map_sg function. This means that users must respect the returned nents value. The sample code is updated to reflect the change. This bug is trivial to cause: call kfifo_dma_in_prepare() such that it prepares a scatterlist with a single entry comprising the whole fifo. This is the case when you map the entirety of a newly created empty fifo. This causes the setup_sgl() function to mark the first scatterlist entry as the end of the chain, no matter what comes after it. Afterwards, add and remove some data from the fifo such that another call to kfifo_dma_in_prepare() will create two scatterlist entries. It returns nents=2. However, due to the previous sg_mark_end() call, sg_is_last() will now return true for the first scatterlist element. This causes the sample code to print a single scatterlist element when it should print two. By removing the call to sg_mark_end(), we make the API as similar as possible to the DMA mapping API. All users are required to respect the returned nents. Signed-off-by: Ira W. Snyder Cc: Stefani Seibold Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kfifo.c | 2 -- samples/kfifo/dma-example.c | 17 +++++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 6b5580c57644..01a0700e873f 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -365,8 +365,6 @@ static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, n = setup_sgl_buf(sgl, fifo->data + off, nents, l); n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); - if (n) - sg_mark_end(sgl + n - 1); return n; } diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c index ee03a4f0b64f..06473791c08a 100644 --- a/samples/kfifo/dma-example.c +++ b/samples/kfifo/dma-example.c @@ -24,6 +24,7 @@ static int __init example_init(void) { int i; unsigned int ret; + unsigned int nents; struct scatterlist sg[10]; printk(KERN_INFO "DMA fifo test start\n"); @@ -61,9 +62,9 @@ static int __init example_init(void) * byte at the beginning, after the kfifo_skip(). */ sg_init_table(sg, ARRAY_SIZE(sg)); - ret = kfifo_dma_in_prepare(&fifo, sg, ARRAY_SIZE(sg), FIFO_SIZE); - printk(KERN_INFO "DMA sgl entries: %d\n", ret); - if (!ret) { + nents = kfifo_dma_in_prepare(&fifo, sg, ARRAY_SIZE(sg), FIFO_SIZE); + printk(KERN_INFO "DMA sgl entries: %d\n", nents); + if (!nents) { /* fifo is full and no sgl was created */ printk(KERN_WARNING "error kfifo_dma_in_prepare\n"); return -EIO; @@ -71,7 +72,7 @@ static int __init example_init(void) /* receive data */ printk(KERN_INFO "scatterlist for receive:\n"); - for (i = 0; i < ARRAY_SIZE(sg); i++) { + for (i = 0; i < nents; i++) { printk(KERN_INFO "sg[%d] -> " "page_link 0x%.8lx offset 0x%.8x length 0x%.8x\n", @@ -91,16 +92,16 @@ static int __init example_init(void) kfifo_dma_in_finish(&fifo, ret); /* Prepare to transmit data, example: 8 bytes */ - ret = kfifo_dma_out_prepare(&fifo, sg, ARRAY_SIZE(sg), 8); - printk(KERN_INFO "DMA sgl entries: %d\n", ret); - if (!ret) { + nents = kfifo_dma_out_prepare(&fifo, sg, ARRAY_SIZE(sg), 8); + printk(KERN_INFO "DMA sgl entries: %d\n", nents); + if (!nents) { /* no data was available and no sgl was created */ printk(KERN_WARNING "error kfifo_dma_out_prepare\n"); return -EIO; } printk(KERN_INFO "scatterlist for transmit:\n"); - for (i = 0; i < ARRAY_SIZE(sg); i++) { + for (i = 0; i < nents; i++) { printk(KERN_INFO "sg[%d] -> " "page_link 0x%.8lx offset 0x%.8x length 0x%.8x\n", -- cgit v1.2.3 From 5336377d6225959624146629ce3fc88ee8ecda3d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 5 Oct 2010 11:29:27 -0700 Subject: modules: Fix module_bug_list list corruption race With all the recent module loading cleanups, we've minimized the code that sits under module_mutex, fixing various deadlocks and making it possible to do most of the module loading in parallel. However, that whole conversion totally missed the rather obscure code that adds a new module to the list for BUG() handling. That code was doubly obscure because (a) the code itself lives in lib/bugs.c (for dubious reasons) and (b) it gets called from the architecture-specific "module_finalize()" rather than from generic code. Calling it from arch-specific code makes no sense what-so-ever to begin with, and is now actively wrong since that code isn't protected by the module loading lock any more. So this commit moves the "module_bug_{finalize,cleanup}()" calls away from the arch-specific code, and into the generic code - and in the process protects it with the module_mutex so that the list operations are now safe. Future fixups: - move the module list handling code into kernel/module.c where it belongs. - get rid of 'module_bug_list' and just use the regular list of modules (called 'modules' - imagine that) that we already create and maintain for other reasons. Reported-and-tested-by: Thomas Gleixner Cc: Rusty Russell Cc: Adrian Bunk Cc: Andrew Morton Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- arch/avr32/kernel/module.c | 3 +-- arch/h8300/kernel/module.c | 3 +-- arch/mn10300/kernel/module.c | 3 +-- arch/parisc/kernel/module.c | 3 +-- arch/powerpc/kernel/module.c | 5 ----- arch/s390/kernel/module.c | 3 +-- arch/sh/kernel/module.c | 2 -- arch/x86/kernel/module.c | 3 +-- include/linux/module.h | 5 ++--- kernel/module.c | 4 ++++ lib/bug.c | 6 ++---- 11 files changed, 14 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/arch/avr32/kernel/module.c b/arch/avr32/kernel/module.c index 98f94d041d9c..a727f54d64d6 100644 --- a/arch/avr32/kernel/module.c +++ b/arch/avr32/kernel/module.c @@ -314,10 +314,9 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, vfree(module->arch.syminfo); module->arch.syminfo = NULL; - return module_bug_finalize(hdr, sechdrs, module); + return 0; } void module_arch_cleanup(struct module *module) { - module_bug_cleanup(module); } diff --git a/arch/h8300/kernel/module.c b/arch/h8300/kernel/module.c index 0865e291c20d..db4953dc4e1b 100644 --- a/arch/h8300/kernel/module.c +++ b/arch/h8300/kernel/module.c @@ -112,10 +112,9 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - return module_bug_finalize(hdr, sechdrs, me); + return 0; } void module_arch_cleanup(struct module *mod) { - module_bug_cleanup(mod); } diff --git a/arch/mn10300/kernel/module.c b/arch/mn10300/kernel/module.c index 6aea7fd76993..196a111e2e29 100644 --- a/arch/mn10300/kernel/module.c +++ b/arch/mn10300/kernel/module.c @@ -206,7 +206,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - return module_bug_finalize(hdr, sechdrs, me); + return 0; } /* @@ -214,5 +214,4 @@ int module_finalize(const Elf_Ehdr *hdr, */ void module_arch_cleanup(struct module *mod) { - module_bug_cleanup(mod); } diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index 159a2b81e90c..6e81bb596e5b 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -941,11 +941,10 @@ int module_finalize(const Elf_Ehdr *hdr, nsyms = newptr - (Elf_Sym *)symhdr->sh_addr; DEBUGP("NEW num_symtab %lu\n", nsyms); symhdr->sh_size = nsyms * sizeof(Elf_Sym); - return module_bug_finalize(hdr, sechdrs, me); + return 0; } void module_arch_cleanup(struct module *mod) { deregister_unwind_table(mod); - module_bug_cleanup(mod); } diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index 477c663e0140..4ef93ae2235f 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -65,10 +65,6 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sect; int err; - err = module_bug_finalize(hdr, sechdrs, me); - if (err) - return err; - /* Apply feature fixups */ sect = find_section(hdr, sechdrs, "__ftr_fixup"); if (sect != NULL) @@ -101,5 +97,4 @@ int module_finalize(const Elf_Ehdr *hdr, void module_arch_cleanup(struct module *mod) { - module_bug_cleanup(mod); } diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 22cfd634c355..f7167ee4604c 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -407,10 +407,9 @@ int module_finalize(const Elf_Ehdr *hdr, { vfree(me->arch.syminfo); me->arch.syminfo = NULL; - return module_bug_finalize(hdr, sechdrs, me); + return 0; } void module_arch_cleanup(struct module *mod) { - module_bug_cleanup(mod); } diff --git a/arch/sh/kernel/module.c b/arch/sh/kernel/module.c index 43adddfe4c04..ae0be697a89e 100644 --- a/arch/sh/kernel/module.c +++ b/arch/sh/kernel/module.c @@ -149,13 +149,11 @@ int module_finalize(const Elf_Ehdr *hdr, int ret = 0; ret |= module_dwarf_finalize(hdr, sechdrs, me); - ret |= module_bug_finalize(hdr, sechdrs, me); return ret; } void module_arch_cleanup(struct module *mod) { - module_bug_cleanup(mod); module_dwarf_cleanup(mod); } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index e0bc186d7501..1c355c550960 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -239,11 +239,10 @@ int module_finalize(const Elf_Ehdr *hdr, apply_paravirt(pseg, pseg + para->sh_size); } - return module_bug_finalize(hdr, sechdrs, me); + return 0; } void module_arch_cleanup(struct module *mod) { alternatives_smp_module_del(mod); - module_bug_cleanup(mod); } diff --git a/include/linux/module.h b/include/linux/module.h index 8a6b9fdc7ffa..aace066bad8f 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -686,17 +686,16 @@ extern int module_sysfs_initialized; #ifdef CONFIG_GENERIC_BUG -int module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, +void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, struct module *); void module_bug_cleanup(struct module *); #else /* !CONFIG_GENERIC_BUG */ -static inline int module_bug_finalize(const Elf_Ehdr *hdr, +static inline void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { - return 0; } static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ diff --git a/kernel/module.c b/kernel/module.c index d0b5f8db11b4..ccd641991842 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1537,6 +1537,7 @@ static int __unlink_module(void *_mod) { struct module *mod = _mod; list_del(&mod->list); + module_bug_cleanup(mod); return 0; } @@ -2625,6 +2626,7 @@ static struct module *load_module(void __user *umod, if (err < 0) goto ddebug; + module_bug_finalize(info.hdr, info.sechdrs, mod); list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2650,6 +2652,8 @@ static struct module *load_module(void __user *umod, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + module_bug_cleanup(mod); + ddebug: if (!mod->taints) dynamic_debug_remove(info.debug); diff --git a/lib/bug.c b/lib/bug.c index 7cdfad88128f..19552096d16b 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -72,8 +72,8 @@ static const struct bug_entry *module_find_bug(unsigned long bugaddr) return NULL; } -int module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, - struct module *mod) +void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, + struct module *mod) { char *secstrings; unsigned int i; @@ -97,8 +97,6 @@ int module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, * could potentially lead to deadlock and thus be counter-productive. */ list_add(&mod->bug_list, &module_bug_list); - - return 0; } void module_bug_cleanup(struct module *mod) -- cgit v1.2.3 From a337fdac7a5622d1e6547f4b476c14dfe5a2c892 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Sep 2010 20:32:19 +0200 Subject: HWPOISON: Copy si_addr_lsb to user The original hwpoison code added a new siginfo field si_addr_lsb to pass the granuality of the fault address to user space. Unfortunately this field was never copied to user space. Fix this here. I added explicit checks for the MCEERR codes to avoid having to patch all potential callers to initialize the field. Signed-off-by: Andi Kleen --- kernel/signal.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index bded65187780..919562c3d6b7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2214,6 +2214,14 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_addr, &to->si_addr); #ifdef __ARCH_SI_TRAPNO err |= __put_user(from->si_trapno, &to->si_trapno); +#endif +#ifdef BUS_MCEERR_AO + /* + * Other callers might not initialize the si_lsb field, + * so check explicitely for the right codes here. + */ + if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) + err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif break; case __SI_CHLD: -- cgit v1.2.3 From 27b3d80a7b6adcf069b5e869e4efcc3a79f88a91 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Oct 2010 12:59:29 -0700 Subject: sysctl: fix min/max handling in __do_proc_doulongvec_minmax() When proc_doulongvec_minmax() is used with an array of longs, and no min/max check requested (.extra1 or .extra2 being NULL), we dereference a NULL pointer for the second element of the array. Noticed while doing some changes in network stack for the "16TB problem" Fix is to not change min & max pointers in __do_proc_doulongvec_minmax(), so that all elements of the vector share an unique min/max limit, like proc_dointvec_minmax(). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Eric Dumazet Cc: "Eric W. Biederman" Cc: Americo Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f88552c6d227..3a45c224770f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2485,7 +2485,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int kbuf[left] = 0; } - for (; left && vleft--; i++, min++, max++, first=0) { + for (; left && vleft--; i++, first = 0) { unsigned long val; if (write) { -- cgit v1.2.3 From ad0cf3478de8677f720ee06393b3147819568d6a Mon Sep 17 00:00:00 2001 From: John Blackwood Date: Tue, 28 Sep 2010 18:03:11 -0400 Subject: perf: Fix incorrect copy_from_user() usage perf events: repair incorrect use of copy_from_user This makes the perf_event_period() return 0 instead of -EFAULT on success. Signed-off-by: John Blackwood Signed-off-by: Joe Korty Acked-by: Peter Zijlstra LKML-Reference: <20100928220311.GA18145@tsunami.ccur.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index db5b56064687..b98bed3d8182 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2202,15 +2202,13 @@ static void perf_event_for_each(struct perf_event *event, static int perf_event_period(struct perf_event *event, u64 __user *arg) { struct perf_event_context *ctx = event->ctx; - unsigned long size; int ret = 0; u64 value; if (!event->attr.sample_period) return -EINVAL; - size = copy_from_user(&value, arg, sizeof(value)); - if (size != sizeof(value)) + if (copy_from_user(&value, arg, sizeof(value))) return -EFAULT; if (!value) -- cgit v1.2.3 From d01343244abdedd18303d0323b518ed9cdcb1988 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 12 Oct 2010 12:06:43 -0400 Subject: ring-buffer: Fix typo of time extends per page Time stamps for the ring buffer are created by the difference between two events. Each page of the ring buffer holds a full 64 bit timestamp. Each event has a 27 bit delta stamp from the last event. The unit of time is nanoseconds, so 27 bits can hold ~134 milliseconds. If two events happen more than 134 milliseconds apart, a time extend is inserted to add more bits for the delta. The time extend has 59 bits, which is good for ~18 years. Currently the time extend is committed separately from the event. If an event is discarded before it is committed, due to filtering, the time extend still exists. If all events are being filtered, then after ~134 milliseconds a new time extend will be added to the buffer. This can only happen till the end of the page. Since each page holds a full timestamp, there is no reason to add a time extend to the beginning of a page. Time extends can only fill a page that has actual data at the beginning, so there is no fear that time extends will fill more than a page without any data. When reading an event, a loop is made to skip over time extends since they are only used to maintain the time stamp and are never given to the caller. As a paranoid check to prevent the loop running forever, with the knowledge that time extends may only fill a page, a check is made that tests the iteration of the loop, and if the iteration is more than the number of time extends that can fit in a page a warning is printed and the ring buffer is disabled (all of ftrace is also disabled with it). There is another event type that is called a TIMESTAMP which can hold 64 bits of data in the theoretical case that two events happen 18 years apart. This code has not been implemented, but the name of this event exists, as well as the structure for it. The size of a TIMESTAMP is 16 bytes, where as a time extend is only 8 bytes. The macro used to calculate how many time extends can fit on a page used the TIMESTAMP size instead of the time extend size cutting the amount in half. The following test case can easily trigger the warning since we only need to have half the page filled with time extends to trigger the warning: # cd /sys/kernel/debug/tracing/ # echo function > current_tracer # echo 'common_pid < 0' > events/ftrace/function/filter # echo > trace # echo 1 > trace_marker # sleep 120 # cat trace Enabling the function tracer and then setting the filter to only trace functions where the process id is negative (no events), then clearing the trace buffer to ensure that we have nothing in the buffer, then write to trace_marker to add an event to the beginning of a page, sleep for 2 minutes (only 35 seconds is probably needed, but this guarantees the bug), and then finally reading the trace which will trigger the bug. This patch fixes the typo and prevents the false positive of that warning. Reported-by: Hans J. Koch Tested-by: Hans J. Koch Cc: Thomas Gleixner Cc: Stable Kernel Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 492197e2f86c..bca96377fd4e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -405,7 +405,7 @@ static inline int test_time_stamp(u64 delta) #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) /* Max number of timestamps that can fit on a page */ -#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) +#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND) int ring_buffer_print_page_header(struct trace_seq *s) { -- cgit v1.2.3