From 6892b75e60557a48c01d57ba320419a9e2ce9846 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 13 Feb 2008 14:02:36 +0100
Subject: sched: make early bootup sched_clock() use safer

do not call sched_clock() too early. Not only might rq->idle
not be set up - but pure per-cpu data might not be accessible
either.

this solves an ia64 early bootup hang with CONFIG_PRINTK_TIME=y.

Tested-by: Tony Luck <tony.luck@gmail.com>
Acked-by: Tony Luck <tony.luck@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b387a8de26a5..7286ccb01082 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -668,6 +668,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
  */
 unsigned int sysctl_sched_rt_period = 1000000;
 
+static __read_mostly int scheduler_running;
+
 /*
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
@@ -689,14 +691,16 @@ unsigned long long cpu_clock(int cpu)
 	unsigned long flags;
 	struct rq *rq;
 
-	local_irq_save(flags);
-	rq = cpu_rq(cpu);
 	/*
 	 * Only call sched_clock() if the scheduler has already been
 	 * initialized (some code might call cpu_clock() very early):
 	 */
-	if (rq->idle)
-		update_rq_clock(rq);
+	if (unlikely(!scheduler_running))
+		return 0;
+
+	local_irq_save(flags);
+	rq = cpu_rq(cpu);
+	update_rq_clock(rq);
 	now = rq->clock;
 	local_irq_restore(flags);
 
@@ -7284,6 +7288,8 @@ void __init sched_init(void)
 	 * During early bootup we pretend to be a normal task:
 	 */
 	current->sched_class = &fair_sched_class;
+
+	scheduler_running = 1;
 }
 
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-- 
cgit v1.2.3


From 70eee74b70c1a8485ec5f2bafa13dbc66fab6e02 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Fri, 22 Feb 2008 13:25:53 +0530
Subject: sched: remove duplicate code from sched_fair.c

pick_task_entity() duplicates existing code. This functionality can be
easily obtained using rb_last(). Avoid code duplication by using rb_last().

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6c091d6e159d..7abad50d935f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -202,16 +202,13 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 
 static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
-	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
-	struct sched_entity *se = NULL;
-	struct rb_node *parent;
-
-	while (*link) {
-		parent = *link;
-		se = rb_entry(parent, struct sched_entity, run_node);
-		link = &parent->rb_right;
-	}
+	struct rb_node *last;
+	struct sched_entity *se;
 
+	last = rb_last(&cfs_rq->tasks_timeline);
+	if (!last)
+		return NULL;
+	se = rb_entry(last, struct sched_entity, run_node);
 	return se;
 }
 
-- 
cgit v1.2.3


From 7eee3e677d6e2e9007afcd7d79b0715525aa552e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 22 Feb 2008 10:32:21 +0100
Subject: sched: clean up __pick_last_entity() a bit

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7abad50d935f..c8e6492c5925 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -202,14 +202,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 
 static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
-	struct rb_node *last;
-	struct sched_entity *se;
+	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 
-	last = rb_last(&cfs_rq->tasks_timeline);
 	if (!last)
 		return NULL;
-	se = rb_entry(last, struct sched_entity, run_node);
-	return se;
+
+	return rb_entry(last, struct sched_entity, run_node);
 }
 
 /**************************************************************
-- 
cgit v1.2.3


From 67ca7bde2e9d3516b5ae0188330ad1059ac03f38 Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Fri, 15 Feb 2008 09:56:36 -0800
Subject: sched: fix signedness warnings in sched.c

Unsigned long values are always assigned to switch_count,
make it unsigned long.

kernel/sched.c:3897:15: warning: incorrect type in assignment (different signedness)
kernel/sched.c:3897:15:    expected long *switch_count
kernel/sched.c:3897:15:    got unsigned long *<noident>
kernel/sched.c:3921:16: warning: incorrect type in assignment (different signedness)
kernel/sched.c:3921:16:    expected long *switch_count
kernel/sched.c:3921:16:    got unsigned long *<noident>

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7286ccb01082..f06950c8a6ce 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3889,7 +3889,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
-	long *switch_count;
+	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
-- 
cgit v1.2.3


From 1481197b50114d7212d659d41cb97f31a8934883 Mon Sep 17 00:00:00 2001
From: Dale Farnsworth <dale@farnsworth.org>
Date: Mon, 25 Feb 2008 23:03:02 +0100
Subject: Subject: lockdep: include all lock classes in all_lock_classes Add
 each lock class to the all_lock_classes list when it is first registered.

Previously, lock classes were added to all_lock_classes when
the lock class was first used.  Since one of the uses of the
list is to find unused locks, this didn't work well.

Signed-off-by: Dale Farnsworth <dale@farnsworth.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3574379f4d62..81a4e4a3f087 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -779,6 +779,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 	 * parallel walking of the hash-list safe:
 	 */
 	list_add_tail_rcu(&class->hash_entry, hash_head);
+	/*
+	 * Add it to the global list of classes:
+	 */
+	list_add_tail_rcu(&class->lock_entry, &all_lock_classes);
 
 	if (verbose(class)) {
 		graph_unlock();
@@ -2282,10 +2286,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
 			return 0;
 		break;
 	case LOCK_USED:
-		/*
-		 * Add it to the global list of classes:
-		 */
-		list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
 		debug_atomic_dec(&nr_unused_locks);
 		break;
 	default:
-- 
cgit v1.2.3


From cf3680b90c7842cf91ed857ac4528f4e057da366 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 14 Feb 2008 10:32:07 +0900
Subject: printk: fix possible printk overrun

printk recursion detection prepends message to printk_buf and offsets
printk_buf when actual message is printed but it forgets to trim buffer
length accordingly. This can result in overrun in extreme cases. Fix it.

[ mingo@elte.hu:

  bug was introduced by me via:

   commit 32a76006683f7b28ae3cc491da37716e002f198e
   Author: Ingo Molnar <mingo@elte.hu>
   Date:   Fri Jan 25 21:07:58 2008 +0100

       printk: make printk more robust by not allowing recursion
]

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index bee36100f110..9adc2a473e6e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -666,7 +666,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	}
 	/* Emit the output into the temporary buffer */
 	printed_len += vscnprintf(printk_buf + printed_len,
-				  sizeof(printk_buf), fmt, args);
+				  sizeof(printk_buf) - printed_len, fmt, args);
 
 	/*
 	 * Copy the output into log_buf.  If the caller didn't provide
-- 
cgit v1.2.3


From 2232c2d8e0a6a31061dec311f3d1cf7624bc14f1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 29 Feb 2008 18:46:50 +0100
Subject: rcu: add support for dynamic ticks and preempt rcu

The PREEMPT-RCU can get stuck if a CPU goes idle and NO_HZ is set. The
idle CPU will not progress the RCU through its grace period and a
synchronize_rcu my get stuck. Without this patch I have a box that will
not boot when PREEMPT_RCU and NO_HZ are set. That same box boots fine
with this patch.

This patch comes from the -rt kernel where it has been tested for
several months.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hardirq.h    |  10 ++
 include/linux/rcuclassic.h |   3 +
 include/linux/rcupreempt.h |  22 +++++
 kernel/rcupreempt.c        | 224 ++++++++++++++++++++++++++++++++++++++++++++-
 kernel/softirq.c           |   1 +
 kernel/time/tick-sched.c   |   3 +
 6 files changed, 259 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 2961ec788046..49829988bfa0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -109,6 +109,14 @@ static inline void account_system_vtime(struct task_struct *tsk)
 }
 #endif
 
+#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+extern void rcu_irq_enter(void);
+extern void rcu_irq_exit(void);
+#else
+# define rcu_irq_enter() do { } while (0)
+# define rcu_irq_exit() do { } while (0)
+#endif /* CONFIG_PREEMPT_RCU */
+
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
  * because NMI handlers may not preempt and the ops are
@@ -117,6 +125,7 @@ static inline void account_system_vtime(struct task_struct *tsk)
  */
 #define __irq_enter()					\
 	do {						\
+		rcu_irq_enter();			\
 		account_system_vtime(current);		\
 		add_preempt_count(HARDIRQ_OFFSET);	\
 		trace_hardirq_enter();			\
@@ -135,6 +144,7 @@ extern void irq_enter(void);
 		trace_hardirq_exit();			\
 		account_system_vtime(current);		\
 		sub_preempt_count(HARDIRQ_OFFSET);	\
+		rcu_irq_exit();				\
 	} while (0)
 
 /*
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 4d6624260b4c..b3dccd68629e 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -160,5 +160,8 @@ extern void rcu_restart_cpu(int cpu);
 extern long rcu_batches_completed(void);
 extern long rcu_batches_completed_bh(void);
 
+#define rcu_enter_nohz()	do { } while (0)
+#define rcu_exit_nohz()		do { } while (0)
+
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUCLASSIC_H */
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 60c2a033b19e..01152ed532c8 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -82,5 +82,27 @@ extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
 
 struct softirq_action;
 
+#ifdef CONFIG_NO_HZ
+DECLARE_PER_CPU(long, dynticks_progress_counter);
+
+static inline void rcu_enter_nohz(void)
+{
+	__get_cpu_var(dynticks_progress_counter)++;
+	WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1);
+	mb();
+}
+
+static inline void rcu_exit_nohz(void)
+{
+	mb();
+	__get_cpu_var(dynticks_progress_counter)++;
+	WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1));
+}
+
+#else /* CONFIG_NO_HZ */
+#define rcu_enter_nohz()	do { } while (0)
+#define rcu_exit_nohz()		do { } while (0)
+#endif /* CONFIG_NO_HZ */
+
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPREEMPT_H */
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 987cfb7ade89..c7c52096df48 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -23,6 +23,10 @@
  *		to Suparna Bhattacharya for pushing me completely away
  *		from atomic instructions on the read side.
  *
+ *  - Added handling of Dynamic Ticks
+ *      Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
+ *                     - Steven Rostedt <srostedt@redhat.com>
+ *
  * Papers:  http://www.rdrop.com/users/paulmck/RCU
  *
  * Design Document: http://lwn.net/Articles/253651/
@@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
 	}
 }
 
+#ifdef CONFIG_NO_HZ
+
+DEFINE_PER_CPU(long, dynticks_progress_counter) = 1;
+static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
+static DEFINE_PER_CPU(int, rcu_update_flag);
+
+/**
+ * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * dynticks_progress_counter to let the RCU handling know that the
+ * CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+	int cpu = smp_processor_id();
+
+	if (per_cpu(rcu_update_flag, cpu))
+		per_cpu(rcu_update_flag, cpu)++;
+
+	/*
+	 * Only update if we are coming from a stopped ticks mode
+	 * (dynticks_progress_counter is even).
+	 */
+	if (!in_interrupt() &&
+	    (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) {
+		/*
+		 * The following might seem like we could have a race
+		 * with NMI/SMIs. But this really isn't a problem.
+		 * Here we do a read/modify/write, and the race happens
+		 * when an NMI/SMI comes in after the read and before
+		 * the write. But NMI/SMIs will increment this counter
+		 * twice before returning, so the zero bit will not
+		 * be corrupted by the NMI/SMI which is the most important
+		 * part.
+		 *
+		 * The only thing is that we would bring back the counter
+		 * to a postion that it was in during the NMI/SMI.
+		 * But the zero bit would be set, so the rest of the
+		 * counter would again be ignored.
+		 *
+		 * On return from the IRQ, the counter may have the zero
+		 * bit be 0 and the counter the same as the return from
+		 * the NMI/SMI. If the state machine was so unlucky to
+		 * see that, it still doesn't matter, since all
+		 * RCU read-side critical sections on this CPU would
+		 * have already completed.
+		 */
+		per_cpu(dynticks_progress_counter, cpu)++;
+		/*
+		 * The following memory barrier ensures that any
+		 * rcu_read_lock() primitives in the irq handler
+		 * are seen by other CPUs to follow the above
+		 * increment to dynticks_progress_counter. This is
+		 * required in order for other CPUs to correctly
+		 * determine when it is safe to advance the RCU
+		 * grace-period state machine.
+		 */
+		smp_mb(); /* see above block comment. */
+		/*
+		 * Since we can't determine the dynamic tick mode from
+		 * the dynticks_progress_counter after this routine,
+		 * we use a second flag to acknowledge that we came
+		 * from an idle state with ticks stopped.
+		 */
+		per_cpu(rcu_update_flag, cpu)++;
+		/*
+		 * If we take an NMI/SMI now, they will also increment
+		 * the rcu_update_flag, and will not update the
+		 * dynticks_progress_counter on exit. That is for
+		 * this IRQ to do.
+		 */
+	}
+}
+
+/**
+ * rcu_irq_exit - Called from exiting Hard irq context.
+ *
+ * If the CPU was idle with dynamic ticks active, update the
+ * dynticks_progress_counter to put let the RCU handling be
+ * aware that the CPU is going back to idle with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+	int cpu = smp_processor_id();
+
+	/*
+	 * rcu_update_flag is set if we interrupted the CPU
+	 * when it was idle with ticks stopped.
+	 * Once this occurs, we keep track of interrupt nesting
+	 * because a NMI/SMI could also come in, and we still
+	 * only want the IRQ that started the increment of the
+	 * dynticks_progress_counter to be the one that modifies
+	 * it on exit.
+	 */
+	if (per_cpu(rcu_update_flag, cpu)) {
+		if (--per_cpu(rcu_update_flag, cpu))
+			return;
+
+		/* This must match the interrupt nesting */
+		WARN_ON(in_interrupt());
+
+		/*
+		 * If an NMI/SMI happens now we are still
+		 * protected by the dynticks_progress_counter being odd.
+		 */
+
+		/*
+		 * The following memory barrier ensures that any
+		 * rcu_read_unlock() primitives in the irq handler
+		 * are seen by other CPUs to preceed the following
+		 * increment to dynticks_progress_counter. This
+		 * is required in order for other CPUs to determine
+		 * when it is safe to advance the RCU grace-period
+		 * state machine.
+		 */
+		smp_mb(); /* see above block comment. */
+		per_cpu(dynticks_progress_counter, cpu)++;
+		WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1);
+	}
+}
+
+static void dyntick_save_progress_counter(int cpu)
+{
+	per_cpu(rcu_dyntick_snapshot, cpu) =
+		per_cpu(dynticks_progress_counter, cpu);
+}
+
+static inline int
+rcu_try_flip_waitack_needed(int cpu)
+{
+	long curr;
+	long snap;
+
+	curr = per_cpu(dynticks_progress_counter, cpu);
+	snap = per_cpu(rcu_dyntick_snapshot, cpu);
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU remained in dynticks mode for the entire time
+	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
+	 * then it cannot be in the middle of an rcu_read_lock(), so
+	 * the next rcu_read_lock() it executes must use the new value
+	 * of the counter.  So we can safely pretend that this CPU
+	 * already acknowledged the counter.
+	 */
+
+	if ((curr == snap) && ((curr & 0x1) == 0))
+		return 0;
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq handlers, then, as above, we can safely pretend
+	 * that this CPU already acknowledged the counter.
+	 */
+
+	if ((curr - snap) > 2 || (snap & 0x1) == 0)
+		return 0;
+
+	/* We need this CPU to explicitly acknowledge the counter flip. */
+
+	return 1;
+}
+
+static inline int
+rcu_try_flip_waitmb_needed(int cpu)
+{
+	long curr;
+	long snap;
+
+	curr = per_cpu(dynticks_progress_counter, cpu);
+	snap = per_cpu(rcu_dyntick_snapshot, cpu);
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU remained in dynticks mode for the entire time
+	 * and didn't take any interrupts, NMIs, SMIs, or whatever,
+	 * then it cannot have executed an RCU read-side critical section
+	 * during that time, so there is no need for it to execute a
+	 * memory barrier.
+	 */
+
+	if ((curr == snap) && ((curr & 0x1) == 0))
+		return 0;
+
+	/*
+	 * If the CPU either entered or exited an outermost interrupt,
+	 * SMI, NMI, or whatever handler, then we know that it executed
+	 * a memory barrier when doing so.  So we don't need another one.
+	 */
+	if (curr != snap)
+		return 0;
+
+	/* We need the CPU to execute a memory barrier. */
+
+	return 1;
+}
+
+#else /* !CONFIG_NO_HZ */
+
+# define dyntick_save_progress_counter(cpu)	do { } while (0)
+# define rcu_try_flip_waitack_needed(cpu)	(1)
+# define rcu_try_flip_waitmb_needed(cpu)	(1)
+
+#endif /* CONFIG_NO_HZ */
+
 /*
  * Get here when RCU is idle.  Decide whether we need to
  * move out of idle state, and return non-zero if so.
@@ -447,8 +657,10 @@ rcu_try_flip_idle(void)
 
 	/* Now ask each CPU for acknowledgement of the flip. */
 
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
+		dyntick_save_progress_counter(cpu);
+	}
 
 	return 1;
 }
@@ -464,7 +676,8 @@ rcu_try_flip_waitack(void)
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
 	for_each_cpu_mask(cpu, rcu_cpu_online_map)
-		if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
+		if (rcu_try_flip_waitack_needed(cpu) &&
+		    per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
 			return 0;
 		}
@@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void)
 	smp_mb();  /*  ^^^^^^^^^^^^ */
 
 	/* Call for a memory barrier from each CPU. */
-	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+	for_each_cpu_mask(cpu, rcu_cpu_online_map) {
 		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
+		dyntick_save_progress_counter(cpu);
+	}
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
 	return 1;
@@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void)
 
 	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
 	for_each_cpu_mask(cpu, rcu_cpu_online_map)
-		if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
+		if (rcu_try_flip_waitmb_needed(cpu) &&
+		    per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
 			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
 			return 0;
 		}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5b3aea5f471e..31e9f2a47928 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -313,6 +313,7 @@ void irq_exit(void)
 	/* Make sure that timer wheel updates are propagated */
 	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
 		tick_nohz_stop_sched_tick();
+	rcu_irq_exit();
 #endif
 	preempt_enable_no_resched();
 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fa9bb73dbdb4..2968298f8f36 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void)
 			ts->idle_tick = ts->sched_timer.expires;
 			ts->tick_stopped = 1;
 			ts->idle_jiffies = last_jiffies;
+			rcu_enter_nohz();
 		}
 
 		/*
@@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void)
 		return;
 	}
 
+	rcu_exit_nohz();
+
 	/* Update jiffies first */
 	select_nohz_load_balancer(0);
 	now = ktime_get();
-- 
cgit v1.2.3


From 7be2a03e3174cee3a3cdcdf17db357470f51caff Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Fri, 8 Feb 2008 15:41:13 +0100
Subject: softlockup: fix task state setting

kthread_stop() can be called when a 'watchdog' thread is executing after
kthread_should_stop() but before set_task_state(TASK_INTERRUPTIBLE).

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softlockup.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 7c2da88db4ed..01b6522fd92b 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -216,26 +216,27 @@ static int watchdog(void *__bind_cpu)
 	/* initialize timestamp */
 	touch_softlockup_watchdog();
 
+	set_current_state(TASK_INTERRUPTIBLE);
 	/*
 	 * Run briefly once per second to reset the softlockup timestamp.
 	 * If this gets delayed for more than 60 seconds then the
 	 * debug-printout triggers in softlockup_tick().
 	 */
 	while (!kthread_should_stop()) {
-		set_current_state(TASK_INTERRUPTIBLE);
 		touch_softlockup_watchdog();
 		schedule();
 
 		if (kthread_should_stop())
 			break;
 
-		if (this_cpu != check_cpu)
-			continue;
-
-		if (sysctl_hung_task_timeout_secs)
-			check_hung_uninterruptible_tasks(this_cpu);
+		if (this_cpu == check_cpu) {
+			if (sysctl_hung_task_timeout_secs)
+				check_hung_uninterruptible_tasks(this_cpu);
+		}
 
+		set_current_state(TASK_INTERRUPTIBLE);
 	}
+	__set_current_state(TASK_RUNNING);
 
 	return 0;
 }
-- 
cgit v1.2.3


From ae778869ae4549628b9e83efe958c3aaa63ed1b9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 27 Feb 2008 16:21:10 -0800
Subject: rcupreempt: fix hibernate/resume in presence of PREEMPT_RCU and
 hotplug

This fixes a oops encountered when doing hibernate/resume in presence of
PREEMPT_RCU.

The problem was that the code failed to disable preemption when
accessing a per-CPU variable.  This is OK when called from code that
already has preemption disabled, but such is not the case from the
suspend/resume code path.

Reported-by: Dave Young <hidave.darkstar@gmail.com>
Tested-by: Dave Young <hidave.darkstar@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index c7c52096df48..845abcd472b0 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -918,8 +918,9 @@ void rcu_offline_cpu(int cpu)
 	 * fix.
 	 */
 
+	local_irq_save(flags);
 	rdp = RCU_DATA_ME();
-	spin_lock_irqsave(&rdp->lock, flags);
+	spin_lock(&rdp->lock);
 	*rdp->nexttail = list;
 	if (list)
 		rdp->nexttail = tail;
-- 
cgit v1.2.3


From c9e71002aacc9821e99531dcc130db88bbc8ad05 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 28 Feb 2008 11:51:07 -0800
Subject: rcupreempt: remove never-migrates assumption from
 rcu_process_callbacks()

This patch fixes a potentially invalid access to a per-CPU variable in
rcu_process_callbacks().

This per-CPU access needs to be done in such a way as to guarantee that
the code using it cannot move to some other CPU before all uses of the
value accessed have completed.  Even though this code is currently only
invoked from softirq context, which currrently cannot migrate to some
other CPU, life would be better if this code did not silently make such
an assumption.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 845abcd472b0..e9517014b57c 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -952,9 +952,11 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	unsigned long flags;
 	struct rcu_head *next, *list;
-	struct rcu_data *rdp = RCU_DATA_ME();
+	struct rcu_data *rdp;
 
-	spin_lock_irqsave(&rdp->lock, flags);
+	local_irq_save(flags);
+	rdp = RCU_DATA_ME();
+	spin_lock(&rdp->lock);
 	list = rdp->donelist;
 	if (list == NULL) {
 		spin_unlock_irqrestore(&rdp->lock, flags);
-- 
cgit v1.2.3


From 422b03cf75e11dfdfb29b0f19709bac585335f86 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Wed, 27 Feb 2008 10:39:22 -0500
Subject: [PATCH] Audit: Fix the format type for size_t variables

Fix the following compiler warning by using "%zu" as defined in C99.

  CC      kernel/auditsc.o
  kernel/auditsc.c: In function 'audit_log_single_execve_arg':
  kernel/auditsc.c:1074: warning: format '%ld' expects type 'long int', but
  argument 4 has type 'size_t'

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2087d6de67ea..782262e4107d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1070,7 +1070,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 		 * so we can be sure nothing was lost.
 		 */
 		if ((i == 0) && (too_long))
-			audit_log_format(*ab, "a%d_len=%ld ", arg_num,
+			audit_log_format(*ab, "a%d_len=%zu ", arg_num,
 					 has_cntl ? 2*len : len);
 
 		/*
-- 
cgit v1.2.3


From b29ee87e9b441e72454efd1be56aa1a05ffb2f58 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 Feb 2008 15:53:05 -0500
Subject: [RFC] AUDIT: do not panic when printk loses messages

On the latest kernels if one was to load about 15 rules, set the failure
state to panic, and then run service auditd stop the kernel will panic.
This is because auditd stops, then the script deletes all of the rules.
These deletions are sent as audit messages out of the printk kernel
interface which is already known to be lossy.  These will overun the
default kernel rate limiting (10 really fast messages) and will call
audit_panic().  The same effect can happen if a slew of avc's come
through while auditd is stopped.

This can be fixed a number of ways but this patch fixes the problem by
just not panicing if auditd is not running.  We know printk is lossy and
if the user chooses to set the failure mode to panic and tries to use
printk we can't make any promises no matter how hard we try, so why try?
At least in this way we continue to get lost message accounting and will
eventually know that things went bad.

The other change is to add a new call to audit_log_lost() if auditd
disappears.  We already pulled the skb off the queue and couldn't send
it so that message is lost.  At least this way we will account for the
last message and panic if the machine is configured to panic.  This code
path should only be run if auditd dies for unforeseen reasons.  If
auditd closes correctly audit_pid will get set to 0 and we won't walk
this code path.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 2eeea9a14240..6d7175c1e878 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -170,7 +170,9 @@ void audit_panic(const char *message)
 			printk(KERN_ERR "audit: %s\n", message);
 		break;
 	case AUDIT_FAIL_PANIC:
-		panic("audit: %s\n", message);
+		/* test audit_pid since printk is always losey, why bother? */
+		if (audit_pid)
+			panic("audit: %s\n", message);
 		break;
 	}
 }
@@ -352,6 +354,7 @@ static int kauditd_thread(void *dummy)
 				if (err < 0) {
 					BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
 					printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+					audit_log_lost("auditd dissapeared\n");
 					audit_pid = 0;
 				}
 			} else {
-- 
cgit v1.2.3


From 8d07a67cface19ac07d7324f38bda7bbb06bbdb2 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Thu, 21 Feb 2008 16:59:22 -0500
Subject: [PATCH] drop EOE records from printk

Hi,

While we are looking at the printk issue, I see that its printk'ing the EOE
(end of event) records which is really not something that we need in syslog.
Its really intended for the realtime audit event stream handled by the audit
daemon. So, lets avoid printk'ing that record type.

Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 6d7175c1e878..10c4930c2bbf 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1353,17 +1353,19 @@ void audit_log_end(struct audit_buffer *ab)
 	if (!audit_rate_check()) {
 		audit_log_lost("rate limit exceeded");
 	} else {
+		struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
 		if (audit_pid) {
-			struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
 			nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
 			skb_queue_tail(&audit_skb_queue, ab->skb);
 			ab->skb = NULL;
 			wake_up_interruptible(&kauditd_wait);
-		} else if (printk_ratelimit()) {
-			struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
-			printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, ab->skb->data + NLMSG_SPACE(0));
-		} else {
-			audit_log_lost("printk limit exceeded\n");
+		} else if (nlh->nlmsg_type != AUDIT_EOE) {
+			if (printk_ratelimit()) {
+				printk(KERN_NOTICE "type=%d %s\n",
+					nlh->nlmsg_type,
+					ab->skb->data + NLMSG_SPACE(0));
+			} else
+				audit_log_lost("printk limit exceeded\n");
 		}
 	}
 	audit_buffer_free(ab);
-- 
cgit v1.2.3


From f49ee505b1ecb5960984880740f09aba87f870dc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 2 Mar 2008 21:44:40 +0300
Subject: introduce kill_orphaned_pgrp() helper

Factor out the common code in reparent_thread() and exit_notify().

No functional changes.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 74 ++++++++++++++++++++++++++++-------------------------------
 1 file changed, 35 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 506a957b665a..11fcce760151 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -255,6 +255,37 @@ static int has_stopped_jobs(struct pid *pgrp)
 	return retval;
 }
 
+/*
+ * Check to see if any process groups have become orphaned as
+ * a result of our exiting, and if they have any stopped jobs,
+ * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
+ */
+static void
+kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
+{
+	struct pid *pgrp = task_pgrp(tsk);
+	struct task_struct *ignored_task = tsk;
+
+	if (!parent)
+		 /* exit: our father is in a different pgrp than
+		  * we are and we were the only connection outside.
+		  */
+		parent = tsk->real_parent;
+	else
+		/* reparent: our child is in a different pgrp than
+		 * we are, and it was the only connection outside.
+		 */
+		ignored_task = NULL;
+
+	if (task_pgrp(parent) != pgrp &&
+	    task_session(parent) == task_session(tsk) &&
+	    will_become_orphaned_pgrp(pgrp, ignored_task) &&
+	    has_stopped_jobs(pgrp)) {
+		__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
+		__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
+	}
+}
+
 /**
  * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
  *
@@ -635,22 +666,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 	    p->exit_signal != -1 && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
-	/*
-	 * process group orphan check
-	 * Case ii: Our child is in a different pgrp
-	 * than we are, and it was the only connection
-	 * outside, so the child pgrp is now orphaned.
-	 */
-	if ((task_pgrp(p) != task_pgrp(father)) &&
-	    (task_session(p) == task_session(father))) {
-		struct pid *pgrp = task_pgrp(p);
-
-		if (will_become_orphaned_pgrp(pgrp, NULL) &&
-		    has_stopped_jobs(pgrp)) {
-			__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
-			__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
-		}
-	}
+	kill_orphaned_pgrp(p, father);
 }
 
 /*
@@ -738,8 +754,6 @@ static void forget_original_parent(struct task_struct *father)
 static void exit_notify(struct task_struct *tsk)
 {
 	int state;
-	struct task_struct *t;
-	struct pid *pgrp;
 
 	/*
 	 * This does two things:
@@ -753,25 +767,7 @@ static void exit_notify(struct task_struct *tsk)
 	exit_task_namespaces(tsk);
 
 	write_lock_irq(&tasklist_lock);
-	/*
-	 * Check to see if any process groups have become orphaned
-	 * as a result of our exiting, and if they have any stopped
-	 * jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
-	 *
-	 * Case i: Our father is in a different pgrp than we are
-	 * and we were the only connection outside, so our pgrp
-	 * is about to become orphaned.
-	 */
-	t = tsk->real_parent;
-
-	pgrp = task_pgrp(tsk);
-	if ((task_pgrp(t) != pgrp) &&
-	    (task_session(t) == task_session(tsk)) &&
-	    will_become_orphaned_pgrp(pgrp, tsk) &&
-	    has_stopped_jobs(pgrp)) {
-		__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
-		__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
-	}
+	kill_orphaned_pgrp(tsk, NULL);
 
 	/* Let father know we died
 	 *
@@ -788,8 +784,8 @@ static void exit_notify(struct task_struct *tsk)
 	 * the same after a fork.
 	 */
 	if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 &&
-	    ( tsk->parent_exec_id != t->self_exec_id  ||
-	      tsk->self_exec_id != tsk->parent_exec_id)
+	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
+	     tsk->self_exec_id != tsk->parent_exec_id)
 	    && !capable(CAP_KILL))
 		tsk->exit_signal = SIGCHLD;
 
-- 
cgit v1.2.3


From 05e83df624fe682bb8571cdb2c6d5284a99c3066 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 2 Mar 2008 21:44:42 +0300
Subject: will_become_orphaned_pgrp: partially fix insufficient ->exit_state
 check

p->exit_state != 0 doesn't mean this process is dead, it may have
sub-threads.  Change the code to use "p->exit_state && thread_group_empty(p)"
instead.

Without this patch, ^Z doesn't deliver SIGTSTP to the foreground process
if the main thread has exited.

However, the new check is not perfect either.  There is a window when
exit_notify() drops tasklist and before release_task().  Suppose that
the last (non-leader) thread exits.  This means that entire group exits,
but thread_group_empty() is not true yet.

As Eric pointed out, is_global_init() is wrong as well, but I did not
dare to do other changes.

Just for the record, has_stopped_jobs() is absolutely wrong too.  But we
can't fix it now, we should first fix SIGNAL_STOP_STOPPED issues.

Even with this patch ^Z doesn't play well with the dead main thread.
The task is stopped correctly but do_wait(WSTOPPED) won't see it.  This
is another unrelated issue, will be (hopefully) fixed separately.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 11fcce760151..41c1edace97a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -214,20 +214,19 @@ struct pid *session_of_pgrp(struct pid *pgrp)
 static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
 {
 	struct task_struct *p;
-	int ret = 1;
 
 	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
-		if (p == ignored_task
-				|| p->exit_state
-				|| is_global_init(p->real_parent))
+		if ((p == ignored_task) ||
+		    (p->exit_state && thread_group_empty(p)) ||
+		    is_global_init(p->real_parent))
 			continue;
+
 		if (task_pgrp(p->real_parent) != pgrp &&
-		    task_session(p->real_parent) == task_session(p)) {
-			ret = 0;
-			break;
-		}
+		    task_session(p->real_parent) == task_session(p))
+			return 0;
 	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
-	return ret;	/* (sighing) "Often!" */
+
+	return 1;
 }
 
 int is_current_pgrp_orphaned(void)
-- 
cgit v1.2.3


From 821c7de7194e77afee1a69d50830a329a6d9af9f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 2 Mar 2008 21:44:44 +0300
Subject: exit_notify: fix kill_orphaned_pgrp() usage with mt exit

1. exit_notify() always calls kill_orphaned_pgrp(). This is wrong, we
   should do this only when the whole process exits.

2. exit_notify() uses "current" as "ignored_task", obviously wrong.
   Use ->group_leader instead.

Test case:

	void hup(int sig)
	{
		printf("HUP received\n");
	}

	void *tfunc(void *arg)
	{
		sleep(2);
		printf("sub-thread exited\n");
		return NULL;
	}

	int main(int argc, char *argv[])
	{
		if (!fork()) {
			signal(SIGHUP, hup);
			kill(getpid(), SIGSTOP);
			exit(0);
		}

		pthread_t thr;
		pthread_create(&thr, NULL, tfunc, NULL);

		sleep(1);
		printf("main thread exited\n");
		syscall(__NR_exit, 0);

		return 0;
	}

output:

	main thread exited
	HUP received
	Hangup

With this patch the output is:

	main thread exited
	sub-thread exited
	HUP received

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 41c1edace97a..cd20bf07e9e3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -750,7 +750,7 @@ static void forget_original_parent(struct task_struct *father)
  * Send signals to all our closest relatives so that they know
  * to properly mourn us..
  */
-static void exit_notify(struct task_struct *tsk)
+static void exit_notify(struct task_struct *tsk, int group_dead)
 {
 	int state;
 
@@ -766,7 +766,8 @@ static void exit_notify(struct task_struct *tsk)
 	exit_task_namespaces(tsk);
 
 	write_lock_irq(&tasklist_lock);
-	kill_orphaned_pgrp(tsk, NULL);
+	if (group_dead)
+		kill_orphaned_pgrp(tsk->group_leader, NULL);
 
 	/* Let father know we died
 	 *
@@ -981,7 +982,7 @@ NORET_TYPE void do_exit(long code)
 		module_put(tsk->binfmt->module);
 
 	proc_exit_connector(tsk);
-	exit_notify(tsk);
+	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
 	mpol_free(tsk->mempolicy);
 	tsk->mempolicy = NULL;
-- 
cgit v1.2.3


From 13b1c3d4b49bd83d861c775ca2db54e1692a1b07 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 3 Mar 2008 20:22:05 -0800
Subject: freezer vs stopped or traced

This changes the "freezer" code used by suspend/hibernate in its treatment
of tasks in TASK_STOPPED (job control stop) and TASK_TRACED (ptrace) states.

As I understand it, the intent of the "freezer" is to hold all tasks
from doing anything significant.  For this purpose, TASK_STOPPED and
TASK_TRACED are "frozen enough".  It's possible the tasks might resume
from ptrace calls (if the tracer were unfrozen) or from signals
(including ones that could come via timer interrupts, etc).  But this
doesn't matter as long as they quickly block again while "freezing" is
in effect.  Some minor adjustments to the signal.c code make sure that
try_to_freeze() very shortly follows all wakeups from both kinds of
stop.  This lets the freezer code safely leave stopped tasks unmolested.

Changing this fixes the longstanding bug of seeing after resuming from
suspend/hibernate your shell report "[1] Stopped" and the like for all
your jobs stopped by ^Z et al, as if you had freshly fg'd and ^Z'd them.
It also removes from the freezer the arcane special case treatment for
ptrace'd tasks, which relied on intimate knowledge of ptrace internals.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/process.c | 29 ++++++++++++-----------------
 kernel/signal.c        | 16 ++++++++++++++--
 2 files changed, 26 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7c2118f9597f..f1d0b345c9ba 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -75,22 +75,15 @@ void refrigerator(void)
 	__set_current_state(save);
 }
 
-static void fake_signal_wake_up(struct task_struct *p, int resume)
+static void fake_signal_wake_up(struct task_struct *p)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&p->sighand->siglock, flags);
-	signal_wake_up(p, resume);
+	signal_wake_up(p, 0);
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 }
 
-static void send_fake_signal(struct task_struct *p)
-{
-	if (task_is_stopped(p))
-		force_sig_specific(SIGSTOP, p);
-	fake_signal_wake_up(p, task_is_stopped(p));
-}
-
 static int has_mm(struct task_struct *p)
 {
 	return (p->mm && !(p->flags & PF_BORROWED_MM));
@@ -121,7 +114,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only)
 	if (freezing(p)) {
 		if (has_mm(p)) {
 			if (!signal_pending(p))
-				fake_signal_wake_up(p, 0);
+				fake_signal_wake_up(p);
 		} else {
 			if (with_mm_only)
 				ret = 0;
@@ -135,7 +128,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only)
 		} else {
 			if (has_mm(p)) {
 				set_freeze_flag(p);
-				send_fake_signal(p);
+				fake_signal_wake_up(p);
 			} else {
 				if (with_mm_only) {
 					ret = 0;
@@ -182,15 +175,17 @@ static int try_to_freeze_tasks(int freeze_user_space)
 			if (frozen(p) || !freezeable(p))
 				continue;
 
-			if (task_is_traced(p) && frozen(p->parent)) {
-				cancel_freezing(p);
-				continue;
-			}
-
 			if (!freeze_task(p, freeze_user_space))
 				continue;
 
-			if (!freezer_should_skip(p))
+			/*
+			 * Now that we've done set_freeze_flag, don't
+			 * perturb a task in TASK_STOPPED or TASK_TRACED.
+			 * It is "frozen enough".  If the task does wake
+			 * up, it will immediately call try_to_freeze.
+			 */
+			if (!task_is_stopped_or_traced(p) &&
+			    !freezer_should_skip(p))
 				todo++;
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
diff --git a/kernel/signal.c b/kernel/signal.c
index 84917fe507f7..6af1210092c3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1623,7 +1623,6 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 	/* Let the debugger run.  */
 	__set_current_state(TASK_TRACED);
 	spin_unlock_irq(&current->sighand->siglock);
-	try_to_freeze();
 	read_lock(&tasklist_lock);
 	if (!unlikely(killed) && may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
@@ -1640,6 +1639,13 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 		read_unlock(&tasklist_lock);
 	}
 
+	/*
+	 * While in TASK_TRACED, we were considered "frozen enough".
+	 * Now that we woke up, it's crucial if we're supposed to be
+	 * frozen that we freeze now before running anything substantial.
+	 */
+	try_to_freeze();
+
 	/*
 	 * We are back.  Now reacquire the siglock before touching
 	 * last_siginfo, so that we are sure to have synchronized with
@@ -1757,9 +1763,15 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
 	sigset_t *mask = &current->blocked;
 	int signr = 0;
 
+relock:
+	/*
+	 * We'll jump back here after any time we were stopped in TASK_STOPPED.
+	 * While in TASK_STOPPED, we were considered "frozen enough".
+	 * Now that we woke up, it's crucial if we're supposed to be
+	 * frozen that we freeze now before running anything substantial.
+	 */
 	try_to_freeze();
 
-relock:
 	spin_lock_irq(&current->sighand->siglock);
 	for (;;) {
 		struct k_sigaction *ka;
-- 
cgit v1.2.3


From 62fb185130e4d420f71a30ff59d8b16b74ef5d2b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 Feb 2008 17:34:02 +0100
Subject: sched: revert load_balance_monitor() changes

The following commits cause a number of regressions:

  commit 58e2d4ca581167c2a079f4ee02be2f0bc52e8729
  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  Date:   Fri Jan 25 21:08:00 2008 +0100
  sched: group scheduling, change how cpu load is calculated

  commit 6b2d7700266b9402e12824e11e0099ae6a4a6a79
  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  Date:   Fri Jan 25 21:08:00 2008 +0100
  sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups

Namely:
 - very frequent wakeups on SMP, reported by PowerTop users.
 - cacheline trashing on (large) SMP
 - some latencies larger than 500ms

While there is a mergeable patch to fix the latter, the former issues
are not fixable in a manner suitable for .25 (we're at -rc3 now).

Hence we revert them and try again in v2.6.26.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Tested-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   4 -
 kernel/sched.c        | 283 +++++++-------------------------------------------
 kernel/sched_fair.c   | 115 +++++++-------------
 kernel/sched_rt.c     |   4 -
 kernel/sysctl.c       |  18 ----
 5 files changed, 70 insertions(+), 354 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c9621f8bf87..9ae4030067a9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1542,10 +1542,6 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
-extern unsigned int sysctl_sched_min_bal_int_shares;
-extern unsigned int sysctl_sched_max_bal_int_shares;
-#endif
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/sched.c b/kernel/sched.c
index f06950c8a6ce..dcd553cc4ee8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -174,41 +174,6 @@ struct task_group {
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
-
-	/*
-	 * shares assigned to a task group governs how much of cpu bandwidth
-	 * is allocated to the group. The more shares a group has, the more is
-	 * the cpu bandwidth allocated to it.
-	 *
-	 * For ex, lets say that there are three task groups, A, B and C which
-	 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
-	 * cpu bandwidth allocated by the scheduler to task groups A, B and C
-	 * should be:
-	 *
-	 *	Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
-	 *	Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
-	 *	Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
-	 *
-	 * The weight assigned to a task group's schedulable entities on every
-	 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
-	 * group's shares. For ex: lets say that task group A has been
-	 * assigned shares of 1000 and there are two CPUs in a system. Then,
-	 *
-	 *  tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
-	 *
-	 * Note: It's not necessary that each of a task's group schedulable
-	 *	 entity have the same weight on all CPUs. If the group
-	 *	 has 2 of its tasks on CPU0 and 1 task on CPU1, then a
-	 *	 better distribution of weight could be:
-	 *
-	 *	tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
-	 *	tg_A->se[1]->load.weight = 1/2 * 2000 =  667
-	 *
-	 * rebalance_shares() is responsible for distributing the shares of a
-	 * task groups like this among the group's schedulable entities across
-	 * cpus.
-	 *
-	 */
 	unsigned long shares;
 #endif
 
@@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock);
 static DEFINE_MUTEX(doms_cur_mutex);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-/* kernel thread that runs rebalance_shares() periodically */
-static struct task_struct *lb_monitor_task;
-static int load_balance_monitor(void *unused);
-#endif
-
-static void set_se_shares(struct sched_entity *se, unsigned long shares);
-
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
 #else
 # define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 
-#define MIN_GROUP_SHARES	2
-
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 #endif
 
@@ -1245,16 +1200,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #endif
 
-static inline void inc_cpu_load(struct rq *rq, unsigned long load)
-{
-	update_load_add(&rq->load, load);
-}
-
-static inline void dec_cpu_load(struct rq *rq, unsigned long load)
-{
-	update_load_sub(&rq->load, load);
-}
-
 #ifdef CONFIG_SMP
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
@@ -1272,14 +1217,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 
 #define sched_class_highest (&rt_sched_class)
 
-static void inc_nr_running(struct rq *rq)
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+	update_load_add(&rq->load, p->se.load.weight);
+}
+
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+	update_load_sub(&rq->load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running++;
+	inc_load(rq, p);
 }
 
-static void dec_nr_running(struct rq *rq)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 	rq->nr_running--;
+	dec_load(rq, p);
 }
 
 static void set_load_weight(struct task_struct *p)
@@ -1371,7 +1328,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
 		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, wakeup);
-	inc_nr_running(rq);
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -1383,7 +1340,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
 		rq->nr_uninterruptible++;
 
 	dequeue_task(rq, p, sleep);
-	dec_nr_running(rq);
+	dec_nr_running(p, rq);
 }
 
 /**
@@ -2023,7 +1980,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		 * management (if any):
 		 */
 		p->sched_class->task_new(rq, p);
-		inc_nr_running(rq);
+		inc_nr_running(p, rq);
 	}
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -4362,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	if (on_rq) {
 		dequeue_task(rq, p, 0);
+		dec_load(rq, p);
+	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -4373,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice)
 
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
+		inc_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -7087,21 +7047,6 @@ void __init sched_init_smp(void)
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 	sched_init_granularity();
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	if (nr_cpu_ids == 1)
-		return;
-
-	lb_monitor_task = kthread_create(load_balance_monitor, NULL,
-					 "group_balance");
-	if (!IS_ERR(lb_monitor_task)) {
-		lb_monitor_task->flags |= PF_NOFREEZE;
-		wake_up_process(lb_monitor_task);
-	} else {
-		printk(KERN_ERR "Could not create load balance monitor thread"
-			"(error = %ld) \n", PTR_ERR(lb_monitor_task));
-	}
-#endif
 }
 #else
 void __init sched_init_smp(void)
@@ -7424,157 +7369,6 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #ifdef CONFIG_GROUP_SCHED
 
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-/*
- * distribute shares of all task groups among their schedulable entities,
- * to reflect load distribution across cpus.
- */
-static int rebalance_shares(struct sched_domain *sd, int this_cpu)
-{
-	struct cfs_rq *cfs_rq;
-	struct rq *rq = cpu_rq(this_cpu);
-	cpumask_t sdspan = sd->span;
-	int balanced = 1;
-
-	/* Walk thr' all the task groups that we have */
-	for_each_leaf_cfs_rq(rq, cfs_rq) {
-		int i;
-		unsigned long total_load = 0, total_shares;
-		struct task_group *tg = cfs_rq->tg;
-
-		/* Gather total task load of this group across cpus */
-		for_each_cpu_mask(i, sdspan)
-			total_load += tg->cfs_rq[i]->load.weight;
-
-		/* Nothing to do if this group has no load */
-		if (!total_load)
-			continue;
-
-		/*
-		 * tg->shares represents the number of cpu shares the task group
-		 * is eligible to hold on a single cpu. On N cpus, it is
-		 * eligible to hold (N * tg->shares) number of cpu shares.
-		 */
-		total_shares = tg->shares * cpus_weight(sdspan);
-
-		/*
-		 * redistribute total_shares across cpus as per the task load
-		 * distribution.
-		 */
-		for_each_cpu_mask(i, sdspan) {
-			unsigned long local_load, local_shares;
-
-			local_load = tg->cfs_rq[i]->load.weight;
-			local_shares = (local_load * total_shares) / total_load;
-			if (!local_shares)
-				local_shares = MIN_GROUP_SHARES;
-			if (local_shares == tg->se[i]->load.weight)
-				continue;
-
-			spin_lock_irq(&cpu_rq(i)->lock);
-			set_se_shares(tg->se[i], local_shares);
-			spin_unlock_irq(&cpu_rq(i)->lock);
-			balanced = 0;
-		}
-	}
-
-	return balanced;
-}
-
-/*
- * How frequently should we rebalance_shares() across cpus?
- *
- * The more frequently we rebalance shares, the more accurate is the fairness
- * of cpu bandwidth distribution between task groups. However higher frequency
- * also implies increased scheduling overhead.
- *
- * sysctl_sched_min_bal_int_shares represents the minimum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * sysctl_sched_max_bal_int_shares represents the maximum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * These settings allows for the appropriate trade-off between accuracy of
- * fairness and the associated overhead.
- *
- */
-
-/* default: 8ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
-
-/* default: 128ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
-
-/* kernel thread that runs rebalance_shares() periodically */
-static int load_balance_monitor(void *unused)
-{
-	unsigned int timeout = sysctl_sched_min_bal_int_shares;
-	struct sched_param schedparm;
-	int ret;
-
-	/*
-	 * We don't want this thread's execution to be limited by the shares
-	 * assigned to default group (init_task_group). Hence make it run
-	 * as a SCHED_RR RT task at the lowest priority.
-	 */
-	schedparm.sched_priority = 1;
-	ret = sched_setscheduler(current, SCHED_RR, &schedparm);
-	if (ret)
-		printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
-				" monitor thread (error = %d) \n", ret);
-
-	while (!kthread_should_stop()) {
-		int i, cpu, balanced = 1;
-
-		/* Prevent cpus going down or coming up */
-		get_online_cpus();
-		/* lockout changes to doms_cur[] array */
-		lock_doms_cur();
-		/*
-		 * Enter a rcu read-side critical section to safely walk rq->sd
-		 * chain on various cpus and to walk task group list
-		 * (rq->leaf_cfs_rq_list) in rebalance_shares().
-		 */
-		rcu_read_lock();
-
-		for (i = 0; i < ndoms_cur; i++) {
-			cpumask_t cpumap = doms_cur[i];
-			struct sched_domain *sd = NULL, *sd_prev = NULL;
-
-			cpu = first_cpu(cpumap);
-
-			/* Find the highest domain at which to balance shares */
-			for_each_domain(cpu, sd) {
-				if (!(sd->flags & SD_LOAD_BALANCE))
-					continue;
-				sd_prev = sd;
-			}
-
-			sd = sd_prev;
-			/* sd == NULL? No load balance reqd in this domain */
-			if (!sd)
-				continue;
-
-			balanced &= rebalance_shares(sd, cpu);
-		}
-
-		rcu_read_unlock();
-
-		unlock_doms_cur();
-		put_online_cpus();
-
-		if (!balanced)
-			timeout = sysctl_sched_min_bal_int_shares;
-		else if (timeout < sysctl_sched_max_bal_int_shares)
-			timeout *= 2;
-
-		msleep_interruptible(timeout);
-	}
-
-	return 0;
-}
-#endif	/* CONFIG_SMP */
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void free_fair_sched_group(struct task_group *tg)
 {
@@ -7841,29 +7635,25 @@ void sched_move_task(struct task_struct *tsk)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
 	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 
-	if (!shares)
-		shares = MIN_GROUP_SHARES;
+	spin_lock_irq(&rq->lock);
 
 	on_rq = se->on_rq;
-	if (on_rq) {
+	if (on_rq)
 		dequeue_entity(cfs_rq, se, 0);
-		dec_cpu_load(rq, se->load.weight);
-	}
 
 	se->load.weight = shares;
 	se->load.inv_weight = div64_64((1ULL<<32), shares);
 
-	if (on_rq) {
+	if (on_rq)
 		enqueue_entity(cfs_rq, se, 0);
-		inc_cpu_load(rq, se->load.weight);
-	}
+
+	spin_unlock_irq(&rq->lock);
 }
 
 static DEFINE_MUTEX(shares_mutex);
@@ -7873,18 +7663,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	int i;
 	unsigned long flags;
 
+	/*
+	 * A weight of 0 or 1 can cause arithmetics problems.
+	 * (The default weight is 1024 - so there's no practical
+	 *  limitation from this.)
+	 */
+	if (shares < 2)
+		shares = 2;
+
 	mutex_lock(&shares_mutex);
 	if (tg->shares == shares)
 		goto done;
 
-	if (shares < MIN_GROUP_SHARES)
-		shares = MIN_GROUP_SHARES;
-
-	/*
-	 * Prevent any load balance activity (rebalance_shares,
-	 * load_balance_fair) from referring to this group first,
-	 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
-	 */
 	spin_lock_irqsave(&task_group_lock, flags);
 	for_each_possible_cpu(i)
 		unregister_fair_sched_group(tg, i);
@@ -7898,11 +7688,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 	 * w/o tripping rebalance_share or load_balance_fair.
 	 */
 	tg->shares = shares;
-	for_each_possible_cpu(i) {
-		spin_lock_irq(&cpu_rq(i)->lock);
+	for_each_possible_cpu(i)
 		set_se_shares(tg->se[i], shares);
-		spin_unlock_irq(&cpu_rq(i)->lock);
-	}
 
 	/*
 	 * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c8e6492c5925..3df4d46994ca 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -727,8 +727,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return se->parent;
 }
 
-#define GROUP_IMBALANCE_PCT	20
-
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 #define for_each_sched_entity(se) \
@@ -819,26 +817,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p)
 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se,
-			    *topse = NULL;	/* Highest schedulable entity */
-	int incload = 1;
+	struct sched_entity *se = &p->se;
 
 	for_each_sched_entity(se) {
-		topse = se;
-		if (se->on_rq) {
-			incload = 0;
+		if (se->on_rq)
 			break;
-		}
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, wakeup);
 		wakeup = 1;
 	}
-	/* Increment cpu load if we just enqueued the first task of a group on
-	 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
-	 * at the highest grouping level.
-	 */
-	if (incload)
-		inc_cpu_load(rq, topse->load.weight);
 
 	hrtick_start_fair(rq, rq->curr);
 }
@@ -851,28 +838,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se,
-			    *topse = NULL; 	/* Highest schedulable entity */
-	int decload = 1;
+	struct sched_entity *se = &p->se;
 
 	for_each_sched_entity(se) {
-		topse = se;
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, sleep);
 		/* Don't dequeue parent if it has other entities besides us */
-		if (cfs_rq->load.weight) {
-			if (parent_entity(se))
-				decload = 0;
+		if (cfs_rq->load.weight)
 			break;
-		}
 		sleep = 1;
 	}
-	/* Decrement cpu load if we just dequeued the last task of a group on
-	 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
-	 * at the highest grouping level.
-	 */
-	if (decload)
-		dec_cpu_load(rq, topse->load.weight);
 
 	hrtick_start_fair(rq, rq->curr);
 }
@@ -1186,6 +1161,25 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr;
+	struct task_struct *p;
+
+	if (!cfs_rq->nr_running || !first_fair(cfs_rq))
+		return MAX_PRIO;
+
+	curr = cfs_rq->curr;
+	if (!curr)
+		curr = __pick_next_entity(cfs_rq);
+
+	p = task_of(curr);
+
+	return p->prio;
+}
+#endif
+
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
@@ -1195,45 +1189,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
 	struct rq_iterator cfs_rq_iterator;
-	unsigned long load_moved;
 
 	cfs_rq_iterator.start = load_balance_start_fair;
 	cfs_rq_iterator.next = load_balance_next_fair;
 
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 #ifdef CONFIG_FAIR_GROUP_SCHED
-		struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
-		unsigned long maxload, task_load, group_weight;
-		unsigned long thisload, per_task_load;
-		struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
-
-		task_load = busy_cfs_rq->load.weight;
-		group_weight = se->load.weight;
+		struct cfs_rq *this_cfs_rq;
+		long imbalance;
+		unsigned long maxload;
 
-		/*
-		 * 'group_weight' is contributed by tasks of total weight
-		 * 'task_load'. To move 'rem_load_move' worth of weight only,
-		 * we need to move a maximum task load of:
-		 *
-		 * 	maxload = (remload / group_weight) * task_load;
-		 */
-		maxload = (rem_load_move * task_load) / group_weight;
+		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
 
-		if (!maxload || !task_load)
+		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+		if (imbalance <= 0)
 			continue;
 
-		per_task_load = task_load / busy_cfs_rq->nr_running;
-		/*
-		 * balance_tasks will try to forcibly move atleast one task if
-		 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
-		 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
-		 */
-		 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
-			continue;
+		/* Don't pull more than imbalance/2 */
+		imbalance /= 2;
+		maxload = min(rem_load_move, imbalance);
 
-		/* Disable priority-based load balance */
-		*this_best_prio = 0;
-		thisload = this_cfs_rq->load.weight;
+		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
 #else
 # define maxload rem_load_move
 #endif
@@ -1242,33 +1219,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 * load_balance_[start|next]_fair iterators
 		 */
 		cfs_rq_iterator.arg = busy_cfs_rq;
-		load_moved = balance_tasks(this_rq, this_cpu, busiest,
+		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
 					       maxload, sd, idle, all_pinned,
 					       this_best_prio,
 					       &cfs_rq_iterator);
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-		/*
-		 * load_moved holds the task load that was moved. The
-		 * effective (group) weight moved would be:
-		 * 	load_moved_eff = load_moved/task_load * group_weight;
-		 */
-		load_moved = (group_weight * load_moved) / task_load;
-
-		/* Adjust shares on both cpus to reflect load_moved */
-		group_weight -= load_moved;
-		set_se_shares(se, group_weight);
-
-		se = busy_cfs_rq->tg->se[this_cpu];
-		if (!thisload)
-			group_weight = load_moved;
-		else
-			group_weight = se->load.weight + load_moved;
-		set_se_shares(se, group_weight);
-#endif
-
-		rem_load_move -= load_moved;
-
 		if (rem_load_move <= 0)
 			break;
 	}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f54792b175b2..76e828517541 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -393,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 	 */
 	for_each_sched_rt_entity(rt_se)
 		enqueue_rt_entity(rt_se);
-
-	inc_cpu_load(rq, p->se.load.weight);
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -414,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 		if (rt_rq && rt_rq->rt_nr_running)
 			enqueue_rt_entity(rt_se);
 	}
-
-	dec_cpu_load(rq, p->se.load.weight);
 }
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8b7e95411795..b2a2d6889bab 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -311,24 +311,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
-	{
-		.ctl_name       = CTL_UNNUMBERED,
-		.procname       = "sched_min_bal_int_shares",
-		.data           = &sysctl_sched_min_bal_int_shares,
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
-	},
-	{
-		.ctl_name       = CTL_UNNUMBERED,
-		.procname       = "sched_max_bal_int_shares",
-		.data           = &sysctl_sched_max_bal_int_shares,
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = &proc_dointvec,
-	},
-#endif
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-- 
cgit v1.2.3


From b6abdb0e6ca5c2c0a7caa4131da2af0750927e72 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 4 Mar 2008 14:28:19 -0800
Subject: cgroup: fix default notify_on_release setting

The documentation says the default value of notify_on_release of a child
cgroup is inherited from its parent, which is reasonable, but the
implementation just sets the flag disabled.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d8abe996e009..e9c2fb01e89b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2232,7 +2232,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 	mutex_lock(&cgroup_mutex);
 
-	cgrp->flags = 0;
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->css_sets);
@@ -2242,6 +2241,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	cgrp->root = parent->root;
 	cgrp->top_cgroup = parent->top_cgroup;
 
+	if (notify_on_release(parent))
+		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+
 	for_each_subsys(root, ss) {
 		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
 		if (IS_ERR(css)) {
-- 
cgit v1.2.3


From fb78922ce9c71b24c4af1ffc9c3d60c57ac471fb Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Tue, 4 Mar 2008 14:28:24 -0800
Subject: Memory Resource Controller use strstrip while parsing arguments

The memory controller has a requirement that while writing values, we need
to use echo -n. This patch fixes the problem and makes the UI more consistent.

Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/controllers/memory.txt | 6 +++---
 kernel/res_counter.c                 | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt
index 6015347b41e2..fba6af45225c 100644
--- a/Documentation/controllers/memory.txt
+++ b/Documentation/controllers/memory.txt
@@ -164,7 +164,7 @@ c. Enable CONFIG_CGROUP_MEM_CONT
 
 Since now we're in the 0 cgroup,
 We can alter the memory limit:
-# echo -n 4M > /cgroups/0/memory.limit_in_bytes
+# echo 4M > /cgroups/0/memory.limit_in_bytes
 
 NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo,
 mega or gigabytes.
@@ -185,7 +185,7 @@ number of factors, such as rounding up to page boundaries or the total
 availability of memory on the system.  The user is required to re-read
 this file after a write to guarantee the value committed by the kernel.
 
-# echo -n 1 > memory.limit_in_bytes
+# echo 1 > memory.limit_in_bytes
 # cat memory.limit_in_bytes
 4096
 
@@ -197,7 +197,7 @@ caches, RSS and Active pages/Inactive pages are shown.
 
 The memory.force_empty gives an interface to drop *all* charges by force.
 
-# echo -n 1 > memory.force_empty
+# echo 1 > memory.force_empty
 
 will drop all charges in cgroup. Currently, this is maintained for test.
 
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 16cbec2d5d60..efbfc0fc232f 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -113,6 +113,7 @@ ssize_t res_counter_write(struct res_counter *counter, int member,
 
 	ret = -EINVAL;
 
+	strstrip(buf);
 	if (write_strategy) {
 		if (write_strategy(buf, &tmp)) {
 			goto out_free;
-- 
cgit v1.2.3


From 9edddaa200df18e08fe0cf21036e8ae467b1363c Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Tue, 4 Mar 2008 14:28:37 -0800
Subject: Kprobes: indicate kretprobe support in Kconfig

Add CONFIG_HAVE_KRETPROBES to the arch/<arch>/Kconfig file for relevant
architectures with kprobes support.  This facilitates easy handling of
in-kernel modules (like samples/kprobes/kretprobe_example.c) that depend on
kretprobes being present in the kernel.

Thanks to Sam Ravnborg for helping make the patch more lean.

Per Mathieu's suggestion, added CONFIG_KRETPROBES and fixed up dependencies.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                  | 7 +++++++
 arch/arm/Kconfig              | 1 +
 arch/ia64/Kconfig             | 1 +
 arch/powerpc/Kconfig          | 1 +
 arch/s390/Kconfig             | 1 +
 arch/sparc64/Kconfig          | 1 +
 arch/x86/Kconfig              | 1 +
 include/asm-arm/kprobes.h     | 1 -
 include/asm-ia64/kprobes.h    | 1 -
 include/asm-powerpc/kprobes.h | 1 -
 include/asm-s390/kprobes.h    | 1 -
 include/asm-sparc64/kprobes.h | 2 --
 include/asm-x86/kprobes.h     | 1 -
 include/linux/kprobes.h       | 6 +++---
 kernel/kprobes.c              | 9 +++------
 15 files changed, 19 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 3d72dc3fc8f5..694c9af520bb 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -27,5 +27,12 @@ config KPROBES
 	  for kernel debugging, non-intrusive instrumentation and testing.
 	  If in doubt, say "N".
 
+config KRETPROBES
+	def_bool y
+	depends on KPROBES && HAVE_KRETPROBES
+
 config HAVE_KPROBES
 	def_bool n
+
+config HAVE_KRETPROBES
+	def_bool n
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 16b82e1272b0..955fc53c1c01 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -12,6 +12,7 @@ config ARM
 	select SYS_SUPPORTS_APM_EMULATION
 	select HAVE_OPROFILE
 	select HAVE_KPROBES if (!XIP_KERNEL)
+	select HAVE_KRETPROBES if (HAVE_KPROBES)
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index dff9edfc7465..56762d3c2a6a 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -18,6 +18,7 @@ config IA64
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 	default y
 	help
 	  The Itanium Processor Family is Intel's 64-bit successor to
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5b8d8382b762..1189d8d6170d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -90,6 +90,7 @@ config PPC
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 
 config EARLY_PRINTK
 	bool
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b21444b681b6..9892827b6176 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -61,6 +61,7 @@ config S390
 	def_bool y
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 
 source "init/Kconfig"
 
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 3af378ddb6ae..463d1be32c98 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -10,6 +10,7 @@ config SPARC
 	default y
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 
 config SPARC64
 	bool
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 53800b80a204..f41c9538ca30 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,6 +21,7 @@ config X86
 	select HAVE_IDE
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
+	select HAVE_KRETPROBES
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 
 
diff --git a/include/asm-arm/kprobes.h b/include/asm-arm/kprobes.h
index 4e7bd32288ae..c042194d3ab5 100644
--- a/include/asm-arm/kprobes.h
+++ b/include/asm-arm/kprobes.h
@@ -20,7 +20,6 @@
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
 
-#define ARCH_SUPPORTS_KRETPROBES
 #define __ARCH_WANT_KPROBES_INSN_SLOT
 #define MAX_INSN_SIZE			2
 #define MAX_STACK_SIZE			64	/* 32 would probably be OK */
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index a93ce9ef07ff..adbaba14eb0a 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -82,7 +82,6 @@ struct kprobe_ctlblk {
 	struct prev_kprobe prev_kprobe[ARCH_PREV_KPROBE_SZ];
 };
 
-#define ARCH_SUPPORTS_KRETPROBES
 #define kretprobe_blacklist_size 0
 
 #define SLOT0_OPCODE_SHIFT	(37)
diff --git a/include/asm-powerpc/kprobes.h b/include/asm-powerpc/kprobes.h
index afabad230dbb..d0e7701fa1f6 100644
--- a/include/asm-powerpc/kprobes.h
+++ b/include/asm-powerpc/kprobes.h
@@ -80,7 +80,6 @@ typedef unsigned int kprobe_opcode_t;
 #define is_trap(instr)	(IS_TW(instr) || IS_TWI(instr))
 #endif
 
-#define ARCH_SUPPORTS_KRETPROBES
 #define flush_insn_slot(p)	do { } while (0)
 #define kretprobe_blacklist_size 0
 
diff --git a/include/asm-s390/kprobes.h b/include/asm-s390/kprobes.h
index 948db3d0d05c..330f68caffe4 100644
--- a/include/asm-s390/kprobes.h
+++ b/include/asm-s390/kprobes.h
@@ -46,7 +46,6 @@ typedef u16 kprobe_opcode_t;
 	? (MAX_STACK_SIZE) \
 	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
 
-#define ARCH_SUPPORTS_KRETPROBES
 #define kretprobe_blacklist_size 0
 
 #define KPROBE_SWAP_INST	0x10
diff --git a/include/asm-sparc64/kprobes.h b/include/asm-sparc64/kprobes.h
index 7237dd87663e..5879d71afdaa 100644
--- a/include/asm-sparc64/kprobes.h
+++ b/include/asm-sparc64/kprobes.h
@@ -14,8 +14,6 @@ typedef u32 kprobe_opcode_t;
 
 #define arch_remove_kprobe(p)	do {} while (0)
 
-#define ARCH_SUPPORTS_KRETPROBES
-
 #define flush_insn_slot(p)		\
 do { 	flushi(&(p)->ainsn.insn[0]);	\
 	flushi(&(p)->ainsn.insn[1]);	\
diff --git a/include/asm-x86/kprobes.h b/include/asm-x86/kprobes.h
index 143476a3cb52..61ad7b5d142e 100644
--- a/include/asm-x86/kprobes.h
+++ b/include/asm-x86/kprobes.h
@@ -42,7 +42,6 @@ typedef u8 kprobe_opcode_t;
 	: (((unsigned long)current_thread_info()) + THREAD_SIZE \
 	   - (unsigned long)(ADDR)))
 
-#define ARCH_SUPPORTS_KRETPROBES
 #define flush_insn_slot(p)	do { } while (0)
 
 extern const int kretprobe_blacklist_size;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 4a6ce82ba039..0f28486f6360 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -125,11 +125,11 @@ struct jprobe {
 DECLARE_PER_CPU(struct kprobe *, current_kprobe);
 DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
-#ifdef ARCH_SUPPORTS_KRETPROBES
+#ifdef CONFIG_KRETPROBES
 extern void arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				   struct pt_regs *regs);
 extern int arch_trampoline_kprobe(struct kprobe *p);
-#else /* ARCH_SUPPORTS_KRETPROBES */
+#else /* CONFIG_KRETPROBES */
 static inline void arch_prepare_kretprobe(struct kretprobe *rp,
 					struct pt_regs *regs)
 {
@@ -138,7 +138,7 @@ static inline int arch_trampoline_kprobe(struct kprobe *p)
 {
 	return 0;
 }
-#endif /* ARCH_SUPPORTS_KRETPROBES */
+#endif /* CONFIG_KRETPROBES */
 /*
  * Function-return probe -
  * Note:
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7a86e6432338..e6a61dcbc578 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -678,8 +678,7 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
 	unregister_kprobe(&jp->kp);
 }
 
-#ifdef ARCH_SUPPORTS_KRETPROBES
-
+#ifdef CONFIG_KRETPROBES
 /*
  * This kprobe pre_handler is registered with every kretprobe. When probe
  * hits it will set up the return probe.
@@ -769,8 +768,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 	return ret;
 }
 
-#else /* ARCH_SUPPORTS_KRETPROBES */
-
+#else /* CONFIG_KRETPROBES */
 int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	return -ENOSYS;
@@ -781,8 +779,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 {
 	return 0;
 }
-
-#endif /* ARCH_SUPPORTS_KRETPROBES */
+#endif /* CONFIG_KRETPROBES */
 
 void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
-- 
cgit v1.2.3


From 544adb41077a10d299a1094f12ec55a5843a9bdb Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Tue, 4 Mar 2008 14:29:00 -0800
Subject: markers: don't risk NULL deref in marker

get_marker() may return NULL, so test for it.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/marker.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/marker.c b/kernel/marker.c
index 50effc01d9a2..48a4ea5afffd 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -698,14 +698,12 @@ int marker_probe_unregister(const char *name,
 {
 	struct marker_entry *entry;
 	struct marker_probe_closure *old;
-	int ret = 0;
+	int ret = -ENOENT;
 
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
-	if (!entry) {
-		ret = -ENOENT;
+	if (!entry)
 		goto end;
-	}
 	if (entry->rcu_pending)
 		rcu_barrier();
 	old = marker_entry_remove_probe(entry, probe, probe_private);
@@ -713,12 +711,15 @@ int marker_probe_unregister(const char *name,
 	marker_update_probes();		/* may update entry */
 	mutex_lock(&markers_mutex);
 	entry = get_marker(name);
+	if (!entry)
+		goto end;
 	entry->oldptr = old;
 	entry->rcu_pending = 1;
 	/* write rcu_pending before calling the RCU callback */
 	smp_wmb();
 	call_rcu(&entry->rcu, free_old_closure);
 	remove_marker(name);	/* Ignore busy error message */
+	ret = 0;
 end:
 	mutex_unlock(&markers_mutex);
 	return ret;
-- 
cgit v1.2.3


From b2a5cd6938879b5bcfef0a73c28fea84c49519c2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Tue, 4 Mar 2008 14:29:44 -0800
Subject: kprobes: fix a null pointer bug in register_kretprobe()

Fix a bug in regiseter_kretprobe() which does not check rp->kp.symbol_name ==
NULL before calling kprobe_lookup_name.

For maintainability, this introduces kprobe_addr helper function which
resolves addr field.  It is used by register_kprobe and register_kretprobe.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 43 ++++++++++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e6a61dcbc578..fcfb580c3afc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -498,27 +498,36 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
 	return 0;
 }
 
+/*
+ * If we have a symbol_name argument, look it up and add the offset field
+ * to it. This way, we can specify a relative address to a symbol.
+ */
+static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
+{
+	kprobe_opcode_t *addr = p->addr;
+	if (p->symbol_name) {
+		if (addr)
+			return NULL;
+		kprobe_lookup_name(p->symbol_name, addr);
+	}
+
+	if (!addr)
+		return NULL;
+	return (kprobe_opcode_t *)(((char *)addr) + p->offset);
+}
+
 static int __kprobes __register_kprobe(struct kprobe *p,
 	unsigned long called_from)
 {
 	int ret = 0;
 	struct kprobe *old_p;
 	struct module *probed_mod;
+	kprobe_opcode_t *addr;
 
-	/*
-	 * If we have a symbol_name argument look it up,
-	 * and add it to the address.  That way the addr
-	 * field can either be global or relative to a symbol.
-	 */
-	if (p->symbol_name) {
-		if (p->addr)
-			return -EINVAL;
-		kprobe_lookup_name(p->symbol_name, p->addr);
-	}
-
-	if (!p->addr)
+	addr = kprobe_addr(p);
+	if (!addr)
 		return -EINVAL;
-	p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
+	p->addr = addr;
 
 	if (!kernel_text_address((unsigned long) p->addr) ||
 	    in_kprobes_functions((unsigned long) p->addr))
@@ -721,12 +730,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 	int ret = 0;
 	struct kretprobe_instance *inst;
 	int i;
-	void *addr = rp->kp.addr;
+	void *addr;
 
 	if (kretprobe_blacklist_size) {
-		if (addr == NULL)
-			kprobe_lookup_name(rp->kp.symbol_name, addr);
-		addr += rp->kp.offset;
+		addr = kprobe_addr(&rp->kp);
+		if (!addr)
+			return -EINVAL;
 
 		for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
 			if (kretprobe_blacklist[i].addr == addr)
-- 
cgit v1.2.3


From 9b37ccfc637be27d9a652fcedc35e6e782c3aa78 Mon Sep 17 00:00:00 2001
From: Pavel Roskin <proski@gnu.org>
Date: Thu, 28 Feb 2008 17:11:02 -0500
Subject: module: allow ndiswrapper to use GPL-only symbols

A change after 2.6.24 broke ndiswrapper by accidentally removing its
access to GPL-only symbols.  Revert that change and add comments about
the reasons why ndiswrapper and driverloader are treated in a special
way.

Signed-off-by: Pavel Roskin <proski@gnu.org>
Acked-by: Greg KH <gregkh@suse.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jon Masters <jonathan@jonmasters.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 901cd6ac2f11..be4807fb90e4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1933,8 +1933,15 @@ static struct module *load_module(void __user *umod,
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
+	/*
+	 * ndiswrapper is under GPL by itself, but loads proprietary modules.
+	 * Don't use add_taint_module(), as it would prevent ndiswrapper from
+	 * using GPL-only symbols it needs.
+	 */
 	if (strcmp(mod->name, "ndiswrapper") == 0)
-		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
+		add_taint(TAINT_PROPRIETARY_MODULE);
+
+	/* driverloader was caught wrongly pretending to be under GPL */
 	if (strcmp(mod->name, "driverloader") == 0)
 		add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
 
-- 
cgit v1.2.3


From 41f7f60d31e5e1dfc9a92957b3e14e08a2f04964 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 4 Mar 2008 23:32:38 -0800
Subject: cpusets: fix obsolete comment

mm migration is no longer done in cpuset_update_task_memory_state() so it
can no longer take current->mm->mmap_sem, so fix the obsolete comment.

[ This changed in commit 04c19fa6f16047abff2288ddbc1f0798ede5a849
  ("cpuset: migrate all tasks in cpuset at once") when the mm migration
  was moved from cpuset_update_task_memory_state() to update_nodemask() ]

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e296ed81d4d..a1b61f414228 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -322,8 +322,8 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * Call without callback_mutex or task_lock() held.  May be
  * called with or without cgroup_mutex held.  Thanks in part to
  * 'the_top_cpuset_hack', the task's cpuset pointer will never
- * be NULL.  This routine also might acquire callback_mutex and
- * current->mm->mmap_sem during call.
+ * be NULL.  This routine also might acquire callback_mutex during
+ * call.
  *
  * Reading current->cpuset->mems_generation doesn't need task_lock
  * to guard the current->cpuset derefence, because it is guarded
-- 
cgit v1.2.3


From 810b38179e9e4d4f57b4b733767bb08f8291a965 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 29 Feb 2008 15:21:01 -0500
Subject: sched: retain vruntime

Kei Tokunaga reported an interactivity problem when moving tasks
between control groups.

Tasks would retain their old vruntime when moved between groups, this
can cause funny lags. Re-set the vruntime on group move to fit within
the new tree.

Reported-by: Kei Tokunaga <tokunaga.keiich@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  4 ++++
 kernel/sched.c        |  5 +++++
 kernel/sched_fair.c   | 14 ++++++++++++++
 3 files changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9ae4030067a9..11d8e9a74eff 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -899,6 +899,10 @@ struct sched_class {
 			     int running);
 	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
 			     int oldprio, int running);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	void (*moved_group) (struct task_struct *p);
+#endif
 };
 
 struct load_weight {
diff --git a/kernel/sched.c b/kernel/sched.c
index dcd553cc4ee8..0b949c4e73ad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7625,6 +7625,11 @@ void sched_move_task(struct task_struct *tsk)
 
 	set_task_rq(tsk, task_cpu(tsk));
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (tsk->sched_class->moved_group)
+		tsk->sched_class->moved_group(tsk);
+#endif
+
 	if (on_rq) {
 		if (unlikely(running))
 			tsk->sched_class->set_curr_task(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3df4d46994ca..e2a530515619 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1353,6 +1353,16 @@ static void set_curr_task_fair(struct rq *rq)
 		set_next_entity(cfs_rq_of(se), se);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void moved_group_fair(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+	update_curr(cfs_rq);
+	place_entity(cfs_rq, &p->se, 1);
+}
+#endif
+
 /*
  * All the scheduling class methods:
  */
@@ -1381,6 +1391,10 @@ static const struct sched_class fair_sched_class = {
 
 	.prio_changed		= prio_changed_fair,
 	.switched_to		= switched_to_fair,
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	.moved_group		= moved_group_fair,
+#endif
 };
 
 #ifdef CONFIG_SCHED_DEBUG
-- 
cgit v1.2.3


From 6fa46fa526f2cab9ce21fa5e39501553a40d196d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 5 Mar 2008 10:00:12 -0500
Subject: sched: balance RT task resched only on runqueue

Sripathi Kodi reported a crash in the -rt kernel:

  https://bugzilla.redhat.com/show_bug.cgi?id=435674

this is due to a place that can reschedule a task without holding
the tasks runqueue lock.  This was caused by the RT balancing code
that pulls RT tasks to the current run queue and will reschedule the
current task.

There's a slight chance that the pulling of the RT tasks will release
the current runqueue's lock and retake it (in the double_lock_balance).
During this time that the runqueue is released, the current task can
migrate to another runqueue.

In the prio_changed_rt code, after the pull, if the current task is of
lesser priority than one of the RT tasks pulled, resched_task is called
on the current task. If the current task had migrated in that small
window, resched_task will be called without holding the runqueue lock
for the runqueue that the task is on.

This race condition also exists in the mainline kernel and this patch
adds a check to make sure the task hasn't migrated before calling
resched_task.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Tested-by: Sripathi Kodi <sripathik@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 76e828517541..0a6d2e516420 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1107,9 +1107,11 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
 			pull_rt_task(rq);
 		/*
 		 * If there's a higher priority task waiting to run
-		 * then reschedule.
+		 * then reschedule. Note, the above pull_rt_task
+		 * can release the rq lock and p could migrate.
+		 * Only reschedule if p is still on the same runqueue.
 		 */
-		if (p->prio > rq->rt.highest_prio)
+		if (p->prio > rq->rt.highest_prio && rq->curr == p)
 			resched_task(p);
 #else
 		/* For UP simply resched on drop of prio */
-- 
cgit v1.2.3


From 150d8bede7f85eb00d8f4d628e6b0bae68739e3b Mon Sep 17 00:00:00 2001
From: Pavel Roskin <proski@gnu.org>
Date: Wed, 5 Mar 2008 16:56:37 -0500
Subject: sched: export task_nice

The API is trivial, and so is the implementation.

Signed-off-by: Pavel Roskin <proski@gnu.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0b949c4e73ad..63a469f8853d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4422,7 +4422,7 @@ int task_nice(const struct task_struct *p)
 {
 	return TASK_NICE(p);
 }
-EXPORT_SYMBOL_GPL(task_nice);
+EXPORT_SYMBOL(task_nice);
 
 /**
  * idle_cpu - is a given cpu idle currently?
-- 
cgit v1.2.3


From 1868f958eb56fc41c5985c8732e564a400c5fdf5 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 7 Mar 2008 09:35:06 +0800
Subject: sched: fix the wrong time slice value for SCHED_FIFO tasks

Function sys_sched_rr_get_interval returns wrong time slice value for
SCHED_FIFO tasks. The time slice for SCHED_FIFO tasks should be 0.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 63a469f8853d..5b13e4b0e009 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5100,7 +5100,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
 	time_slice = 0;
 	if (p->policy == SCHED_RR) {
 		time_slice = DEF_TIMESLICE;
-	} else {
+	} else if (p->policy != SCHED_FIFO) {
 		struct sched_entity *se = &p->se;
 		unsigned long flags;
 		struct rq *rq;
-- 
cgit v1.2.3


From 2692a2406b9262bbb101708815be99ec2988e48b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 Feb 2008 12:00:46 +0100
Subject: sched: rt-group: fixup schedulability constraints calculation

it was only possible to configure the rt-group scheduling parameters
beyond the default value in a very small range.

that's because div64_64() has a different calling convention than
do_div() :/

fix a few untidies while we are here; sysctl_sched_rt_period may overflow
due to that multiplication, so cast to u64 first. Also that RUNTIME_INF
juggling makes little sense although its an effective NOP.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5b13e4b0e009..b8ee864c7481 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7726,9 +7726,7 @@ static unsigned long to_ratio(u64 period, u64 runtime)
 	if (runtime == RUNTIME_INF)
 		return 1ULL << 16;
 
-	runtime *= (1ULL << 16);
-	div64_64(runtime, period);
-	return runtime;
+	return div64_64(runtime << 16, period);
 }
 
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
@@ -7757,18 +7755,16 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 	u64 rt_runtime, rt_period;
 	int err = 0;
 
-	rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
+	rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 	rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
 	if (rt_runtime_us == -1)
-		rt_runtime = rt_period;
+		rt_runtime = RUNTIME_INF;
 
 	mutex_lock(&rt_constraints_mutex);
 	if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
 		err = -EINVAL;
 		goto unlock;
 	}
-	if (rt_runtime_us == -1)
-		rt_runtime = RUNTIME_INF;
 	tg->rt_runtime = rt_runtime;
  unlock:
 	mutex_unlock(&rt_constraints_mutex);
-- 
cgit v1.2.3


From 521f1a2489c41f8b1181b0a8eb52e1c34284d50b Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Thu, 28 Feb 2008 15:21:56 +0530
Subject: sched: don't allow rt_runtime_us to be zero for groups having rt
 tasks

This patch checks if we can set the rt_runtime_us to 0. If there is a
realtime task in the group, we don't want to set the rt_runtime_us as 0
or bad things will happen. (that task wont get any CPU time despite
being TASK_RUNNNG)

Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b8ee864c7481..52b98675acb2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7750,6 +7750,17 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 	return total + to_ratio(period, runtime) < global_ratio;
 }
 
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
+{
+	struct task_struct *g, *p;
+	do_each_thread(g, p) {
+		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+			return 1;
+	} while_each_thread(g, p);
+	return 0;
+}
+
 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
 	u64 rt_runtime, rt_period;
@@ -7761,12 +7772,18 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 		rt_runtime = RUNTIME_INF;
 
 	mutex_lock(&rt_constraints_mutex);
+	read_lock(&tasklist_lock);
+	if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) {
+		err = -EBUSY;
+		goto unlock;
+	}
 	if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
 		err = -EINVAL;
 		goto unlock;
 	}
 	tg->rt_runtime = rt_runtime;
  unlock:
+	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
 	return err;
-- 
cgit v1.2.3