From 5c60169a01af712b0b1aa1f5db3fcb8776b22d9f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 18:49:33 -0700
Subject: [PATCH] rcu lock update: Add per-cpu batch counter

From: Manfred Spraul <manfred@colorfullife.com>

Below is the one of my patches from my rcu lock update.  Jack Steiner tested
the first one on a 512p and it resolved the rcu cache line trashing.  All were
tested on osdl with STP.

Step one for reducing cacheline trashing within rcupdate.c:

The current code uses the rcu_cpu_mask bitmap both for keeping track of the
cpus that haven't gone through a quiescent state and for checking if a cpu
should look for quiescent states.  The bitmap is frequently changed and the
check is done by polling - together this causes cache line trashing.

If it's cheaper to access a (mostly) read-only cacheline than a cacheline that
is frequently dirtied, then it's possible to reduce the trashing by splitting
the rcu_cpu_mask bitmap into two cachelines:

The patch adds a generation counter and moves it into a separate cacheline.
This allows to removes all accesses to rcu_cpumask (in the read-write
cacheline) from rcu_pending and at least 50% of the accesses from
rcu_check_quiescent_state.  rcu_pending and all but one call per cpu to
rcu_check_quiescent_state access the read-only cacheline.  Probably not enough
for 512p, but it's a start, just for 128 byte more memory use, without slowing
down rcu grace periods.  Obviously the read-only cacheline is not really
read-only: it's written once per grace period to indicate that a new grace
period is running.

Tests on an 8-way Pentium III with reaim showed some improvement:

oprofile hits:
Reference: http://khack.osdl.org/stp/293075/
Hits	   %
23741     0.0994  rcu_pending
19057     0.0798  rcu_check_quiescent_state
6530      0.0273  rcu_check_callbacks

Patched: http://khack.osdl.org/stp/293076/
8291      0.0579  rcu_pending
5475      0.0382  rcu_check_quiescent_state
3604      0.0252  rcu_check_callbacks

The total runtime differs between both runs, thus the % number must
be compared: Around 50% faster. I've uninlined rcu_pending for the
test.

Tested with reaim and kernbench.

Description:

- per-cpu quiescbatch and qs_pending fields introduced: quiescbatch contains
  the number of the last quiescent period that the cpu has seen and qs_pending
  is set if the cpu has not yet reported the quiescent state for the current
  period.  With these two fields a cpu can test if it should report a
  quiescent state without having to look at the frequently written
  rcu_cpu_mask bitmap.

- curbatch split into two fields: rcu_ctrlblk.batch.completed and
  rcu_ctrlblk.batch.cur.  This makes it possible to figure out if a grace
  period is running (completed != cur) without accessing the rcu_cpu_mask
  bitmap.

- rcu_ctrlblk.maxbatch removed and replaced with a true/false next_pending
  flag: next_pending=1 means that another grace period should be started
  immediately after the end of the current period.  Previously, this was
  achieved by maxbatch: curbatch==maxbatch means don't start, curbatch!=
  maxbatch means start.  A flag improves the readability: The only possible
  values for maxbatch were curbatch and curbatch+1.

- rcu_ctrlblk split into two cachelines for better performance.

- common code from rcu_offline_cpu and rcu_check_quiescent_state merged into
  cpu_quiet.

- rcu_offline_cpu: replace spin_lock_irq with spin_lock_bh, there are no
  accesses from irq context (and there are accesses to the spinlock with
  enabled interrupts from tasklet context).

- rcu_restart_cpu introduced, s390 should call it after changing nohz:
  Theoretically the global batch counter could wrap around and end up at
  RCU_quiescbatch(cpu).  Then the cpu would not look for a quiescent state and
  rcu would lock up.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 142 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 88 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 13a1b5a5825f..d665d001e030 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -47,8 +47,8 @@
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
-	{ .mutex = SPIN_LOCK_UNLOCKED, .curbatch = 1, 
-	  .maxbatch = 1, .rcu_cpu_mask = CPU_MASK_NONE };
+	{ .batch = { .cur = -300, .completed = -300 },
+	  .state = {.mutex = SPIN_LOCK_UNLOCKED, .rcu_cpu_mask = CPU_MASK_NONE } };
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 
 /* Fake initialization required by compiler */
@@ -96,26 +96,60 @@ static void rcu_do_batch(struct list_head *list)
 	}
 }
 
+/*
+ * Grace period handling:
+ * The grace period handling consists out of two steps:
+ * - A new grace period is started.
+ *   This is done by rcu_start_batch. The start is not broadcasted to
+ *   all cpus, they must pick this up by comparing rcu_ctrlblk.batch.cur with
+ *   RCU_quiescbatch(cpu). All cpus are recorded  in the
+ *   rcu_ctrlblk.state.rcu_cpu_mask bitmap.
+ * - All cpus must go through a quiescent state.
+ *   Since the start of the grace period is not broadcasted, at least two
+ *   calls to rcu_check_quiescent_state are required:
+ *   The first call just notices that a new grace period is running. The
+ *   following calls check if there was a quiescent state since the beginning
+ *   of the grace period. If so, it updates rcu_ctrlblk.state.rcu_cpu_mask. If
+ *   the bitmap is empty, then the grace period is completed.
+ *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
+ *   period (if necessary).
+ */
 /*
  * Register a new batch of callbacks, and start it up if there is currently no
  * active batch and the batch to be registered has not already occurred.
- * Caller must hold the rcu_ctrlblk lock.
+ * Caller must hold the rcu_ctrlblk.state lock.
  */
-static void rcu_start_batch(long newbatch)
+static void rcu_start_batch(int next_pending)
 {
 	cpumask_t active;
 
-	if (rcu_batch_before(rcu_ctrlblk.maxbatch, newbatch)) {
-		rcu_ctrlblk.maxbatch = newbatch;
+	if (next_pending)
+		rcu_ctrlblk.state.next_pending = 1;
+
+	if (rcu_ctrlblk.state.next_pending &&
+			rcu_ctrlblk.batch.completed == rcu_ctrlblk.batch.cur) {
+		rcu_ctrlblk.state.next_pending = 0;
+		/* Can't change, since spin lock held. */
+		active = nohz_cpu_mask;
+		cpus_complement(active);
+		cpus_and(rcu_ctrlblk.state.rcu_cpu_mask, cpu_online_map, active);
+		rcu_ctrlblk.batch.cur++;
 	}
-	if (rcu_batch_before(rcu_ctrlblk.maxbatch, rcu_ctrlblk.curbatch) ||
-	    !cpus_empty(rcu_ctrlblk.rcu_cpu_mask)) {
-		return;
+}
+
+/*
+ * cpu went through a quiescent state since the beginning of the grace period.
+ * Clear it from the cpu mask and complete the grace period if it was the last
+ * cpu. Start another grace period if someone has further entries pending
+ */
+static void cpu_quiet(int cpu)
+{
+	cpu_clear(cpu, rcu_ctrlblk.state.rcu_cpu_mask);
+	if (cpus_empty(rcu_ctrlblk.state.rcu_cpu_mask)) {
+		/* batch completed ! */
+		rcu_ctrlblk.batch.completed = rcu_ctrlblk.batch.cur;
+		rcu_start_batch(0);
 	}
-	/* Can't change, since spin lock held. */
-	active = nohz_cpu_mask;
-	cpus_complement(active);
-	cpus_and(rcu_ctrlblk.rcu_cpu_mask, cpu_online_map, active);
 }
 
 /*
@@ -127,7 +161,19 @@ static void rcu_check_quiescent_state(void)
 {
 	int cpu = smp_processor_id();
 
-	if (!cpu_isset(cpu, rcu_ctrlblk.rcu_cpu_mask))
+	if (RCU_quiescbatch(cpu) != rcu_ctrlblk.batch.cur) {
+		/* new grace period: record qsctr value. */
+		RCU_qs_pending(cpu) = 1;
+		RCU_last_qsctr(cpu) = RCU_qsctr(cpu);
+		RCU_quiescbatch(cpu) = rcu_ctrlblk.batch.cur;
+		return;
+	}
+
+	/* Grace period already completed for this cpu?
+	 * qs_pending is checked instead of the actual bitmap to avoid
+	 * cacheline trashing.
+	 */
+	if (!RCU_qs_pending(cpu))
 		return;
 
 	/* 
@@ -135,27 +181,19 @@ static void rcu_check_quiescent_state(void)
 	 * we may miss one quiescent state of that CPU. That is
 	 * tolerable. So no need to disable interrupts.
 	 */
-	if (RCU_last_qsctr(cpu) == RCU_QSCTR_INVALID) {
-		RCU_last_qsctr(cpu) = RCU_qsctr(cpu);
-		return;
-	}
 	if (RCU_qsctr(cpu) == RCU_last_qsctr(cpu))
 		return;
+	RCU_qs_pending(cpu) = 0;
 
-	spin_lock(&rcu_ctrlblk.mutex);
-	if (!cpu_isset(cpu, rcu_ctrlblk.rcu_cpu_mask))
-		goto out_unlock;
-
-	cpu_clear(cpu, rcu_ctrlblk.rcu_cpu_mask);
-	RCU_last_qsctr(cpu) = RCU_QSCTR_INVALID;
-	if (!cpus_empty(rcu_ctrlblk.rcu_cpu_mask))
-		goto out_unlock;
-
-	rcu_ctrlblk.curbatch++;
-	rcu_start_batch(rcu_ctrlblk.maxbatch);
+	spin_lock(&rcu_ctrlblk.state.mutex);
+	/*
+	 * RCU_quiescbatch/batch.cur and the cpu bitmap can come out of sync
+	 * during cpu startup. Ignore the quiescent state.
+	 */
+	if (likely(RCU_quiescbatch(cpu) == rcu_ctrlblk.batch.cur))
+		cpu_quiet(cpu);
 
-out_unlock:
-	spin_unlock(&rcu_ctrlblk.mutex);
+	spin_unlock(&rcu_ctrlblk.state.mutex);
 }
 
 
@@ -185,25 +223,11 @@ static void rcu_offline_cpu(int cpu)
 	 * we can block indefinitely waiting for it, so flush
 	 * it here
 	 */
-	spin_lock_irq(&rcu_ctrlblk.mutex);
-	if (cpus_empty(rcu_ctrlblk.rcu_cpu_mask))
-		goto unlock;
-
-	cpu_clear(cpu, rcu_ctrlblk.rcu_cpu_mask);
-	if (cpus_empty(rcu_ctrlblk.rcu_cpu_mask)) {
-		rcu_ctrlblk.curbatch++;
-		/* We may avoid calling start batch if
-		 * we are starting the batch only
-		 * because of the DEAD CPU (the current
-		 * CPU will start a new batch anyway for
-		 * the callbacks we will move to current CPU).
-		 * However, we will avoid this optimisation
-		 * for now.
-		 */
-		rcu_start_batch(rcu_ctrlblk.maxbatch);
-	}
+	spin_lock_bh(&rcu_ctrlblk.state.mutex);
+	if (rcu_ctrlblk.batch.cur != rcu_ctrlblk.batch.completed)
+		cpu_quiet(cpu);
 unlock:
-	spin_unlock_irq(&rcu_ctrlblk.mutex);
+	spin_unlock_bh(&rcu_ctrlblk.state.mutex);
 
 	rcu_move_batch(&RCU_curlist(cpu));
 	rcu_move_batch(&RCU_nxtlist(cpu));
@@ -213,6 +237,14 @@ unlock:
 
 #endif
 
+void rcu_restart_cpu(int cpu)
+{
+	spin_lock_bh(&rcu_ctrlblk.state.mutex);
+	RCU_quiescbatch(cpu) = rcu_ctrlblk.batch.completed;
+	RCU_qs_pending(cpu) = 0;
+	spin_unlock_bh(&rcu_ctrlblk.state.mutex);
+}
+
 /*
  * This does the RCU processing work from tasklet context. 
  */
@@ -222,7 +254,7 @@ static void rcu_process_callbacks(unsigned long unused)
 	LIST_HEAD(list);
 
 	if (!list_empty(&RCU_curlist(cpu)) &&
-	    rcu_batch_after(rcu_ctrlblk.curbatch, RCU_batch(cpu))) {
+			!rcu_batch_before(rcu_ctrlblk.batch.completed,RCU_batch(cpu))) {
 		__list_splice(&RCU_curlist(cpu), &list);
 		INIT_LIST_HEAD(&RCU_curlist(cpu));
 	}
@@ -236,10 +268,10 @@ static void rcu_process_callbacks(unsigned long unused)
 		/*
 		 * start the next batch of callbacks
 		 */
-		spin_lock(&rcu_ctrlblk.mutex);
-		RCU_batch(cpu) = rcu_ctrlblk.curbatch + 1;
-		rcu_start_batch(RCU_batch(cpu));
-		spin_unlock(&rcu_ctrlblk.mutex);
+		spin_lock(&rcu_ctrlblk.state.mutex);
+		RCU_batch(cpu) = rcu_ctrlblk.batch.cur + 1;
+		rcu_start_batch(1);
+		spin_unlock(&rcu_ctrlblk.state.mutex);
 	} else {
 		local_irq_enable();
 	}
@@ -263,6 +295,8 @@ static void __devinit rcu_online_cpu(int cpu)
 	tasklet_init(&RCU_tasklet(cpu), rcu_process_callbacks, 0UL);
 	INIT_LIST_HEAD(&RCU_nxtlist(cpu));
 	INIT_LIST_HEAD(&RCU_curlist(cpu));
+	RCU_quiescbatch(cpu) = rcu_ctrlblk.batch.completed;
+	RCU_qs_pending(cpu) = 0;
 }
 
 static int __devinit rcu_cpu_notify(struct notifier_block *self, 
-- 
cgit v1.2.3


From 720e8a63908eb18aad1721c1429e89fbf7cf0ca6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 18:49:44 -0700
Subject: [PATCH] rcu lock update: Use a sequence lock for starting batches

From: Manfred Spraul <manfred@colorfullife.com>

Step two for reducing cacheline trashing within rcupdate.c:

rcu_process_callbacks always acquires rcu_ctrlblk.state.mutex and calls
rcu_start_batch, even if the batch is already running or already scheduled to
run.

This can be avoided with a sequence lock: A sequence lock allows to read the
current batch number and next_pending atomically.  If next_pending is already
set, then there is no need to acquire the global mutex.

This means that for each grace period, there will be

- one write access to the rcu_ctrlblk.batch cacheline

- lots of read accesses to rcu_ctrlblk.batch (3-10*cpus_online()).  Behavior
  similar to the jiffies cacheline, shouldn't be a problem.

- cpus_online()+1 write accesses to rcu_ctrlblk.state, all of them starting
  with spin_lock(&rcu_ctrlblk.state.mutex).

  For large enough cpus_online() this will be a problem, but all except two
  of the spin_lock calls only protect the rcu_cpu_mask bitmap, thus a
  hierarchical bitmap would allow to split the write accesses to multiple
  cachelines.

Tested on an 8-way with reaim.  Unfortunately it probably won't help with Jack
Steiner's 'ls' test since in this test only one cpu generates rcu entries.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h |  6 +++++-
 kernel/rcupdate.c        | 29 +++++++++++++++++++++--------
 2 files changed, 26 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index f9981251d542..883fc03ed4ef 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -41,6 +41,7 @@
 #include <linux/threads.h>
 #include <linux/percpu.h>
 #include <linux/cpumask.h>
+#include <linux/seqlock.h>
 
 /**
  * struct rcu_head - callback structure for use with RCU
@@ -69,11 +70,14 @@ struct rcu_ctrlblk {
 	struct {
 		long	cur;		/* Current batch number.	      */
 		long	completed;	/* Number of the last completed batch */
+		int	next_pending;	/* Is the next batch already waiting? */
+		seqcount_t lock;	/* for atomically reading cur and     */
+		                        /* next_pending. Spinlock not used,   */
+		                        /* protected by state.mutex           */
 	} batch ____cacheline_maxaligned_in_smp;
 	/* remaining members: bookkeeping of the progress of the grace period */
 	struct {
 		spinlock_t	mutex;	/* Guard this struct                  */
-		int	next_pending;	/* Is the next batch already waiting? */
 		cpumask_t	rcu_cpu_mask; 	/* CPUs that need to switch   */
 				/* in order for current batch to proceed.     */
 	} state ____cacheline_maxaligned_in_smp;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index d665d001e030..dc1ac448d07c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -47,7 +47,7 @@
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
-	{ .batch = { .cur = -300, .completed = -300 },
+	{ .batch = { .cur = -300, .completed = -300 , .lock = SEQCNT_ZERO },
 	  .state = {.mutex = SPIN_LOCK_UNLOCKED, .rcu_cpu_mask = CPU_MASK_NONE } };
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 
@@ -124,16 +124,18 @@ static void rcu_start_batch(int next_pending)
 	cpumask_t active;
 
 	if (next_pending)
-		rcu_ctrlblk.state.next_pending = 1;
+		rcu_ctrlblk.batch.next_pending = 1;
 
-	if (rcu_ctrlblk.state.next_pending &&
+	if (rcu_ctrlblk.batch.next_pending &&
 			rcu_ctrlblk.batch.completed == rcu_ctrlblk.batch.cur) {
-		rcu_ctrlblk.state.next_pending = 0;
 		/* Can't change, since spin lock held. */
 		active = nohz_cpu_mask;
 		cpus_complement(active);
 		cpus_and(rcu_ctrlblk.state.rcu_cpu_mask, cpu_online_map, active);
+		write_seqcount_begin(&rcu_ctrlblk.batch.lock);
+		rcu_ctrlblk.batch.next_pending = 0;
 		rcu_ctrlblk.batch.cur++;
+		write_seqcount_end(&rcu_ctrlblk.batch.lock);
 	}
 }
 
@@ -261,6 +263,8 @@ static void rcu_process_callbacks(unsigned long unused)
 
 	local_irq_disable();
 	if (!list_empty(&RCU_nxtlist(cpu)) && list_empty(&RCU_curlist(cpu))) {
+		int next_pending, seq;
+
 		__list_splice(&RCU_nxtlist(cpu), &RCU_curlist(cpu));
 		INIT_LIST_HEAD(&RCU_nxtlist(cpu));
 		local_irq_enable();
@@ -268,10 +272,19 @@ static void rcu_process_callbacks(unsigned long unused)
 		/*
 		 * start the next batch of callbacks
 		 */
-		spin_lock(&rcu_ctrlblk.state.mutex);
-		RCU_batch(cpu) = rcu_ctrlblk.batch.cur + 1;
-		rcu_start_batch(1);
-		spin_unlock(&rcu_ctrlblk.state.mutex);
+		do {
+			seq = read_seqcount_begin(&rcu_ctrlblk.batch.lock);
+			/* determine batch number */
+			RCU_batch(cpu) = rcu_ctrlblk.batch.cur + 1;
+			next_pending = rcu_ctrlblk.batch.next_pending;
+		} while (read_seqcount_retry(&rcu_ctrlblk.batch.lock, seq));
+
+		if (!next_pending) {
+			/* and start it/schedule start if it's a new batch */
+			spin_lock(&rcu_ctrlblk.state.mutex);
+			rcu_start_batch(1);
+			spin_unlock(&rcu_ctrlblk.state.mutex);
+		}
 	} else {
 		local_irq_enable();
 	}
-- 
cgit v1.2.3


From 72914d307d2ab4630212644a46cf75746d04440d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 18:49:55 -0700
Subject: [PATCH] rcu lock update: Code move & cleanup

From: Manfred Spraul <manfred@colorfullife.com>

Step three for reducing cacheline trashing within rcupdate.c:

Cleanup and code move from <linux/rcupdate.h> to kernel/rcupdate.c: Remove
internal details from the header file.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h | 27 +++++-----------
 kernel/rcupdate.c        | 81 +++++++++++++++++++++++++++---------------------
 2 files changed, 53 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 883fc03ed4ef..967b2d47014b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -64,24 +64,13 @@ struct rcu_head {
 
 
-/* Control variables for rcupdate callback mechanism. */
+/* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
-	/* "const" members: only changed when starting/ending a grace period  */
-	struct {
-		long	cur;		/* Current batch number.	      */
-		long	completed;	/* Number of the last completed batch */
-		int	next_pending;	/* Is the next batch already waiting? */
-		seqcount_t lock;	/* for atomically reading cur and     */
-		                        /* next_pending. Spinlock not used,   */
-		                        /* protected by state.mutex           */
-	} batch ____cacheline_maxaligned_in_smp;
-	/* remaining members: bookkeeping of the progress of the grace period */
-	struct {
-		spinlock_t	mutex;	/* Guard this struct                  */
-		cpumask_t	rcu_cpu_mask; 	/* CPUs that need to switch   */
-				/* in order for current batch to proceed.     */
-	} state ____cacheline_maxaligned_in_smp;
-};
+	long	cur;		/* Current batch number.                      */
+	long	completed;	/* Number of the last completed batch         */
+	int	next_pending;	/* Is the next batch already waiting?         */
+	seqcount_t lock;	/* For atomic reads of cur and next_pending.  */
+} ____cacheline_maxaligned_in_smp;
 
 /* Is batch a before batch b ? */
 static inline int rcu_batch_before(long a, long b)
@@ -131,14 +120,14 @@ static inline int rcu_pending(int cpu)
 	 * for them has completed.
 	 */
 	if (!list_empty(&RCU_curlist(cpu)) &&
-		  !rcu_batch_before(rcu_ctrlblk.batch.completed,RCU_batch(cpu)))
+		  !rcu_batch_before(rcu_ctrlblk.completed,RCU_batch(cpu)))
 		return 1;
 	/* This cpu has no pending entries, but there are new entries */
 	if (list_empty(&RCU_curlist(cpu)) &&
 			 !list_empty(&RCU_nxtlist(cpu)))
 		return 1;
 	/* The rcu core waits for a quiescent state from the cpu */
-	if (RCU_quiescbatch(cpu) != rcu_ctrlblk.batch.cur || RCU_qs_pending(cpu))
+	if (RCU_quiescbatch(cpu) != rcu_ctrlblk.cur || RCU_qs_pending(cpu))
 		return 1;
 	/* nothing to do */
 	return 0;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index dc1ac448d07c..0e13bdbb9fd7 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -47,8 +47,17 @@
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
-	{ .batch = { .cur = -300, .completed = -300 , .lock = SEQCNT_ZERO },
-	  .state = {.mutex = SPIN_LOCK_UNLOCKED, .rcu_cpu_mask = CPU_MASK_NONE } };
+	{ .cur = -300, .completed = -300 , .lock = SEQCNT_ZERO };
+
+/* Bookkeeping of the progress of the grace period */
+struct {
+	spinlock_t	mutex; /* Guard this struct and writes to rcu_ctrlblk */
+	cpumask_t	rcu_cpu_mask; /* CPUs that need to switch in order    */
+	                              /* for current batch to proceed.        */
+} rcu_state ____cacheline_maxaligned_in_smp =
+	  {.mutex = SPIN_LOCK_UNLOCKED, .rcu_cpu_mask = CPU_MASK_NONE };
+
+
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 
 /* Fake initialization required by compiler */
@@ -101,15 +110,15 @@ static void rcu_do_batch(struct list_head *list)
  * The grace period handling consists out of two steps:
  * - A new grace period is started.
  *   This is done by rcu_start_batch. The start is not broadcasted to
- *   all cpus, they must pick this up by comparing rcu_ctrlblk.batch.cur with
+ *   all cpus, they must pick this up by comparing rcu_ctrlblk.cur with
  *   RCU_quiescbatch(cpu). All cpus are recorded  in the
- *   rcu_ctrlblk.state.rcu_cpu_mask bitmap.
+ *   rcu_state.rcu_cpu_mask bitmap.
  * - All cpus must go through a quiescent state.
  *   Since the start of the grace period is not broadcasted, at least two
  *   calls to rcu_check_quiescent_state are required:
  *   The first call just notices that a new grace period is running. The
  *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_ctrlblk.state.rcu_cpu_mask. If
+ *   of the grace period. If so, it updates rcu_state.rcu_cpu_mask. If
  *   the bitmap is empty, then the grace period is completed.
  *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
  *   period (if necessary).
@@ -117,25 +126,25 @@ static void rcu_do_batch(struct list_head *list)
 /*
  * Register a new batch of callbacks, and start it up if there is currently no
  * active batch and the batch to be registered has not already occurred.
- * Caller must hold the rcu_ctrlblk.state lock.
+ * Caller must hold rcu_state.mutex.
  */
 static void rcu_start_batch(int next_pending)
 {
 	cpumask_t active;
 
 	if (next_pending)
-		rcu_ctrlblk.batch.next_pending = 1;
+		rcu_ctrlblk.next_pending = 1;
 
-	if (rcu_ctrlblk.batch.next_pending &&
-			rcu_ctrlblk.batch.completed == rcu_ctrlblk.batch.cur) {
+	if (rcu_ctrlblk.next_pending &&
+			rcu_ctrlblk.completed == rcu_ctrlblk.cur) {
 		/* Can't change, since spin lock held. */
 		active = nohz_cpu_mask;
 		cpus_complement(active);
-		cpus_and(rcu_ctrlblk.state.rcu_cpu_mask, cpu_online_map, active);
-		write_seqcount_begin(&rcu_ctrlblk.batch.lock);
-		rcu_ctrlblk.batch.next_pending = 0;
-		rcu_ctrlblk.batch.cur++;
-		write_seqcount_end(&rcu_ctrlblk.batch.lock);
+		cpus_and(rcu_state.rcu_cpu_mask, cpu_online_map, active);
+		write_seqcount_begin(&rcu_ctrlblk.lock);
+		rcu_ctrlblk.next_pending = 0;
+		rcu_ctrlblk.cur++;
+		write_seqcount_end(&rcu_ctrlblk.lock);
 	}
 }
 
@@ -146,10 +155,10 @@ static void rcu_start_batch(int next_pending)
  */
 static void cpu_quiet(int cpu)
 {
-	cpu_clear(cpu, rcu_ctrlblk.state.rcu_cpu_mask);
-	if (cpus_empty(rcu_ctrlblk.state.rcu_cpu_mask)) {
+	cpu_clear(cpu, rcu_state.rcu_cpu_mask);
+	if (cpus_empty(rcu_state.rcu_cpu_mask)) {
 		/* batch completed ! */
-		rcu_ctrlblk.batch.completed = rcu_ctrlblk.batch.cur;
+		rcu_ctrlblk.completed = rcu_ctrlblk.cur;
 		rcu_start_batch(0);
 	}
 }
@@ -163,11 +172,11 @@ static void rcu_check_quiescent_state(void)
 {
 	int cpu = smp_processor_id();
 
-	if (RCU_quiescbatch(cpu) != rcu_ctrlblk.batch.cur) {
+	if (RCU_quiescbatch(cpu) != rcu_ctrlblk.cur) {
 		/* new grace period: record qsctr value. */
 		RCU_qs_pending(cpu) = 1;
 		RCU_last_qsctr(cpu) = RCU_qsctr(cpu);
-		RCU_quiescbatch(cpu) = rcu_ctrlblk.batch.cur;
+		RCU_quiescbatch(cpu) = rcu_ctrlblk.cur;
 		return;
 	}
 
@@ -187,15 +196,15 @@ static void rcu_check_quiescent_state(void)
 		return;
 	RCU_qs_pending(cpu) = 0;
 
-	spin_lock(&rcu_ctrlblk.state.mutex);
+	spin_lock(&rcu_state.mutex);
 	/*
 	 * RCU_quiescbatch/batch.cur and the cpu bitmap can come out of sync
 	 * during cpu startup. Ignore the quiescent state.
 	 */
-	if (likely(RCU_quiescbatch(cpu) == rcu_ctrlblk.batch.cur))
+	if (likely(RCU_quiescbatch(cpu) == rcu_ctrlblk.cur))
 		cpu_quiet(cpu);
 
-	spin_unlock(&rcu_ctrlblk.state.mutex);
+	spin_unlock(&rcu_state.mutex);
 }
 
 
@@ -225,11 +234,11 @@ static void rcu_offline_cpu(int cpu)
 	 * we can block indefinitely waiting for it, so flush
 	 * it here
 	 */
-	spin_lock_bh(&rcu_ctrlblk.state.mutex);
-	if (rcu_ctrlblk.batch.cur != rcu_ctrlblk.batch.completed)
+	spin_lock_bh(&rcu_state.mutex);
+	if (rcu_ctrlblk.cur != rcu_ctrlblk.completed)
 		cpu_quiet(cpu);
 unlock:
-	spin_unlock_bh(&rcu_ctrlblk.state.mutex);
+	spin_unlock_bh(&rcu_state.mutex);
 
 	rcu_move_batch(&RCU_curlist(cpu));
 	rcu_move_batch(&RCU_nxtlist(cpu));
@@ -241,10 +250,10 @@ unlock:
 
 void rcu_restart_cpu(int cpu)
 {
-	spin_lock_bh(&rcu_ctrlblk.state.mutex);
-	RCU_quiescbatch(cpu) = rcu_ctrlblk.batch.completed;
+	spin_lock_bh(&rcu_state.mutex);
+	RCU_quiescbatch(cpu) = rcu_ctrlblk.completed;
 	RCU_qs_pending(cpu) = 0;
-	spin_unlock_bh(&rcu_ctrlblk.state.mutex);
+	spin_unlock_bh(&rcu_state.mutex);
 }
 
 /*
@@ -256,7 +265,7 @@ static void rcu_process_callbacks(unsigned long unused)
 	LIST_HEAD(list);
 
 	if (!list_empty(&RCU_curlist(cpu)) &&
-			!rcu_batch_before(rcu_ctrlblk.batch.completed,RCU_batch(cpu))) {
+			!rcu_batch_before(rcu_ctrlblk.completed,RCU_batch(cpu))) {
 		__list_splice(&RCU_curlist(cpu), &list);
 		INIT_LIST_HEAD(&RCU_curlist(cpu));
 	}
@@ -273,17 +282,17 @@ static void rcu_process_callbacks(unsigned long unused)
 		 * start the next batch of callbacks
 		 */
 		do {
-			seq = read_seqcount_begin(&rcu_ctrlblk.batch.lock);
+			seq = read_seqcount_begin(&rcu_ctrlblk.lock);
 			/* determine batch number */
-			RCU_batch(cpu) = rcu_ctrlblk.batch.cur + 1;
-			next_pending = rcu_ctrlblk.batch.next_pending;
-		} while (read_seqcount_retry(&rcu_ctrlblk.batch.lock, seq));
+			RCU_batch(cpu) = rcu_ctrlblk.cur + 1;
+			next_pending = rcu_ctrlblk.next_pending;
+		} while (read_seqcount_retry(&rcu_ctrlblk.lock, seq));
 
 		if (!next_pending) {
 			/* and start it/schedule start if it's a new batch */
-			spin_lock(&rcu_ctrlblk.state.mutex);
+			spin_lock(&rcu_state.mutex);
 			rcu_start_batch(1);
-			spin_unlock(&rcu_ctrlblk.state.mutex);
+			spin_unlock(&rcu_state.mutex);
 		}
 	} else {
 		local_irq_enable();
@@ -308,7 +317,7 @@ static void __devinit rcu_online_cpu(int cpu)
 	tasklet_init(&RCU_tasklet(cpu), rcu_process_callbacks, 0UL);
 	INIT_LIST_HEAD(&RCU_nxtlist(cpu));
 	INIT_LIST_HEAD(&RCU_curlist(cpu));
-	RCU_quiescbatch(cpu) = rcu_ctrlblk.batch.completed;
+	RCU_quiescbatch(cpu) = rcu_ctrlblk.completed;
 	RCU_qs_pending(cpu) = 0;
 }
 
-- 
cgit v1.2.3


From b659a6fbb927a79acd606c4466d03cb615879f9f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 18:50:06 -0700
Subject: [PATCH] reduce rcu_head size - core

From: Dipankar Sarma <dipankar@in.ibm.com>

This reduces the RCU head size by using a singly linked to maintain them.
The ordering of the callbacks is still maintained as before by using a tail
pointer for the next list.

Signed-Off-By : Dipankar Sarma <dipankar@in.ibm.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h | 24 +++++++++++++-----------
 kernel/rcupdate.c        | 42 +++++++++++++++++++++---------------------
 2 files changed, 34 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 967b2d47014b..8fafbdf06866 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -36,7 +36,6 @@
 #ifdef __KERNEL__
 
 #include <linux/cache.h>
-#include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
 #include <linux/percpu.h>
@@ -45,21 +44,20 @@
 
 /**
  * struct rcu_head - callback structure for use with RCU
- * @list: list_head to queue the update requests
+ * @next: next update requests in a list
  * @func: actual update function to call after the grace period.
  * @arg: argument to be passed to the actual update function.
  */
 struct rcu_head {
-	struct list_head list;
+	struct rcu_head *next;
 	void (*func)(void *obj);
 	void *arg;
 };
 
-#define RCU_HEAD_INIT(head) \
-		{ .list = LIST_HEAD_INIT(head.list), .func = NULL, .arg = NULL }
+#define RCU_HEAD_INIT(head) { .next = NULL, .func = NULL, .arg = NULL }
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT(head)
 #define INIT_RCU_HEAD(ptr) do { \
-       INIT_LIST_HEAD(&(ptr)->list); (ptr)->func = NULL; (ptr)->arg = NULL; \
+       (ptr)->next = NULL; (ptr)->func = NULL; (ptr)->arg = NULL; \
 } while (0)
 
 
@@ -99,8 +97,9 @@ struct rcu_data {
 
 	/* 2) batch handling */
         long  	       	batch;           /* Batch # for current RCU batch */
-        struct list_head  nxtlist;
-        struct list_head  curlist;
+        struct rcu_head *nxtlist;
+	struct rcu_head **nxttail;
+        struct rcu_head *curlist;
 };
 
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
@@ -113,22 +112,25 @@ extern struct rcu_ctrlblk rcu_ctrlblk;
 #define RCU_batch(cpu) 		(per_cpu(rcu_data, (cpu)).batch)
 #define RCU_nxtlist(cpu) 	(per_cpu(rcu_data, (cpu)).nxtlist)
 #define RCU_curlist(cpu) 	(per_cpu(rcu_data, (cpu)).curlist)
+#define RCU_nxttail(cpu) 	(per_cpu(rcu_data, (cpu)).nxttail)
 
 static inline int rcu_pending(int cpu) 
 {
 	/* This cpu has pending rcu entries and the grace period
 	 * for them has completed.
 	 */
-	if (!list_empty(&RCU_curlist(cpu)) &&
+	if (RCU_curlist(cpu) &&
 		  !rcu_batch_before(rcu_ctrlblk.completed,RCU_batch(cpu)))
 		return 1;
+
 	/* This cpu has no pending entries, but there are new entries */
-	if (list_empty(&RCU_curlist(cpu)) &&
-			 !list_empty(&RCU_nxtlist(cpu)))
+	if (!RCU_curlist(cpu) && RCU_nxtlist(cpu))
 		return 1;
+
 	/* The rcu core waits for a quiescent state from the cpu */
 	if (RCU_quiescbatch(cpu) != rcu_ctrlblk.cur || RCU_qs_pending(cpu))
 		return 1;
+
 	/* nothing to do */
 	return 0;
 }
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0e13bdbb9fd7..cdb59f12a5fa 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -82,9 +82,11 @@ void fastcall call_rcu(struct rcu_head *head, void (*func)(void *arg), void *arg
 
 	head->func = func;
 	head->arg = arg;
+	head->next = NULL;
 	local_irq_save(flags);
 	cpu = smp_processor_id();
-	list_add_tail(&head->list, &RCU_nxtlist(cpu));
+	*RCU_nxttail(cpu) = head;
+	RCU_nxttail(cpu) = &head->next;
 	local_irq_restore(flags);
 }
 
@@ -92,16 +94,14 @@ void fastcall call_rcu(struct rcu_head *head, void (*func)(void *arg), void *arg
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
  */
-static void rcu_do_batch(struct list_head *list)
+static void rcu_do_batch(struct rcu_head *list)
 {
-	struct list_head *entry;
-	struct rcu_head *head;
+	struct rcu_head *next;
 
-	while (!list_empty(list)) {
-		entry = list->next;
-		list_del(entry);
-		head = list_entry(entry, struct rcu_head, list);
-		head->func(head->arg);
+	while (list) {
+		next = list->next;
+		list->func(list->arg);
+		list = next;
 	}
 }
 
@@ -262,20 +262,21 @@ void rcu_restart_cpu(int cpu)
 static void rcu_process_callbacks(unsigned long unused)
 {
 	int cpu = smp_processor_id();
-	LIST_HEAD(list);
+	struct rcu_head *rcu_list = NULL;
 
-	if (!list_empty(&RCU_curlist(cpu)) &&
-			!rcu_batch_before(rcu_ctrlblk.completed,RCU_batch(cpu))) {
-		__list_splice(&RCU_curlist(cpu), &list);
-		INIT_LIST_HEAD(&RCU_curlist(cpu));
+	if (RCU_curlist(cpu) &&
+	    !rcu_batch_before(rcu_ctrlblk.completed, RCU_batch(cpu))) {
+		rcu_list = RCU_curlist(cpu);
+		RCU_curlist(cpu) = NULL;
 	}
 
 	local_irq_disable();
-	if (!list_empty(&RCU_nxtlist(cpu)) && list_empty(&RCU_curlist(cpu))) {
+	if (RCU_nxtlist(cpu) && !RCU_curlist(cpu)) {
 		int next_pending, seq;
 
-		__list_splice(&RCU_nxtlist(cpu), &RCU_curlist(cpu));
-		INIT_LIST_HEAD(&RCU_nxtlist(cpu));
+		RCU_curlist(cpu) = RCU_nxtlist(cpu);
+		RCU_nxtlist(cpu) = NULL;
+		RCU_nxttail(cpu) = &RCU_nxtlist(cpu);
 		local_irq_enable();
 
 		/*
@@ -298,8 +299,8 @@ static void rcu_process_callbacks(unsigned long unused)
 		local_irq_enable();
 	}
 	rcu_check_quiescent_state();
-	if (!list_empty(&list))
-		rcu_do_batch(&list);
+	if (rcu_list)
+		rcu_do_batch(rcu_list);
 }
 
 void rcu_check_callbacks(int cpu, int user)
@@ -315,8 +316,7 @@ static void __devinit rcu_online_cpu(int cpu)
 {
 	memset(&per_cpu(rcu_data, cpu), 0, sizeof(struct rcu_data));
 	tasklet_init(&RCU_tasklet(cpu), rcu_process_callbacks, 0UL);
-	INIT_LIST_HEAD(&RCU_nxtlist(cpu));
-	INIT_LIST_HEAD(&RCU_curlist(cpu));
+	RCU_nxttail(cpu) = &RCU_nxtlist(cpu);
 	RCU_quiescbatch(cpu) = rcu_ctrlblk.completed;
 	RCU_qs_pending(cpu) = 0;
 }
-- 
cgit v1.2.3


From 8c1ce9d6d628945ff23f844dbe9f21f5d5383b99 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 18:50:18 -0700
Subject: [PATCH] rcu: avoid passing an argument to the callback function

From: Dipankar Sarma <dipankar@in.ibm.com>

This patch changes the call_rcu() API and avoids passing an argument to the
callback function as suggested by Rusty.  Instead, it is assumed that the
user has embedded the rcu head into a structure that is useful in the
callback and the rcu_head pointer is passed to the callback.  The callback
can use container_of() to get the pointer to its structure and work with
it.  Together with the rcu-singly-link patch, it reduces the rcu_head size
by 50%.  Considering that we use these in things like struct dentry and
struct dst_entry, this is good savings in space.

An example :

struct my_struct {
	struct rcu_head rcu;
	int x;
	int y;
};

void my_rcu_callback(struct rcu_head *head)
{
	struct my_struct *p = container_of(head, struct my_struct, rcu);
	free(p);
}

void my_delete(struct my_struct *p)
{
	...
	call_rcu(&p->rcu, my_rcu_callback);
	...
}

Signed-Off-By: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/mm/tlb.c      |  7 ++++---
 fs/dcache.c              |  6 +++---
 include/linux/rcupdate.h | 10 ++++------
 include/net/dst.h        |  6 ++++++
 ipc/util.c               | 25 ++++++++++++++++++++-----
 kernel/auditsc.c         |  7 ++++---
 kernel/rcupdate.c        | 26 ++++++++++++++++----------
 net/bridge/br_if.c       | 12 +++++++++---
 net/decnet/dn_route.c    |  4 ++--
 net/ipv4/route.c         |  4 ++--
 net/sched/sch_generic.c  |  6 +++---
 security/selinux/netif.c |  6 +++---
 12 files changed, 76 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/arch/ppc64/mm/tlb.c b/arch/ppc64/mm/tlb.c
index 980443bebed1..8825e14cb35a 100644
--- a/arch/ppc64/mm/tlb.c
+++ b/arch/ppc64/mm/tlb.c
@@ -158,9 +158,10 @@ void pte_free_now(struct page *ptepage)
 	pte_free(ptepage);
 }
 
-static void pte_free_rcu_callback(void *arg)
+static void pte_free_rcu_callback(struct rcu_head *head)
 {
-	struct pte_freelist_batch *batch = arg;
+	struct pte_freelist_batch *batch =
+		container_of(head, struct pte_freelist_batch, rcu);
 	unsigned int i;
 
 	for (i = 0; i < batch->index; i++)
@@ -171,7 +172,7 @@ static void pte_free_rcu_callback(void *arg)
 void pte_free_submit(struct pte_freelist_batch *batch)
 {
 	INIT_RCU_HEAD(&batch->rcu);
-	call_rcu(&batch->rcu, pte_free_rcu_callback, batch);
+	call_rcu(&batch->rcu, pte_free_rcu_callback);
 }
 
 void pte_free_finish(void)
diff --git a/fs/dcache.c b/fs/dcache.c
index 4c632e1261dc..02df5d45b82e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -65,9 +65,9 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
-static void d_callback(void *arg)
+static void d_callback(struct rcu_head *head)
 {
-	struct dentry * dentry = (struct dentry *)arg;
+	struct dentry * dentry = container_of(head, struct dentry, d_rcu);
 
 	if (dname_external(dentry))
 		kfree(dentry->d_name.name);
@@ -82,7 +82,7 @@ static void d_free(struct dentry *dentry)
 {
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
- 	call_rcu(&dentry->d_rcu, d_callback, dentry);
+ 	call_rcu(&dentry->d_rcu, d_callback);
 }
 
 /*
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 8fafbdf06866..10c4b8f24f08 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -46,18 +46,16 @@
  * struct rcu_head - callback structure for use with RCU
  * @next: next update requests in a list
  * @func: actual update function to call after the grace period.
- * @arg: argument to be passed to the actual update function.
  */
 struct rcu_head {
 	struct rcu_head *next;
-	void (*func)(void *obj);
-	void *arg;
+	void (*func)(struct rcu_head *head);
 };
 
-#define RCU_HEAD_INIT(head) { .next = NULL, .func = NULL, .arg = NULL }
+#define RCU_HEAD_INIT(head) { .next = NULL, .func = NULL }
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT(head)
 #define INIT_RCU_HEAD(ptr) do { \
-       (ptr)->next = NULL; (ptr)->func = NULL; (ptr)->arg = NULL; \
+       (ptr)->next = NULL; (ptr)->func = NULL; \
 } while (0)
 
 
@@ -144,7 +142,7 @@ extern void rcu_restart_cpu(int cpu);
 
 /* Exported interfaces */
 extern void FASTCALL(call_rcu(struct rcu_head *head, 
-                          void (*func)(void *arg), void *arg));
+				void (*func)(struct rcu_head *head)));
 extern void synchronize_kernel(void);
 
 #endif /* __KERNEL__ */
diff --git a/include/net/dst.h b/include/net/dst.h
index ed2504c6b4e9..543ff945bb2f 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -183,6 +183,12 @@ static inline void dst_free(struct dst_entry * dst)
 	__dst_free(dst);
 }
 
+static inline void dst_rcu_free(struct rcu_head *head)
+{
+	struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
+	dst_free(dst);
+}
+
 static inline void dst_confirm(struct dst_entry *dst)
 {
 	if (dst)
diff --git a/ipc/util.c b/ipc/util.c
index 81a145eded63..727c33233f5f 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -333,25 +333,40 @@ void* ipc_rcu_alloc(int size)
  * Since RCU callback function is called in bh,
  * we need to defer the vfree to schedule_work
  */
-static void ipc_schedule_free(void* arg)
+static void ipc_schedule_free(struct rcu_head *head)
 {
-	struct ipc_rcu_vmalloc *free = arg;
+	struct ipc_rcu_vmalloc *free =
+		container_of(head, struct ipc_rcu_vmalloc, rcu);
 
 	INIT_WORK(&free->work, vfree, free);
 	schedule_work(&free->work);
 }
 
+/**
+ *	ipc_immediate_free	- free ipc + rcu space
+ *
+ *	Free from the RCU callback context
+ *
+ */
+static void ipc_immediate_free(struct rcu_head *head)
+{
+	struct ipc_rcu_kmalloc *free =
+		container_of(head, struct ipc_rcu_kmalloc, rcu);
+	kfree(free);
+}
+
+
+
 void ipc_rcu_free(void* ptr, int size)
 {
 	if (rcu_use_vmalloc(size)) {
 		struct ipc_rcu_vmalloc *free;
 		free = ptr - sizeof(*free);
-		call_rcu(&free->rcu, ipc_schedule_free, free);
+		call_rcu(&free->rcu, ipc_schedule_free);
 	} else {
 		struct ipc_rcu_kmalloc *free;
 		free = ptr - sizeof(*free);
-		/* kfree takes a "const void *" so gcc warns.  So we cast. */
-		call_rcu(&free->rcu, (void (*)(void *))kfree, free);
+		call_rcu(&free->rcu, ipc_immediate_free);
 	}
 
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 342b57141fd9..e688c73f6a9e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -177,9 +177,10 @@ static inline int audit_add_rule(struct audit_entry *entry,
 	return 0;
 }
 
-static void audit_free_rule(void *arg)
+static void audit_free_rule(struct rcu_head *head)
 {
-	kfree(arg);
+	struct audit_entry *e = container_of(head, struct audit_entry, rcu);
+	kfree(e);
 }
 
 /* Note that audit_add_rule and audit_del_rule are called via
@@ -195,7 +196,7 @@ static inline int audit_del_rule(struct audit_rule *rule,
 	list_for_each_entry(e, list, list) {
 		if (!audit_compare_rule(rule, &e->rule)) {
 			list_del_rcu(&e->list);
-			call_rcu(&e->rcu, audit_free_rule, e);
+			call_rcu(&e->rcu, audit_free_rule);
 			return 0;
 		}
 	}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cdb59f12a5fa..7282889a5fd9 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -68,20 +68,19 @@ static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
  * call_rcu - Queue an RCU update request.
  * @head: structure to be used for queueing the RCU updates.
  * @func: actual update function to be invoked after the grace period
- * @arg: argument to be passed to the update function
  *
  * The update function will be invoked as soon as all CPUs have performed 
  * a context switch or been seen in the idle loop or in a user process. 
  * The read-side of critical section that use call_rcu() for updation must 
  * be protected by rcu_read_lock()/rcu_read_unlock().
  */
-void fastcall call_rcu(struct rcu_head *head, void (*func)(void *arg), void *arg)
+void fastcall call_rcu(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
 {
 	int cpu;
 	unsigned long flags;
 
 	head->func = func;
-	head->arg = arg;
 	head->next = NULL;
 	local_irq_save(flags);
 	cpu = smp_processor_id();
@@ -100,7 +99,7 @@ static void rcu_do_batch(struct rcu_head *list)
 
 	while (list) {
 		next = list->next;
-		list->func(list->arg);
+		list->func(list);
 		list = next;
 	}
 }
@@ -358,11 +357,18 @@ void __init rcu_init(void)
 	register_cpu_notifier(&rcu_nb);
 }
 
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
 
 /* Because of FASTCALL declaration of complete, we use this wrapper */
-static void wakeme_after_rcu(void *completion)
+static void wakeme_after_rcu(struct rcu_head  *head)
 {
-	complete(completion);
+	struct rcu_synchronize *rcu;
+
+	rcu = container_of(head, struct rcu_synchronize, head);
+	complete(&rcu->completion);
 }
 
 /**
@@ -371,14 +377,14 @@ static void wakeme_after_rcu(void *completion)
  */
 void synchronize_kernel(void)
 {
-	struct rcu_head rcu;
-	DECLARE_COMPLETION(completion);
+	struct rcu_synchronize rcu;
 
+	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished */
-	call_rcu(&rcu, wakeme_after_rcu, &completion);
+	call_rcu(&rcu.head, wakeme_after_rcu);
 
 	/* Wait for it */
-	wait_for_completion(&completion);
+	wait_for_completion(&rcu.completion);
 }
 
 
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 20275604941a..6bb2ed24718f 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -75,9 +75,8 @@ static int br_initial_port_cost(struct net_device *dev)
 	return 100;	/* assume old 10Mbps */
 }
 
-static void destroy_nbp(void *arg)
+static void destroy_nbp(struct net_bridge_port *p)
 {
-	struct net_bridge_port *p = arg;
 	struct net_device *dev = p->dev;
 
 	dev->br_port = NULL;
@@ -88,6 +87,13 @@ static void destroy_nbp(void *arg)
 	br_sysfs_freeif(p);
 }
 
+static void destroy_nbp_rcu(struct rcu_head *head)
+{
+	struct net_bridge_port *p =
+			container_of(head, struct net_bridge_port, rcu);
+	destroy_nbp(p);
+}
+
 /* called with RTNL */
 static void del_nbp(struct net_bridge_port *p)
 {
@@ -108,7 +114,7 @@ static void del_nbp(struct net_bridge_port *p)
 	del_timer_sync(&p->forward_delay_timer);
 	del_timer_sync(&p->hold_timer);
 	
-	call_rcu(&p->rcu, destroy_nbp, p);
+	call_rcu(&p->rcu, destroy_nbp_rcu);
 }
 
 /* called with RTNL */
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 90d0583f56f4..c181467ddca3 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -146,14 +146,14 @@ static __inline__ unsigned dn_hash(unsigned short src, unsigned short dst)
 
 static inline void dnrt_free(struct dn_route *rt)
 {
-	call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
+	call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
 }
 
 static inline void dnrt_drop(struct dn_route *rt)
 {
 	if (rt)
 		dst_release(&rt->u.dst);
-	call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
+	call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
 }
 
 static void dn_dst_check_expire(unsigned long dummy)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a570e709f738..46aeef6fa4e1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -439,13 +439,13 @@ static struct file_operations rt_cpu_seq_fops = {
   
 static __inline__ void rt_free(struct rtable *rt)
 {
-	call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
+	call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
 }
 
 static __inline__ void rt_drop(struct rtable *rt)
 {
 	ip_rt_put(rt);
-	call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
+	call_rcu(&rt->u.dst.rcu_head, dst_rcu_free);
 }
 
 static __inline__ int rt_fast_clean(struct rtable *rth)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 111dad476d2b..db8cab2cf1d5 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -411,9 +411,9 @@ void qdisc_reset(struct Qdisc *qdisc)
 /* this is the rcu callback function to clean up a qdisc when there 
  * are no further references to it */
 
-static void __qdisc_destroy (void * arg) 
+static void __qdisc_destroy(struct rcu_head *head)
 {
-	struct Qdisc    *qdisc = (struct Qdisc *) arg;
+	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
 	struct Qdisc_ops  *ops = qdisc->ops;
 
 #ifdef CONFIG_NET_ESTIMATOR
@@ -448,7 +448,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
 		}
 	}
 
-	call_rcu(&qdisc->q_rcu, __qdisc_destroy, qdisc);
+	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
 
 }
 
diff --git a/security/selinux/netif.c b/security/selinux/netif.c
index 4cbf8f259d93..d23bd7e6345e 100644
--- a/security/selinux/netif.c
+++ b/security/selinux/netif.c
@@ -134,9 +134,9 @@ out:
 	return netif;
 }
 
-static void sel_netif_free(void *p)
+static void sel_netif_free(struct rcu_head *p)
 {
-	struct sel_netif *netif = p;
+	struct sel_netif *netif = container_of(p, struct sel_netif, rcu_head);
 	
 	DEBUGP("%s: %s\n", __FUNCTION__, netif->nsec.dev->name);
 	kfree(netif);
@@ -151,7 +151,7 @@ static void sel_netif_destroy(struct sel_netif *netif)
 	sel_netif_total--;
 	spin_unlock_bh(&sel_netif_lock);
 
-	call_rcu(&netif->rcu_head, sel_netif_free, netif);
+	call_rcu(&netif->rcu_head, sel_netif_free);
 }
 
 void sel_netif_put(struct sel_netif *netif)
-- 
cgit v1.2.3


From d2cec97bc421d6f9c2ee0d9bd8e0ce47d0022cac Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 18:50:29 -0700
Subject: [PATCH] cpumask: make cpu_present_map real even on non-smp

From: Paul Jackson <pj@sgi.com>

This patch makes cpu_present_map a real map for all configurations, instead of
a constant for non-SMP.  It also moves the definition of cpu_present_map out
of kernel/cpu.c into kernel/sched.c, because cpu.c isn't compiled into non-SMP
kernels.

The pattern is that each of the possible, present and online cpu maps are
actual kernel global cpumask_t variables, for all configurations.  They are
documented in include/linux/cpumask.h.  Some of the UP (NR_CPUS=1) code
cheats, and hardcodes the assumption that the single bit position of these
maps is always set, as an optimization.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/cpumask.h | 13 +++++--------
 kernel/cpu.c            |  8 --------
 kernel/sched.c          | 10 ++++++++++
 3 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 4293a465d87b..dced3ec2b0c8 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -10,15 +10,12 @@
 
 extern cpumask_t cpu_online_map;
 extern cpumask_t cpu_possible_map;
-extern cpumask_t cpu_present_map;
 
 #define num_online_cpus()		cpus_weight(cpu_online_map)
 #define num_possible_cpus()		cpus_weight(cpu_possible_map)
-#define num_present_cpus()		cpus_weight(cpu_present_map)
 
 #define cpu_online(cpu)			cpu_isset(cpu, cpu_online_map)
 #define cpu_possible(cpu)		cpu_isset(cpu, cpu_possible_map)
-#define cpu_present(cpu)		cpu_isset(cpu, cpu_present_map)
 
 #define for_each_cpu_mask(cpu, mask)					\
 	for (cpu = first_cpu_const(mk_cpumask_const(mask));		\
@@ -27,26 +24,26 @@ extern cpumask_t cpu_present_map;
 
 #define for_each_cpu(cpu) for_each_cpu_mask(cpu, cpu_possible_map)
 #define for_each_online_cpu(cpu) for_each_cpu_mask(cpu, cpu_online_map)
-#define for_each_present_cpu(cpu) for_each_cpu_mask(cpu, cpu_present_map)
 #else
 #define	cpu_online_map			cpumask_of_cpu(0)
 #define	cpu_possible_map		cpumask_of_cpu(0)
-#define	cpu_present_map			cpumask_of_cpu(0)
 
 #define num_online_cpus()		1
 #define num_possible_cpus()		1
-#define num_present_cpus()		1
 
 #define cpu_online(cpu)			({ BUG_ON((cpu) != 0); 1; })
 #define cpu_possible(cpu)		({ BUG_ON((cpu) != 0); 1; })
-#define cpu_present(cpu)		({ BUG_ON((cpu) != 0); 1; })
 
 #define for_each_cpu_mask(cpu, mask) for (cpu = 0; cpu < 1; cpu++)
 #define for_each_cpu(cpu) for (cpu = 0; cpu < 1; cpu++)
 #define for_each_online_cpu(cpu) for (cpu = 0; cpu < 1; cpu++)
-#define for_each_present_cpu(cpu) for (cpu = 0; cpu < 1; cpu++)
 #endif
 
+extern cpumask_t cpu_present_map;
+#define num_present_cpus()		cpus_weight(cpu_present_map)
+#define cpu_present(cpu)		cpu_isset(cpu, cpu_present_map)
+#define for_each_present_cpu(cpu)	for_each_cpu_mask(cpu, cpu_present_map)
+
 #define cpumask_scnprintf(buf, buflen, map)				\
 	bitmap_scnprintf(buf, buflen, cpus_addr(map), NR_CPUS)
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 72b984c67eb3..083521327e64 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,14 +20,6 @@
 DECLARE_MUTEX(cpucontrol);
 
 static struct notifier_block *cpu_chain;
-/*
- * Represents all cpu's present in the system
- * In systems capable of hotplug, this map could dynamically grow
- * as new cpu's are detected in the system via any platform specific
- * method, such as ACPI for e.g.
- */
-cpumask_t	cpu_present_map;
-EXPORT_SYMBOL(cpu_present_map);
 
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
diff --git a/kernel/sched.c b/kernel/sched.c
index 8f49ba1202c3..a614d7ebf7c3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2941,6 +2941,16 @@ out_unlock:
 	return retval;
 }
 
+/*
+ * Represents all cpu's present in the system
+ * In systems capable of hotplug, this map could dynamically grow
+ * as new cpu's are detected in the system via any platform specific
+ * method, such as ACPI for e.g.
+ */
+
+cpumask_t cpu_present_map;
+EXPORT_SYMBOL(cpu_present_map);
+
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
-- 
cgit v1.2.3


From f3344dc35c26c28647ae4a6b9cdd9df3fa65777e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 18:51:00 -0700
Subject: [PATCH] cpumask: rewrite cpumask.h - single bitmap based
 implementation

From: Paul Jackson <pj@sgi.com>

Major rewrite of cpumask to use a single implementation, as a struct-wrapped
bitmap.

This patch leaves some 26 include/asm-*/cpumask*.h header files orphaned - to
be removed next patch.

Some nine cpumask macros for const variants and to coerce and promote between
an unsigned long and a cpumask are obsolete.  Simple emulation wrappers are
provided in this patch for these obsolete macros, which can be removed once
each of the 3 archs (i386, ppc64, x86_64) using them are recoded in follow-on
patches to not need them.

The CPU_MASK_ALL macro now avoids leaving possible garbage one bits in any
unused portion of the high word.

An inproved comment lists all available operators, for convenient browsing.

From: Mikael Pettersson <mikpe@csd.uu.se>

  2.6.7-rc3-mm1 changed CPU_MASK_NONE into something that isn't a valid
  rvalue (it only works inside struct initializers).  This caused compile-time
  errors in perfctr in UP x86 builds.

From: Arnd Bergmann <arnd@arndb.de>

  cpumask-5-10-rewrite-cpumaskh-single-bitmap-based from 2.6.7-rc3-mm1
  causes include2/asm/smp.h:54:1: warning: "cpu_online" redefined

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Mikael Pettersson <mikpe@csd.uu.se>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/sparc64/kernel/irq.c |   6 +-
 include/asm-s390/smp.h    |   6 -
 include/linux/cpumask.h   | 388 ++++++++++++++++++++++++++++++++++++++++++----
 kernel/rcupdate.c         |   7 +-
 kernel/sched.c            |   5 +
 5 files changed, 368 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c
index b7f6a1eb922f..35243558bfc0 100644
--- a/arch/sparc64/kernel/irq.c
+++ b/arch/sparc64/kernel/irq.c
@@ -691,9 +691,10 @@ static inline void redirect_intr(int cpu, struct ino_bucket *bp)
 	 *    Just Do It.
 	 */
 	struct irqaction *ap = bp->irq_info;
-	cpumask_t cpu_mask = get_smpaff_in_irqaction(ap);
+	cpumask_t cpu_mask;
 	unsigned int buddy, ticks;
 
+	cpus_addr(cpu_mask)[0] = get_smpaff_in_irqaction(ap);
 	cpus_and(cpu_mask, cpu_mask, cpu_online_map);
 	if (cpus_empty(cpu_mask))
 		cpu_mask = cpu_online_map;
@@ -1210,9 +1211,10 @@ static int irq_affinity_read_proc (char *page, char **start, off_t off,
 {
 	struct ino_bucket *bp = ivector_table + (long)data;
 	struct irqaction *ap = bp->irq_info;
-	cpumask_t mask = get_smpaff_in_irqaction(ap);
+	cpumask_t mask;
 	int len;
 
+	cpus_addr(mask)[0] = get_smpaff_in_irqaction(ap);
 	if (cpus_empty(mask))
 		mask = cpu_online_map;
 
diff --git a/include/asm-s390/smp.h b/include/asm-s390/smp.h
index 7e613240088e..70d06261bb9e 100644
--- a/include/asm-s390/smp.h
+++ b/include/asm-s390/smp.h
@@ -31,10 +31,6 @@ typedef struct
 
 extern int smp_call_function_on(void (*func) (void *info), void *info,
 				int nonatomic, int wait, int cpu);
-
-extern cpumask_t cpu_online_map;
-extern cpumask_t cpu_possible_map;
-
 #define NO_PROC_ID		0xFF		/* No processor magic marker */
 
 /*
@@ -51,8 +47,6 @@ extern cpumask_t cpu_possible_map;
 
 #define smp_processor_id() (S390_lowcore.cpu_data.cpu_nr)
 
-#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map)
-
 extern __inline__ __u16 hard_smp_processor_id(void)
 {
         __u16 cpu_address;
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index dced3ec2b0c8..f05bf9402dfc 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1,53 +1,379 @@
 #ifndef __LINUX_CPUMASK_H
 #define __LINUX_CPUMASK_H
 
+/*
+ * Cpumasks provide a bitmap suitable for representing the
+ * set of CPU's in a system, one bit position per CPU number.
+ *
+ * See detailed comments in the file linux/bitmap.h describing the
+ * data type on which these cpumasks are based.
+ *
+ * For details of cpumask_scnprintf() and cpumask_parse(),
+ * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
+ *
+ * The available cpumask operations are:
+ *
+ * void cpu_set(cpu, mask)		turn on bit 'cpu' in mask
+ * void cpu_clear(cpu, mask)		turn off bit 'cpu' in mask
+ * void cpus_setall(mask)		set all bits
+ * void cpus_clear(mask)		clear all bits
+ * int cpu_isset(cpu, mask)		true iff bit 'cpu' set in mask
+ * int cpu_test_and_set(cpu, mask)	test and set bit 'cpu' in mask
+ *
+ * void cpus_and(dst, src1, src2)	dst = src1 & src2  [intersection]
+ * void cpus_or(dst, src1, src2)	dst = src1 | src2  [union]
+ * void cpus_xor(dst, src1, src2)	dst = src1 ^ src2
+ * void cpus_andnot(dst, src1, src2)	dst = src1 & ~src2
+ * void cpus_complement(dst, src)	dst = ~src
+ *
+ * int cpus_equal(mask1, mask2)		Does mask1 == mask2?
+ * int cpus_intersects(mask1, mask2)	Do mask1 and mask2 intersect?
+ * int cpus_subset(mask1, mask2)	Is mask1 a subset of mask2?
+ * int cpus_empty(mask)			Is mask empty (no bits sets)?
+ * int cpus_full(mask)			Is mask full (all bits sets)?
+ * int cpus_weight(mask)		Hamming weigh - number of set bits
+ *
+ * void cpus_shift_right(dst, src, n)	Shift right
+ * void cpus_shift_left(dst, src, n)	Shift left
+ *
+ * int first_cpu(mask)			Number lowest set bit, or NR_CPUS
+ * int next_cpu(cpu, mask)		Next cpu past 'cpu', or NR_CPUS
+ *
+ * cpumask_t cpumask_of_cpu(cpu)	Return cpumask with bit 'cpu' set
+ * CPU_MASK_ALL				Initializer - all bits set
+ * CPU_MASK_NONE			Initializer - no bits set
+ * unsigned long *cpus_addr(mask)	Array of unsigned long's in mask
+ *
+ * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing
+ * int cpumask_parse(ubuf, ulen, mask)	Parse ascii string as cpumask
+ *
+ * int num_online_cpus()		Number of online CPUs
+ * int num_possible_cpus()		Number of all possible CPUs
+ * int cpu_online(cpu)			Is some cpu online?
+ * int cpu_possible(cpu)		Is some cpu possible?
+ * void cpu_set_online(cpu)		set cpu in cpu_online_map
+ * void cpu_set_offline(cpu)		clear cpu in cpu_online_map
+ * int any_online_cpu(mask)		First online cpu in mask
+ *
+ * for_each_cpu_mask(cpu, mask)		for-loop cpu over mask
+ * for_each_cpu(cpu)			for-loop cpu over cpu_possible_map
+ * for_each_online_cpu(cpu)		for-loop cpu over cpu_online_map
+ *
+ * Subtlety:
+ * 1) The 'type-checked' form of cpu_isset() causes gcc (3.3.2, anyway)
+ *    to generate slightly worse code.  Note for example the additional
+ *    40 lines of assembly code compiling the "for each possible cpu"
+ *    loops buried in the disk_stat_read() macros calls when compiling
+ *    drivers/block/genhd.c (arch i386, CONFIG_SMP=y).  So use a simple
+ *    one-line #define for cpu_isset(), instead of wrapping an inline
+ *    inside a macro, the way we do the other calls.
+ */
+
 #include <linux/threads.h>
 #include <linux/bitmap.h>
-#include <asm/cpumask.h>
 #include <asm/bug.h>
 
-#ifdef CONFIG_SMP
+typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
+extern cpumask_t _unused_cpumask_arg_;
 
-extern cpumask_t cpu_online_map;
-extern cpumask_t cpu_possible_map;
+#define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
+static inline void __cpu_set(int cpu, volatile cpumask_t *dstp)
+{
+	set_bit(cpu, dstp->bits);
+}
 
-#define num_online_cpus()		cpus_weight(cpu_online_map)
-#define num_possible_cpus()		cpus_weight(cpu_possible_map)
+#define cpu_clear(cpu, dst) __cpu_clear((cpu), &(dst))
+static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp)
+{
+	clear_bit(cpu, dstp->bits);
+}
 
-#define cpu_online(cpu)			cpu_isset(cpu, cpu_online_map)
-#define cpu_possible(cpu)		cpu_isset(cpu, cpu_possible_map)
+#define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS)
+static inline void __cpus_setall(cpumask_t *dstp, int nbits)
+{
+	bitmap_fill(dstp->bits, nbits);
+}
 
-#define for_each_cpu_mask(cpu, mask)					\
-	for (cpu = first_cpu_const(mk_cpumask_const(mask));		\
-		cpu < NR_CPUS;						\
-		cpu = next_cpu_const(cpu, mk_cpumask_const(mask)))
+#define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS)
+static inline void __cpus_clear(cpumask_t *dstp, int nbits)
+{
+	bitmap_zero(dstp->bits, nbits);
+}
 
-#define for_each_cpu(cpu) for_each_cpu_mask(cpu, cpu_possible_map)
-#define for_each_online_cpu(cpu) for_each_cpu_mask(cpu, cpu_online_map)
-#else
-#define	cpu_online_map			cpumask_of_cpu(0)
-#define	cpu_possible_map		cpumask_of_cpu(0)
+/* No static inline type checking - see Subtlety (1) above. */
+#define cpu_isset(cpu, cpumask) test_bit((cpu), (cpumask).bits)
+
+#define cpu_test_and_set(cpu, cpumask) __cpu_test_and_set((cpu), &(cpumask))
+static inline int __cpu_test_and_set(int cpu, cpumask_t *addr)
+{
+	return test_and_set_bit(cpu, addr->bits);
+}
+
+#define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_and(cpumask_t *dstp, cpumask_t *src1p,
+					cpumask_t *src2p, int nbits)
+{
+	bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_or(cpumask_t *dstp, cpumask_t *src1p,
+					cpumask_t *src2p, int nbits)
+{
+	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_xor(cpumask_t *dstp, cpumask_t *src1p,
+					cpumask_t *src2p, int nbits)
+{
+	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_andnot(dst, src1, src2) \
+				__cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_andnot(cpumask_t *dstp, cpumask_t *src1p,
+					cpumask_t *src2p, int nbits)
+{
+	bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_complement(dst, src) __cpus_complement(&(dst), &(src), NR_CPUS)
+static inline void __cpus_complement(cpumask_t *dstp,
+					cpumask_t *srcp, int nbits)
+{
+	bitmap_complement(dstp->bits, srcp->bits, nbits);
+}
+
+#define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_equal(cpumask_t *src1p,
+					cpumask_t *src2p, int nbits)
+{
+	return bitmap_equal(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_intersects(cpumask_t *src1p,
+					cpumask_t *src2p, int nbits)
+{
+	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_subset(cpumask_t *src1p,
+					cpumask_t *src2p, int nbits)
+{
+	return bitmap_subset(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_empty(src) __cpus_empty(&(src), NR_CPUS)
+static inline int __cpus_empty(cpumask_t *srcp, int nbits)
+{
+	return bitmap_empty(srcp->bits, nbits);
+}
+
+#define cpus_full(cpumask) __cpus_full(&(cpumask), NR_CPUS)
+static inline int __cpus_full(cpumask_t *srcp, int nbits)
+{
+	return bitmap_full(srcp->bits, nbits);
+}
+
+#define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS)
+static inline int __cpus_weight(cpumask_t *srcp, int nbits)
+{
+	return bitmap_weight(srcp->bits, nbits);
+}
+
+#define cpus_shift_right(dst, src, n) \
+			__cpus_shift_right(&(dst), &(src), (n), NR_CPUS)
+static inline void __cpus_shift_right(cpumask_t *dstp,
+					cpumask_t *srcp, int n, int nbits)
+{
+	bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
+}
+
+#define cpus_shift_left(dst, src, n) \
+			__cpus_shift_left(&(dst), &(src), (n), NR_CPUS)
+static inline void __cpus_shift_left(cpumask_t *dstp,
+					cpumask_t *srcp, int n, int nbits)
+{
+	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
+}
+
+#define first_cpu(src) __first_cpu(&(src), NR_CPUS)
+static inline int __first_cpu(cpumask_t *srcp, int nbits)
+{
+	return find_first_bit(srcp->bits, nbits);
+}
 
-#define num_online_cpus()		1
-#define num_possible_cpus()		1
+#define next_cpu(n, src) __next_cpu((n), &(src), NR_CPUS)
+static inline int __next_cpu(int n, cpumask_t *srcp, int nbits)
+{
+	return find_next_bit(srcp->bits, nbits, n+1);
+}
 
-#define cpu_online(cpu)			({ BUG_ON((cpu) != 0); 1; })
-#define cpu_possible(cpu)		({ BUG_ON((cpu) != 0); 1; })
+#define cpumask_of_cpu(cpu)						\
+({									\
+	typeof(_unused_cpumask_arg_) m;					\
+	if (sizeof(m) == sizeof(unsigned long)) {			\
+		m.bits[0] = 1UL<<(cpu);					\
+	} else {							\
+		cpus_clear(m);						\
+		cpu_set((cpu), m);					\
+	}								\
+	m;								\
+})
+
+#define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS)
+
+#if NR_CPUS <= BITS_PER_LONG
+
+#define CPU_MASK_ALL							\
+((cpumask_t) { {							\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
+} })
+
+#else
+
+#define CPU_MASK_ALL							\
+((cpumask_t) { {							\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,			\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
+} })
 
-#define for_each_cpu_mask(cpu, mask) for (cpu = 0; cpu < 1; cpu++)
-#define for_each_cpu(cpu) for (cpu = 0; cpu < 1; cpu++)
-#define for_each_online_cpu(cpu) for (cpu = 0; cpu < 1; cpu++)
 #endif
 
+#define CPU_MASK_NONE							\
+((cpumask_t) { {							\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL				\
+} })
+
+#define cpus_addr(src) ((src).bits)
+
+#define cpumask_scnprintf(buf, len, src) \
+			__cpumask_scnprintf((buf), (len), &(src), NR_CPUS)
+static inline int __cpumask_scnprintf(char *buf, int len,
+					cpumask_t *srcp, int nbits)
+{
+	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
+}
+
+#define cpumask_parse(ubuf, ulen, src) \
+			__cpumask_parse((ubuf), (ulen), &(src), NR_CPUS)
+static inline int __cpumask_parse(const char __user *buf, int len,
+					cpumask_t *srcp, int nbits)
+{
+	return bitmap_parse(buf, len, srcp->bits, nbits);
+}
+
+#if NR_CPUS > 1
+#define for_each_cpu_mask(cpu, mask)		\
+	for ((cpu) = first_cpu(mask);		\
+		(cpu) < NR_CPUS;		\
+		(cpu) = next_cpu((cpu), (mask)))
+#else /* NR_CPUS == 1 */
+#define for_each_cpu_mask(cpu, mask) for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#endif /* NR_CPUS */
+
+/*
+ * The following particular system cpumasks and operations manage
+ * possible, present and online cpus.  Each of them is a fixed size
+ * bitmap of size NR_CPUS.
+ *
+ *  #ifdef CONFIG_HOTPLUG_CPU
+ *     cpu_possible_map - all NR_CPUS bits set
+ *     cpu_present_map  - has bit 'cpu' set iff cpu is populated
+ *     cpu_online_map   - has bit 'cpu' set iff cpu available to scheduler
+ *  #else
+ *     cpu_possible_map - has bit 'cpu' set iff cpu is populated
+ *     cpu_present_map  - copy of cpu_possible_map
+ *     cpu_online_map   - has bit 'cpu' set iff cpu available to scheduler
+ *  #endif
+ *
+ *  In either case, NR_CPUS is fixed at compile time, as the static
+ *  size of these bitmaps.  The cpu_possible_map is fixed at boot
+ *  time, as the set of CPU id's that it is possible might ever
+ *  be plugged in at anytime during the life of that system boot.
+ *  The cpu_present_map is dynamic(*), representing which CPUs
+ *  are currently plugged in.  And cpu_online_map is the dynamic
+ *  subset of cpu_present_map, indicating those CPUs available
+ *  for scheduling.
+ *
+ *  If HOTPLUG is enabled, then cpu_possible_map is forced to have
+ *  all NR_CPUS bits set, otherwise it is just the set of CPUs that
+ *  ACPI reports present at boot.
+ *
+ *  If HOTPLUG is enabled, then cpu_present_map varies dynamically,
+ *  depending on what ACPI reports as currently plugged in, otherwise
+ *  cpu_present_map is just a copy of cpu_possible_map.
+ *
+ *  (*) Well, cpu_present_map is dynamic in the hotplug case.  If not
+ *      hotplug, it's a copy of cpu_possible_map, hence fixed at boot.
+ *
+ * Subtleties:
+ * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
+ *    assumption that their single CPU is online.  The UP
+ *    cpu_{online,possible,present}_maps are placebos.  Changing them
+ *    will have no useful affect on the following num_*_cpus()
+ *    and cpu_*() macros in the UP case.  This ugliness is a UP
+ *    optimization - don't waste any instructions or memory references
+ *    asking if you're online or how many CPUs there are if there is
+ *    only one CPU.
+ * 2) Most SMP arch's #define some of these maps to be some
+ *    other map specific to that arch.  Therefore, the following
+ *    must be #define macros, not inlines.  To see why, examine
+ *    the assembly code produced by the following.  Note that
+ *    set1() writes phys_x_map, but set2() writes x_map:
+ *        int x_map, phys_x_map;
+ *        #define set1(a) x_map = a
+ *        inline void set2(int a) { x_map = a; }
+ *        #define x_map phys_x_map
+ *        main(){ set1(3); set2(5); }
+ */
+
+extern cpumask_t cpu_possible_map;
+extern cpumask_t cpu_online_map;
 extern cpumask_t cpu_present_map;
-#define num_present_cpus()		cpus_weight(cpu_present_map)
-#define cpu_present(cpu)		cpu_isset(cpu, cpu_present_map)
-#define for_each_present_cpu(cpu)	for_each_cpu_mask(cpu, cpu_present_map)
 
-#define cpumask_scnprintf(buf, buflen, map)				\
-	bitmap_scnprintf(buf, buflen, cpus_addr(map), NR_CPUS)
+#if NR_CPUS > 1
+#define num_online_cpus()    cpus_weight(cpu_online_map)
+#define num_possible_cpus()  cpus_weight(cpu_possible_map)
+#define num_present_cpus()   cpus_weight(cpu_present_map)
+#define cpu_online(cpu)      cpu_isset((cpu), cpu_online_map)
+#define cpu_possible(cpu)    cpu_isset((cpu), cpu_possible_map)
+#define cpu_present(cpu)     cpu_isset((cpu), cpu_present_map)
+#else
+#define num_online_cpus()    1
+#define num_possible_cpus()  1
+#define num_present_cpus()   1
+#define cpu_online(cpu)      ((cpu) == 0)
+#define cpu_possible(cpu)    ((cpu) == 0)
+#define cpu_present(cpu)     ((cpu) == 0)
+#endif
+
+#define any_online_cpu(mask)			\
+({						\
+	int cpu;				\
+	for_each_cpu_mask(cpu, (mask))		\
+		if (cpu_online(cpu))		\
+			break;			\
+	cpu;					\
+})
+
+#define for_each_cpu(cpu)	  for_each_cpu_mask((cpu), cpu_possible_map)
+#define for_each_online_cpu(cpu)  for_each_cpu_mask((cpu), cpu_online_map)
+#define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
 
-#define cpumask_parse(buf, buflen, map)					\
-	bitmap_parse(buf, buflen, cpus_addr(map), NR_CPUS)
+/* Begin obsolete cpumask operator emulation */
+#define cpu_isset_const(a,b) cpu_isset(a,b)
+#define cpumask_const_t cpumask_t
+#define cpus_coerce(m) (cpus_addr(m)[0])
+#define cpus_coerce_const cpus_coerce
+#define cpus_promote(x) ({ cpumask_t m; m.bits[0] = x; m; })
+#define cpus_weight_const cpus_weight
+#define first_cpu_const first_cpu
+#define mk_cpumask_const(x) x
+#define next_cpu_const next_cpu
+/* End of obsolete cpumask operator emulation */
 
 #endif /* __LINUX_CPUMASK_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 7282889a5fd9..f35f944abe3d 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -129,17 +129,14 @@ static void rcu_do_batch(struct rcu_head *list)
  */
 static void rcu_start_batch(int next_pending)
 {
-	cpumask_t active;
-
 	if (next_pending)
 		rcu_ctrlblk.next_pending = 1;
 
 	if (rcu_ctrlblk.next_pending &&
 			rcu_ctrlblk.completed == rcu_ctrlblk.cur) {
 		/* Can't change, since spin lock held. */
-		active = nohz_cpu_mask;
-		cpus_complement(active);
-		cpus_and(rcu_state.rcu_cpu_mask, cpu_online_map, active);
+		cpus_andnot(rcu_state.rcu_cpu_mask, cpu_online_map,
+							nohz_cpu_mask);
 		write_seqcount_begin(&rcu_ctrlblk.lock);
 		rcu_ctrlblk.next_pending = 0;
 		rcu_ctrlblk.cur++;
diff --git a/kernel/sched.c b/kernel/sched.c
index a614d7ebf7c3..f0103ee1d66a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2951,6 +2951,11 @@ out_unlock:
 cpumask_t cpu_present_map;
 EXPORT_SYMBOL(cpu_present_map);
 
+#ifndef CONFIG_SMP
+cpumask_t cpu_online_map = CPU_MASK_ALL;
+cpumask_t cpu_possible_map = CPU_MASK_ALL;
+#endif
+
 /**
  * sys_sched_getaffinity - get the cpu affinity of a process
  * @pid: pid of the process
-- 
cgit v1.2.3


From 4b81e400a9ea37a5888434234befaf389a36e72b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 18:52:21 -0700
Subject: [PATCH] cpumask: optimize various uses of new cpumasks

From: Paul Jackson <pj@sgi.com>

Make use of for_each_cpu_mask() macro to simplify and optimize a couple of
sparc64 per-CPU loops.

Optimize a bit of cpumask code for asm-i386/mach-es7000

Convert physids_complement() to use both args in the files
include/asm-i386/mpspec.h, include/asm-x86_64/mpspec.h.

Remove cpumask hack from asm-x86_64/topology.h routine pcibus_to_cpumask().

Clarify and slightly optimize several cpumask manipulations in kernel/sched.c

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/sparc64/kernel/smp.c               | 66 ++++++++++++---------------------
 include/asm-i386/mach-es7000/mach_ipi.h |  5 +--
 include/asm-i386/mpspec.h               |  2 +-
 include/asm-x86_64/mpspec.h             |  2 +-
 include/asm-x86_64/topology.h           |  6 ++-
 kernel/sched.c                          | 18 ++++-----
 6 files changed, 39 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 64b873212243..58c3792b4af1 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -406,14 +406,8 @@ static __inline__ void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, c
 	int i;
 
 	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_isset(i, mask)) {
-			spitfire_xcall_helper(data0, data1, data2, pstate, i);
-			cpu_clear(i, mask);
-			if (cpus_empty(mask))
-				break;
-		}
-	}
+	for_each_cpu_mask(i, mask)
+		spitfire_xcall_helper(data0, data1, data2, pstate, i);
 }
 
 /* Cheetah now allows to send the whole 64-bytes of data in the interrupt
@@ -456,25 +450,19 @@ retry:
 
 	nack_busy_id = 0;
 	{
-		cpumask_t work_mask = mask;
 		int i;
 
-		for (i = 0; i < NR_CPUS; i++) {
-			if (cpu_isset(i, work_mask)) {
-				u64 target = (i << 14) | 0x70;
-
-				if (!is_jalapeno)
-					target |= (nack_busy_id << 24);
-				__asm__ __volatile__(
-					"stxa	%%g0, [%0] %1\n\t"
-					"membar	#Sync\n\t"
-					: /* no outputs */
-					: "r" (target), "i" (ASI_INTR_W));
-				nack_busy_id++;
- 				cpu_clear(i, work_mask);
-				if (cpus_empty(work_mask))
-					break;
-			}
+		for_each_cpu_mask(i, mask) {
+			u64 target = (i << 14) | 0x70;
+
+			if (!is_jalapeno)
+				target |= (nack_busy_id << 24);
+			__asm__ __volatile__(
+				"stxa	%%g0, [%0] %1\n\t"
+				"membar	#Sync\n\t"
+				: /* no outputs */
+				: "r" (target), "i" (ASI_INTR_W));
+			nack_busy_id++;
 		}
 	}
 
@@ -507,7 +495,6 @@ retry:
 			printk("CPU[%d]: mondo stuckage result[%016lx]\n",
 			       smp_processor_id(), dispatch_stat);
 		} else {
-			cpumask_t work_mask = mask;
 			int i, this_busy_nack = 0;
 
 			/* Delay some random time with interrupts enabled
@@ -518,22 +505,17 @@ retry:
 			/* Clear out the mask bits for cpus which did not
 			 * NACK us.
 			 */
-			for (i = 0; i < NR_CPUS; i++) {
-				if (cpu_isset(i, work_mask)) {
-					u64 check_mask;
-
-					if (is_jalapeno)
-						check_mask = (0x2UL << (2*i));
-					else
-						check_mask = (0x2UL <<
-							      this_busy_nack);
-					if ((dispatch_stat & check_mask) == 0)
-						cpu_clear(i, mask);
-					this_busy_nack += 2;
-					cpu_clear(i, work_mask);
-					if (cpus_empty(work_mask))
-						break;
-				}
+			for_each_cpu_mask(i, mask) {
+				u64 check_mask;
+
+				if (is_jalapeno)
+					check_mask = (0x2UL << (2*i));
+				else
+					check_mask = (0x2UL <<
+						      this_busy_nack);
+				if ((dispatch_stat & check_mask) == 0)
+					cpu_clear(i, mask);
+				this_busy_nack += 2;
 			}
 
 			goto retry;
diff --git a/include/asm-i386/mach-es7000/mach_ipi.h b/include/asm-i386/mach-es7000/mach_ipi.h
index 6b7a56c1ddd5..cb8a2fdb5c59 100644
--- a/include/asm-i386/mach-es7000/mach_ipi.h
+++ b/include/asm-i386/mach-es7000/mach_ipi.h
@@ -10,9 +10,8 @@ static inline void send_IPI_mask(cpumask_t mask, int vector)
 
 static inline void send_IPI_allbutself(int vector)
 {
-	cpumask_t mask = cpumask_of_cpu(smp_processor_id());
-	cpus_complement(mask);
-	cpus_and(mask, mask, cpu_online_map);
+	cpumask_t mask = cpu_online_map;
+	cpu_clear(smp_processor_id(), mask);
 	if (!cpus_empty(mask))
 		send_IPI_mask(mask, vector);
 }
diff --git a/include/asm-i386/mpspec.h b/include/asm-i386/mpspec.h
index 31a9717c1a8b..8170e019af8d 100644
--- a/include/asm-i386/mpspec.h
+++ b/include/asm-i386/mpspec.h
@@ -53,7 +53,7 @@ typedef struct physid_mask physid_mask_t;
 #define physids_and(dst, src1, src2)		bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
 #define physids_or(dst, src1, src2)		bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
 #define physids_clear(map)			bitmap_zero((map).mask, MAX_APICS)
-#define physids_complement(map)			bitmap_complement((map).mask, (map).mask, MAX_APICS)
+#define physids_complement(dst, src)		bitmap_complement((dst).mask,(src).mask, MAX_APICS)
 #define physids_empty(map)			bitmap_empty((map).mask, MAX_APICS)
 #define physids_equal(map1, map2)		bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
 #define physids_weight(map)			bitmap_weight((map).mask, MAX_APICS)
diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h
index a9a6f16672f2..219d40acd489 100644
--- a/include/asm-x86_64/mpspec.h
+++ b/include/asm-x86_64/mpspec.h
@@ -212,7 +212,7 @@ typedef struct physid_mask physid_mask_t;
 #define physids_and(dst, src1, src2)		bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
 #define physids_or(dst, src1, src2)		bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
 #define physids_clear(map)			bitmap_zero((map).mask, MAX_APICS)
-#define physids_complement(map)			bitmap_complement((map).mask, (map).mask, MAX_APICS)
+#define physids_complement(dst, src)		bitmap_complement((dst).mask, (src).mask, MAX_APICS)
 #define physids_empty(map)			bitmap_empty((map).mask, MAX_APICS)
 #define physids_equal(map1, map2)		bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
 #define physids_weight(map)			bitmap_weight((map).mask, MAX_APICS)
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index bb9a99ba2065..9310f9a1c1c5 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -20,9 +20,11 @@ extern cpumask_t     node_to_cpumask[];
 #define node_to_first_cpu(node) 	(__ffs(node_to_cpumask[node]))
 #define node_to_cpumask(node)		(node_to_cpumask[node])
 
-static inline unsigned long pcibus_to_cpumask(int bus)
+static inline cpumask_t pcibus_to_cpumask(int bus)
 {
-	return mp_bus_to_cpumask[bus] & cpu_online_map; 
+	cpumask_t tmp;
+	cpus_and(tmp, mp_bus_to_cpumask[bus], cpu_online_map);
+	return tmp;
 }
 
 #define NODE_BALANCE_RATE 30	/* CHECKME */ 
diff --git a/kernel/sched.c b/kernel/sched.c
index f0103ee1d66a..017b59b8de5e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -696,10 +696,9 @@ static int wake_idle(int cpu, task_t *p)
 		return cpu;
 
 	cpus_and(tmp, sd->span, cpu_online_map);
-	for_each_cpu_mask(i, tmp) {
-		if (!cpu_isset(i, p->cpus_allowed))
-			continue;
+	cpus_and(tmp, tmp, p->cpus_allowed);
 
+	for_each_cpu_mask(i, tmp) {
 		if (idle_cpu(i))
 			return i;
 	}
@@ -3335,7 +3334,7 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask)
 	runqueue_t *rq;
 
 	rq = task_rq_lock(p, &flags);
-	if (any_online_cpu(new_mask) == NR_CPUS) {
+	if (!cpus_intersects(new_mask, cpu_online_map)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -3510,8 +3509,7 @@ static void migrate_all_tasks(int src_cpu)
 		if (dest_cpu == NR_CPUS)
 			dest_cpu = any_online_cpu(tsk->cpus_allowed);
 		if (dest_cpu == NR_CPUS) {
-			cpus_clear(tsk->cpus_allowed);
-			cpus_complement(tsk->cpus_allowed);
+			cpus_setall(tsk->cpus_allowed);
 			dest_cpu = any_online_cpu(tsk->cpus_allowed);
 
 			/* Don't tell them about moving exiting tasks
@@ -3827,7 +3825,7 @@ void sched_domain_debug(void)
 			int j;
 			char str[NR_CPUS];
 			struct sched_group *group = sd->groups;
-			cpumask_t groupmask, tmp;
+			cpumask_t groupmask;
 
 			cpumask_scnprintf(str, NR_CPUS, sd->span);
 			cpus_clear(groupmask);
@@ -3857,8 +3855,7 @@ void sched_domain_debug(void)
 				if (!cpus_weight(group->cpumask))
 					printk(" ERROR empty group:");
 
-				cpus_and(tmp, groupmask, group->cpumask);
-				if (cpus_weight(tmp) > 0)
+				if (cpus_intersects(groupmask, group->cpumask))
 					printk(" ERROR repeated CPUs:");
 
 				cpus_or(groupmask, groupmask, group->cpumask);
@@ -3877,8 +3874,7 @@ void sched_domain_debug(void)
 			sd = sd->parent;
 
 			if (sd) {
-				cpus_and(tmp, groupmask, sd->span);
-				if (!cpus_equal(tmp, groupmask))
+				if (!cpus_subset(groupmask, sd->span))
 					printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
 			}
 
-- 
cgit v1.2.3


From a3dcb7f41eced06d4e43365fefd98a3b9b48e340 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 18:52:44 -0700
Subject: [PATCH] clean up cpumask_t temporaries

From: Rusty Russell <rusty@rustcorp.com.au>

Paul Jackson's cpumask tour-de-force allows us to get rid of those stupid
temporaries which we used to hold CPU_MASK_ALL to hand them to functions.
This used to break NR_CPUS > BITS_PER_LONG.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/kernel/irq.c                  | 3 +--
 arch/ppc64/kernel/rtasd.c                | 3 +--
 arch/ppc64/kernel/xics.c                 | 6 ++----
 include/asm-i386/mach-numaq/mach_apic.h  | 3 +--
 include/asm-i386/mach-summit/mach_apic.h | 3 +--
 kernel/kmod.c                            | 3 +--
 kernel/kthread.c                         | 3 +--
 kernel/sched.c                           | 5 ++---
 8 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/arch/ppc64/kernel/irq.c b/arch/ppc64/kernel/irq.c
index 7335442e4178..004c3c1a034e 100644
--- a/arch/ppc64/kernel/irq.c
+++ b/arch/ppc64/kernel/irq.c
@@ -738,7 +738,6 @@ static int irq_affinity_write_proc (struct file *file, const char __user *buffer
 	irq_desc_t *desc = get_irq_desc(irq);
 	int ret;
 	cpumask_t new_value, tmp;
-	cpumask_t allcpus = CPU_MASK_ALL;
 
 	if (!desc->handler->set_affinity)
 		return -EIO;
@@ -753,7 +752,7 @@ static int irq_affinity_write_proc (struct file *file, const char __user *buffer
 	 * NR_CPUS == 32 and cpumask is a long), so we mask it here to
 	 * be consistent.
 	 */
-	cpus_and(new_value, new_value, allcpus);
+	cpus_and(new_value, new_value, CPU_MASK_ALL);
 
 	/*
 	 * Grab lock here so cpu_online_map can't change, and also
diff --git a/arch/ppc64/kernel/rtasd.c b/arch/ppc64/kernel/rtasd.c
index 243d2ee8de8a..aa649a24a947 100644
--- a/arch/ppc64/kernel/rtasd.c
+++ b/arch/ppc64/kernel/rtasd.c
@@ -364,7 +364,6 @@ static int rtasd(void *unused)
 	unsigned int err_type;
 	int cpu = 0;
 	int event_scan = rtas_token("event-scan");
-	cpumask_t all = CPU_MASK_ALL;
 	int rc;
 
 	daemonize("rtasd");
@@ -419,7 +418,7 @@ static int rtasd(void *unused)
 	for (;;) {
 		set_cpus_allowed(current, cpumask_of_cpu(cpu));
 		do_event_scan(event_scan);
-		set_cpus_allowed(current, all);
+		set_cpus_allowed(current, CPU_MASK_ALL);
 
 		/* Drop hotplug lock, and sleep for a bit (at least
 		 * one second since some machines have problems if we
diff --git a/arch/ppc64/kernel/xics.c b/arch/ppc64/kernel/xics.c
index 1d9cf20a2900..32adc8c22953 100644
--- a/arch/ppc64/kernel/xics.c
+++ b/arch/ppc64/kernel/xics.c
@@ -240,14 +240,13 @@ static unsigned int real_irq_to_virt(unsigned int real_irq)
 static int get_irq_server(unsigned int irq)
 {
 	cpumask_t cpumask = irq_affinity[irq];
-	cpumask_t allcpus = CPU_MASK_ALL;
 	cpumask_t tmp = CPU_MASK_NONE;
 	unsigned int server;
 
 #ifdef CONFIG_IRQ_ALL_CPUS
 	/* For the moment only implement delivery to all cpus or one cpu */
 	if (smp_threads_ready) {
-		if (cpus_equal(cpumask, allcpus)) {
+		if (cpus_equal(cpumask, CPU_MASK_ALL)) {
 			server = default_distrib_server;
 		} else {
 			cpus_and(tmp, cpu_online_map, cpumask);
@@ -616,7 +615,6 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 	long status;
 	unsigned long xics_status[2];
 	unsigned long newmask;
-	cpumask_t allcpus = CPU_MASK_ALL;
 	cpumask_t tmp = CPU_MASK_NONE;
 
 	irq = virt_irq_to_real(irq_offset_down(virq));
@@ -632,7 +630,7 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 	}
 
 	/* For the moment only implement delivery to all cpus or one cpu */
-	if (cpus_equal(cpumask, allcpus)) {
+	if (cpus_equal(cpumask, CPU_MASK_ALL)) {
 		newmask = default_distrib_server;
 	} else {
 		cpus_and(tmp, cpu_online_map, cpumask);
diff --git a/include/asm-i386/mach-numaq/mach_apic.h b/include/asm-i386/mach-numaq/mach_apic.h
index e40c308b15d3..b852593a1c7b 100644
--- a/include/asm-i386/mach-numaq/mach_apic.h
+++ b/include/asm-i386/mach-numaq/mach_apic.h
@@ -8,8 +8,7 @@
 
 static inline cpumask_t target_cpus(void)
 {
-	cpumask_t tmp = CPU_MASK_ALL;
-	return tmp;
+	return CPU_MASK_ALL;
 }
 
 #define TARGET_CPUS (target_cpus())
diff --git a/include/asm-i386/mach-summit/mach_apic.h b/include/asm-i386/mach-summit/mach_apic.h
index 4bf36ddba96d..214263a48f71 100644
--- a/include/asm-i386/mach-summit/mach_apic.h
+++ b/include/asm-i386/mach-summit/mach_apic.h
@@ -19,8 +19,7 @@
 
 static inline cpumask_t target_cpus(void)
 {
-	cpumask_t tmp = CPU_MASK_ALL;
-	return tmp;
+	return CPU_MASK_ALL;
 } 
 #define TARGET_CPUS	(target_cpus())
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ea62192b7597..579269c38a3b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -154,7 +154,6 @@ static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
 	int retval;
-	cpumask_t mask = CPU_MASK_ALL;
 
 	/* Unblock all signals. */
 	flush_signals(current);
@@ -165,7 +164,7 @@ static int ____call_usermodehelper(void *data)
 	spin_unlock_irq(&current->sighand->siglock);
 
 	/* We can run anywhere, unlike our parent keventd(). */
-	set_cpus_allowed(current, mask);
+	set_cpus_allowed(current, CPU_MASK_ALL);
 
 	retval = -EPERM;
 	if (current->fs->root)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index da0ec5b25cdf..5689ebb1a250 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -65,7 +65,6 @@ static int kthread(void *_create)
 	void *data;
 	sigset_t blocked;
 	int ret = -EINTR;
-	cpumask_t mask = CPU_MASK_ALL;
 
 	kthread_exit_files();
 
@@ -79,7 +78,7 @@ static int kthread(void *_create)
 	flush_signals(current);
 
 	/* By default we can run anywhere, unlike keventd. */
-	set_cpus_allowed(current, mask);
+	set_cpus_allowed(current, CPU_MASK_ALL);
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
 	__set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/sched.c b/kernel/sched.c
index 017b59b8de5e..95f18cf8a5b6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3913,16 +3913,15 @@ void __init sched_init(void)
 	/* Set up an initial dummy domain for early boot */
 	static struct sched_domain sched_domain_init;
 	static struct sched_group sched_group_init;
-	cpumask_t cpu_mask_all = CPU_MASK_ALL;
 
 	memset(&sched_domain_init, 0, sizeof(struct sched_domain));
-	sched_domain_init.span = cpu_mask_all;
+	sched_domain_init.span = CPU_MASK_ALL;
 	sched_domain_init.groups = &sched_group_init;
 	sched_domain_init.last_balance = jiffies;
 	sched_domain_init.balance_interval = INT_MAX; /* Don't balance */
 
 	memset(&sched_group_init, 0, sizeof(struct sched_group));
-	sched_group_init.cpumask = cpu_mask_all;
+	sched_group_init.cpumask = CPU_MASK_ALL;
 	sched_group_init.next = &sched_group_init;
 	sched_group_init.cpu_power = SCHED_LOAD_SCALE;
 #endif
-- 
cgit v1.2.3


From a44115192f55f8b7a22d511cc194d7880a2ba553 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 18:54:04 -0700
Subject: [PATCH] vm: vfs shrinkage tuning

Some people want the dentry and inode caches shrink harder, others want them
shrunk more reluctantly.

The patch adds /proc/sys/vm/vfs_cache_pressure, which tunes the vfs cache
versus pagecache scanning pressure.

- at vfs_cache_pressure=0 we don't shrink dcache and icache at all.

- at vfs_cache_pressure=100 there is no change in behaviour.

- at vfs_cache_pressure > 100 we reclaim dentries and inodes harder.


The number of megabytes of slab left after a slocate.cron on my 256MB test
box:

vfs_cache_pressure=100000   33480
vfs_cache_pressure=10000    61996
vfs_cache_pressure=1000     104056
vfs_cache_pressure=200      166340
vfs_cache_pressure=100      190200
vfs_cache_pressure=50       206168

Of course, this just left more directory and inode pagecache behind instead of
vfs cache.  Interestingly, on this machine the entire slocate run fits into
pagecache, but not into VFS caches.


Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/proc.txt | 12 ++++++++++++
 Documentation/sysctl/vm.txt        |  2 +-
 fs/dcache.c                        |  5 +++--
 fs/inode.c                         |  2 +-
 include/linux/dcache.h             |  2 ++
 include/linux/sysctl.h             |  1 +
 kernel/sysctl.c                    | 12 ++++++++++++
 7 files changed, 32 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 995da97a2633..b84b7a7cc723 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1115,6 +1115,18 @@ program to load modules on demand.
 The files  in  this directory can be used to tune the operation of the virtual
 memory (VM)  subsystem  of  the  Linux  kernel.
 
+vfs_cache_pressure
+------------------
+
+Controls the tendency of the kernel to reclaim the memory which is used for
+caching of directory and inode objects.
+
+At the default value of vfs_cache_pressure=100 the kernel will attempt to
+reclaim dentries and inodes at a "fair" rate with respect to pagecache and
+swapcache reclaim.  Decreasing vfs_cache_pressure causes the kernel to prefer
+to retain dentry and inode caches.  Increasing vfs_cache_pressure beyond 100
+causes the kernel to prefer to reclaim dentries and inodes.
+
 dirty_background_ratio
 ----------------------
 
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index fc3e413c3721..c873ef92fbfa 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -28,7 +28,7 @@ Currently, these files are in /proc/sys/vm:
 ==============================================================
 
 dirty_ratio, dirty_background_ratio, dirty_expire_centisecs,
-dirty_writeback_centisecs:
+dirty_writeback_centisecs, vfs_cache_pressure:
 
 See Documentation/filesystems/proc.txt
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 02df5d45b82e..ac36b5c5fdf2 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,9 +32,10 @@
 #include <linux/swap.h>
 #include <linux/bootmem.h>
 
-#define DCACHE_PARANOIA 1
 /* #define DCACHE_DEBUG 1 */
 
+int sysctl_vfs_cache_pressure = 100;
+
 spinlock_t dcache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
 
@@ -664,7 +665,7 @@ static int shrink_dcache_memory(int nr, unsigned int gfp_mask)
 			return -1;
 		prune_dcache(nr);
 	}
-	return dentry_stat.nr_unused;
+	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
 /**
diff --git a/fs/inode.c b/fs/inode.c
index e802a3c35fd2..ad1dd009acbc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -488,7 +488,7 @@ static int shrink_icache_memory(int nr, unsigned int gfp_mask)
 		if (gfp_mask & __GFP_FS)
 			prune_icache(nr);
 	}
-	return inodes_stat.nr_unused;
+	return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
 static void __wait_on_freeing_inode(struct inode *inode);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 72f48658a7d7..66e27328434b 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -313,6 +313,8 @@ static inline int d_mountpoint(struct dentry *dentry)
 extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *);
 extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
 
+extern int sysctl_vfs_cache_pressure;
+
 #endif /* __KERNEL__ */
 
 #endif	/* __LINUX_DCACHE_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index f70f7fc14498..38acd5d4b691 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -164,6 +164,7 @@ enum
 	VM_LAPTOP_MODE=23,	/* vm laptop mode */
 	VM_BLOCK_DUMP=24,	/* block dump mode */
 	VM_HUGETLB_GROUP=25,	/* permitted hugetlb group */
+	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7dca63a88ea2..641727bab22f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -39,6 +39,8 @@
 #include <linux/initrd.h>
 #include <linux/times.h>
 #include <linux/limits.h>
+#include <linux/dcache.h>
+
 #include <asm/uaccess.h>
 
 #ifdef CONFIG_ROOT_NFS
@@ -777,6 +779,16 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= VM_VFS_CACHE_PRESSURE,
+		.procname	= "vfs_cache_pressure",
+		.data		= &sysctl_vfs_cache_pressure,
+		.maxlen		= sizeof(sysctl_vfs_cache_pressure),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3


From 97b6dbb209b4bfb029823a4d7fb46eea3ec22574 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 19:23:48 -0700
Subject: [PATCH] Prepare for SMP suspend

From: Pavel Machek <pavel@ucw.cz>

Its very bad idea to freeze migration threads, as it crashes machine upon
next call to "schedule()".  In refrigerator, I had one "wake_up_process()"
too many.  This fixes it.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/process.c | 1 -
 kernel/sched.c         | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 347435415eaf..0d1f63ec7287 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -109,7 +109,6 @@ void thaw_processes(void)
 			wake_up_process(p);
 		} else
 			printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
-		wake_up_process(p);
 	} while_each_thread(g, p);
 
 	read_unlock(&tasklist_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index 95f18cf8a5b6..9e676db0267b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3571,6 +3571,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
 		p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
 		if (IS_ERR(p))
 			return NOTIFY_BAD;
+		p->flags |= PF_NOFREEZE;
 		kthread_bind(p, cpu);
 		/* Must be high prio: stop_machine expects to yield to it. */
 		rq = task_rq_lock(p, &flags);
-- 
cgit v1.2.3


From 11e4e1e82c3de5869bfe18a46e2718cbdf586929 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 23 Jun 2004 19:24:10 -0700
Subject: [PATCH] consolidate in-kernel configuration

From: Andy Whitcroft <apw@shadowen.org>

Being able to recover the configuration from a kernel is very useful and it
would be nice to default this option to Yes.  Currently, to have the config
available both from the image (using extract-ikconfig) and via /proc we
keep two copies of the original .config in the kernel.  One in plain text
and one gzip compressed.  This is not optimal.

This patch removes the plain text version of the configuration and updates
the extraction tools to locate and use the gzip'd version of the file.
This has the added bonus of providing us with the exact same results in
both cases, the original .config; including the comments.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile          | 18 +---------
 kernel/configs.c         | 25 ++++++++++----
 scripts/extract-ikconfig | 87 +++++++++++++++++++++++++++---------------------
 3 files changed, 69 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 2b7b352b1a38..47f98594e9e5 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -33,23 +33,7 @@ ifneq ($(CONFIG_IA64),y)
 CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
-# configs.o uses generated files - dependecies must be listed explicitly
-$(obj)/configs.o: $(obj)/ikconfig.h
-
-ifdef CONFIG_IKCONFIG_PROC
 $(obj)/configs.o: $(obj)/config_data.h
-endif
-
-# ikconfig.h contains all the selected config entries - generated
-# from top-level Makefile and .config. Info from ikconfig.h can
-# be extracted from the kernel binary.
-
-quiet_cmd_ikconfig = IKCFG   $@
-      cmd_ikconfig = $(CONFIG_SHELL) $< .config $(srctree)/Makefile > $@
-
-targets += ikconfig.h
-$(obj)/ikconfig.h: scripts/mkconfigs .config $(srctree)/Makefile FORCE
-	$(call if_changed,ikconfig)
 
 # config_data.h contains the same information as ikconfig.h but gzipped.
 # Info from config_data can be extracted from /proc/config*
@@ -58,7 +42,7 @@ $(obj)/config_data.gz: .config FORCE
 	$(call if_changed,gzip)
 
 quiet_cmd_ikconfiggz = IKCFG   $@
-      cmd_ikconfiggz = cat $< | scripts/bin2c kernel_config_data > $@
+      cmd_ikconfiggz = (echo "const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
 	$(call if_changed,ikconfiggz)
diff --git a/kernel/configs.c b/kernel/configs.c
index 326ab7b214f6..d18a944ad249 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -34,13 +34,26 @@
 /**************************************************/
 /* the actual current config file                 */
 
-/* This one is for extraction from the kernel binary file image. */
-#include "ikconfig.h"
+/*
+ * Define kernel_config_data and kernel_config_data_size, which contains the
+ * wrapped and compressed configuration file.  The file is first compressed
+ * with gzip and then bounded by two eight byte magic numbers to allow
+ * extraction from a binary kernel image:
+ *
+ *   IKCFG_ST
+ *   <image>
+ *   IKCFG_ED
+ */
+#define MAGIC_START	"IKCFG_ST"
+#define MAGIC_END	"IKCFG_ED"
+#include "config_data.h"
 
-#ifdef CONFIG_IKCONFIG_PROC
 
-/* This is the data that can be read from /proc/config.gz. */
-#include "config_data.h"
+#define MAGIC_SIZE (sizeof(MAGIC_START) - 1)
+#define kernel_config_data_size \
+	(sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2)
+
+#ifdef CONFIG_IKCONFIG_PROC
 
 /**************************************************/
 /* globals and useful constants                   */
@@ -58,7 +71,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
 		return 0;
 
 	count = min(len, (size_t)(kernel_config_data_size - pos));
-	if(copy_to_user(buf, kernel_config_data + pos, count))
+	if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
 		return -EFAULT;
 
 	*offset += count;
diff --git a/scripts/extract-ikconfig b/scripts/extract-ikconfig
index 8e526d370966..d9f9f34b22ab 100755
--- a/scripts/extract-ikconfig
+++ b/scripts/extract-ikconfig
@@ -1,9 +1,31 @@
-#! /bin/bash 
+#!/bin/sh
 # extracts .config info from a [b]zImage file
 # uses: binoffset (new), dd, zcat, strings, grep
 # $arg1 is [b]zImage filename
 
-TMPFILE=""
+binoffset="./scripts/binoffset"
+
+IKCFG_ST="0x49 0x4b 0x43 0x46 0x47 0x5f 0x53 0x54"
+IKCFG_ED="0x49 0x4b 0x43 0x46 0x47 0x5f 0x45 0x44"
+function dump_config {
+    typeset file="$1"
+
+    start=`$binoffset $file $IKCFG_ST 2>/dev/null`
+    [ "$?" != "0" ] && start="-1"
+    if [ "$start" -eq "-1" ]; then
+	return
+    fi
+    end=`$binoffset $file $IKCFG_ED 2>/dev/null`
+
+    let start="$start + 8"
+    let size="$end - $start"
+
+    head --bytes="$end" "$file" | tail --bytes="$size" | zcat
+
+    clean_up
+    exit 0
+}
+
 
 usage()
 {
@@ -12,8 +34,7 @@ usage()
 
 clean_up()
 {
-	if [ -z $ISCOMP ]
-	then
+	if [ "$TMPFILE" != "" ]; then
 		rm -f $TMPFILE
 	fi
 }
@@ -21,46 +42,36 @@ clean_up()
 if [ $# -lt 1 ]
 then
 	usage
-	exit
+	exit 1
 fi
 
-image=$1
+TMPFILE="/tmp/ikconfig-$$"
+image="$1"
 
-# There are two gzip headers, as well as arches which don't compress their
-# kernel.
-GZHDR="0x1f 0x8b 0x08 0x00"
-if [ `binoffset $image $GZHDR >/dev/null 2>&1 ; echo $?` -ne 0 ]
-then
-	GZHDR="0x1f 0x8b 0x08 0x08"
-	if [ `binoffset $image $GZHDR >/dev/null 2>&1 ; echo $?` -ne 0 ]
-	then
-		ISCOMP=0
-	fi
-fi
+# vmlinux: Attempt to dump the configuration from the file directly
+dump_config "$image"
 
-PID=$$
+GZHDR1="0x1f 0x8b 0x08 0x00"
+GZHDR2="0x1f 0x8b 0x08 0x08"
 
-# Extract and uncompress the kernel image if necessary
-if [ -z $ISCOMP ]
-then
-	TMPFILE="/tmp/`basename $image`.vmlin.$PID"
-	dd if=$image bs=1 skip=`binoffset $image $GZHDR` 2> /dev/null | zcat > $TMPFILE
-else
-	TMPFILE=$image
+# vmlinux.gz: Check for a compressed images
+off=`$binoffset "$image" $GZHDR1 2>/dev/null`
+[ "$?" != "0" ] && off="-1"
+if [ "$off" -eq "-1" ]; then
+	off=`$binoffset "$image" $GZHDR2 2>/dev/null`
+	[ "$?" != "0" ] && off="-1"
 fi
-
-# Look for strings.
-strings $TMPFILE | grep "CONFIG_BEGIN=n" > /dev/null
-if [ $? -eq 0 ]
-then
-	strings $TMPFILE | awk "/CONFIG_BEGIN=n/,/CONFIG_END=n/" > $image.oldconfig.$PID
-else
-	echo "ERROR: Unable to extract kernel configuration information."
-	echo "       This kernel image may not have the config info."
-	clean_up
-	exit 1
+if [ "$off" -eq "0" ]; then
+	zcat <"$image" >"$TMPFILE"
+	dump_config "$TMPFILE"
+elif [ "$off" -ne "-1" ]; then
+	(dd ibs="$off" skip=1 count=0 && dd bs=512k) <"$image" 2>/dev/null | \
+		zcat >"$TMPFILE"
+	dump_config "$TMPFILE"
 fi
 
-echo "Kernel configuration written to $image.oldconfig.$PID"
+echo "ERROR: Unable to extract kernel configuration information."
+echo "       This kernel image may not have the config info."
+
 clean_up
-exit 0
+exit 1
-- 
cgit v1.2.3