[PATCH] cpumask_t: allow more than BITS_PER_LONG CPUs

From: William Lee Irwin III <wli@holomorphy.com> Contributions from: Jan Dittmer <jdittmer@sfhq.hn.org> Arnd Bergmann <arnd@arndb.de> "Bryan O'Sullivan" <bos@serpentine.com> "David S. Miller" <davem@redhat.com> Badari Pulavarty <pbadari@us.ibm.com> "Martin J. Bligh" <mbligh@aracnet.com> Zwane Mwaikambo <zwane@linuxpower.ca> It has ben tested on x86, sparc64, x86_64, ia64 (I think), ppc and ppc64. cpumask_t enables systems with NR_CPUS > BITS_PER_LONG to utilize all their cpus by creating an abstract data type dedicated to representing cpu bitmasks, similar to fd sets from userspace, and sweeping the appropriate code to update callers to the access API. The fd set-like structure is according to Linus' own suggestion; the macro calling convention to ambiguate representations with minimal code impact is my own invention. Specifically, a new set of inline functions for manipulating arbitrary-width bitmaps is introduced with a relatively simple implementation, in tandem with a new data type representing bitmaps of width NR_CPUS, cpumask_t, whose accessor functions are defined in terms of the bitmap manipulation inlines. This bitmap ADT found an additional use in i386 arch code handling sparse physical APIC ID's, which was convenient to use in this case as the accounting structure was required to be wider to accommodate the physids consumed by larger numbers of cpus. For the sake of simplicity and low code impact, these cpu bitmasks are passed primarily by value; however, an additional set of accessors along with an auxiliary data type with const call-by-reference semantics is provided to address performance concerns raised in connection with very large systems, such as SGI's larger models, where copying and call-by-value overhead would be prohibitive. Few (if any) users of the call-by-reference API are immediately introduced. Also, in order to avoid calling convention overhead on architectures where structures are required to be passed by value, NR_CPUS <= BITS_PER_LONG is special-cased so that cpumask_t falls back to an unsigned long and the accessors perform the usual bit twiddling on unsigned longs as opposed to arrays thereof. Audits were done with the structure overhead in-place, restoring this special-casing only afterward so as to ensure a more complete API conversion while undergoing the majority of its end-user exposure in -mm. More -mm's were shipped after its restoration to be sure that was tested, too. The immediate users of this functionality are Sun sparc64 systems, SGI mips64 and ia64 systems, and IBM ia32, ppc64, and s390 systems. Of these, only the ppc64 machines needing the functionality have yet to be released; all others have had systems requiring it for full functionality for at least 6 months, and in some cases, since the initial Linux port to the affected architecture.
author: Andrew Morton <akpm@osdl.org> 2003-08-18 06:43:55 -0700
committer: Linus Torvalds <torvalds@home.osdl.org> 2003-08-18 06:43:55 -0700
commit: bf8cb61f2839b8885077adae5020491ec3e7f453 (patch)
tree: 2cb32a3b80f9e21e0ca613c4fa63962a57940935 /kernel
parent: 068a96bff3db85e237643e0e719b0563c35af8ea (diff)
6 files changed, 31 insertions, 32 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 1164d51d897f..b65c19fe2dce 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -233,7 +233,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->map_count = 0;
 	mm->rss = 0;
-	mm->cpu_vm_mask = 0;
+	cpus_clear(mm->cpu_vm_mask);
 	pprev = &mm->mmap;
 
 	/*
diff --git a/kernel/module.c b/kernel/module.c
index 58d73701bbde..8a67a83ecc1f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -482,7 +482,7 @@ static int stopref(void *cpu)
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	setscheduler(current->pid, SCHED_FIFO, &param);
 #endif
-	set_cpus_allowed(current, 1UL << (unsigned long)cpu);
+	set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
 
 	/* Ack: we are alive */
 	atomic_inc(&stopref_thread_ack);
@@ -535,7 +535,7 @@ static void stopref_set_state(enum stopref_state state, int sleep)
 static int stop_refcounts(void)
 {
 	unsigned int i, cpu;
-	unsigned long old_allowed;
+	cpumask_t old_allowed;
 	int ret = 0;
 
 	/* One thread per cpu.  We'll do our own. */
@@ -543,7 +543,7 @@ static int stop_refcounts(void)
 
 	/* FIXME: racy with set_cpus_allowed. */
 	old_allowed = current->cpus_allowed;
-	set_cpus_allowed(current, 1UL << (unsigned long)cpu);
+	set_cpus_allowed(current, cpumask_of_cpu(cpu));
 
 	atomic_set(&stopref_thread_ack, 0);
 	stopref_num_threads = 0;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 49c3ab0a2a76..113bae2ae4e5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -48,7 +48,7 @@
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
 	{ .mutex = SPIN_LOCK_UNLOCKED, .curbatch = 1, 
-	  .maxbatch = 1, .rcu_cpu_mask = 0 };
+	  .maxbatch = 1, .rcu_cpu_mask = CPU_MASK_NONE };
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 
 /* Fake initialization required by compiler */
@@ -107,7 +107,7 @@ static void rcu_start_batch(long newbatch)
 		rcu_ctrlblk.maxbatch = newbatch;
 	}
 	if (rcu_batch_before(rcu_ctrlblk.maxbatch, rcu_ctrlblk.curbatch) ||
-	    (rcu_ctrlblk.rcu_cpu_mask != 0)) {
+	    !cpus_empty(rcu_ctrlblk.rcu_cpu_mask)) {
 		return;
 	}
 	rcu_ctrlblk.rcu_cpu_mask = cpu_online_map;
@@ -122,7 +122,7 @@ static void rcu_check_quiescent_state(void)
 {
 	int cpu = smp_processor_id();
 
-	if (!test_bit(cpu, &rcu_ctrlblk.rcu_cpu_mask))
+	if (!cpu_isset(cpu, rcu_ctrlblk.rcu_cpu_mask))
 		return;
 
 	/* 
@@ -138,12 +138,12 @@ static void rcu_check_quiescent_state(void)
 		return;
 
 	spin_lock(&rcu_ctrlblk.mutex);
-	if (!test_bit(cpu, &rcu_ctrlblk.rcu_cpu_mask))
+	if (!cpu_isset(cpu, rcu_ctrlblk.rcu_cpu_mask))
 		goto out_unlock;
 
-	clear_bit(cpu, &rcu_ctrlblk.rcu_cpu_mask);
+	cpu_clear(cpu, rcu_ctrlblk.rcu_cpu_mask);
 	RCU_last_qsctr(cpu) = RCU_QSCTR_INVALID;
-	if (rcu_ctrlblk.rcu_cpu_mask != 0)
+	if (!cpus_empty(rcu_ctrlblk.rcu_cpu_mask))
 		goto out_unlock;
 
 	rcu_ctrlblk.curbatch++;
diff --git a/kernel/sched.c b/kernel/sched.c
index 422de31370b5..7a0dc4b1c434 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -489,7 +489,7 @@ repeat_lock_task:
 			 */
 			if (unlikely(sync && !task_running(rq, p) &&
 				(task_cpu(p) != smp_processor_id()) &&
-				(p->cpus_allowed & (1UL << smp_processor_id())))) {
+				cpu_isset(smp_processor_id(), p->cpus_allowed))) {
 
 				set_task_cpu(p, smp_processor_id());
 				task_rq_unlock(rq, &flags);
@@ -775,13 +775,13 @@ static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
  */
 static void sched_migrate_task(task_t *p, int dest_cpu)
 {
-	unsigned long old_mask;
+	cpumask_t old_mask;
 
 	old_mask = p->cpus_allowed;
-	if (!(old_mask & (1UL << dest_cpu)))
+	if (!cpu_isset(dest_cpu, old_mask))
 		return;
 	/* force the process onto the specified CPU */
-	set_cpus_allowed(p, 1UL << dest_cpu);
+	set_cpus_allowed(p, cpumask_of_cpu(dest_cpu));
 
 	/* restore the cpus allowed mask */
 	set_cpus_allowed(p, old_mask);
@@ -794,7 +794,7 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
 static int sched_best_cpu(struct task_struct *p)
 {
 	int i, minload, load, best_cpu, node = 0;
-	unsigned long cpumask;
+	cpumask_t cpumask;
 
 	best_cpu = task_cpu(p);
 	if (cpu_rq(best_cpu)->nr_running <= 2)
@@ -818,7 +818,7 @@ static int sched_best_cpu(struct task_struct *p)
 	minload = 10000000;
 	cpumask = node_to_cpumask(node);
 	for (i = 0; i < NR_CPUS; ++i) {
-		if (!(cpumask & (1UL << i)))
+		if (!cpu_isset(i, cpumask))
 			continue;
 		if (cpu_rq(i)->nr_running < minload) {
 			best_cpu = i;
@@ -905,7 +905,7 @@ static inline unsigned int double_lock_balance(runqueue_t *this_rq,
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in cpumask.
  */
-static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance, unsigned long cpumask)
+static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance, cpumask_t cpumask)
 {
 	int nr_running, load, max_load, i;
 	runqueue_t *busiest, *rq_src;
@@ -940,7 +940,7 @@ static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu,
 	busiest = NULL;
 	max_load = 1;
 	for (i = 0; i < NR_CPUS; i++) {
-		if (!((1UL << i) & cpumask))
+		if (!cpu_isset(i, cpumask))
 			continue;
 
 		rq_src = cpu_rq(i);
@@ -1012,7 +1012,7 @@ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t
  * We call this with the current runqueue locked,
  * irqs disabled.
  */
-static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask)
+static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask)
 {
 	int imbalance, idx, this_cpu = smp_processor_id();
 	runqueue_t *busiest;
@@ -1066,7 +1066,7 @@ skip_queue:
 #define CAN_MIGRATE_TASK(p,rq,this_cpu)					\
 	((!idle || (jiffies - (p)->last_run > cache_decay_ticks)) &&	\
 		!task_running(rq, p) &&					\
-			((p)->cpus_allowed & (1UL << (this_cpu))))
+			cpu_isset(this_cpu, (p)->cpus_allowed))
 
 	curr = curr->prev;
 
@@ -1109,10 +1109,10 @@ out:
 static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
 {
 	int node = find_busiest_node(cpu_to_node(this_cpu));
-	unsigned long cpumask, this_cpumask = 1UL << this_cpu;
 
 	if (node >= 0) {
-		cpumask = node_to_cpumask(node) | this_cpumask;
+		cpumask_t cpumask = node_to_cpumask(node);
+		cpu_set(this_cpu, cpumask);
 		spin_lock(&this_rq->lock);
 		load_balance(this_rq, idle, cpumask);
 		spin_unlock(&this_rq->lock);
@@ -1923,7 +1923,7 @@ out_unlock:
 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
-	unsigned long new_mask;
+	cpumask_t new_mask;
 	int retval;
 	task_t *p;
 
@@ -1971,7 +1971,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
 				      unsigned long __user *user_mask_ptr)
 {
 	unsigned int real_len;
-	unsigned long mask;
+	cpumask_t mask;
 	int retval;
 	task_t *p;
 
@@ -1987,7 +1987,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
 		goto out_unlock;
 
 	retval = 0;
-	mask = p->cpus_allowed & cpu_online_map;
+	cpus_and(mask, p->cpus_allowed, cpu_online_map);
 
 out_unlock:
 	read_unlock(&tasklist_lock);
@@ -2317,7 +2317,7 @@ typedef struct {
  * task must not exit() & deallocate itself prematurely.  The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed(task_t *p, unsigned long new_mask)
+int set_cpus_allowed(task_t *p, cpumask_t new_mask)
 {
 	unsigned long flags;
 	migration_req_t req;
@@ -2332,7 +2332,7 @@ int set_cpus_allowed(task_t *p, unsigned long new_mask)
 	 * Can the task run on the task's current CPU? If not then
 	 * migrate the thread off to a proper CPU.
 	 */
-	if (new_mask & (1UL << task_cpu(p))) {
+	if (cpu_isset(task_cpu(p), new_mask)) {
 		task_rq_unlock(rq, &flags);
 		return 0;
 	}
@@ -2402,7 +2402,7 @@ static int migration_thread(void * data)
 	 * migration thread on this CPU, guaranteed (we're started
 	 * serially).
 	 */
-	set_cpus_allowed(current, 1UL << cpu);
+	set_cpus_allowed(current, cpumask_of_cpu(cpu));
 
 	ret = setscheduler(0, SCHED_FIFO, &param);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 96294a3d673f..da14854d652b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -322,9 +322,8 @@ static int ksoftirqd(void * __bind_cpu)
 	current->flags |= PF_IOTHREAD;
 
 	/* Migrate to the right CPU */
-	set_cpus_allowed(current, 1UL << cpu);
-	if (smp_processor_id() != cpu)
-		BUG();
+	set_cpus_allowed(current, cpumask_of_cpu(cpu));
+	BUG_ON(smp_processor_id() != cpu);
 
 	__set_current_state(TASK_INTERRUPTIBLE);
 	mb();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index efeaaa4f0274..7b77fb75dc07 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -176,7 +176,7 @@ static int worker_thread(void *__startup)
 	cwq->thread = current;
 
 	set_user_nice(current, -10);
-	set_cpus_allowed(current, 1UL << cpu);
+	set_cpus_allowed(current, cpumask_of_cpu(cpu));
 
 	complete(&startup->done);
author	Andrew Morton <akpm@osdl.org>	2003-08-18 06:43:55 -0700
committer	Linus Torvalds <torvalds@home.osdl.org>	2003-08-18 06:43:55 -0700
commit	bf8cb61f2839b8885077adae5020491ec3e7f453 (patch)
tree	2cb32a3b80f9e21e0ca613c4fa63962a57940935 /kernel
parent	068a96bff3db85e237643e0e719b0563c35af8ea (diff)