5 files changed, 315 insertions, 299 deletions
diff --git a/kernel/context.c b/kernel/context.c
index d5fbe3344f12..ae836aaf2ddf 100644
--- a/kernel/context.c
+++ b/kernel/context.c
@@ -28,6 +28,60 @@ static DECLARE_WAIT_QUEUE_HEAD(context_task_done);
 static int keventd_running;
 static struct task_struct *keventd_task;
 
+static spinlock_t tqueue_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+
+typedef struct list_head task_queue;
+
+/*
+ * Queue a task on a tq.  Return non-zero if it was successfully
+ * added.
+ */
+static inline int queue_task(struct tq_struct *tq, task_queue *list)
+{
+	int ret = 0;
+	unsigned long flags;
+
+	if (!test_and_set_bit(0, &tq->sync)) {
+		spin_lock_irqsave(&tqueue_lock, flags);
+		list_add_tail(&tq->list, list);
+		spin_unlock_irqrestore(&tqueue_lock, flags);
+		ret = 1;
+	}
+	return ret;
+}
+
+#define TQ_ACTIVE(q)	(!list_empty(&q))
+
+static inline void run_task_queue(task_queue *list)
+{
+	struct list_head head, *next;
+	unsigned long flags;
+
+	if (!TQ_ACTIVE(*list))
+		return;
+
+	spin_lock_irqsave(&tqueue_lock, flags);
+	list_add(&head, list);
+	list_del_init(list);
+	spin_unlock_irqrestore(&tqueue_lock, flags);
+
+	next = head.next;
+	while (next != &head) {
+		void (*f) (void *);
+		struct tq_struct *p;
+		void *data;
+
+		p = list_entry(next, struct tq_struct, list);
+		next = next->next;
+		f = p->routine;
+		data = p->data;
+		wmb();
+		p->sync = 0;
+		if (f)
+			f(data);
+	}
+}
+
 static int need_keventd(const char *who)
 {
 	if (keventd_running == 0)
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 0409fc676f29..0b822501d9c8 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -420,12 +420,9 @@ EXPORT_SYMBOL(probe_irq_off);
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 EXPORT_SYMBOL(mod_timer);
-EXPORT_SYMBOL(tq_timer);
-EXPORT_SYMBOL(tq_immediate);
+EXPORT_SYMBOL(tvec_bases);
 
 #ifdef CONFIG_SMP
-/* Various random spinlocks we want to export */
-EXPORT_SYMBOL(tqueue_lock);
 
 /* Big-Reader lock implementation */
 EXPORT_SYMBOL(__brlock_array);
diff --git a/kernel/sched.c b/kernel/sched.c
index c4ba26fd875a..44d4929ca677 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -29,6 +29,7 @@
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/delay.h>
+#include <linux/timer.h>
 
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
@@ -860,6 +861,7 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
 
+	run_local_timers();
 	if (p == rq->idle) {
 		/* note: this timer irq context must be accounted for as well */
 		if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
@@ -2101,10 +2103,7 @@ __init int migration_init(void)
 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 #endif
 
-extern void init_timervecs(void);
-extern void timer_bh(void);
-extern void tqueue_bh(void);
-extern void immediate_bh(void);
+extern void init_timers(void);
 
 void __init sched_init(void)
 {
@@ -2140,10 +2139,7 @@ void __init sched_init(void)
 	set_task_cpu(current, smp_processor_id());
 	wake_up_process(current);
 
-	init_timervecs();
-	init_bh(TIMER_BH, timer_bh);
-	init_bh(TQUEUE_BH, tqueue_bh);
-	init_bh(IMMEDIATE_BH, immediate_bh);
+	init_timers();
 
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7b717b39c66e..8e1ea53cc032 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -3,21 +3,15 @@
  *
  *	Copyright (C) 1992 Linus Torvalds
  *
- * Fixed a disable_bh()/enable_bh() race (was causing a console lockup)
- * due bh_mask_count not atomic handling. Copyright (C) 1998  Andrea Arcangeli
- *
  * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
  */
 
-#include <linux/config.h>
-#include <linux/mm.h>
 #include <linux/kernel_stat.h>
 #include <linux/interrupt.h>
-#include <linux/smp_lock.h>
-#include <linux/init.h>
-#include <linux/tqueue.h>
-#include <linux/percpu.h>
 #include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
 
 /*
    - No shared variables, all the data are CPU local.
@@ -35,7 +29,6 @@
      it is logically serialized per device, but this serialization
      is invisible to common code.
    - Tasklets: serialized wrt itself.
-   - Bottom halves: globally serialized, grr...
  */
 
 irq_cpustat_t irq_stat[NR_CPUS];
@@ -115,10 +108,10 @@ inline void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
 	__cpu_raise_softirq(cpu, nr);
 
 	/*
-	 * If we're in an interrupt or bh, we're done
-	 * (this also catches bh-disabled code). We will
+	 * If we're in an interrupt or softirq, we're done
+	 * (this also catches softirq-disabled code). We will
 	 * actually run the softirq once we return from
-	 * the irq or bh.
+	 * the irq or softirq.
 	 *
 	 * Otherwise we wake up ksoftirqd to make sure we
 	 * schedule the softirq soon.
@@ -267,91 +260,12 @@ void tasklet_kill(struct tasklet_struct *t)
 	clear_bit(TASKLET_STATE_SCHED, &t->state);
 }
 
-
-
-/* Old style BHs */
-
-static void (*bh_base[32])(void);
-struct tasklet_struct bh_task_vec[32];
-
-/* BHs are serialized by spinlock global_bh_lock.
-
-   It is still possible to make synchronize_bh() as
-   spin_unlock_wait(&global_bh_lock). This operation is not used
-   by kernel now, so that this lock is not made private only
-   due to wait_on_irq().
-
-   It can be removed only after auditing all the BHs.
- */
-spinlock_t global_bh_lock = SPIN_LOCK_UNLOCKED;
-
-static void bh_action(unsigned long nr)
-{
-	if (!spin_trylock(&global_bh_lock))
-		goto resched;
-
-	if (bh_base[nr])
-		bh_base[nr]();
-
-	hardirq_endlock();
-	spin_unlock(&global_bh_lock);
-	return;
-
-	spin_unlock(&global_bh_lock);
-resched:
-	mark_bh(nr);
-}
-
-void init_bh(int nr, void (*routine)(void))
-{
-	bh_base[nr] = routine;
-	mb();
-}
-
-void remove_bh(int nr)
-{
-	tasklet_kill(bh_task_vec+nr);
-	bh_base[nr] = NULL;
-}
-
 void __init softirq_init()
 {
-	int i;
-
-	for (i=0; i<32; i++)
-		tasklet_init(bh_task_vec+i, bh_action, i);
-
 	open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
 }
 
-void __run_task_queue(task_queue *list)
-{
-	struct list_head head, *next;
-	unsigned long flags;
-
-	spin_lock_irqsave(&tqueue_lock, flags);
-	list_add(&head, list);
-	list_del_init(list);
-	spin_unlock_irqrestore(&tqueue_lock, flags);
-
-	next = head.next;
-	while (next != &head) {
-		void (*f) (void *);
-		struct tq_struct *p;
-		void *data;
-
-		p = list_entry(next, struct tq_struct, list);
-		next = next->next;
-		f = p->routine;
-		data = p->data;
-		wmb();
-		p->sync = 0;
-		if (f)
-			f(data);
-	}
-}
-
 static int ksoftirqd(void * __bind_cpu)
 {
 	int cpu = (int) (long) __bind_cpu;
diff --git a/kernel/timer.c b/kernel/timer.c
index 55c14c11c901..01ee9d3103b4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -14,74 +14,21 @@
  *                              Copyright (C) 1998  Andrea Arcangeli
  *  1999-03-10  Improved NTP compatibility by Ulrich Windl
  *  2002-05-31	Move sys_sysinfo here and make its locking sane, Robert Love
+ *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
+ *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
+ *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
  */
 
-#include <linux/config.h>
-#include <linux/mm.h>
-#include <linux/timex.h>
-#include <linux/delay.h>
-#include <linux/smp_lock.h>
-#include <linux/interrupt.h>
-#include <linux/tqueue.h>
 #include <linux/kernel_stat.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
 
 #include <asm/uaccess.h>
 
-struct kernel_stat kstat;
-
-/*
- * Timekeeping variables
- */
-
-unsigned long tick_usec = TICK_USEC; 		/* ACTHZ          period (usec) */
-unsigned long tick_nsec = TICK_NSEC(TICK_USEC);	/* USER_HZ period (nsec) */
-
-/* The current time */
-struct timespec xtime __attribute__ ((aligned (16)));
-
-/* Don't completely fail for HZ > 500.  */
-int tickadj = 500/HZ ? : 1;		/* microsecs */
-
-DECLARE_TASK_QUEUE(tq_timer);
-DECLARE_TASK_QUEUE(tq_immediate);
-
 /*
- * phase-lock loop variables
- */
-/* TIME_ERROR prevents overwriting the CMOS clock */
-int time_state = TIME_OK;		/* clock synchronization status	*/
-int time_status = STA_UNSYNC;		/* clock status bits		*/
-long time_offset;			/* time adjustment (us)		*/
-long time_constant = 2;			/* pll time constant		*/
-long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
-long time_precision = 1;		/* clock precision (us)		*/
-long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
-long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-long time_phase;			/* phase offset (scaled us)	*/
-long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;
-					/* frequency offset (scaled ppm)*/
-long time_adj;				/* tick adjust (scaled 1 / HZ)	*/
-long time_reftime;			/* time at last adjustment (s)	*/
-
-long time_adjust;
-
-unsigned long event;
-
-extern int do_setitimer(int, struct itimerval *, struct itimerval *);
-
-/*
- * The 64-bit jiffies value is not atomic - you MUST NOT read it
- * without holding read_lock_irq(&xtime_lock).
- * jiffies is defined in the linker script...
- */
-
-
-unsigned int * prof_buffer;
-unsigned long prof_len;
-unsigned long prof_shift;
-
-/*
- * Event timer code
+ * per-CPU timer vector definitions:
  */
 #define TVN_BITS 6
 #define TVR_BITS 8
@@ -90,115 +37,88 @@ unsigned long prof_shift;
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
 
-struct timer_vec {
+typedef struct tvec_s {
 	int index;
 	struct list_head vec[TVN_SIZE];
-};
+} tvec_t;
 
-struct timer_vec_root {
+typedef struct tvec_root_s {
 	int index;
 	struct list_head vec[TVR_SIZE];
-};
+} tvec_root_t;
 
-static struct timer_vec tv5;
-static struct timer_vec tv4;
-static struct timer_vec tv3;
-static struct timer_vec tv2;
-static struct timer_vec_root tv1;
+struct tvec_t_base_s {
+	spinlock_t lock;
+	unsigned long timer_jiffies;
+	volatile timer_t * volatile running_timer;
+	tvec_root_t tv1;
+	tvec_t tv2;
+	tvec_t tv3;
+	tvec_t tv4;
+	tvec_t tv5;
+} ____cacheline_aligned_in_smp;
 
-static struct timer_vec * const tvecs[] = {
-	(struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
-};
+typedef struct tvec_t_base_s tvec_base_t;
 
-#define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
+static tvec_base_t tvec_bases[NR_CPUS] __cacheline_aligned;
 
-void init_timervecs (void)
-{
-	int i;
+/* Fake initialization needed to avoid compiler breakage */
+static DEFINE_PER_CPU(struct tasklet_struct, timer_tasklet) = { NULL };
 
-	for (i = 0; i < TVN_SIZE; i++) {
-		INIT_LIST_HEAD(tv5.vec + i);
-		INIT_LIST_HEAD(tv4.vec + i);
-		INIT_LIST_HEAD(tv3.vec + i);
-		INIT_LIST_HEAD(tv2.vec + i);
-	}
-	for (i = 0; i < TVR_SIZE; i++)
-		INIT_LIST_HEAD(tv1.vec + i);
-}
-
-static unsigned long timer_jiffies;
-
-static inline void internal_add_timer(struct timer_list *timer)
+static inline void internal_add_timer(tvec_base_t *base, timer_t *timer)
 {
-	/*
-	 * must be cli-ed when calling this
-	 */
 	unsigned long expires = timer->expires;
-	unsigned long idx = expires - timer_jiffies;
+	unsigned long idx = expires - base->timer_jiffies;
 	struct list_head * vec;
 
 	if (idx < TVR_SIZE) {
 		int i = expires & TVR_MASK;
-		vec = tv1.vec + i;
+		vec = base->tv1.vec + i;
 	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
 		int i = (expires >> TVR_BITS) & TVN_MASK;
-		vec = tv2.vec + i;
+		vec = base->tv2.vec + i;
 	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
 		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
-		vec =  tv3.vec + i;
+		vec = base->tv3.vec + i;
 	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
 		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
-		vec = tv4.vec + i;
+		vec = base->tv4.vec + i;
 	} else if ((signed long) idx < 0) {
-		/* can happen if you add a timer with expires == jiffies,
+		/*
+		 * Can happen if you add a timer with expires == jiffies,
 		 * or you set a timer to go off in the past
 		 */
-		vec = tv1.vec + tv1.index;
+		vec = base->tv1.vec + base->tv1.index;
 	} else if (idx <= 0xffffffffUL) {
 		int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
-		vec = tv5.vec + i;
+		vec = base->tv5.vec + i;
 	} else {
 		/* Can only get here on architectures with 64-bit jiffies */
 		INIT_LIST_HEAD(&timer->list);
 		return;
 	}
 	/*
-	 * Timers are FIFO!
+	 * Timers are FIFO:
 	 */
-	list_add(&timer->list, vec->prev);
+	list_add_tail(&timer->list, vec);
 }
 
-/* Initialize both explicitly - let's try to have them in the same cache line */
-spinlock_t timerlist_lock ____cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-
-#ifdef CONFIG_SMP
-volatile struct timer_list * volatile running_timer;
-#define timer_enter(t) do { running_timer = t; mb(); } while (0)
-#define timer_exit() do { running_timer = NULL; } while (0)
-#define timer_is_running(t) (running_timer == t)
-#define timer_synchronize(t) while (timer_is_running(t)) barrier()
-#else
-#define timer_enter(t)		do { } while (0)
-#define timer_exit()		do { } while (0)
-#endif
-
-void add_timer(struct timer_list *timer)
+void add_timer(timer_t *timer)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&timerlist_lock, flags);
-	if (unlikely(timer_pending(timer)))
-		goto bug;
-	internal_add_timer(timer);
-	spin_unlock_irqrestore(&timerlist_lock, flags);
-	return;
-bug:
-	spin_unlock_irqrestore(&timerlist_lock, flags);
-	printk(KERN_ERR "BUG: kernel timer added twice at %p.\n",
-			__builtin_return_address(0));
+	int cpu = get_cpu();
+	tvec_base_t *base = tvec_bases + cpu;
+  	unsigned long flags;
+  
+  	BUG_ON(timer_pending(timer));
+
+	spin_lock_irqsave(&base->lock, flags);
+	internal_add_timer(base, timer);
+	timer->base = base;
+	spin_unlock_irqrestore(&base->lock, flags);
+	put_cpu();
 }
 
-static inline int detach_timer (struct timer_list *timer)
+static inline int detach_timer (timer_t *timer)
 {
 	if (!timer_pending(timer))
 		return 0;
@@ -206,28 +126,78 @@ static inline int detach_timer (struct timer_list *timer)
 	return 1;
 }
 
-int mod_timer(struct timer_list *timer, unsigned long expires)
+/*
+ * mod_timer() has subtle locking semantics because parallel
+ * calls to it must happen serialized.
+ */
+int mod_timer(timer_t *timer, unsigned long expires)
 {
-	int ret;
+	tvec_base_t *old_base, *new_base;
 	unsigned long flags;
+	int ret;
+
+	if (timer_pending(timer) && timer->expires == expires)
+		return 1;
+
+	local_irq_save(flags);
+	new_base = tvec_bases + smp_processor_id();
+repeat:
+	old_base = timer->base;
+
+	/*
+	 * Prevent deadlocks via ordering by old_base < new_base.
+	 */
+	if (old_base && (new_base != old_base)) {
+		if (old_base < new_base) {
+			spin_lock(&new_base->lock);
+			spin_lock(&old_base->lock);
+		} else {
+			spin_lock(&old_base->lock);
+			spin_lock(&new_base->lock);
+		}
+		/*
+		 * Subtle, we rely on timer->base being always
+		 * valid and being updated atomically.
+		 */
+		if (timer->base != old_base) {
+			spin_unlock(&new_base->lock);
+			spin_unlock(&old_base->lock);
+			goto repeat;
+		}
+	} else
+		spin_lock(&new_base->lock);
 
-	spin_lock_irqsave(&timerlist_lock, flags);
 	timer->expires = expires;
 	ret = detach_timer(timer);
-	internal_add_timer(timer);
-	spin_unlock_irqrestore(&timerlist_lock, flags);
+	internal_add_timer(new_base, timer);
+	timer->base = new_base;
+
+	if (old_base && (new_base != old_base))
+		spin_unlock(&old_base->lock);
+	spin_unlock_irqrestore(&new_base->lock, flags);
+
 	return ret;
 }
 
-int del_timer(struct timer_list * timer)
+int del_timer(timer_t * timer)
 {
-	int ret;
 	unsigned long flags;
+	tvec_base_t * base;
+	int ret;
 
-	spin_lock_irqsave(&timerlist_lock, flags);
+	if (!timer->base)
+		return 0;
+repeat:
+ 	base = timer->base;
+	spin_lock_irqsave(&base->lock, flags);
+	if (base != timer->base) {
+		spin_unlock_irqrestore(&base->lock, flags);
+		goto repeat;
+	}
 	ret = detach_timer(timer);
 	timer->list.next = timer->list.prev = NULL;
-	spin_unlock_irqrestore(&timerlist_lock, flags);
+	spin_unlock_irqrestore(&base->lock, flags);
+
 	return ret;
 }
 
@@ -240,24 +210,33 @@ int del_timer(struct timer_list * timer)
  * (for reference counting).
  */
 
-int del_timer_sync(struct timer_list * timer)
+int del_timer_sync(timer_t * timer)
 {
+	tvec_base_t * base;
 	int ret = 0;
 
+	if (!timer->base)
+		return 0;
 	for (;;) {
 		unsigned long flags;
 		int running;
 
-		spin_lock_irqsave(&timerlist_lock, flags);
+repeat:
+	 	base = timer->base;
+		spin_lock_irqsave(&base->lock, flags);
+		if (base != timer->base) {
+			spin_unlock_irqrestore(&base->lock, flags);
+			goto repeat;
+		}
 		ret += detach_timer(timer);
 		timer->list.next = timer->list.prev = 0;
-		running = timer_is_running(timer);
-		spin_unlock_irqrestore(&timerlist_lock, flags);
+		running = timer_is_running(base, timer);
+		spin_unlock_irqrestore(&base->lock, flags);
 
 		if (!running)
 			break;
 
-		timer_synchronize(timer);
+		timer_synchronize(base, timer);
 	}
 
 	return ret;
@@ -265,7 +244,7 @@ int del_timer_sync(struct timer_list * timer)
 #endif
 
 
-static inline void cascade_timers(struct timer_vec *tv)
+static void cascade(tvec_base_t *base, tvec_t *tv)
 {
 	/* cascade all the timers from tv up one level */
 	struct list_head *head, *curr, *next;
@@ -277,67 +256,107 @@ static inline void cascade_timers(struct timer_vec *tv)
 	 * detach them individually, just clear the list afterwards.
 	 */
 	while (curr != head) {
-		struct timer_list *tmp;
+		timer_t *tmp;
 
-		tmp = list_entry(curr, struct timer_list, list);
+		tmp = list_entry(curr, timer_t, list);
+		if (tmp->base != base)
+			BUG();
 		next = curr->next;
 		list_del(curr); // not needed
-		internal_add_timer(tmp);
+		internal_add_timer(base, tmp);
 		curr = next;
 	}
 	INIT_LIST_HEAD(head);
 	tv->index = (tv->index + 1) & TVN_MASK;
 }
 
-static inline void run_timer_list(void)
+static void __run_timers(tvec_base_t *base)
 {
-	spin_lock_irq(&timerlist_lock);
-	while ((long)(jiffies - timer_jiffies) >= 0) {
+	unsigned long flags;
+
+	spin_lock_irqsave(&base->lock, flags);
+	while ((long)(jiffies - base->timer_jiffies) >= 0) {
 		struct list_head *head, *curr;
-		if (!tv1.index) {
-			int n = 1;
-			do {
-				cascade_timers(tvecs[n]);
-			} while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
+
+		/*
+		 * Cascade timers:
+		 */
+		if (!base->tv1.index) {
+			cascade(base, &base->tv2);
+			if (base->tv2.index == 1) {
+				cascade(base, &base->tv3);
+				if (base->tv3.index == 1) {
+					cascade(base, &base->tv4);
+					if (base->tv4.index == 1)
+						cascade(base, &base->tv5);
+				}
+			}
 		}
 repeat:
-		head = tv1.vec + tv1.index;
+		head = base->tv1.vec + base->tv1.index;
 		curr = head->next;
 		if (curr != head) {
-			struct timer_list *timer;
 			void (*fn)(unsigned long);
 			unsigned long data;
+			timer_t *timer;
 
-			timer = list_entry(curr, struct timer_list, list);
+			timer = list_entry(curr, timer_t, list);
  			fn = timer->function;
- 			data= timer->data;
+ 			data = timer->data;
 
 			detach_timer(timer);
 			timer->list.next = timer->list.prev = NULL;
-			timer_enter(timer);
-			spin_unlock_irq(&timerlist_lock);
+			timer_enter(base, timer);
+			spin_unlock_irq(&base->lock);
 			fn(data);
-			spin_lock_irq(&timerlist_lock);
-			timer_exit();
+			spin_lock_irq(&base->lock);
+			timer_exit(base);
 			goto repeat;
 		}
-		++timer_jiffies; 
-		tv1.index = (tv1.index + 1) & TVR_MASK;
+		++base->timer_jiffies; 
+		base->tv1.index = (base->tv1.index + 1) & TVR_MASK;
 	}
-	spin_unlock_irq(&timerlist_lock);
+	spin_unlock_irqrestore(&base->lock, flags);
 }
 
-spinlock_t tqueue_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
+/******************************************************************/
 
-void tqueue_bh(void)
-{
-	run_task_queue(&tq_timer);
-}
+/*
+ * Timekeeping variables
+ */
+unsigned long tick_usec = TICK_USEC; 		/* ACTHZ   period (usec) */
+unsigned long tick_nsec = TICK_NSEC(TICK_USEC);	/* USER_HZ period (nsec) */
 
-void immediate_bh(void)
-{
-	run_task_queue(&tq_immediate);
-}
+/* The current time */
+struct timespec xtime __attribute__ ((aligned (16)));
+
+/* Don't completely fail for HZ > 500.  */
+int tickadj = 500/HZ ? : 1;		/* microsecs */
+
+struct kernel_stat kstat;
+
+/*
+ * phase-lock loop variables
+ */
+/* TIME_ERROR prevents overwriting the CMOS clock */
+int time_state = TIME_OK;		/* clock synchronization status	*/
+int time_status = STA_UNSYNC;		/* clock status bits		*/
+long time_offset;			/* time adjustment (us)		*/
+long time_constant = 2;			/* pll time constant		*/
+long time_tolerance = MAXFREQ;		/* frequency tolerance (ppm)	*/
+long time_precision = 1;		/* clock precision (us)		*/
+long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
+long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
+long time_phase;			/* phase offset (scaled us)	*/
+long time_freq = ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;
+					/* frequency offset (scaled ppm)*/
+long time_adj;				/* tick adjust (scaled 1 / HZ)	*/
+long time_reftime;			/* time at last adjustment (s)	*/
+long time_adjust;
+
+unsigned int * prof_buffer;
+unsigned long prof_len;
+unsigned long prof_shift;
 
 /*
  * this routine handles the overflow of the microsecond field
@@ -638,17 +657,33 @@ unsigned long wall_jiffies;
 rwlock_t xtime_lock __cacheline_aligned_in_smp = RW_LOCK_UNLOCKED;
 unsigned long last_time_offset;
 
+/*
+ * This function runs timers and the timer-tq in softirq context.
+ */
+static void run_timer_tasklet(unsigned long data)
+{
+	tvec_base_t *base = tvec_bases + smp_processor_id();
+
+	if ((long)(jiffies - base->timer_jiffies) >= 0)
+		__run_timers(base);
+}
+
+/*
+ * Called by the local, per-CPU timer interrupt on SMP.
+ */
+void run_local_timers(void)
+{
+	tasklet_hi_schedule(&per_cpu(timer_tasklet, smp_processor_id()));
+}
+
+/*
+ * Called by the timer interrupt. xtime_lock must already be taken
+ * by the timer IRQ!
+ */
 static inline void update_times(void)
 {
 	unsigned long ticks;
 
-	/*
-	 * update_times() is run from the raw timer_bh handler so we
-	 * just know that the irqs are locally enabled and so we don't
-	 * need to save/restore the flags of the local CPU here. -arca
-	 */
-	write_lock_irq(&xtime_lock);
-
 	ticks = jiffies - wall_jiffies;
 	if (ticks) {
 		wall_jiffies += ticks;
@@ -656,14 +691,13 @@ static inline void update_times(void)
 	}
 	last_time_offset = 0;
 	calc_load(ticks);
-	write_unlock_irq(&xtime_lock);
-}
-
-void timer_bh(void)
-{
-	update_times();
-	run_timer_list();
 }
+  
+/*
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * without holding read_lock_irq(&xtime_lock).
+ * jiffies is defined in the linker script...
+ */
 
 void do_timer(struct pt_regs *regs)
 {
@@ -673,13 +707,13 @@ void do_timer(struct pt_regs *regs)
 
 	update_process_times(user_mode(regs));
 #endif
-	mark_bh(TIMER_BH);
-	if (TQ_ACTIVE(tq_timer))
-		mark_bh(TQUEUE_BH);
+	update_times();
 }
 
 #if !defined(__alpha__) && !defined(__ia64__)
 
+extern int do_setitimer(int, struct itimerval *, struct itimerval *);
+
 /*
  * For backwards compatibility?  This can be done in libc so Alpha
  * and all newer ports shouldn't need it.
@@ -821,7 +855,7 @@ static void process_timeout(unsigned long __data)
  */
 signed long schedule_timeout(signed long timeout)
 {
-	struct timer_list timer;
+	timer_t timer;
 	unsigned long expire;
 
 	switch (timeout)
@@ -974,3 +1008,24 @@ out:
 
 	return 0;
 }
+
+void __init init_timers(void)
+{
+	int i, j;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		tvec_base_t *base;
+	       
+		base = tvec_bases + i;
+		spin_lock_init(&base->lock);
+		for (j = 0; j < TVN_SIZE; j++) {
+			INIT_LIST_HEAD(base->tv5.vec + j);
+			INIT_LIST_HEAD(base->tv4.vec + j);
+			INIT_LIST_HEAD(base->tv3.vec + j);
+			INIT_LIST_HEAD(base->tv2.vec + j);
+		}
+		for (j = 0; j < TVR_SIZE; j++)
+			INIT_LIST_HEAD(base->tv1.vec + j);
+		tasklet_init(&per_cpu(timer_tasklet, i), run_timer_tasklet, 0);
+	}
+}