From 9b65d113fe1cd4961ce94127584413cefd49aaed Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 28 Mar 2005 03:19:14 -0800
Subject: [PATCH] mm counter operations through macros

This patch extracts all the operations on counters protected by the page
table lock (currently rss and anon_rss) into definitions in
include/linux/sched.h.  All rss operations are performed through the
following macros:

get_mm_counter(mm, member)		-> Obtain the value of a counter
set_mm_counter(mm, member, value)	-> Set the value of a counter
update_mm_counter(mm, member, value)	-> Add to a counter
inc_mm_counter(mm, member)		-> Increment a counter
dec_mm_counter(mm, member)		-> Decrement a counter

With this patch it becomes easier to add new counters and it is possible to
redefine the method of counter handling.  The counters are an issue for
scalability since they are used in frequently used code paths and may cause
cache line bouncing.

F.e. One may not use counters at all and count the pages when needed, switch
to atomic operations if the mm_struct locking changes or split the rss
into counters that can be locally incremented.

The relevant fields of the task_struct are renamed with a leading underscore
to catch out people who are not using the acceessor macros.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/acct.c | 2 +-
 kernel/fork.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 035669624b6c..4168f631868e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -542,7 +542,7 @@ void acct_update_integrals(struct task_struct *tsk)
 		if (delta == 0)
 			return;
 		tsk->acct_stimexpd = tsk->stime;
-		tsk->acct_rss_mem1 += delta * tsk->mm->rss;
+		tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b67b3ebf3c0..f42a17f88699 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -195,8 +195,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	mm->mmap_cache = NULL;
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->map_count = 0;
-	mm->rss = 0;
-	mm->anon_rss = 0;
+	set_mm_counter(mm, rss, 0);
+	set_mm_counter(mm, anon_rss, 0);
 	cpus_clear(mm->cpu_vm_mask);
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
@@ -492,7 +492,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 	if (retval)
 		goto free_pt;
 
-	mm->hiwater_rss = mm->rss;
+	mm->hiwater_rss = get_mm_counter(mm,rss);
 	mm->hiwater_vm = mm->total_vm;
 
 good_mm:
-- 
cgit v1.2.3


From a027980e3ee6e032e919bdfa670afff861dc358f Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Mon, 28 Mar 2005 03:25:45 -0800
Subject: [PATCH] ppc64: fix linkage error on G5

Move the ppc64 specific cond_syscall(ppc_rtas) into sys_ni.c so that it
takes effect.  With this fixed we can remove the #define hack.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/kernel/misc.S     | 4 ----
 arch/ppc64/kernel/syscalls.c | 3 ---
 kernel/sys_ni.c              | 1 +
 3 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/ppc64/kernel/misc.S b/arch/ppc64/kernel/misc.S
index 306e1879ad78..90b41f48d21c 100644
--- a/arch/ppc64/kernel/misc.S
+++ b/arch/ppc64/kernel/misc.S
@@ -680,10 +680,6 @@ _GLOBAL(kernel_thread)
 	ld	r30,-16(r1)
 	blr
 
-#ifdef CONFIG_PPC_RTAS /* hack hack hack */
-#define ppc_rtas	sys_ni_syscall
-#endif
-
 /* Why isn't this a) automatic, b) written in 'C'? */	
 	.balign 8
 _GLOBAL(sys_call_table32)
diff --git a/arch/ppc64/kernel/syscalls.c b/arch/ppc64/kernel/syscalls.c
index 365987190d71..f2865ff8d2f9 100644
--- a/arch/ppc64/kernel/syscalls.c
+++ b/arch/ppc64/kernel/syscalls.c
@@ -256,6 +256,3 @@ void do_show_syscall_exit(unsigned long r3)
 {
 	printk(" -> %lx, current=%p cpu=%d\n", r3, current, smp_processor_id());
 }
-
-/* Only exists on P-series. */
-cond_syscall(ppc_rtas);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 623eaf517534..1802a311dd3f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -83,3 +83,4 @@ cond_syscall(sys_pciconfig_write);
 cond_syscall(sys_pciconfig_iobase);
 cond_syscall(sys32_ipc);
 cond_syscall(sys32_sysctl);
+cond_syscall(ppc_rtas);
-- 
cgit v1.2.3


From 3dbbe74b828f525fba77aeb6deab49a5bd108b7b Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Mon, 28 Mar 2005 03:28:42 -0800
Subject: [PATCH] x86: CMOS time update optimisation

This patch changes the update of the cmos clock to be timer driven rather
than poll driven by the timer interrupt function.  If the clock is not
being synced to an outside source the timer is removed and thus system
overhead is nill in that case.  The update frequency is still ~11 minutes
and missing the update window still causes a retry in 60 seconds.

We want the calls to sync_cmos_clock() to be made in a consistent environment.
This was not true when calling it directly from the NTP call code.  The
change means that sync_cmos_clock() is ALWAYS called from run_timers(), i.e.
as a timer call back function.

Also, call the timer code only through the timer interface (set a short timer
to do it from the ntp call).

Signed-off-by: George Anzinger <george@mvista.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/time.c | 75 ++++++++++++++++++++++++++++++++++---------------
 kernel/time.c           |  9 ++++++
 2 files changed, 62 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 6b5c98f3683a..9b55e30e4490 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -204,19 +204,19 @@ static int set_rtc_mmss(unsigned long nowtime)
 {
 	int retval;
 
+	WARN_ON(irqs_disabled());
+
 	/* gets recalled with irq locally disabled */
-	spin_lock(&rtc_lock);
+	spin_lock_irq(&rtc_lock);
 	if (efi_enabled)
 		retval = efi_set_rtc_mmss(nowtime);
 	else
 		retval = mach_set_rtc_mmss(nowtime);
-	spin_unlock(&rtc_lock);
+	spin_unlock_irq(&rtc_lock);
 
 	return retval;
 }
 
-/* last time the cmos clock got updated */
-static long last_rtc_update;
 
 int timer_ack;
 
@@ -268,24 +268,6 @@ static inline void do_timer_interrupt(int irq, void *dev_id,
 
 	do_timer_interrupt_hook(regs);
 
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
-	    xtime.tv_sec > last_rtc_update + 660 &&
-	    (xtime.tv_nsec / 1000)
-			>= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
-	    (xtime.tv_nsec / 1000)
-			<= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) {
-	        last_rtc_update = xtime.tv_sec;
-		if (efi_enabled) {
-		    if (efi_set_rtc_mmss(xtime.tv_sec))
-			last_rtc_update -= 600;
-		} else if (set_rtc_mmss(xtime.tv_sec))
-			last_rtc_update -= 600;
-	}
 
 	if (MCA_bus) {
 		/* The PS/2 uses level-triggered interrupts.  You can't
@@ -342,6 +324,55 @@ unsigned long get_cmos_time(void)
 
 	return retval;
 }
+static void sync_cmos_clock(unsigned long dummy);
+
+static struct timer_list sync_cmos_timer =
+                                      TIMER_INITIALIZER(sync_cmos_clock, 0, 0);
+
+static void sync_cmos_clock(unsigned long dummy)
+{
+	struct timeval now, next;
+	int fail = 1;
+
+	/*
+	 * If we have an externally synchronized Linux clock, then update
+	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+	 * called as close as possible to 500 ms before the new second starts.
+	 * This code is run on a timer.  If the clock is set, that timer
+	 * may not expire at the correct time.  Thus, we adjust...
+	 */
+	if ((time_status & STA_UNSYNC) != 0)
+		/*
+		 * Not synced, exit, do not restart a timer (if one is
+		 * running, let it run out).
+		 */
+		return;
+
+	do_gettimeofday(&now);
+	if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
+	    now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
+		fail = set_rtc_mmss(now.tv_sec);
+
+	next.tv_usec = USEC_AFTER - now.tv_usec;
+	if (next.tv_usec <= 0)
+		next.tv_usec += USEC_PER_SEC;
+
+	if (!fail)
+		next.tv_sec = 659;
+	else
+		next.tv_sec = 0;
+
+	if (next.tv_usec >= USEC_PER_SEC) {
+		next.tv_sec++;
+		next.tv_usec -= USEC_PER_SEC;
+	}
+	mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
+}
+
+void notify_arch_cmos_timer(void)
+{
+	mod_timer(&sync_cmos_timer, jiffies + 1);
+}
 
 static long clock_cmos_diff, sleep_start;
 
diff --git a/kernel/time.c b/kernel/time.c
index d5400f6af052..2c0b90d79d4c 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -215,6 +215,14 @@ long pps_stbcnt;		/* stability limit exceeded */
 /* hook for a loadable hardpps kernel module */
 void (*hardpps_ptr)(struct timeval *);
 
+/* we call this to notify the arch when the clock is being
+ * controlled.  If no such arch routine, do nothing.
+ */
+void __attribute__ ((weak)) notify_arch_cmos_timer(void)
+{
+	return;
+}
+
 /* adjtimex mainly allows reading (and writing, if superuser) of
  * kernel time-keeping variables. used by xntpd.
  */
@@ -398,6 +406,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
 	txc->stbcnt	   = pps_stbcnt;
 	write_sequnlock_irq(&xtime_lock);
 	do_gettimeofday(&txc->time);
+	notify_arch_cmos_timer();
 	return(result);
 }
 
-- 
cgit v1.2.3


From ca0a9bc2cf3937968f1a4395b8e8711c150afe96 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Mon, 28 Mar 2005 03:42:22 -0800
Subject: [PATCH] swsusp: Add missing refrigerator calls

This adds few more places where it is possible freeze kernel threads.

From: Nigel Cunningham <ncunningham@cyclades.com>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/media/video/msp3400.c | 8 ++++++--
 drivers/media/video/tvaudio.c | 1 +
 drivers/pnp/pnpbios/core.c    | 6 +++++-
 fs/afs/kafsasyncd.c           | 2 ++
 fs/afs/kafstimod.c            | 2 ++
 fs/lockd/clntproc.c           | 1 +
 kernel/signal.c               | 2 ++
 net/rxrpc/krxiod.c            | 2 ++
 net/rxrpc/krxsecd.c           | 2 ++
 net/rxrpc/krxtimod.c          | 2 ++
 net/sunrpc/svcsock.c          | 1 +
 11 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/drivers/media/video/msp3400.c b/drivers/media/video/msp3400.c
index 040d9312e37d..d996ec99caff 100644
--- a/drivers/media/video/msp3400.c
+++ b/drivers/media/video/msp3400.c
@@ -734,6 +734,7 @@ static int msp34xx_sleep(struct msp3400c *msp, int timeout)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
+again:
 	add_wait_queue(&msp->wq, &wait);
 	if (!kthread_should_stop()) {
 		if (timeout < 0) {
@@ -749,9 +750,12 @@ static int msp34xx_sleep(struct msp3400c *msp, int timeout)
 #endif
 		}
 	}
-	if (current->flags & PF_FREEZE)
-		refrigerator(PF_FREEZE);
+
 	remove_wait_queue(&msp->wq, &wait);
+
+	if (try_to_freeze(PF_FREEZE))
+		goto again;
+
 	return msp->restart;
 }
 
diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c
index 540a6b1461f7..065eb4007b1d 100644
--- a/drivers/media/video/tvaudio.c
+++ b/drivers/media/video/tvaudio.c
@@ -286,6 +286,7 @@ static int chip_thread(void *data)
 			schedule();
 		}
 		remove_wait_queue(&chip->wq, &wait);
+		try_to_freeze(PF_FREEZE);
 		if (chip->done || signal_pending(current))
 			break;
 		dprintk("%s: thread wakeup\n", i2c_clientname(&chip->c));
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index 355242f64c99..0f6330b3af12 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -180,8 +180,12 @@ static int pnp_dock_thread(void * unused)
 		 * Poll every 2 seconds
 		 */
 		msleep_interruptible(2000);
-		if(signal_pending(current))
+
+		if(signal_pending(current)) {
+			if (try_to_freeze(PF_FREEZE))
+				continue;
 			break;
+		}
 
 		status = pnp_bios_dock_station_info(&now);
 
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
index e179e17acdc6..6fc88ae8ad94 100644
--- a/fs/afs/kafsasyncd.c
+++ b/fs/afs/kafsasyncd.c
@@ -116,6 +116,8 @@ static int kafsasyncd(void *arg)
 		remove_wait_queue(&kafsasyncd_sleepq, &myself);
 		set_current_state(TASK_RUNNING);
 
+		try_to_freeze(PF_FREEZE);
+
 		/* discard pending signals */
 		afs_discard_my_signals();
 
diff --git a/fs/afs/kafstimod.c b/fs/afs/kafstimod.c
index e8bbffee5866..86e710dd057e 100644
--- a/fs/afs/kafstimod.c
+++ b/fs/afs/kafstimod.c
@@ -91,6 +91,8 @@ static int kafstimod(void *arg)
 			complete_and_exit(&kafstimod_dead, 0);
 		}
 
+		try_to_freeze(PF_FREEZE);
+
 		/* discard pending signals */
 		afs_discard_my_signals();
 
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 8494023e48eb..a4407619b1f1 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -312,6 +312,7 @@ static int nlm_wait_on_grace(wait_queue_head_t *queue)
 	prepare_to_wait(queue, &wait, TASK_INTERRUPTIBLE);
 	if (!signalled ()) {
 		schedule_timeout(NLMCLNT_GRACE_WAIT);
+		try_to_freeze(PF_FREEZE);
 		if (!signalled ())
 			status = 0;
 	}
diff --git a/kernel/signal.c b/kernel/signal.c
index d60b47641b79..0dfc74ccf61a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2219,6 +2219,8 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 			current->state = TASK_INTERRUPTIBLE;
 			timeout = schedule_timeout(timeout);
 
+			if (current->flags & PF_FREEZE)
+				refrigerator(PF_FREEZE);
 			spin_lock_irq(&current->sighand->siglock);
 			sig = dequeue_signal(current, &these, &info);
 			current->blocked = current->real_blocked;
diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index c987395e5783..2b537f425a17 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,6 +138,8 @@ static int rxrpc_krxiod(void *arg)
 
 		_debug("### End Work");
 
+		try_to_freeze(PF_FREEZE);
+
                 /* discard pending signals */
 		rxrpc_discard_my_signals();
 
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 117aa91537a1..6020c89d9228 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,6 +107,8 @@ static int rxrpc_krxsecd(void *arg)
 
 		_debug("### End Inbound Calls");
 
+		try_to_freeze(PF_FREEZE);
+
                 /* discard pending signals */
 		rxrpc_discard_my_signals();
 
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 0d9e9d2f8eed..249c2b0290bb 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,6 +90,8 @@ static int krxtimod(void *arg)
 			complete_and_exit(&krxtimod_dead, 0);
 		}
 
+		try_to_freeze(PF_FREEZE);
+
 		/* discard pending signals */
 		rxrpc_discard_my_signals();
 
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 7794c16d84bb..05907035bc96 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1186,6 +1186,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 	arg->len = (pages-1)*PAGE_SIZE;
 	arg->tail[0].iov_len = 0;
 	
+	try_to_freeze(PF_FREEZE);
 	if (signalled())
 		return -EINTR;
 
-- 
cgit v1.2.3


From c7b2e321ebc22a62871a7b7245db303591038b01 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Mon, 28 Mar 2005 03:43:12 -0800
Subject: [PATCH] swsusp: small updates

This kills unused macro and write-only variable, and adds messages where
something goes wrong with suspending devices.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h | 2 --
 kernel/power/main.c     | 8 ++++++--
 kernel/power/swsusp.c   | 7 +++----
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 33917ccee13b..2bf0d5fabcdb 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -34,8 +34,6 @@ typedef struct pbe {
 #define SWAP_FILENAME_MAXLENGTH	32
 
 
-#define SUSPEND_PD_PAGES(x)     (((x)*sizeof(struct pbe))/PAGE_SIZE+1)
-
 extern dev_t swsusp_resume_device;
    
 /* mm/vmscan.c */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b0315cbad9b9..7960ddf04a57 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -65,8 +65,10 @@ static int suspend_prepare(suspend_state_t state)
 			goto Thaw;
 	}
 
-	if ((error = device_suspend(PMSG_SUSPEND)))
+	if ((error = device_suspend(PMSG_SUSPEND))) {
+		printk(KERN_ERR "Some devices failed to suspend\n");
 		goto Finish;
+	}
 	return 0;
  Finish:
 	if (pm_ops->finish)
@@ -85,8 +87,10 @@ static int suspend_enter(suspend_state_t state)
 
 	local_irq_save(flags);
 
-	if ((error = device_power_down(PMSG_SUSPEND)))
+	if ((error = device_power_down(PMSG_SUSPEND))) {
+		printk(KERN_ERR "Some devices failed to power down\n");
 		goto Done;
+	}
 	error = pm_ops->enter(state);
 	device_power_up();
  Done:
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index cf555c10d3b5..328cbcb9867f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -98,7 +98,6 @@ unsigned int nr_copy_pages __nosavedata = 0;
  */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
 static suspend_pagedir_t *pagedir_save;
-static int pagedir_order __nosavedata = 0;
 
 #define SWSUSP_SIG	"S1SUSPEND"
 
@@ -920,7 +919,8 @@ int swsusp_resume(void)
 {
 	int error;
 	local_irq_disable();
-	device_power_down(PMSG_FREEZE);
+	if (device_power_down(PMSG_FREEZE))
+		printk(KERN_ERR "Some devices failed to power down, very bad\n");
 	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
 	error = swsusp_arch_resume();
@@ -1219,7 +1219,6 @@ static int check_header(void)
 		return -EPERM;
 	}
 	nr_copy_pages = swsusp_info.image_pages;
-	pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
 	return error;
 }
 
@@ -1238,7 +1237,7 @@ static int check_sig(void)
 		 */
 		error = bio_write_page(0, &swsusp_header);
 	} else { 
-		pr_debug(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
+		printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
 		return -EINVAL;
 	}
 	if (!error)
-- 
cgit v1.2.3


From 96f3e0cca37a60a33837ade37c2538e5ee706d19 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Mon, 28 Mar 2005 03:43:28 -0800
Subject: [PATCH] swsusp: kill swsusp_restore

This kills swsusp_resume; it should be arch-neutral but some i386 code
sneaked in.  And arch-specific code is better done in assembly anyway.
Plus it fixes memory leaks in error paths.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/power/swsusp.S         | 13 ++++++++++++-
 arch/x86_64/kernel/suspend_asm.S | 15 +++++++++++++--
 kernel/power/swsusp.c            | 16 +++++-----------
 3 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/power/swsusp.S b/arch/i386/power/swsusp.S
index 171a6c85aac2..c4105286ff26 100644
--- a/arch/i386/power/swsusp.S
+++ b/arch/i386/power/swsusp.S
@@ -51,6 +51,15 @@ copy_loop:
 	.p2align 4,,7
 
 done:
+	/* Flush TLB, including "global" things (vmalloc) */
+	movl	mmu_cr4_features, %eax
+	movl	%eax, %edx
+	andl	$~(1<<7), %edx;  # PGE
+	movl	%edx, %cr4;  # turn off PGE
+	movl	%cr3, %ecx;  # flush TLB
+	movl	%ecx, %cr3
+	movl	%eax, %cr4;  # turn PGE back on
+
 	movl saved_context_esp, %esp
 	movl saved_context_ebp, %ebp
 	movl saved_context_ebx, %ebx
@@ -58,5 +67,7 @@ done:
 	movl saved_context_edi, %edi
 
 	pushl saved_context_eflags ; popfl
-	call swsusp_restore
+
+	xorl	%eax, %eax
+
 	ret
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index 1246f85cd8c0..53f8e1659511 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -69,12 +69,21 @@ loop:
 	movq	pbe_next(%rdx), %rdx
 	jmp	loop
 done:
+	/* Flush TLB, including "global" things (vmalloc) */
+	movq	mmu_cr4_features(%rip), %rax
+	movq	%rax, %rdx
+	andq	$~(1<<7), %rdx;  # PGE
+	movq	%rdx, %cr4;  # turn off PGE
+	movq	%cr3, %rcx;  # flush TLB
+	movq	%rcx, %cr3
+	movq	%rax, %cr4;  # turn PGE back on
+
 	movl	$24, %eax
 	movl	%eax, %ds
 
 	movq saved_context_esp(%rip), %rsp
 	movq saved_context_ebp(%rip), %rbp
-	movq saved_context_eax(%rip), %rax
+	/* Don't restore %rax, it must be 0 anyway */
 	movq saved_context_ebx(%rip), %rbx
 	movq saved_context_ecx(%rip), %rcx
 	movq saved_context_edx(%rip), %rdx
@@ -89,5 +98,7 @@ done:
 	movq saved_context_r14(%rip), %r14
 	movq saved_context_r15(%rip), %r15
 	pushq saved_context_eflags(%rip) ; popfq
-	call	swsusp_restore
+
+	xorq	%rax, %rax
+
 	ret
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 328cbcb9867f..ae5bebc3b18f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -892,29 +892,23 @@ int swsusp_suspend(void)
 	 * at resume time, and evil weirdness ensues.
 	 */
 	if ((error = device_power_down(PMSG_FREEZE))) {
+		printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
 		local_irq_enable();
+		swsusp_free();
 		return error;
 	}
 	save_processor_state();
-	error = swsusp_arch_suspend();
+	if ((error = swsusp_arch_suspend()))
+		swsusp_free();
 	/* Restore control flow magically appears here */
 	restore_processor_state();
+	BUG_ON (nr_copy_pages_check != nr_copy_pages);
 	restore_highmem();
 	device_power_up();
 	local_irq_enable();
 	return error;
 }
 
-
-asmlinkage int swsusp_restore(void)
-{
-	BUG_ON (nr_copy_pages_check != nr_copy_pages);
-	
-	/* Even mappings of "global" things (vmalloc) need to be fixed */
-	__flush_tlb_global();
-	return 0;
-}
-
 int swsusp_resume(void)
 {
 	int error;
-- 
cgit v1.2.3


From 7f03bb0f68caef3a6b4f79e22c80c89b8fff6c41 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 28 Mar 2005 03:51:53 -0800
Subject: [PATCH] break_lock fix

lock->break_lock is set when a lock is contended, but cleared only in
cond_resched_lock.  Users of need_lockbreak (journal_commit_transaction,
copy_pte_range, unmap_vmas) don't necessarily use cond_resched_lock on it.

So, if the lock has been contended at some time in the past, break_lock
remains set thereafter, and the fastpath keeps dropping lock unnecessarily.
 Hanging the system if you make a change like I did, forever restarting a
loop before making any progress.  And even users of cond_resched_lock may
well suffer an initial unnecessary lockbreak.

There seems to be no point at which break_lock can be cleared when
unlocking, any point being either too early or too late; but that's okay,
it's only of interest while the lock is held.  So clear it whenever the
lock is acquired - and any waiting contenders will quickly set it again.
Additional locking overhead?  well, this is only when CONFIG_PREEMPT is on.

Since cond_resched_lock's spin_lock clears break_lock, no need to clear it
itself; and use need_lockbreak there too, preferring optimizer to #ifdefs.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c    | 5 +----
 kernel/spinlock.c | 2 ++
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c32f9389978f..dff94ba6df38 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3741,14 +3741,11 @@ EXPORT_SYMBOL(cond_resched);
  */
 int cond_resched_lock(spinlock_t * lock)
 {
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
-	if (lock->break_lock) {
-		lock->break_lock = 0;
+	if (need_lockbreak(lock)) {
 		spin_unlock(lock);
 		cpu_relax();
 		spin_lock(lock);
 	}
-#endif
 	if (need_resched()) {
 		_raw_spin_unlock(lock);
 		preempt_enable_no_resched();
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index b8e76ca8a001..e15ed17863f1 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -187,6 +187,7 @@ void __lockfunc _##op##_lock(locktype##_t *lock)			\
 			cpu_relax();					\
 		preempt_disable();					\
 	}								\
+	(lock)->break_lock = 0;						\
 }									\
 									\
 EXPORT_SYMBOL(_##op##_lock);						\
@@ -209,6 +210,7 @@ unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
 			cpu_relax();					\
 		preempt_disable();					\
 	}								\
+	(lock)->break_lock = 0;						\
 	return flags;							\
 }									\
 									\
-- 
cgit v1.2.3


From 03dd1d1113eb992a44cc78e05a7b8f2fe0f63946 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Mon, 28 Mar 2005 03:55:27 -0800
Subject: [PATCH] Fix irq_affinity write from /proc for ia64

Made GENERIC_HARDIRQ mechanism work for ia64 and CPU hotplug.  When write
to /proc/irq is handled it is not appropriate to perform set_rte
immediatly, since there is a race when the interrupt is asserted while the
re-program is happening.  Hence such programming is only safe when we do
the re-program at the time of servicing an interrupt.  This got broken when
GENERIC_HARDIRQ got introduced for ia64.

- added CONFIG_PENDING_IRQ so default /proc/irq write handler can do the right
  thing.

TBD: We currently dont handle redirectable hint either in the display, or
when we handle writes to /proc/irq/XX/smp_affinity.  We need an arch
specific way to account for the presence of "r" hint when we handle the
proc write.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Cc: <linux-ia64@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/irq.c | 12 ++++++++++--
 kernel/irq/proc.c      | 10 ++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 4413e54b123b..28f2aadc38d0 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -94,12 +94,20 @@ skip:
 /*
  * This is updated when the user sets irq affinity via /proc
  */
-cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+static cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
 static unsigned long pending_irq_redir[BITS_TO_LONGS(NR_IRQS)];
 
-static cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
 static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
 
+/*
+ * Arch specific routine for deferred write to iosapic rte to reprogram
+ * intr destination.
+ */
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+	pending_irq_cpumask[irq] = mask_val;
+}
+
 void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 {
 	cpumask_t mask = CPU_MASK_NONE;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 33fe32e114cb..85d08daa6600 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,6 +19,13 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
  */
 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 
+void __attribute__((weak))
+proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+	irq_affinity[irq] = mask_val;
+	irq_desc[irq].handler->set_affinity(irq, mask_val);
+}
+
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
 {
@@ -53,8 +60,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
 	if (cpus_empty(tmp))
 		return -EINVAL;
 
-	irq_affinity[irq] = new_value;
-	irq_desc[irq].handler->set_affinity(irq, new_value);
+	proc_set_irq_affinity(irq, new_value);
 
 	return full_count;
 }
-- 
cgit v1.2.3


From 9026dff22e10aeae3be3d682d49b956fef0f3bab Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Mon, 28 Mar 2005 03:56:15 -0800
Subject: [PATCH] Exports to enable clock driver modules

The following exports are necessary to allow loadable modules to define new
clocks.  Without these the mmtimer driver cannot be build correctly as a
module (there is another mmtimer specific fix necessary to get it to build
properly but that will be a separate patch):

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 7 ++++++-
 kernel/time.c         | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5f0fbcf511ba..4e2904586925 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -46,6 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
+#include <linux/module.h>
 
 #ifndef div_long_long_rem
 #include <asm/div64.h>
@@ -460,6 +461,7 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
 			timr->it_process);
 	}
 }
+EXPORT_SYMBOL_GPL(posix_timer_event);
 
 /*
  * This function gets called when a POSIX.1b interval timer expires.  It
@@ -555,6 +557,7 @@ void register_posix_clock(clockid_t clock_id, struct k_clock *new_clock)
 
 	posix_clocks[clock_id] = *new_clock;
 }
+EXPORT_SYMBOL_GPL(register_posix_clock);
 
 static struct k_itimer * alloc_posix_timer(void)
 {
@@ -1246,16 +1249,17 @@ int do_posix_clock_monotonic_gettime(struct timespec *tp)
 	return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
 }
 
-
 int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp)
 {
 	return -EINVAL;
 }
+EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
 
 int do_posix_clock_notimer_create(struct k_itimer *timer)
 {
 	return -EINVAL;
 }
+EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
 
 int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
 {
@@ -1265,6 +1269,7 @@ int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
 	return -ENOTSUP;
 #endif
 }
+EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
 
 asmlinkage long
 sys_clock_settime(clockid_t which_clock, const struct timespec __user *tp)
diff --git a/kernel/time.c b/kernel/time.c
index 2c0b90d79d4c..96fd0f499631 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/fs.h>
+#include <linux/module.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -503,6 +504,7 @@ void getnstimeofday (struct timespec *tv)
 	tv->tv_sec = sec;
 	tv->tv_nsec = nsec;
 }
+EXPORT_SYMBOL_GPL(getnstimeofday);
 
 int do_settimeofday (struct timespec *tv)
 {
-- 
cgit v1.2.3


From e758da38da22c0863cf96323077323080ffc9ad9 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Mon, 28 Mar 2005 03:57:38 -0800
Subject: [PATCH] New console flag: CON_BOOT

CON_BOOT is like early printk in that it allows for output really early on.
 It's better than early printk because it unregisters automatically when a
real console is initialised.  So if you don't get consoles registering in
console_init, there isn't a huge delay between the boot console
unregistering and the real console starting.  This is the case on PA-RISC
where we have serial ports that aren't discovered until the PCI bus has
been walked.

I think all the current early printk users could be converted to this
scheme with a minimal amount of effort.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/console.h | 8 ++++++--
 kernel/printk.c         | 5 +++++
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/console.h b/include/linux/console.h
index 99fd8e4be694..b9b183e986e5 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -77,13 +77,17 @@ void give_up_console(const struct consw *sw);
 #define CM_MOVE     (3)
 
 /*
- *	The interface for a console, or any other device that
- *	wants to capture console messages (printer driver?)
+ * The interface for a console, or any other device that wants to capture
+ * console messages (printer driver?)
+ *
+ * If a console driver is marked CON_BOOT then it will be auto-unregistered
+ * when the first real console is registered.  This is for early-printk drivers.
  */
 
 #define CON_PRINTBUFFER	(1)
 #define CON_CONSDEV	(2) /* Last on the command line */
 #define CON_ENABLED	(4)
+#define CON_BOOT	(8)
 
 struct console
 {
diff --git a/kernel/printk.c b/kernel/printk.c
index e5a2222f477e..5d5754964bf4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -861,6 +861,11 @@ void register_console(struct console * console)
 	if (!(console->flags & CON_ENABLED))
 		return;
 
+	if (console_drivers && (console_drivers->flags & CON_BOOT)) {
+		unregister_console(console_drivers);
+		console->flags &= ~CON_PRINTBUFFER;
+	}
+
 	/*
 	 *	Put this console in the list - keep the
 	 *	preferred driver at the head of the list.
-- 
cgit v1.2.3


From 068c82777b6b4f8827dc2820d41d312f2d58ae89 Mon Sep 17 00:00:00 2001
From: "Prasanna S. Panchamukhi" <prasanna@in.ibm.com>
Date: Mon, 28 Mar 2005 03:58:10 -0800
Subject: [PATCH] kprobes: incorrect spin_unlock_irqrestore() call in
 register_kprobe()

register_kprobe() routine was calling spin_unlock_irqrestore() wrongly.

This patch removes unwanted spin_unlock_irqrestore() call in
register_kprobe() routine.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4a331aed0866..1d5dd1337bd1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -79,7 +79,7 @@ int register_kprobe(struct kprobe *p)
 	unsigned long flags = 0;
 
 	if ((ret = arch_prepare_kprobe(p)) != 0) {
-		goto out;
+		goto rm_kprobe;
 	}
 	spin_lock_irqsave(&kprobe_lock, flags);
 	INIT_HLIST_NODE(&p->hlist);
@@ -96,8 +96,9 @@ int register_kprobe(struct kprobe *p)
 	*p->addr = BREAKPOINT_INSTRUCTION;
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
-      out:
+out:
 	spin_unlock_irqrestore(&kprobe_lock, flags);
+rm_kprobe:
 	if (ret == -EEXIST)
 		arch_remove_kprobe(p);
 	return ret;
-- 
cgit v1.2.3


From 4158b4b8ca50e3d02bd82ec47284cbbde0fea434 Mon Sep 17 00:00:00 2001
From: Randolph Chung <randolph@tausq.org>
Date: Mon, 28 Mar 2005 03:58:26 -0800
Subject: [PATCH] Missing set_fs() calls around kernel syscall

Found by sparse...  since we are passing kernel param to a syscall handler,
we need to do the set_fs() wrappers.

Signed-off-by: Randolph Chung <tausq@debian.org>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/stop_machine.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e31b1cb8e503..c39ed70af174 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -6,6 +6,7 @@
 #include <linux/syscalls.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
+#include <asm/uaccess.h>
 
 /* Since we effect priority and affinity (both of which are visible
  * to, and settable by outside processes) we do indirection via a
@@ -86,9 +87,13 @@ static int stop_machine(void)
 {
 	int i, ret = 0;
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	mm_segment_t old_fs = get_fs();
 
 	/* One high-prio thread per cpu.  We'll do this one. */
-	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+	set_fs(KERNEL_DS);
+	sys_sched_setscheduler(current->pid, SCHED_FIFO,
+				(struct sched_param __user *)&param);
+	set_fs(old_fs);
 
 	atomic_set(&stopmachine_thread_ack, 0);
 	stopmachine_num_threads = 0;
-- 
cgit v1.2.3


From 47f65a4152108517d4c2fbc52ec1e007c2884433 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Mon, 28 Mar 2005 03:58:44 -0800
Subject: [PATCH] cpusets: mems generation deadlock fix

The cpuset code to update mems_generation could (in theory) deadlock on
cpuset_sem if it needed to allocate some memory while creating (mkdir) or
removing (rmdir) a cpuset, so already held cpuset_sem.  Some other process
would have to mess with this tasks cpuset memory placement at the same
time.

We avoid this possible deadlock by always updating mems_generation after we
grab cpuset_sem on such operations, before we risk any operations that
might require memory allocation.

Thanks to Jack Steiner <steiner@sgi.com> for noticing this.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 44c03c666b01..cd942ce30b73 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -504,6 +504,35 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 	BUG_ON(!nodes_intersects(*pmask, node_online_map));
 }
 
+/*
+ * Refresh current tasks mems_allowed and mems_generation from
+ * current tasks cpuset.  Call with cpuset_sem held.
+ *
+ * Be sure to call refresh_mems() on any cpuset operation which
+ * (1) holds cpuset_sem, and (2) might possibly alloc memory.
+ * Call after obtaining cpuset_sem lock, before any possible
+ * allocation.  Otherwise one risks trying to allocate memory
+ * while the task cpuset_mems_generation is not the same as
+ * the mems_generation in its cpuset, which would deadlock on
+ * cpuset_sem in cpuset_update_current_mems_allowed().
+ *
+ * Since we hold cpuset_sem, once refresh_mems() is called, the
+ * test (current->cpuset_mems_generation != cs->mems_generation)
+ * in cpuset_update_current_mems_allowed() will remain false,
+ * until we drop cpuset_sem.  Anyone else who would change our
+ * cpusets mems_generation needs to lock cpuset_sem first.
+ */
+
+static void refresh_mems(void)
+{
+	struct cpuset *cs = current->cpuset;
+
+	if (current->cpuset_mems_generation != cs->mems_generation) {
+		guarantee_online_mems(cs, &current->mems_allowed);
+		current->cpuset_mems_generation = cs->mems_generation;
+	}
+}
+
 /*
  * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
  *
@@ -1224,6 +1253,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 		return -ENOMEM;
 
 	down(&cpuset_sem);
+	refresh_mems();
 	cs->flags = 0;
 	if (notify_on_release(parent))
 		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1277,6 +1307,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	/* the vfs holds both inode->i_sem already */
 
 	down(&cpuset_sem);
+	refresh_mems();
 	if (atomic_read(&cs->count) > 0) {
 		up(&cpuset_sem);
 		return -EBUSY;
@@ -1433,8 +1464,7 @@ void cpuset_update_current_mems_allowed()
 		return;		/* task is exiting */
 	if (current->cpuset_mems_generation != cs->mems_generation) {
 		down(&cpuset_sem);
-		guarantee_online_mems(cs, &current->mems_allowed);
-		current->cpuset_mems_generation = cs->mems_generation;
+		refresh_mems();
 		up(&cpuset_sem);
 	}
 }
-- 
cgit v1.2.3


From 4733474d20cbe84f8bd83fd38aef645b6eb11f57 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Mon, 28 Mar 2005 03:59:49 -0800
Subject: [PATCH] Fix POSIX timers expiring before their scheduled time

This patch fixes the problem of POSIX timers returning too early due to not
accounting for the time starting mid jiffie.

Signed-off-by: George Anzinger <george@mvista.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4e2904586925..fd316c272260 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -938,6 +938,10 @@ static int adjust_abs_time(struct k_clock *clock, struct timespec *tp,
 	 */
 	if (oc.tv_sec < 0)
 		oc.tv_sec = oc.tv_nsec = 0;
+
+	if (oc.tv_sec | oc.tv_nsec)
+		set_normalized_timespec(&oc, oc.tv_sec,
+					oc.tv_nsec + clock->res);
 	tstojiffie(&oc, clock->res, exp);
 
 	/*
@@ -1507,7 +1511,6 @@ static int common_nsleep(clockid_t which_clock,
 		if (abs || !rq_time) {
 			adjust_abs_time(&posix_clocks[which_clock], &t, abs,
 					&rq_time, &dum);
-			rq_time += (t.tv_sec || t.tv_nsec);
 		}
 
 		left = rq_time - get_jiffies_64();
-- 
cgit v1.2.3


From 7b8f061e04e3090a40f2828434f17c9003013cc7 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Mon, 28 Mar 2005 04:00:22 -0800
Subject: [PATCH] posix-cpu-timers and cputime_t divisons.

The posix cpu timers introduced code that will not work with an arbitrary
type for cputime_t.  In particular the division of two cputime_t values
broke the s390 build because cputime_t is define as an unsigned long long.

The first problem is the division of a cputime_t value by a number of
threads.  That is a cputime_t divided by an integer.  The patch adds
another macro cputime_div to the cputime macro regime which implements this
type of division and replaces all occurences of a cputime / nthread in the
posix cpu timer code.

Next problem is bump_cpu_timer. This function is severly broken:

1) In the body of the first if statement a timer->it.cpu.incr.sched is
   used as the second argument of do_div.  do_div expects an unsigned long
   as "base" parameter but timer->it.cpu.incr.sched is an unsigned long
   long.  If the timer increment ever happens to be >= 2^32 the result is
   wrong and if the lower 32 bits are zero this even crashes with a fixed
   point divide exception.

2) The cputime_le(now.cpu, timer->it.cpu.expires.cpu) in the else if
   condition is wrong.  The cputime_le() reads as "now.cpu <=
   timer->it.cpu.expires.cpu" and the subsequent cputime_ge() reads as
   "now.cpu >= timer.it.cpu.expires.cpu".  That means that the two values
   needs to be equal to make the body of the second if to have any effect.
   The first cputime_le should be a cputime_ge.

3) timer->it.cpu.expires.cpu and delta in the else part of the if are of
   type cputime_t.  A division of two cputime_t values is undefined (think
   of cputime_t as e.g.  a struct timespec, that just doesn't work).  We
   could add a primitive for this type of division but we'd end up with a
   64 bit division or something even more complicated.

The solution for bump_cpu_timer is to use the "slow" division algorithm
that does shifts and subtracts.  That adds yet another cputime macro,
cputime_halve to do the right shift of a cputime value.

The next problem is in arm_timer.  The UPDATE_CLOCK macro does the wrong
thing for it_prof_expires and it_virt_expires.  Expanded the macro and
added the cputime magic to it_prof/it_virt.

The remaining problems are rather simple, timespec_to_jiffies instead of
timespec_to_cputime and several cases where cputime_eq with cputime_zero
needs to be used instead of "== 0".

What still worries me a bit is to use "timer->it.cpu.incr.sched == 0" as
check if the timer is armed at all.  It should work but its not really
clean.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/cputime.h |   2 +
 include/asm-s390/cputime.h    |   8 ++++
 kernel/posix-cpu-timers.c     | 106 ++++++++++++++++++++++++++----------------
 3 files changed, 77 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h
index 7943c66945be..6f178563e336 100644
--- a/include/asm-generic/cputime.h
+++ b/include/asm-generic/cputime.h
@@ -10,6 +10,8 @@ typedef unsigned long cputime_t;
 #define cputime_max			((~0UL >> 1) - 1)
 #define cputime_add(__a, __b)		((__a) +  (__b))
 #define cputime_sub(__a, __b)		((__a) -  (__b))
+#define cputime_div(__a, __n)		((__a) /  (__n))
+#define cputime_halve(__a)		((__a) >> 1)
 #define cputime_eq(__a, __b)		((__a) == (__b))
 #define cputime_gt(__a, __b)		((__a) >  (__b))
 #define cputime_ge(__a, __b)		((__a) >= (__b))
diff --git a/include/asm-s390/cputime.h b/include/asm-s390/cputime.h
index 216d861337e6..4b3ef7cad115 100644
--- a/include/asm-s390/cputime.h
+++ b/include/asm-s390/cputime.h
@@ -9,6 +9,8 @@
 #ifndef _S390_CPUTIME_H
 #define _S390_CPUTIME_H
 
+#include <asm/div64.h>
+
 /* We want to use micro-second resolution. */
 
 typedef unsigned long long cputime_t;
@@ -40,6 +42,12 @@ __div(unsigned long long n, unsigned int base)
 #define cputime_max			((~0UL >> 1) - 1)
 #define cputime_add(__a, __b)		((__a) +  (__b))
 #define cputime_sub(__a, __b)		((__a) -  (__b))
+#define cputime_div(__a, __n) ({		\
+	unsigned long long __div = (__a);	\
+	do_div(__div,__n);			\
+	__div;					\
+})
+#define cputime_halve(__a)		((__a) >> 1)
 #define cputime_eq(__a, __b)		((__a) == (__b))
 #define cputime_gt(__a, __b)		((__a) >  (__b))
 #define cputime_ge(__a, __b)		((__a) >= (__b))
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5dfd280631ae..ad85d3f0dcc4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -38,7 +38,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
 	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
 		ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
 	} else {
-		ret.cpu = timespec_to_jiffies(tp);
+		ret.cpu = timespec_to_cputime(tp);
 	}
 	return ret;
 }
@@ -94,28 +94,46 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
 static inline void bump_cpu_timer(struct k_itimer *timer,
 				  union cpu_time_count now)
 {
+	int i;
+
 	if (timer->it.cpu.incr.sched == 0)
 		return;
 
 	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
-		long long delta;
-		delta = now.sched - timer->it.cpu.expires.sched;
-		if (delta >= 0) {
-			do_div(delta, timer->it.cpu.incr.sched);
-			delta++;
-			timer->it.cpu.expires.sched +=
-				delta * timer->it.cpu.incr.sched;
-			timer->it_overrun += (int) delta;
+		unsigned long long delta, incr;
+
+		if (now.sched < timer->it.cpu.expires.sched)
+			return;
+		incr = timer->it.cpu.incr.sched;
+		delta = now.sched + incr - timer->it.cpu.expires.sched;
+		/* Don't use (incr*2 < delta), incr*2 might overflow. */
+		for (i = 0; incr < delta - incr; i++)
+			incr = incr << 1;
+		for (; i >= 0; incr >>= 1, i--) {
+			if (delta <= incr)
+				continue;
+			timer->it.cpu.expires.sched += incr;
+			timer->it_overrun += 1 << i;
+			delta -= incr;
 		}
-	} else if (cputime_le(now.cpu, timer->it.cpu.expires.cpu)) {
-		cputime_t delta = cputime_sub(now.cpu,
-					      timer->it.cpu.expires.cpu);
-		if (cputime_ge(delta, cputime_zero)) {
-			long orun = 1 + (delta / timer->it.cpu.incr.cpu);
+	} else {
+		cputime_t delta, incr;
+
+		if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
+			return;
+		incr = timer->it.cpu.incr.cpu;
+		delta = cputime_sub(cputime_add(now.cpu, incr),
+				    timer->it.cpu.expires.cpu);
+		/* Don't use (incr*2 < delta), incr*2 might overflow. */
+		for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
+			     incr = cputime_add(incr, incr);
+		for (; i >= 0; incr = cputime_halve(incr), i--) {
+			if (cputime_le(delta, incr))
+				continue;
 			timer->it.cpu.expires.cpu =
-				cputime_add(timer->it.cpu.expires.cpu,
-					    orun * timer->it.cpu.incr.cpu);
-			timer->it_overrun += orun;
+				cputime_add(timer->it.cpu.expires.cpu, incr);
+			timer->it_overrun += 1 << i;
+			delta = cputime_sub(delta, incr);
 		}
 	}
 }
@@ -479,8 +497,8 @@ static void process_timer_rebalance(struct task_struct *p,
 		BUG();
 		break;
 	case CPUCLOCK_PROF:
-		left = cputime_sub(expires.cpu, val.cpu)
-			/ nthreads;
+		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
+				   nthreads);
 		do {
 			if (!unlikely(t->exit_state)) {
 				ticks = cputime_add(prof_ticks(t), left);
@@ -494,8 +512,8 @@ static void process_timer_rebalance(struct task_struct *p,
 		} while (t != p);
 		break;
 	case CPUCLOCK_VIRT:
-		left = cputime_sub(expires.cpu, val.cpu)
-			/ nthreads;
+		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
+				   nthreads);
 		do {
 			if (!unlikely(t->exit_state)) {
 				ticks = cputime_add(virt_ticks(t), left);
@@ -587,17 +605,25 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 			switch (CPUCLOCK_WHICH(timer->it_clock)) {
 			default:
 				BUG();
-#define UPDATE_CLOCK(WHICH, c, n)			      		      \
-			case CPUCLOCK_##WHICH: 				      \
-				if (p->it_##c##_expires == 0 ||		      \
-				    p->it_##c##_expires > nt->expires.n) {    \
-					p->it_##c##_expires = nt->expires.n;  \
-				}					      \
-				break
-			UPDATE_CLOCK(PROF, prof, cpu);
-			UPDATE_CLOCK(VIRT, virt, cpu);
-			UPDATE_CLOCK(SCHED, sched, sched);
-#undef UPDATE_CLOCK
+			case CPUCLOCK_PROF:
+				if (cputime_eq(p->it_prof_expires,
+					       cputime_zero) ||
+				    cputime_gt(p->it_prof_expires,
+					       nt->expires.cpu))
+					p->it_prof_expires = nt->expires.cpu;
+				break;
+			case CPUCLOCK_VIRT:
+				if (cputime_eq(p->it_virt_expires,
+					       cputime_zero) ||
+				    cputime_gt(p->it_virt_expires,
+					       nt->expires.cpu))
+					p->it_virt_expires = nt->expires.cpu;
+				break;
+			case CPUCLOCK_SCHED:
+				if (p->it_sched_expires == 0 ||
+				    p->it_sched_expires > nt->expires.sched)
+					p->it_sched_expires = nt->expires.sched;
+				break;
 			}
 		} else {
 			/*
@@ -934,7 +960,7 @@ static void check_thread_timers(struct task_struct *tsk,
 {
 	struct list_head *timers = tsk->cpu_timers;
 
-	tsk->it_prof_expires = 0;
+	tsk->it_prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
@@ -948,7 +974,7 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 
 	++timers;
-	tsk->it_virt_expires = 0;
+	tsk->it_virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
@@ -1044,7 +1070,7 @@ static void check_process_timers(struct task_struct *tsk,
 	}
 
 	++timers;
-	sched_expires = cputime_zero;
+	sched_expires = 0;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
@@ -1132,9 +1158,11 @@ static void check_process_timers(struct task_struct *tsk,
 		unsigned long long sched_left, sched;
 		const unsigned int nthreads = atomic_read(&sig->live);
 
-		prof_left = cputime_sub(prof_expires,
-					cputime_add(utime, stime)) / nthreads;
-		virt_left = cputime_sub(virt_expires, utime) / nthreads;
+		prof_left = cputime_sub(prof_expires, utime);
+		prof_left = cputime_sub(prof_left, stime);
+		prof_left = cputime_div(prof_left, nthreads);
+		virt_left = cputime_sub(virt_expires, utime);
+		virt_left = cputime_div(virt_left, nthreads);
 		if (sched_expires) {
 			sched_left = sched_expires - sched_time;
 			do_div(sched_left, nthreads);
@@ -1245,7 +1273,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	BUG_ON(!irqs_disabled());
 
 #define UNEXPIRED(clock) \
-		(tsk->it_##clock##_expires == 0 || \
+		(cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \
 		 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires))
 
 	if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
-- 
cgit v1.2.3


From 58aceba09b4f67abd309d199de8f2da375f45e88 Mon Sep 17 00:00:00 2001
From: Jakub Jelínek <jakub@redhat.com>
Date: Mon, 28 Mar 2005 04:00:54 -0800
Subject: [PATCH] Futex: make futex_wait() atomic again

Call get_futex_value_locked in futex_wait with futex hash bucket locked and
only enqueue the futex if futex has the expected value.  Simplify
futex_requeue.

Signed-off-by: Jakub Jelinek <jakub@redhat.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 89 +++++++++++++++++++++++++++++++---------------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 7f9f4a012190..7b54a672d0ad 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -97,7 +97,6 @@ struct futex_q {
  */
 struct futex_hash_bucket {
        spinlock_t              lock;
-       unsigned int	    nqueued;
        struct list_head       chain;
 };
 
@@ -265,7 +264,6 @@ static inline int get_futex_value_locked(int *dest, int __user *from)
 	inc_preempt_count();
 	ret = __copy_from_user_inatomic(dest, from, sizeof(int));
 	dec_preempt_count();
-	preempt_check_resched();
 
 	return ret ? -EFAULT : 0;
 }
@@ -339,7 +337,6 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
 	struct list_head *head1;
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
-	unsigned int nqueued;
 
  retry:
 	down_read(&current->mm->mmap_sem);
@@ -354,23 +351,22 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
 	bh1 = hash_futex(&key1);
 	bh2 = hash_futex(&key2);
 
-	nqueued = bh1->nqueued;
+	if (bh1 < bh2)
+		spin_lock(&bh1->lock);
+	spin_lock(&bh2->lock);
+	if (bh1 > bh2)
+		spin_lock(&bh1->lock);
+
 	if (likely(valp != NULL)) {
 		int curval;
 
-		/* In order to avoid doing get_user while
-		   holding bh1->lock and bh2->lock, nqueued
-		   (monotonically increasing field) must be first
-		   read, then *uaddr1 fetched from userland and
-		   after acquiring lock nqueued field compared with
-		   the stored value.  The smp_mb () below
-		   makes sure that bh1->nqueued is read from memory
-		   before *uaddr1.  */
-		smp_mb();
-
 		ret = get_futex_value_locked(&curval, (int __user *)uaddr1);
 
 		if (unlikely(ret)) {
+			spin_unlock(&bh1->lock);
+			if (bh1 != bh2)
+				spin_unlock(&bh2->lock);
+
 			/* If we would have faulted, release mmap_sem, fault
 			 * it in and start all over again.
 			 */
@@ -385,21 +381,10 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
 		}
 		if (curval != *valp) {
 			ret = -EAGAIN;
-			goto out;
+			goto out_unlock;
 		}
 	}
 
-	if (bh1 < bh2)
-		spin_lock(&bh1->lock);
-	spin_lock(&bh2->lock);
-	if (bh1 > bh2)
-		spin_lock(&bh1->lock);
-
-	if (unlikely(nqueued != bh1->nqueued && valp != NULL)) {
-		ret = -EAGAIN;
-		goto out_unlock;
-	}
-
 	head1 = &bh1->chain;
 	list_for_each_entry_safe(this, next, head1, list) {
 		if (!match_futex (&this->key, &key1))
@@ -435,13 +420,9 @@ out:
 	return ret;
 }
 
-/*
- * queue_me and unqueue_me must be called as a pair, each
- * exactly once.  They are called with the hashed spinlock held.
- */
-
 /* The key must be already stored in q->key. */
-static void queue_me(struct futex_q *q, int fd, struct file *filp)
+static inline struct futex_hash_bucket *
+queue_lock(struct futex_q *q, int fd, struct file *filp)
 {
 	struct futex_hash_bucket *bh;
 
@@ -455,11 +436,35 @@ static void queue_me(struct futex_q *q, int fd, struct file *filp)
 	q->lock_ptr = &bh->lock;
 
 	spin_lock(&bh->lock);
-	bh->nqueued++;
+	return bh;
+}
+
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh)
+{
 	list_add_tail(&q->list, &bh->chain);
 	spin_unlock(&bh->lock);
 }
 
+static inline void
+queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
+{
+	spin_unlock(&bh->lock);
+	drop_key_refs(&q->key);
+}
+
+/*
+ * queue_me and unqueue_me must be called as a pair, each
+ * exactly once.  They are called with the hashed spinlock held.
+ */
+
+/* The key must be already stored in q->key. */
+static void queue_me(struct futex_q *q, int fd, struct file *filp)
+{
+	struct futex_hash_bucket *bh;
+	bh = queue_lock(q, fd, filp);
+	__queue_me(q, bh);
+}
+
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
 static int unqueue_me(struct futex_q *q)
 {
@@ -503,6 +508,7 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
 	DECLARE_WAITQUEUE(wait, current);
 	int ret, curval;
 	struct futex_q q;
+	struct futex_hash_bucket *bh;
 
  retry:
 	down_read(&current->mm->mmap_sem);
@@ -511,7 +517,7 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
-	queue_me(&q, -1, NULL);
+	bh = queue_lock(&q, -1, NULL);
 
 	/*
 	 * Access the page AFTER the futex is queued.
@@ -537,14 +543,13 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
 	ret = get_futex_value_locked(&curval, (int __user *)uaddr);
 
 	if (unlikely(ret)) {
+		queue_unlock(&q, bh);
+
 		/* If we would have faulted, release mmap_sem, fault it in and
 		 * start all over again.
 		 */
 		up_read(&current->mm->mmap_sem);
 
-		if (!unqueue_me(&q)) /* There's a chance we got woken already */
-			return 0;
-
 		ret = get_user(curval, (int __user *)uaddr);
 
 		if (!ret)
@@ -553,9 +558,13 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
 	}
 	if (curval != val) {
 		ret = -EWOULDBLOCK;
-		goto out_unqueue;
+		queue_unlock(&q, bh);
+		goto out_release_sem;
 	}
 
+	/* Only actually queue if *uaddr contained val.  */
+	__queue_me(&q, bh);
+
 	/*
 	 * Now the futex is queued and we have checked the data, we
 	 * don't want to hold mmap_sem while we sleep.
@@ -596,10 +605,6 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
 	 * have handled it for us already. */
 	return -EINTR;
 
- out_unqueue:
-	/* If we were woken (and unqueued), we succeeded, whatever. */
-	if (!unqueue_me(&q))
-		ret = 0;
  out_release_sem:
 	up_read(&current->mm->mmap_sem);
 	return ret;
-- 
cgit v1.2.3