From 706489d8a6f1187875ebdb86a751b6d77d1911e4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:23:34 -0800
Subject: [PATCH] slab: extended cpu notifiers

Patch from Dipankar Sarma  <dipankar@in.ibm.com>

This is Manfred's patch which provides a CPU_UP_PREPARE cpu notifier to
allow initialization of per_cpu data just before the cpu becomes fully
functional.

It also provides a facility for the CPU_UP_PREPARE handler to return
NOTIFY_BAD to signify that the CPU is not permitted to come up.  If
that happens, a CPU_UP_CANCELLED message is passed to all the handlers.

The patch also fixes a bogus NOFITY_BAD return from the softirq setup
code.

Patch has been acked by Rusty.

We need this mechanism in slab for starting per-cpu timers and for
allocating the per-cpu slab hgead arrays *before* the CPU has come up
and started using slab.
---
 kernel/cpu.c     | 20 ++++++++++++++++----
 kernel/softirq.c |  3 +--
 2 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index a155998dbe3e..4c0ada2b99ae 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -29,26 +29,38 @@ void unregister_cpu_notifier(struct notifier_block *nb)
 int __devinit cpu_up(unsigned int cpu)
 {
 	int ret;
+	void *hcpu = (void *)(long)cpu;
 
-	if ((ret = down_interruptible(&cpucontrol)) != 0) 
+	if ((ret = down_interruptible(&cpucontrol)) != 0)
 		return ret;
 
 	if (cpu_online(cpu)) {
 		ret = -EINVAL;
 		goto out;
 	}
+	ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+	if (ret == NOTIFY_BAD) {
+		printk("%s: attempt to bring up CPU %u failed\n",
+				__FUNCTION__, cpu);
+		ret = -EINVAL;
+		goto out_notify;
+	}
 
 	/* Arch-specific enabling code. */
 	ret = __cpu_up(cpu);
-	if (ret != 0) goto out;
+	if (ret != 0)
+		goto out_notify;
 	if (!cpu_online(cpu))
 		BUG();
 
 	/* Now call notifier in preparation. */
 	printk("CPU %u IS NOW UP!\n", cpu);
-	notifier_call_chain(&cpu_chain, CPU_ONLINE, (void *)(long)cpu);
+	notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
 
- out:
+out_notify:
+	if (ret != 0)
+		notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
+out:
 	up(&cpucontrol);
 	return ret;
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 45e8712f9f70..1c0f1c4e39e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -316,9 +316,8 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 
 		while (!ksoftirqd_task(hotcpu))
 			yield();
-		return NOTIFY_OK;
  	}
-	return NOTIFY_BAD;
+	return NOTIFY_OK;
 }
 
 static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 };
-- 
cgit v1.2.3


From 22331dad4687ac6f91428884e2b9a02cb4d8a6df Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:23:44 -0800
Subject: [PATCH] slab: add_timer_on: add a timer on a particular CPU

add_timer_on is like add_timer, except it takes a target CPU on which
to add the timer.

The slab code needs per-cpu timers for shrinking the per-cpu caches.
---
 include/linux/timer.h |  1 +
 kernel/timer.c        | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index cfedb5e8bb07..d8ed753c8caa 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -44,6 +44,7 @@ static inline int timer_pending(const struct timer_list * timer)
 }
 
 extern void add_timer(struct timer_list * timer);
+extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
   
diff --git a/kernel/timer.c b/kernel/timer.c
index 2d30f7fd0ecb..58c80293060b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -134,6 +134,26 @@ void add_timer(timer_t *timer)
 	put_cpu();
 }
 
+/***
+ * add_timer_on - start a timer on a particular CPU
+ * @timer: the timer to be added
+ * @cpu: the CPU to start it on
+ *
+ * This is not very scalable on SMP.
+ */
+void add_timer_on(struct timer_list *timer, int cpu)
+{
+	tvec_base_t *base = tvec_bases+ cpu;
+  	unsigned long flags;
+  
+  	BUG_ON(timer_pending(timer) || !timer->function);
+
+	spin_lock_irqsave(&base->lock, flags);
+	internal_add_timer(base, timer);
+	timer->base = base;
+	spin_unlock_irqrestore(&base->lock, flags);
+}
+
 /***
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
-- 
cgit v1.2.3


From c12e16e28b4cf576840cff509caf0c06ff4dc299 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:31:27 -0800
Subject: [PATCH] percpu: convert RCU

Patch from Dipankar Sarma <dipankar@in.ibm.com>

This patch convers RCU per_cpu data to use per_cpu data area
and makes it safe for cpu_possible allocation by using CPU
notifiers.
---
 include/linux/rcupdate.h | 15 ++++++++-------
 kernel/rcupdate.c        | 43 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 42 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a5ffb7bb5743..e9e2287e1e1c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -39,6 +39,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
+#include <linux/percpu.h>
 
 /**
  * struct rcu_head - callback structure for use with RCU
@@ -94,16 +95,16 @@ struct rcu_data {
         long  	       	batch;           /* Batch # for current RCU batch */
         struct list_head  nxtlist;
         struct list_head  curlist;
-} ____cacheline_aligned_in_smp;
+};
 
-extern struct rcu_data rcu_data[NR_CPUS];
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
 extern struct rcu_ctrlblk rcu_ctrlblk;
 
-#define RCU_qsctr(cpu) 		(rcu_data[(cpu)].qsctr)
-#define RCU_last_qsctr(cpu) 	(rcu_data[(cpu)].last_qsctr)
-#define RCU_batch(cpu) 		(rcu_data[(cpu)].batch)
-#define RCU_nxtlist(cpu) 	(rcu_data[(cpu)].nxtlist)
-#define RCU_curlist(cpu) 	(rcu_data[(cpu)].curlist)
+#define RCU_qsctr(cpu) 		(per_cpu(rcu_data, (cpu)).qsctr)
+#define RCU_last_qsctr(cpu) 	(per_cpu(rcu_data, (cpu)).last_qsctr)
+#define RCU_batch(cpu) 		(per_cpu(rcu_data, (cpu)).batch)
+#define RCU_nxtlist(cpu) 	(per_cpu(rcu_data, (cpu)).nxtlist)
+#define RCU_curlist(cpu) 	(per_cpu(rcu_data, (cpu)).curlist)
 
 #define RCU_QSCTR_INVALID	0
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 1a149dff7832..91483119714c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -41,13 +41,14 @@
 #include <linux/module.h>
 #include <linux/completion.h>
 #include <linux/percpu.h>
+#include <linux/notifier.h>
 #include <linux/rcupdate.h>
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
 	{ .mutex = SPIN_LOCK_UNLOCKED, .curbatch = 1, 
 	  .maxbatch = 1, .rcu_cpu_mask = 0 };
-struct rcu_data rcu_data[NR_CPUS] __cacheline_aligned;
+DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
@@ -198,6 +199,33 @@ void rcu_check_callbacks(int cpu, int user)
 	tasklet_schedule(&RCU_tasklet(cpu));
 }
 
+static void __devinit rcu_online_cpu(int cpu)
+{
+	memset(&per_cpu(rcu_data, cpu), 0, sizeof(struct rcu_data));
+	tasklet_init(&RCU_tasklet(cpu), rcu_process_callbacks, 0UL);
+	INIT_LIST_HEAD(&RCU_nxtlist(cpu));
+	INIT_LIST_HEAD(&RCU_curlist(cpu));
+}
+
+static int __devinit rcu_cpu_notify(struct notifier_block *self, 
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	switch (action) {
+	case CPU_UP_PREPARE:
+		rcu_online_cpu(cpu);
+		break;
+	/* Space reserved for CPU_OFFLINE :) */
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
 /*
  * Initializes rcu mechanism.  Assumed to be called early.
  * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
@@ -206,16 +234,13 @@ void rcu_check_callbacks(int cpu, int user)
  */
 void __init rcu_init(void)
 {
-	int i;
-
-	memset(&rcu_data[0], 0, sizeof(rcu_data));
-	for (i = 0; i < NR_CPUS; i++) {
-		tasklet_init(&RCU_tasklet(i), rcu_process_callbacks, 0UL);
-		INIT_LIST_HEAD(&RCU_nxtlist(i));
-		INIT_LIST_HEAD(&RCU_curlist(i));
-	}
+	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
 }
 
+
 /* Because of FASTCALL declaration of complete, we use this wrapper */
 static void wakeme_after_rcu(void *completion)
 {
-- 
cgit v1.2.3


From cf228cdc908ed78c54475aa3f91b775d1f1f7da1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:31:37 -0800
Subject: [PATCH] percpu: convert timers

Patch from Dipankar Sarma <dipankar@in.ibm.com>

This patch changes the per-CPU data in timer management (tvec_bases)
to use per_cpu data area and makes it safe for cpu_possible allocation
by using CPU notifiers. End result - saving space.

Depends on cpu_possible patch.
---
 kernel/timer.c | 75 +++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 51 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 58c80293060b..ebb2b6c627cc 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -24,6 +24,7 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/notifier.h>
 
 #include <asm/uaccess.h>
 
@@ -62,7 +63,8 @@ struct tvec_t_base_s {
 
 typedef struct tvec_t_base_s tvec_base_t;
 
-static tvec_base_t tvec_bases[NR_CPUS] __cacheline_aligned;
+/* Fake initialization */
+static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
 
 /* Fake initialization needed to avoid compiler breakage */
 static DEFINE_PER_CPU(struct tasklet_struct, timer_tasklet) = { NULL };
@@ -122,7 +124,7 @@ static inline void internal_add_timer(tvec_base_t *base, timer_t *timer)
 void add_timer(timer_t *timer)
 {
 	int cpu = get_cpu();
-	tvec_base_t *base = tvec_bases + cpu;
+	tvec_base_t *base = &per_cpu(tvec_bases, cpu);
   	unsigned long flags;
   
   	BUG_ON(timer_pending(timer) || !timer->function);
@@ -143,7 +145,7 @@ void add_timer(timer_t *timer)
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	tvec_base_t *base = tvec_bases+ cpu;
+	tvec_base_t *base = &per_cpu(tvec_bases, cpu);
   	unsigned long flags;
   
   	BUG_ON(timer_pending(timer) || !timer->function);
@@ -189,7 +191,7 @@ int mod_timer(timer_t *timer, unsigned long expires)
 		return 1;
 
 	local_irq_save(flags);
-	new_base = tvec_bases + smp_processor_id();
+	new_base = &per_cpu(tvec_bases, smp_processor_id());
 repeat:
 	old_base = timer->base;
 
@@ -285,15 +287,17 @@ repeat:
  */
 int del_timer_sync(timer_t *timer)
 {
-	tvec_base_t *base = tvec_bases;
+	tvec_base_t *base;
 	int i, ret = 0;
 
 del_again:
 	ret += del_timer(timer);
 
-	for (i = 0; i < NR_CPUS; i++, base++) {
+	for (i = 0; i < NR_CPUS; i++) {
 		if (!cpu_online(i))
 			continue;
+
+		base = &per_cpu(tvec_bases, i);
 		if (base->running_timer == timer) {
 			while (base->running_timer == timer) {
 				cpu_relax();
@@ -731,7 +735,7 @@ unsigned long last_time_offset;
  */
 static void run_timer_tasklet(unsigned long data)
 {
-	tvec_base_t *base = tvec_bases + smp_processor_id();
+	tvec_base_t *base = &per_cpu(tvec_bases, smp_processor_id());
 
 	if ((long)(jiffies - base->timer_jiffies) >= 0)
 		__run_timers(base);
@@ -1086,23 +1090,46 @@ out:
 	return 0;
 }
 
-void __init init_timers(void)
+static void __devinit init_timers_cpu(int cpu)
 {
-	int i, j;
-
-	for (i = 0; i < NR_CPUS; i++) {
-		tvec_base_t *base;
-	       
-		base = tvec_bases + i;
-		spin_lock_init(&base->lock);
-		for (j = 0; j < TVN_SIZE; j++) {
-			INIT_LIST_HEAD(base->tv5.vec + j);
-			INIT_LIST_HEAD(base->tv4.vec + j);
-			INIT_LIST_HEAD(base->tv3.vec + j);
-			INIT_LIST_HEAD(base->tv2.vec + j);
-		}
-		for (j = 0; j < TVR_SIZE; j++)
-			INIT_LIST_HEAD(base->tv1.vec + j);
-		tasklet_init(&per_cpu(timer_tasklet, i), run_timer_tasklet, 0);
+	int j;
+	tvec_base_t *base;
+       
+	base = &per_cpu(tvec_bases, cpu);
+	spin_lock_init(&base->lock);
+	for (j = 0; j < TVN_SIZE; j++) {
+		INIT_LIST_HEAD(base->tv5.vec + j);
+		INIT_LIST_HEAD(base->tv4.vec + j);
+		INIT_LIST_HEAD(base->tv3.vec + j);
+		INIT_LIST_HEAD(base->tv2.vec + j);
+	}
+	for (j = 0; j < TVR_SIZE; j++)
+		INIT_LIST_HEAD(base->tv1.vec + j);
+	tasklet_init(&per_cpu(timer_tasklet, cpu), run_timer_tasklet, 0UL);
+}
+	
+static int __devinit timer_cpu_notify(struct notifier_block *self, 
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	switch(action) {
+	case CPU_UP_PREPARE:
+		init_timers_cpu(cpu);
+		break;
+	default:
+		break;
 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata timers_nb = {
+	.notifier_call	= timer_cpu_notify,
+};
+
+
+void __init init_timers(void)
+{
+	timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+				(void *)(long)smp_processor_id());
+	register_cpu_notifier(&timers_nb);
 }
-- 
cgit v1.2.3


From c1bf37e9fea967860f5a5dc58016e35723889419 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:31:47 -0800
Subject: [PATCH] percpu: convert softirqs

Patch from Dipankar Sarma <dipankar@in.ibm.com>

This patch makes per_cpu tasklet vectors safe for cpu_possible
allocation by using CPU notifiers.
---
 kernel/softirq.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 1c0f1c4e39e0..59fb7acb0ee1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -12,6 +12,7 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/notifier.h>
 
 /*
    - No shared variables, all the data are CPU local.
@@ -260,10 +261,39 @@ void tasklet_kill(struct tasklet_struct *t)
 	clear_bit(TASKLET_STATE_SCHED, &t->state);
 }
 
+
+static void tasklet_init_cpu(int cpu)
+{
+	per_cpu(tasklet_vec, cpu).list = NULL;
+	per_cpu(tasklet_hi_vec, cpu).list = NULL;
+}
+	
+static int tasklet_cpu_notify(struct notifier_block *self, 
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	switch(action) {
+	case CPU_UP_PREPARE:
+		tasklet_init_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return 0;
+}
+
+static struct notifier_block tasklet_nb = {
+	.notifier_call	= tasklet_cpu_notify,
+	.next		= NULL,
+};
+
 void __init softirq_init()
 {
 	open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL);
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
+	tasklet_cpu_notify(&tasklet_nb, (unsigned long)CPU_UP_PREPARE,
+				(void *)(long)smp_processor_id());
+	register_cpu_notifier(&tasklet_nb);
 }
 
 static int ksoftirqd(void * __bind_cpu)
@@ -320,7 +350,9 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 };
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
 
 __init int spawn_ksoftirqd(void)
 {
-- 
cgit v1.2.3


From 1d2652dd2c3e942e75dc3137b3cb1774b43ae377 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:35:44 -0800
Subject: [PATCH] hot-n-cold pages: bulk page freeing

Patch from Martin Bligh.

Implements __free_pages_bulk().  Release multiple pages of a given
order into the buddy all within a single acquisition of the zone lock.

This also removes current->local_pages.  The per-task list of pages
which only ever contained one page.  To prevent other tasks from
stealing pages which this task has just freed up.

Given that we're freeing into the per-cpu caches, and that those are
multipage caches, and the cpu-stickiness of the scheduler, I think
current->local_pages is no longer needed.
---
 include/linux/sched.h |  22 +++----
 kernel/fork.c         |   2 -
 mm/page_alloc.c       | 156 ++++++++++++++++++++------------------------------
 3 files changed, 70 insertions(+), 110 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b5e63d8ade25..65f9799aa896 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -293,9 +293,6 @@ struct task_struct {
 	struct list_head ptrace_list;
 
 	struct mm_struct *mm, *active_mm;
-	struct list_head local_pages;
-
-	unsigned int allocation_order, nr_local_pages;
 
 /* task state */
 	struct linux_binfmt *binfmt;
@@ -411,16 +408,15 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_MEMDIE	0x00001000	/* Killed for out-of-memory */
-#define PF_FREE_PAGES	0x00002000	/* per process page freeing */
-#define PF_FLUSHER	0x00004000	/* responsible for disk writeback */
-#define PF_NOWARN	0x00008000	/* debug: don't warn if alloc fails */
-
-#define PF_FREEZE	0x00010000	/* this task should be frozen for suspend */
-#define PF_IOTHREAD	0x00020000	/* this thread is needed for doing I/O to swap */
-#define PF_FROZEN	0x00040000	/* frozen for system suspend */
-#define PF_SYNC		0x00080000	/* performing fsync(), etc */
-#define PF_FSTRANS	0x00100000	/* inside a filesystem transaction */
-#define PF_KSWAPD	0x00200000	/* I am kswapd */
+#define PF_FLUSHER	0x00002000	/* responsible for disk writeback */
+#define PF_NOWARN	0x00004000	/* debug: don't warn if alloc fails */
+
+#define PF_FREEZE	0x00008000	/* this task should be frozen for suspend */
+#define PF_IOTHREAD	0x00010000	/* this thread is needed for doing I/O to swap */
+#define PF_FROZEN	0x00020000	/* frozen for system suspend */
+#define PF_SYNC		0x00040000	/* performing fsync(), etc */
+#define PF_FSTRANS	0x00080000	/* inside a filesystem transaction */
+#define PF_KSWAPD	0x00100000	/* I am kswapd */
 
 /*
  * Ptrace flags
diff --git a/kernel/fork.c b/kernel/fork.c
index 2f5f00301182..4a33d682dfaa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -769,8 +769,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->start_time = jiffies;
 	p->security = NULL;
 
-	INIT_LIST_HEAD(&p->local_pages);
-
 	retval = -ENOMEM;
 	if (security_ops->task_alloc_security(p))
 		goto bad_fork_cleanup;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d4fca60114ea..dd35f4d7ac49 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -91,49 +91,17 @@ static void bad_page(const char *function, struct page *page)
  * -- wli
  */
 
-void __free_pages_ok (struct page *page, unsigned int order)
+static inline void __free_pages_bulk (struct page *page, struct page *base,
+		struct zone *zone, struct free_area *area, unsigned long mask,
+		unsigned int order)
 {
-	unsigned long index, page_idx, mask, flags;
-	struct free_area *area;
-	struct page *base;
-	struct zone *zone;
-
-	mod_page_state(pgfree, 1<<order);
-
-	if (	page_mapped(page) ||
-		page->mapping != NULL ||
-		page_count(page) != 0 ||
-		(page->flags & (
-			1 << PG_lru	|
-			1 << PG_private |
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_writeback )))
-		bad_page(__FUNCTION__, page);
-
-	if (PageDirty(page))
-		ClearPageDirty(page);
+	unsigned long page_idx, index;
 
-	if (unlikely(current->flags & PF_FREE_PAGES)) {
-		if (!current->nr_local_pages && !in_interrupt()) {
-			list_add(&page->list, &current->local_pages);
-			page->index = order;
-			current->nr_local_pages++;
-			goto out;
-		}
-	}
-
-	zone = page_zone(page);
-
-	mask = (~0UL) << order;
-	base = zone->zone_mem_map;
 	page_idx = page - base;
 	if (page_idx & ~mask)
 		BUG();
 	index = page_idx >> (1 + order);
-	area = zone->free_area + order;
 
-	spin_lock_irqsave(&zone->lock, flags);
 	zone->free_pages -= mask;
 	while (mask + (1 << (MAX_ORDER-1))) {
 		struct page *buddy1, *buddy2;
@@ -160,9 +128,58 @@ void __free_pages_ok (struct page *page, unsigned int order)
 		page_idx &= mask;
 	}
 	list_add(&(base + page_idx)->list, &area->free_list);
+}
+
+static inline void free_pages_check(const char *function, struct page *page)
+{
+	if (	page_mapped(page) ||
+		page->mapping != NULL ||
+		page_count(page) != 0 ||
+		(page->flags & (
+			1 << PG_lru	|
+			1 << PG_private |
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_writeback )))
+		bad_page(function, page);
+	if (PageDirty(page))
+		ClearPageDirty(page);
+}
+
+/*
+ * Frees a list of pages. 
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free, or 0 for all on the list.
+ */
+static void
+free_pages_bulk(struct zone *zone, int count,
+		struct list_head *list, unsigned int order)
+{
+	unsigned long mask, flags;
+	struct free_area *area;
+	struct page *base, *page = NULL;
+
+	mask = (~0UL) << order;
+	base = zone->zone_mem_map;
+	area = zone->free_area + order;
+	spin_lock_irqsave(&zone->lock, flags);
+	while (!list_empty(list) && count--) {
+		page = list_entry(list->prev, struct page, list);
+		/* have to delete it as __free_pages_bulk list manipulates */
+		list_del(&page->list);
+		__free_pages_bulk(page, base, zone, area, mask, order);
+		mod_page_state(pgfree, count<<order);
+	}
 	spin_unlock_irqrestore(&zone->lock, flags);
-out:
-	return;
+}
+
+void __free_pages_ok(struct page *page, unsigned int order)
+{
+	LIST_HEAD(list);
+
+	free_pages_check(__FUNCTION__, page);
+	list_add(&page->list, &list);
+	free_pages_bulk(page_zone(page), 1, &list, order);
 }
 
 #define MARK_USED(index, order, area) \
@@ -323,59 +340,6 @@ int is_head_of_free_region(struct page *page)
 }
 #endif /* CONFIG_SOFTWARE_SUSPEND */
 
-static /* inline */ struct page *
-balance_classzone(struct zone* classzone, unsigned int gfp_mask,
-			unsigned int order, int * freed)
-{
-	struct page * page = NULL;
-	int __freed = 0;
-
-	BUG_ON(in_interrupt());
-
-	current->allocation_order = order;
-	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
-
-	__freed = try_to_free_pages(classzone, gfp_mask, order);
-
-	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
-
-	if (current->nr_local_pages) {
-		struct list_head * entry, * local_pages;
-		struct page * tmp;
-		int nr_pages;
-
-		local_pages = &current->local_pages;
-
-		if (likely(__freed)) {
-			/* pick from the last inserted so we're lifo */
-			entry = local_pages->next;
-			do {
-				tmp = list_entry(entry, struct page, list);
-				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
-					list_del(entry);
-					page = tmp;
-					current->nr_local_pages--;
-					prep_new_page(page);
-					break;
-				}
-			} while ((entry = entry->next) != local_pages);
-		}
-
-		nr_pages = current->nr_local_pages;
-		/* free in reverse order so that the global order will be lifo */
-		while ((entry = local_pages->prev) != local_pages) {
-			list_del(entry);
-			tmp = list_entry(entry, struct page, list);
-			__free_pages_ok(tmp, tmp->index);
-			if (!nr_pages--)
-				BUG();
-		}
-		current->nr_local_pages = 0;
-	}
-	*freed = __freed;
-	return page;
-}
-
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
@@ -386,7 +350,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	unsigned long min;
 	struct zone **zones, *classzone;
 	struct page * page;
-	int freed, i;
+	int cflags;
+	int i;
 
 	if (gfp_mask & __GFP_WAIT)
 		might_sleep();
@@ -463,9 +428,10 @@ nopage:
 		goto nopage;
 
 	inc_page_state(allocstall);
-	page = balance_classzone(classzone, gfp_mask, order, &freed);
-	if (page)
-		return page;
+	cflags = current->flags;
+	current->flags |= PF_MEMALLOC;
+	try_to_free_pages(classzone, gfp_mask, order);
+	current->flags = cflags;
 
 	/* go through the zonelist yet one more time */
 	min = 1UL << order;
-- 
cgit v1.2.3


From a0e7d495df35797364092fedff52ec488ec702eb Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@cse.unsw.edu.au>
Date: Wed, 30 Oct 2002 00:24:57 -0800
Subject: [PATCH] kNFSd: Convert nfsd to use a list of pages instead of one big
 buffer

This means:
  1/ We don't need an order-4 allocation for each nfsd that starts
  2/ We don't need an order-4 allocation in skb_linearize when
     we receive a 32K write request
  3/ It will be easier to incorporate the zero-copy read changes

The pages are handed around using an xdr_buf (instead of svc_buf)
much like the NFS client so future crypto code can use the same
data structure for both client and server.

The code assumes that most requests and replies fit in a single page.
The exceptions are assumed to have some largish 'data' bit, and the
rest must fit in a single page.
The 'data' bits are file data, readdir data, and symlinks.
There must be only one 'data' bit per request.
This is all fine for nfs/nlm.

This isn't complete:
  1/ NFSv4 hasn't been converted yet (it won't compile)
  2/ NFSv3 allows symlinks upto 4096, but the code will only support
     upto about 3800 at the moment
  3/ readdir responses are limited to about 3800.

but I thought that patch was big enough, and the rest can come
later.


This patch introduces vfs_readv and vfs_writev as parallels to
vfs_read and vfs_write.  This means there is a fair bit of
duplication in read_write.c that should probably be tidied up...
---
 fs/lockd/xdr.c             |  19 ----
 fs/lockd/xdr4.c            |  20 -----
 fs/nfsd/nfs3proc.c         |  30 +++----
 fs/nfsd/nfs3xdr.c          |  86 ++++++++++++-------
 fs/nfsd/nfscache.c         |  42 ++++-----
 fs/nfsd/nfsproc.c          |  24 +++---
 fs/nfsd/nfssvc.c           |   8 +-
 fs/nfsd/nfsxdr.c           |  86 ++++++++++++-------
 fs/nfsd/vfs.c              |   9 +-
 fs/read_write.c            |  94 ++++++++++++++++++++
 include/linux/fs.h         |   2 +
 include/linux/nfsd/cache.h |   4 +-
 include/linux/nfsd/nfsd.h  |   4 +-
 include/linux/nfsd/xdr.h   |   8 +-
 include/linux/nfsd/xdr3.h  |   5 +-
 include/linux/sunrpc/svc.h | 110 +++++++++++++++++-------
 kernel/ksyms.c             |   2 +
 net/sunrpc/svc.c           | 136 +++++++++++++++++------------
 net/sunrpc/svcauth.c       |   3 +-
 net/sunrpc/svcauth_unix.c  |  65 +++++++-------
 net/sunrpc/svcsock.c       | 210 ++++++++++++++++++++++++++++-----------------
 net/sunrpc/xprt.c          |   2 +-
 22 files changed, 602 insertions(+), 367 deletions(-)

(limited to 'kernel')

diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 5e87dd2fa59f..3d604168ebf9 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -216,25 +216,6 @@ nlm_encode_testres(u32 *p, struct nlm_res *resp)
 	return p;
 }
 
-/*
- * Check buffer bounds after decoding arguments
- */
-static inline int
-xdr_argsize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_argbuf;
-
-	return p - buf->base <= buf->buflen;
-}
-
-static inline int
-xdr_ressize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_resbuf;
-
-	buf->len = p - buf->base;
-	return (buf->len <= buf->buflen);
-}
 
 /*
  * First, the server side XDR functions
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 479bcdf73c2d..1f11211cbeb2 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -222,26 +222,6 @@ nlm4_encode_testres(u32 *p, struct nlm_res *resp)
 }
 
 
-/*
- * Check buffer bounds after decoding arguments
- */
-static int
-xdr_argsize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_argbuf;
-
-	return p - buf->base <= buf->buflen;
-}
-
-static int
-xdr_ressize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_resbuf;
-
-	buf->len = p - buf->base;
-	return (buf->len <= buf->buflen);
-}
-
 /*
  * First, the server side XDR functions
  */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 61be6186bacf..ede78ddd05ae 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -43,11 +43,11 @@ static int	nfs3_ftypes[] = {
 /*
  * Reserve room in the send buffer
  */
-static void
-svcbuf_reserve(struct svc_buf *buf, u32 **ptr, int *len, int nr)
+static inline void
+svcbuf_reserve(struct xdr_buf *buf, u32 **ptr, int *len, int nr)
 {
-	*ptr = buf->buf + nr;
-	*len = buf->buflen - buf->len - nr;
+	*ptr = (u32*)(buf->head[0].iov_base+buf->head[0].iov_len) + nr;
+	*len = ((PAGE_SIZE-buf->head[0].iov_len)>>2) - nr;
 }
 
 /*
@@ -150,7 +150,7 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd_fhandle     *argp,
 	dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
 
 	/* Reserve room for status, post_op_attr, and path length */
-	svcbuf_reserve(&rqstp->rq_resbuf, &path, &dummy,
+	svcbuf_reserve(&rqstp->rq_res, &path, &dummy,
 				1 + NFS3_POST_OP_ATTR_WORDS + 1);
 
 	/* Read the symlink. */
@@ -167,8 +167,7 @@ static int
 nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 				        struct nfsd3_readres  *resp)
 {
-	u32 *	buffer;
-	int	nfserr, avail;
+	int	nfserr;
 
 	dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
 				SVCFH_fmt(&argp->fh),
@@ -179,18 +178,17 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 	 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
 	 * + 1 (xdr opaque byte count) = 26
 	 */
-	svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &avail,
-			1 + NFS3_POST_OP_ATTR_WORDS + 3);
+
 	resp->count = argp->count;
-	if ((avail << 2) < resp->count)
-		resp->count = avail << 2;
+	if (NFSSVC_MAXBLKSIZE < resp->count)
+		resp->count = NFSSVC_MAXBLKSIZE;
 
-	svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + argp->count +4);
+	svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_read(rqstp, &resp->fh,
 				  argp->offset,
-				  (char *) buffer,
+			   	  argp->vec, argp->vlen,
 				  &resp->count);
 	if (nfserr == 0) {
 		struct inode	*inode = resp->fh.fh_dentry->d_inode;
@@ -220,7 +218,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 	resp->committed = argp->stable;
 	nfserr = nfsd_write(rqstp, &resp->fh,
 				   argp->offset,
-				   argp->data,
+				   argp->vec, argp->vlen,
 				   argp->len,
 				   &resp->committed);
 	resp->count = argp->count;
@@ -447,7 +445,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 				argp->count, (u32) argp->cookie);
 
 	/* Reserve buffer space for status, attributes and verifier */
-	svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &count,
+	svcbuf_reserve(&rqstp->rq_res, &buffer, &count,
 				1 + NFS3_POST_OP_ATTR_WORDS + 2);
 
 	/* Make sure we've room for the NULL ptr & eof flag, and shrink to
@@ -490,7 +488,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 				argp->count, (u32) argp->cookie);
 
 	/* Reserve buffer space for status, attributes and verifier */
-	svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &count,
+	svcbuf_reserve(&rqstp->rq_res, &buffer, &count,
 				1 + NFS3_POST_OP_ATTR_WORDS + 2);
 
 	/* Make sure we've room for the NULL ptr & eof flag, and shrink to
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9eeba9f3291d..963bf3c7bf1d 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
+#include <linux/mm.h>
 
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
@@ -269,27 +270,6 @@ encode_wcc_data(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 	return encode_post_op_attr(rqstp, p, fhp);
 }
 
-/*
- * Check buffer bounds after decoding arguments
- */
-static inline int
-xdr_argsize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_argbuf;
-
-	return p - buf->base <= buf->buflen;
-}
-
-static inline int
-xdr_ressize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_resbuf;
-
-	buf->len = p - buf->base;
-	dprintk("nfsd: ressize_check p %p base %p len %d\n",
-			p, buf->base, buf->buflen);
-	return (buf->len <= buf->buflen);
-}
 
 /*
  * XDR decode functions
@@ -342,11 +322,29 @@ int
 nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd3_readargs *args)
 {
+	int len;
+	int v,pn;
+
 	if (!(p = decode_fh(p, &args->fh))
 	 || !(p = xdr_decode_hyper(p, &args->offset)))
 		return 0;
 
-	args->count = ntohl(*p++);
+	len = args->count = ntohl(*p++);
+
+	if (len > NFSSVC_MAXBLKSIZE)
+		len = NFSSVC_MAXBLKSIZE;
+
+	/* set up the iovec */
+	v=0;
+	while (len > 0) {
+		pn = rqstp->rq_resused;
+		take_page(rqstp);
+		args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+		args->vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
+		v++;
+		len -= PAGE_SIZE;
+	}
+	args->vlen = v;
 	return xdr_argsize_check(rqstp, p);
 }
 
@@ -354,17 +352,33 @@ int
 nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd3_writeargs *args)
 {
+	int len, v;
+
 	if (!(p = decode_fh(p, &args->fh))
 	 || !(p = xdr_decode_hyper(p, &args->offset)))
 		return 0;
 
 	args->count = ntohl(*p++);
 	args->stable = ntohl(*p++);
-	args->len = ntohl(*p++);
-	args->data = (char *) p;
-	p += XDR_QUADLEN(args->len);
+	len = args->len = ntohl(*p++);
+
+	args->vec[0].iov_base = (void*)p;
+	args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
+		(((void*)p) - rqstp->rq_arg.head[0].iov_base);
+
+	if (len > NFSSVC_MAXBLKSIZE)
+		len = NFSSVC_MAXBLKSIZE;
+	v=  0;
+	while (len > args->vec[v].iov_len) {
+		len -= args->vec[v].iov_len;
+		v++;
+		args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]);
+		args->vec[v].iov_len = PAGE_SIZE;
+	}
+	args->vec[v].iov_len = len;
+	args->vlen = v+1;
 
-	return xdr_argsize_check(rqstp, p);
+	return args->count == args->len && args->vec[0].iov_len > 0;
 }
 
 int
@@ -584,9 +598,23 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 		*p++ = htonl(resp->count);
 		*p++ = htonl(resp->eof);
 		*p++ = htonl(resp->count);	/* xdr opaque count */
-		p += XDR_QUADLEN(resp->count);
-	}
-	return xdr_ressize_check(rqstp, p);
+		xdr_ressize_check(rqstp, p);
+		/* now update rqstp->rq_res to reflect data aswell */
+		rqstp->rq_res.page_base = 0;
+		rqstp->rq_res.page_len = resp->count;
+		if (resp->count & 3) {
+			/* need to page with tail */
+			rqstp->rq_res.tail[0].iov_base = p;
+			*p = 0;
+			rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
+		}
+		rqstp->rq_res.len =
+			rqstp->rq_res.head[0].iov_len+
+			rqstp->rq_res.page_len+
+			rqstp->rq_res.tail[0].iov_len;
+		return 1;
+	} else
+		return xdr_ressize_check(rqstp, p);
 }
 
 /* WRITE */
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index ab52b4b100f5..b1ae27ee05ba 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -41,7 +41,7 @@ static struct svc_cacherep *	lru_tail;
 static struct svc_cacherep *	nfscache;
 static int			cache_disabled = 1;
 
-static int	nfsd_cache_append(struct svc_rqst *rqstp, struct svc_buf *data);
+static int	nfsd_cache_append(struct svc_rqst *rqstp, struct iovec *vec);
 
 /* 
  * locking for the reply cache:
@@ -107,7 +107,7 @@ nfsd_cache_shutdown(void)
 
 	for (rp = lru_head; rp; rp = rp->c_lru_next) {
 		if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF)
-			kfree(rp->c_replbuf.buf);
+			kfree(rp->c_replvec.iov_base);
 	}
 
 	cache_disabled = 1;
@@ -242,8 +242,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
 
 	/* release any buffer */
 	if (rp->c_type == RC_REPLBUFF) {
-		kfree(rp->c_replbuf.buf);
-		rp->c_replbuf.buf = NULL;
+		kfree(rp->c_replvec.iov_base);
+		rp->c_replvec.iov_base = NULL;
 	}
 	rp->c_type = RC_NOCACHE;
  out:
@@ -272,11 +272,11 @@ found_entry:
 	case RC_NOCACHE:
 		break;
 	case RC_REPLSTAT:
-		svc_putu32(&rqstp->rq_resbuf, rp->c_replstat);
+		svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat);
 		rtn = RC_REPLY;
 		break;
 	case RC_REPLBUFF:
-		if (!nfsd_cache_append(rqstp, &rp->c_replbuf))
+		if (!nfsd_cache_append(rqstp, &rp->c_replvec))
 			goto out;	/* should not happen */
 		rtn = RC_REPLY;
 		break;
@@ -308,13 +308,14 @@ void
 nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, u32 *statp)
 {
 	struct svc_cacherep *rp;
-	struct svc_buf	*resp = &rqstp->rq_resbuf, *cachp;
+	struct iovec	*resv = &rqstp->rq_res.head[0], *cachv;
 	int		len;
 
 	if (!(rp = rqstp->rq_cacherep) || cache_disabled)
 		return;
 
-	len = resp->len - (statp - resp->base);
+	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
+	len >>= 2;
 	
 	/* Don't cache excessive amounts of data and XDR failures */
 	if (!statp || len > (256 >> 2)) {
@@ -329,16 +330,16 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, u32 *statp)
 		rp->c_replstat = *statp;
 		break;
 	case RC_REPLBUFF:
-		cachp = &rp->c_replbuf;
-		cachp->buf = (u32 *) kmalloc(len << 2, GFP_KERNEL);
-		if (!cachp->buf) {
+		cachv = &rp->c_replvec;
+		cachv->iov_base = kmalloc(len << 2, GFP_KERNEL);
+		if (!cachv->iov_base) {
 			spin_lock(&cache_lock);
 			rp->c_state = RC_UNUSED;
 			spin_unlock(&cache_lock);
 			return;
 		}
-		cachp->len = len;
-		memcpy(cachp->buf, statp, len << 2);
+		cachv->iov_len = len << 2;
+		memcpy(cachv->iov_base, statp, len << 2);
 		break;
 	}
 	spin_lock(&cache_lock);
@@ -353,19 +354,20 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, u32 *statp)
 
 /*
  * Copy cached reply to current reply buffer. Should always fit.
+ * FIXME as reply is in a page, we should just attach the page, and
+ * keep a refcount....
  */
 static int
-nfsd_cache_append(struct svc_rqst *rqstp, struct svc_buf *data)
+nfsd_cache_append(struct svc_rqst *rqstp, struct iovec *data)
 {
-	struct svc_buf	*resp = &rqstp->rq_resbuf;
+	struct iovec	*vec = &rqstp->rq_res.head[0];
 
-	if (resp->len + data->len > resp->buflen) {
+	if (vec->iov_len + data->iov_len > PAGE_SIZE) {
 		printk(KERN_WARNING "nfsd: cached reply too large (%d).\n",
-				data->len);
+				data->iov_len);
 		return 0;
 	}
-	memcpy(resp->buf, data->buf, data->len << 2);
-	resp->buf += data->len;
-	resp->len += data->len;
+	memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
+	vec->iov_len += data->iov_len;
 	return 1;
 }
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 06c4326e469b..997400e1105a 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -30,11 +30,11 @@ typedef struct svc_buf	svc_buf;
 #define NFSDDBG_FACILITY		NFSDDBG_PROC
 
 
-static void
-svcbuf_reserve(struct svc_buf *buf, u32 **ptr, int *len, int nr)
+static inline void
+svcbuf_reserve(struct xdr_buf *buf, u32 **ptr, int *len, int nr)
 {
-	*ptr = buf->buf + nr;
-	*len = buf->buflen - buf->len - nr;
+	*ptr = (u32*)(buf->head[0].iov_base+buf->head[0].iov_len) + nr;
+	*len = ((PAGE_SIZE-buf->head[0].iov_len)>>2) - nr;
 }
 
 static int
@@ -109,7 +109,7 @@ nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_fhandle     *argp,
 	dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
 
 	/* Reserve room for status and path length */
-	svcbuf_reserve(&rqstp->rq_resbuf, &path, &dummy, 2);
+	svcbuf_reserve(&rqstp->rq_res, &path, &dummy, 2);
 
 	/* Read the symlink. */
 	resp->len = NFS_MAXPATHLEN;
@@ -127,8 +127,7 @@ static int
 nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
 				       struct nfsd_readres  *resp)
 {
-	u32 *	buffer;
-	int	nfserr, avail;
+	int	nfserr;
 
 	dprintk("nfsd: READ    %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
@@ -137,22 +136,21 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
 	/* Obtain buffer pointer for payload. 19 is 1 word for
 	 * status, 17 words for fattr, and 1 word for the byte count.
 	 */
-	svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &avail, 19);
 
-	if ((avail << 2) < argp->count) {
+	if (NFSSVC_MAXBLKSIZE < argp->count) {
 		printk(KERN_NOTICE
 			"oversized read request from %08x:%d (%d bytes)\n",
 				ntohl(rqstp->rq_addr.sin_addr.s_addr),
 				ntohs(rqstp->rq_addr.sin_port),
 				argp->count);
-		argp->count = avail << 2;
+		argp->count = NFSSVC_MAXBLKSIZE;
 	}
 	svc_reserve(rqstp, (19<<2) + argp->count + 4);
 
 	resp->count = argp->count;
 	nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
 				  argp->offset,
-				  (char *) buffer,
+			   	  argp->vec, argp->vlen,
 				  &resp->count);
 
 	return nfserr;
@@ -175,7 +173,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 
 	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh),
 				   argp->offset,
-				   argp->data,
+				   argp->vec, argp->vlen,
 				   argp->len,
 				   &stable);
 	return nfserr;
@@ -478,7 +476,7 @@ nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
 		argp->count, argp->cookie);
 
 	/* Reserve buffer space for status */
-	svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &count, 1);
+	svcbuf_reserve(&rqstp->rq_res, &buffer, &count, 1);
 
 	/* Shrink to the client read size */
 	if (count > (argp->count >> 2))
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index a12f2c980865..da4271183ef7 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -277,7 +277,8 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
 
 	/* Decode arguments */
 	xdr = proc->pc_decode;
-	if (xdr && !xdr(rqstp, rqstp->rq_argbuf.buf, rqstp->rq_argp)) {
+	if (xdr && !xdr(rqstp, (u32*)rqstp->rq_arg.head[0].iov_base,
+			rqstp->rq_argp)) {
 		dprintk("nfsd: failed to decode arguments!\n");
 		nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
 		*statp = rpc_garbage_args;
@@ -293,14 +294,15 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
 	}
 		
 	if (rqstp->rq_proc != 0)
-		svc_putu32(&rqstp->rq_resbuf, nfserr);
+		svc_putu32(&rqstp->rq_res.head[0], nfserr);
 
 	/* Encode result.
 	 * For NFSv2, additional info is never returned in case of an error.
 	 */
 	if (!(nfserr && rqstp->rq_vers == 2)) {
 		xdr = proc->pc_encode;
-		if (xdr && !xdr(rqstp, rqstp->rq_resbuf.buf, rqstp->rq_resp)) {
+		if (xdr && !xdr(rqstp, (u32*)(rqstp->rq_res.head[0].iov_base+rqstp->rq_res.head[0].iov_len),
+				rqstp->rq_resp)) {
 			/* Failed to encode result. Release cache entry */
 			dprintk("nfsd: failed to encode result!\n");
 			nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 15f1c7a16031..d0895793efb1 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -14,6 +14,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/xdr.h>
+#include <linux/mm.h>
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
@@ -176,27 +177,6 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 	return p;
 }
 
-/*
- * Check buffer bounds after decoding arguments
- */
-static inline int
-xdr_argsize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_argbuf;
-
-	return p - buf->base <= buf->buflen;
-}
-
-static inline int
-xdr_ressize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_resbuf;
-
-	buf->len = p - buf->base;
-	dprintk("nfsd: ressize_check p %p base %p len %d\n",
-			p, buf->base, buf->buflen);
-	return (buf->len <= buf->buflen);
-}
 
 /*
  * XDR decode functions
@@ -241,13 +221,31 @@ int
 nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_readargs *args)
 {
+	int len;
+	int v,pn;
 	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
 
 	args->offset    = ntohl(*p++);
-	args->count     = ntohl(*p++);
-	args->totalsize = ntohl(*p++);
+	len = args->count     = ntohl(*p++);
+	p++; /* totalcount - unused */
+
+	if (len > NFSSVC_MAXBLKSIZE)
+		len = NFSSVC_MAXBLKSIZE;
 
+	/* set up somewhere to store response.
+	 * We take pages, put them on reslist and include in iovec
+	 */
+	v=0;
+	while (len > 0) {
+		pn=rqstp->rq_resused;
+		take_page(rqstp);
+		args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+		args->vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
+		v++;
+		len -= PAGE_SIZE;
+	}
+	args->vlen = v;
 	return xdr_argsize_check(rqstp, p);
 }
 
@@ -255,17 +253,30 @@ int
 nfssvc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_writeargs *args)
 {
+	int len;
+	int v;
 	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
 
 	p++;				/* beginoffset */
 	args->offset = ntohl(*p++);	/* offset */
 	p++;				/* totalcount */
-	args->len = ntohl(*p++);
-	args->data = (char *) p;
-	p += XDR_QUADLEN(args->len);
-
-	return xdr_argsize_check(rqstp, p);
+	len = args->len = ntohl(*p++);
+	args->vec[0].iov_base = (void*)p;
+	args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
+				(((void*)p) - rqstp->rq_arg.head[0].iov_base);
+	if (len > NFSSVC_MAXBLKSIZE)
+		len = NFSSVC_MAXBLKSIZE;
+	v = 0;
+	while (len > args->vec[v].iov_len) {
+		len -= args->vec[v].iov_len;
+		v++;
+		args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]);
+		args->vec[v].iov_len = PAGE_SIZE;
+	}
+	args->vec[v].iov_len = len;
+	args->vlen = v+1;
+	return args->vec[0].iov_len > 0;
 }
 
 int
@@ -371,9 +382,22 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 {
 	p = encode_fattr(rqstp, p, &resp->fh);
 	*p++ = htonl(resp->count);
-	p += XDR_QUADLEN(resp->count);
-
-	return xdr_ressize_check(rqstp, p);
+	xdr_ressize_check(rqstp, p);
+
+	/* now update rqstp->rq_res to reflect data aswell */
+	rqstp->rq_res.page_base = 0;
+	rqstp->rq_res.page_len = resp->count;
+	if (resp->count & 3) {
+		/* need to pad with tail */
+		rqstp->rq_res.tail[0].iov_base = p;
+		*p = 0;
+		rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
+	}
+	rqstp->rq_res.len = 
+		rqstp->rq_res.head[0].iov_len+
+		rqstp->rq_res.page_len+
+		rqstp->rq_res.tail[0].iov_len;
+	return 1;
 }
 
 int
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index cb06e914d548..76ad1349e5b9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -577,7 +577,7 @@ found:
  */
 int
 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
-          char *buf, unsigned long *count)
+          struct iovec *vec, int vlen, unsigned long *count)
 {
 	struct raparms	*ra;
 	mm_segment_t	oldfs;
@@ -603,7 +603,7 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 
 	oldfs = get_fs();
 	set_fs(KERNEL_DS);
-	err = vfs_read(&file, buf, *count, &offset);
+	err = vfs_readv(&file, vec, vlen, *count, &offset);
 	set_fs(oldfs);
 
 	/* Write back readahead params */
@@ -629,7 +629,8 @@ out:
  */
 int
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
-				char *buf, unsigned long cnt, int *stablep)
+				struct iovec *vec, int vlen,
+	   			unsigned long cnt, int *stablep)
 {
 	struct svc_export	*exp;
 	struct file		file;
@@ -677,7 +678,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 
 	/* Write the data. */
 	oldfs = get_fs(); set_fs(KERNEL_DS);
-	err = vfs_write(&file, buf, cnt, &offset);
+	err = vfs_writev(&file, vec, vlen, cnt, &offset);
 	if (err >= 0)
 		nfsdstats.io_write += cnt;
 	set_fs(oldfs);
diff --git a/fs/read_write.c b/fs/read_write.c
index a8b23e6367ee..a773421cb6f7 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -207,6 +207,53 @@ ssize_t vfs_read(struct file *file, char *buf, size_t count, loff_t *pos)
 	return ret;
 }
 
+ssize_t vfs_readv(struct file *file, struct iovec *vec, int vlen, size_t count, loff_t *pos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	ssize_t ret;
+
+	if (!(file->f_mode & FMODE_READ))
+		return -EBADF;
+	if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
+		return -EINVAL;
+
+	ret = locks_verify_area(FLOCK_VERIFY_READ, inode, file, *pos, count);
+	if (!ret) {
+		ret = security_ops->file_permission (file, MAY_READ);
+		if (!ret) {
+			if (file->f_op->readv)
+				ret = file->f_op->readv(file, vec, vlen, pos);
+			else {
+				/* do it by hand */
+				struct iovec *vector = vec;
+				ret = 0;
+				while (vlen > 0) {
+					void * base =  vector->iov_base;
+					size_t len = vector->iov_len;
+					ssize_t nr;
+					vector++;
+					vlen--;
+					if (file->f_op->read)
+						nr = file->f_op->read(file, base, len, pos);
+					else
+						nr = do_sync_read(file, base, len, pos);
+					if (nr < 0) {
+						if (!ret) ret = nr;
+						break;
+					}
+					ret += nr;
+					if (nr != len)
+						break;
+				}
+			}
+			if (ret > 0)
+				dnotify_parent(file->f_dentry, DN_ACCESS);
+		}
+	}
+
+	return ret;
+}
+
 ssize_t do_sync_write(struct file *filp, const char *buf, size_t len, loff_t *ppos)
 {
 	struct kiocb kiocb;
@@ -247,6 +294,53 @@ ssize_t vfs_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 	return ret;
 }
 
+ssize_t vfs_writev(struct file *file, const struct iovec *vec, int vlen, size_t count, loff_t *pos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	ssize_t ret;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EBADF;
+	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
+		return -EINVAL;
+
+	ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file, *pos, count);
+	if (!ret) {
+		ret = security_ops->file_permission (file, MAY_WRITE);
+		if (!ret) {
+			if (file->f_op->writev)
+				ret = file->f_op->writev(file, vec, vlen, pos);
+			else {
+				/* do it by hand */
+				const struct iovec *vector = vec;
+				ret = 0;
+				while (vlen > 0) {
+					void * base = vector->iov_base;
+					size_t len = vector->iov_len;
+					ssize_t nr;
+					vector++;
+					vlen--;
+					if (file->f_op->write)
+						nr = file->f_op->write(file, base, len, pos);
+					else
+						nr = do_sync_write(file, base, len, pos);
+					if (nr < 0) {
+						if (!ret) ret = nr;
+						break;
+					}
+					ret += nr;
+					if (nr != len)
+						break;
+				}
+			}
+			if (ret > 0)
+				dnotify_parent(file->f_dentry, DN_MODIFY);
+		}
+	}
+
+	return ret;
+}
+
 asmlinkage ssize_t sys_read(unsigned int fd, char * buf, size_t count)
 {
 	struct file *file;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0adb782c7ac7..9a3e78ba7592 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -795,6 +795,8 @@ struct seq_file;
 
 extern ssize_t vfs_read(struct file *, char *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char *, size_t, loff_t *);
+extern ssize_t vfs_readv(struct file *, struct iovec *, int, size_t, loff_t *);
+extern ssize_t vfs_writev(struct file *, const struct iovec *, int, size_t, loff_t *);
 
 /*
  * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h
index ae2da13bed23..b780f9635930 100644
--- a/include/linux/nfsd/cache.h
+++ b/include/linux/nfsd/cache.h
@@ -32,12 +32,12 @@ struct svc_cacherep {
 	u32			c_vers;
 	unsigned long		c_timestamp;
 	union {
-		struct svc_buf	u_buffer;
+		struct iovec	u_vec;
 		u32		u_status;
 	}			c_u;
 };
 
-#define c_replbuf		c_u.u_buffer
+#define c_replvec		c_u.u_vec
 #define c_replstat		c_u.u_status
 
 /* cache entry states */
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 94fc6231004a..1b8b01067391 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -88,9 +88,9 @@ int		nfsd_open(struct svc_rqst *, struct svc_fh *, int,
 				int, struct file *);
 void		nfsd_close(struct file *);
 int		nfsd_read(struct svc_rqst *, struct svc_fh *,
-				loff_t, char *, unsigned long *);
+				loff_t, struct iovec *,int, unsigned long *);
 int		nfsd_write(struct svc_rqst *, struct svc_fh *,
-				loff_t, char *, unsigned long, int *);
+				loff_t, struct iovec *,int, unsigned long, int *);
 int		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
 				char *, int *);
 int		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h
index dc6f850f3622..97078834e430 100644
--- a/include/linux/nfsd/xdr.h
+++ b/include/linux/nfsd/xdr.h
@@ -29,16 +29,16 @@ struct nfsd_readargs {
 	struct svc_fh		fh;
 	__u32			offset;
 	__u32			count;
-	__u32			totalsize;
+	struct iovec		vec[RPCSVC_MAXPAGES];
+	int			vlen;
 };
 
 struct nfsd_writeargs {
 	svc_fh			fh;
-	__u32			beginoffset;
 	__u32			offset;
-	__u32			totalcount;
-	__u8 *			data;
 	int			len;
+	struct iovec		vec[RPCSVC_MAXPAGES];
+	int			vlen;
 };
 
 struct nfsd_createargs {
diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h
index 83ec5bc2b542..1576a6db4a17 100644
--- a/include/linux/nfsd/xdr3.h
+++ b/include/linux/nfsd/xdr3.h
@@ -33,6 +33,8 @@ struct nfsd3_readargs {
 	struct svc_fh		fh;
 	__u64			offset;
 	__u32			count;
+	struct iovec		vec[RPCSVC_MAXPAGES];
+	int			vlen;
 };
 
 struct nfsd3_writeargs {
@@ -40,8 +42,9 @@ struct nfsd3_writeargs {
 	__u64			offset;
 	__u32			count;
 	int			stable;
-	__u8 *			data;
 	int			len;
+	struct iovec		vec[RPCSVC_MAXPAGES];
+	int			vlen;
 };
 
 struct nfsd3_createargs {
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 9ad879d9bea7..24464d66411a 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -48,43 +48,49 @@ struct svc_serv {
  * This is use to determine the max number of pages nfsd is
  * willing to return in a single READ operation.
  */
-#define RPCSVC_MAXPAYLOAD	16384u
+#define RPCSVC_MAXPAYLOAD	(64*1024u)
 
 /*
- * Buffer to store RPC requests or replies in.
- * Each server thread has one of these beasts.
+ * RPC Requsts and replies are stored in one or more pages.
+ * We maintain an array of pages for each server thread.
+ * Requests are copied into these pages as they arrive.  Remaining
+ * pages are available to write the reply into.
  *
- * Area points to the allocated memory chunk currently owned by the
- * buffer. Base points to the buffer containing the request, which is
- * different from area when directly reading from an sk_buff. buf is
- * the current read/write position while processing an RPC request.
+ * Currently pages are all re-used by the same server.  Later we 
+ * will use ->sendpage to transmit pages with reduced copying.  In
+ * that case we will need to give away the page and allocate new ones.
+ * In preparation for this, we explicitly move pages off the recv
+ * list onto the transmit list, and back.
  *
- * The array of iovecs can hold additional data that the server process
- * may not want to copy into the RPC reply buffer, but pass to the 
- * network sendmsg routines directly. The prime candidate for this
- * will of course be NFS READ operations, but one might also want to
- * do something about READLINK and READDIR. It might be worthwhile
- * to implement some generic readdir cache in the VFS layer...
+ * We use xdr_buf for holding responses as it fits well with NFS
+ * read responses (that have a header, and some data pages, and possibly
+ * a tail) and means we can share some client side routines.
  *
- * On the receiving end of the RPC server, the iovec may be used to hold
- * the list of IP fragments once we get to process fragmented UDP
- * datagrams directly.
+ * The xdr_buf.head iovec always points to the first page in the rq_*pages
+ * list.  The xdr_buf.pages pointer points to the second page on that
+ * list.  xdr_buf.tail points to the end of the first page.
+ * This assumes that the non-page part of an rpc reply will fit
+ * in a page - NFSd ensures this.  lockd also has no trouble.
  */
-#define RPCSVC_MAXIOV		((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE + 1)
-struct svc_buf {
-	u32 *			area;	/* allocated memory */
-	u32 *			base;	/* base of RPC datagram */
-	int			buflen;	/* total length of buffer */
-	u32 *			buf;	/* read/write pointer */
-	int			len;	/* current end of buffer */
-
-	/* iovec for zero-copy NFS READs */
-	struct iovec		iov[RPCSVC_MAXIOV];
-	int			nriov;
-};
-#define svc_getu32(argp, val)	{ (val) = *(argp)->buf++; (argp)->len--; }
-#define svc_putu32(resp, val)	{ *(resp)->buf++ = (val); (resp)->len++; }
+#define RPCSVC_MAXPAGES		((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE + 1)
+
+static inline u32 svc_getu32(struct iovec *iov)
+{
+	u32 val, *vp;
+	vp = iov->iov_base;
+	val = *vp++;
+	iov->iov_base = (void*)vp;
+	iov->iov_len -= sizeof(u32);
+	return val;
+}
+static inline void svc_putu32(struct iovec *iov, u32 val)
+{
+	u32 *vp = iov->iov_base + iov->iov_len;
+	*vp = val;
+	iov->iov_len += sizeof(u32);
+}
 
+	
 /*
  * The context of a single thread, including the request currently being
  * processed.
@@ -102,9 +108,15 @@ struct svc_rqst {
 	struct svc_cred		rq_cred;	/* auth info */
 	struct sk_buff *	rq_skbuff;	/* fast recv inet buffer */
 	struct svc_deferred_req*rq_deferred;	/* deferred request we are replaying */
-	struct svc_buf		rq_defbuf;	/* default buffer */
-	struct svc_buf		rq_argbuf;	/* argument buffer */
-	struct svc_buf		rq_resbuf;	/* result buffer */
+
+	struct xdr_buf		rq_arg;
+	struct xdr_buf		rq_res;
+	struct page *		rq_argpages[RPCSVC_MAXPAGES];
+	struct page *		rq_respages[RPCSVC_MAXPAGES];
+	short			rq_argused;	/* pages used for argument */
+	short			rq_arghi;	/* pages available in argument page list */
+	short			rq_resused;	/* pages used for result */
+
 	u32			rq_xid;		/* transmission id */
 	u32			rq_prog;	/* program number */
 	u32			rq_vers;	/* program version */
@@ -136,6 +148,38 @@ struct svc_rqst {
 	wait_queue_head_t	rq_wait;	/* synchronization */
 };
 
+/*
+ * Check buffer bounds after decoding arguments
+ */
+static inline int
+xdr_argsize_check(struct svc_rqst *rqstp, u32 *p)
+{
+	char *cp = (char *)p;
+	struct iovec *vec = &rqstp->rq_arg.head[0];
+	return cp - (char*)vec->iov_base <= vec->iov_len;
+}
+
+static inline int
+xdr_ressize_check(struct svc_rqst *rqstp, u32 *p)
+{
+	struct iovec *vec = &rqstp->rq_res.head[0];
+	char *cp = (char*)p;
+
+	vec->iov_len = cp - (char*)vec->iov_base;
+	rqstp->rq_res.len = vec->iov_len;
+
+	return vec->iov_len <= PAGE_SIZE;
+}
+
+static int inline take_page(struct svc_rqst *rqstp)
+{
+	if (rqstp->rq_arghi <= rqstp->rq_argused)
+		return -ENOMEM;
+	rqstp->rq_respages[rqstp->rq_resused++] =
+		rqstp->rq_argpages[--rqstp->rq_arghi];
+	return 0;
+}
+
 struct svc_deferred_req {
 	struct svc_serv		*serv;
 	u32			prot;	/* protocol (UDP or TCP) */
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 7ecffcd552d1..6628b24ba827 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -253,7 +253,9 @@ EXPORT_SYMBOL(find_inode_number);
 EXPORT_SYMBOL(is_subdir);
 EXPORT_SYMBOL(get_unused_fd);
 EXPORT_SYMBOL(vfs_read);
+EXPORT_SYMBOL(vfs_readv);
 EXPORT_SYMBOL(vfs_write);
+EXPORT_SYMBOL(vfs_writev);
 EXPORT_SYMBOL(vfs_create);
 EXPORT_SYMBOL(vfs_mkdir);
 EXPORT_SYMBOL(vfs_mknod);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 0296128c3cbc..60cdc3cdb300 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -13,6 +13,7 @@
 #include <linux/net.h>
 #include <linux/in.h>
 #include <linux/unistd.h>
+#include <linux/mm.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/xdr.h>
@@ -35,7 +36,6 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
 
 	if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL)))
 		return NULL;
-
 	memset(serv, 0, sizeof(*serv));
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
@@ -105,35 +105,42 @@ svc_destroy(struct svc_serv *serv)
 }
 
 /*
- * Allocate an RPC server buffer
- * Later versions may do nifty things by allocating multiple pages
- * of memory directly and putting them into the bufp->iov.
+ * Allocate an RPC server's buffer space.
+ * We allocate pages and place them in rq_argpages.
  */
-int
-svc_init_buffer(struct svc_buf *bufp, unsigned int size)
+static int
+svc_init_buffer(struct svc_rqst *rqstp, unsigned int size)
 {
-	if (!(bufp->area = (u32 *) kmalloc(size, GFP_KERNEL)))
-		return 0;
-	bufp->base   = bufp->area;
-	bufp->buf    = bufp->area;
-	bufp->len    = 0;
-	bufp->buflen = size >> 2;
-
-	bufp->iov[0].iov_base = bufp->area;
-	bufp->iov[0].iov_len  = size;
-	bufp->nriov = 1;
-
-	return 1;
+	int pages = 2 + (size+ PAGE_SIZE -1) / PAGE_SIZE;
+	int arghi;
+	
+	rqstp->rq_argused = 0;
+	rqstp->rq_resused = 0;
+	arghi = 0;
+	if (pages > RPCSVC_MAXPAGES)
+		BUG();
+	while (pages) {
+		struct page *p = alloc_page(GFP_KERNEL);
+		if (!p)
+			break;
+		rqstp->rq_argpages[arghi++] = p;
+		pages--;
+	}
+	rqstp->rq_arghi = arghi;
+	return ! pages;
 }
 
 /*
  * Release an RPC server buffer
  */
-void
-svc_release_buffer(struct svc_buf *bufp)
+static void
+svc_release_buffer(struct svc_rqst *rqstp)
 {
-	kfree(bufp->area);
-	bufp->area = 0;
+	while (rqstp->rq_arghi)
+		put_page(rqstp->rq_argpages[--rqstp->rq_arghi]);
+	while (rqstp->rq_resused)
+		put_page(rqstp->rq_respages[--rqstp->rq_resused]);
+	rqstp->rq_argused = 0;
 }
 
 /*
@@ -154,7 +161,7 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv)
 
 	if (!(rqstp->rq_argp = (u32 *) kmalloc(serv->sv_xdrsize, GFP_KERNEL))
 	 || !(rqstp->rq_resp = (u32 *) kmalloc(serv->sv_xdrsize, GFP_KERNEL))
-	 || !svc_init_buffer(&rqstp->rq_defbuf, serv->sv_bufsz))
+	 || !svc_init_buffer(rqstp, serv->sv_bufsz))
 		goto out_thread;
 
 	serv->sv_nrthreads++;
@@ -180,7 +187,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
 {
 	struct svc_serv	*serv = rqstp->rq_server;
 
-	svc_release_buffer(&rqstp->rq_defbuf);
+	svc_release_buffer(rqstp);
 	if (rqstp->rq_resp)
 		kfree(rqstp->rq_resp);
 	if (rqstp->rq_argp)
@@ -242,37 +249,51 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 	struct svc_program	*progp;
 	struct svc_version	*versp = NULL;	/* compiler food */
 	struct svc_procedure	*procp = NULL;
-	struct svc_buf *	argp = &rqstp->rq_argbuf;
-	struct svc_buf *	resp = &rqstp->rq_resbuf;
+	struct iovec *		argv = &rqstp->rq_arg.head[0];
+	struct iovec *		resv = &rqstp->rq_res.head[0];
 	kxdrproc_t		xdr;
-	u32			*bufp, *statp;
+	u32			*statp;
 	u32			dir, prog, vers, proc,
 				auth_stat, rpc_stat;
 
 	rpc_stat = rpc_success;
-	bufp = argp->buf;
 
-	if (argp->len < 5)
+	if (argv->iov_len < 6*4)
 		goto err_short_len;
 
-	dir  = ntohl(*bufp++);
-	vers = ntohl(*bufp++);
+	/* setup response xdr_buf.
+	 * Initially it has just one page 
+	 */
+	take_page(rqstp); /* must succeed */
+	resv->iov_base = page_address(rqstp->rq_respages[0]);
+	resv->iov_len = 0;
+	rqstp->rq_res.pages = rqstp->rq_respages+1;
+	rqstp->rq_res.len = 0;
+	rqstp->rq_res.page_base = 0;
+	rqstp->rq_res.page_len = 0;
+	/* tcp needs a space for the record length... */
+	if (rqstp->rq_prot == IPPROTO_TCP)
+		svc_putu32(resv, 0);
+
+	rqstp->rq_xid = svc_getu32(argv);
+	svc_putu32(resv, rqstp->rq_xid);
+
+	dir  = ntohl(svc_getu32(argv));
+	vers = ntohl(svc_getu32(argv));
 
 	/* First words of reply: */
-	svc_putu32(resp, xdr_one);		/* REPLY */
-	svc_putu32(resp, xdr_zero);		/* ACCEPT */
+	svc_putu32(resv, xdr_one);		/* REPLY */
 
 	if (dir != 0)		/* direction != CALL */
 		goto err_bad_dir;
 	if (vers != 2)		/* RPC version number */
 		goto err_bad_rpc;
 
-	rqstp->rq_prog = prog = ntohl(*bufp++);	/* program number */
-	rqstp->rq_vers = vers = ntohl(*bufp++);	/* version number */
-	rqstp->rq_proc = proc = ntohl(*bufp++);	/* procedure number */
+	svc_putu32(resv, xdr_zero);		/* ACCEPT */
 
-	argp->buf += 5;
-	argp->len -= 5;
+	rqstp->rq_prog = prog = ntohl(svc_getu32(argv));	/* program number */
+	rqstp->rq_vers = vers = ntohl(svc_getu32(argv));	/* version number */
+	rqstp->rq_proc = proc = ntohl(svc_getu32(argv));	/* procedure number */
 
 	/*
 	 * Decode auth data, and add verifier to reply buffer.
@@ -307,8 +328,8 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 	serv->sv_stats->rpccnt++;
 
 	/* Build the reply header. */
-	statp = resp->buf;
-	svc_putu32(resp, rpc_success);		/* RPC_SUCCESS */
+	statp = resv->iov_base +resv->iov_len;
+	svc_putu32(resv, rpc_success);		/* RPC_SUCCESS */
 
 	/* Bump per-procedure stats counter */
 	procp->pc_count++;
@@ -327,14 +348,14 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 	if (!versp->vs_dispatch) {
 		/* Decode arguments */
 		xdr = procp->pc_decode;
-		if (xdr && !xdr(rqstp, rqstp->rq_argbuf.buf, rqstp->rq_argp))
+		if (xdr && !xdr(rqstp, argv->iov_base, rqstp->rq_argp))
 			goto err_garbage;
 
 		*statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 
 		/* Encode reply */
 		if (*statp == rpc_success && (xdr = procp->pc_encode)
-		 && !xdr(rqstp, rqstp->rq_resbuf.buf, rqstp->rq_resp)) {
+		 && !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
 			dprintk("svc: failed to encode reply\n");
 			/* serv->sv_stats->rpcsystemerr++; */
 			*statp = rpc_system_err;
@@ -347,7 +368,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 
 	/* Check RPC status result */
 	if (*statp != rpc_success)
-		resp->len = statp + 1 - resp->base;
+		resv->iov_len = ((void*)statp)  - resv->iov_base + 4;
 
 	/* Release reply info */
 	if (procp->pc_release)
@@ -369,7 +390,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 
 err_short_len:
 #ifdef RPC_PARANOIA
-	printk("svc: short len %d, dropping request\n", argp->len);
+	printk("svc: short len %d, dropping request\n", argv->iov_len);
 #endif
 	goto dropit;			/* drop request */
 
@@ -382,18 +403,19 @@ err_bad_dir:
 
 err_bad_rpc:
 	serv->sv_stats->rpcbadfmt++;
-	resp->buf[-1] = xdr_one;	/* REJECT */
-	svc_putu32(resp, xdr_zero);	/* RPC_MISMATCH */
-	svc_putu32(resp, xdr_two);	/* Only RPCv2 supported */
-	svc_putu32(resp, xdr_two);
+	svc_putu32(resv, xdr_one);	/* REJECT */
+	svc_putu32(resv, xdr_zero);	/* RPC_MISMATCH */
+	svc_putu32(resv, xdr_two);	/* Only RPCv2 supported */
+	svc_putu32(resv, xdr_two);
 	goto sendit;
 
 err_bad_auth:
 	dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat));
 	serv->sv_stats->rpcbadauth++;
-	resp->buf[-1] = xdr_one;	/* REJECT */
-	svc_putu32(resp, xdr_one);	/* AUTH_ERROR */
-	svc_putu32(resp, auth_stat);	/* status */
+	resv->iov_len -= 4;
+	svc_putu32(resv, xdr_one);	/* REJECT */
+	svc_putu32(resv, xdr_one);	/* AUTH_ERROR */
+	svc_putu32(resv, auth_stat);	/* status */
 	goto sendit;
 
 err_bad_prog:
@@ -403,7 +425,7 @@ err_bad_prog:
 	/* else it is just a Solaris client seeing if ACLs are supported */
 #endif
 	serv->sv_stats->rpcbadfmt++;
-	svc_putu32(resp, rpc_prog_unavail);
+	svc_putu32(resv, rpc_prog_unavail);
 	goto sendit;
 
 err_bad_vers:
@@ -411,9 +433,9 @@ err_bad_vers:
 	printk("svc: unknown version (%d)\n", vers);
 #endif
 	serv->sv_stats->rpcbadfmt++;
-	svc_putu32(resp, rpc_prog_mismatch);
-	svc_putu32(resp, htonl(progp->pg_lovers));
-	svc_putu32(resp, htonl(progp->pg_hivers));
+	svc_putu32(resv, rpc_prog_mismatch);
+	svc_putu32(resv, htonl(progp->pg_lovers));
+	svc_putu32(resv, htonl(progp->pg_hivers));
 	goto sendit;
 
 err_bad_proc:
@@ -421,7 +443,7 @@ err_bad_proc:
 	printk("svc: unknown procedure (%d)\n", proc);
 #endif
 	serv->sv_stats->rpcbadfmt++;
-	svc_putu32(resp, rpc_proc_unavail);
+	svc_putu32(resv, rpc_proc_unavail);
 	goto sendit;
 
 err_garbage:
@@ -429,6 +451,6 @@ err_garbage:
 	printk("svc: failed to decode args\n");
 #endif
 	serv->sv_stats->rpcbadfmt++;
-	svc_putu32(resp, rpc_garbage_args);
+	svc_putu32(resv, rpc_garbage_args);
 	goto sendit;
 }
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 5e68c0c270ca..39a46f7a12f5 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -40,8 +40,7 @@ svc_authenticate(struct svc_rqst *rqstp, u32 *statp, u32 *authp, int proc)
 	*statp = rpc_success;
 	*authp = rpc_auth_ok;
 
-	svc_getu32(&rqstp->rq_argbuf, flavor);
-	flavor = ntohl(flavor);
+	flavor = ntohl(svc_getu32(&rqstp->rq_arg.head[0]));
 
 	dprintk("svc: svc_authenticate (%d)\n", flavor);
 	if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor])) {
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 7fabce411088..37e74850f362 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -295,20 +295,20 @@ void svcauth_unix_purge(void)
 static int
 svcauth_null_accept(struct svc_rqst *rqstp, u32 *authp, int proc)
 {
-	struct svc_buf	*argp = &rqstp->rq_argbuf;
-	struct svc_buf	*resp = &rqstp->rq_resbuf;
+	struct iovec	*argv = &rqstp->rq_arg.head[0];
+	struct iovec	*resv = &rqstp->rq_res.head[0];
 	int		rv=0;
 	struct ip_map key, *ipm;
 
-	if ((argp->len -= 3) < 0) {
+	if (argv->iov_len < 3*4)
 		return SVC_GARBAGE;
-	}
-	if (*(argp->buf)++ != 0) {	/* we already skipped the flavor */
+
+	if (svc_getu32(argv) != 0) { 
 		dprintk("svc: bad null cred\n");
 		*authp = rpc_autherr_badcred;
 		return SVC_DENIED;
 	}
-	if (*(argp->buf)++ != RPC_AUTH_NULL || *(argp->buf)++ != 0) {
+	if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
 		dprintk("svc: bad null verf\n");
 		*authp = rpc_autherr_badverf;
 		return SVC_DENIED;
@@ -320,8 +320,8 @@ svcauth_null_accept(struct svc_rqst *rqstp, u32 *authp, int proc)
 	rqstp->rq_cred.cr_groups[0] = NOGROUP;
 
 	/* Put NULL verifier */
-	svc_putu32(resp, RPC_AUTH_NULL);
-	svc_putu32(resp, 0);
+	svc_putu32(resv, RPC_AUTH_NULL);
+	svc_putu32(resv, 0);
 
 	key.m_class = rqstp->rq_server->sv_program->pg_class;
 	key.m_addr = rqstp->rq_addr.sin_addr;
@@ -376,55 +376,54 @@ struct auth_ops svcauth_null = {
 int
 svcauth_unix_accept(struct svc_rqst *rqstp, u32 *authp, int proc)
 {
-	struct svc_buf	*argp = &rqstp->rq_argbuf;
-	struct svc_buf	*resp = &rqstp->rq_resbuf;
+	struct iovec	*argv = &rqstp->rq_arg.head[0];
+	struct iovec	*resv = &rqstp->rq_res.head[0];
 	struct svc_cred	*cred = &rqstp->rq_cred;
-	u32		*bufp = argp->buf, slen, i;
-	int		len   = argp->len;
+	u32		slen, i;
+	int		len   = argv->iov_len;
 	int		rv=0;
 	struct ip_map key, *ipm;
 
-	if ((len -= 3) < 0)
+	if ((len -= 3*4) < 0)
 		return SVC_GARBAGE;
 
-	bufp++;					/* length */
-	bufp++;					/* time stamp */
-	slen = XDR_QUADLEN(ntohl(*bufp++));	/* machname length */
-	if (slen > 64 || (len -= slen + 3) < 0)
+	svc_getu32(argv);			/* length */
+	svc_getu32(argv);			/* time stamp */
+	slen = XDR_QUADLEN(ntohl(svc_getu32(argv)));	/* machname length */
+	if (slen > 64 || (len -= (slen + 3)*4) < 0)
 		goto badcred;
-	bufp += slen;				/* skip machname */
-
-	cred->cr_uid = ntohl(*bufp++);		/* uid */
-	cred->cr_gid = ntohl(*bufp++);		/* gid */
+	argv->iov_base = (void*)((u32*)argv->iov_base + slen);	/* skip machname */
+	argv->iov_len -= slen*4;
 
-	slen = ntohl(*bufp++);			/* gids length */
-	if (slen > 16 || (len -= slen + 2) < 0)
+	cred->cr_uid = ntohl(svc_getu32(argv));		/* uid */
+	cred->cr_gid = ntohl(svc_getu32(argv));		/* gid */
+	slen = ntohl(svc_getu32(argv));			/* gids length */
+	if (slen > 16 || (len -= (slen + 2)*4) < 0)
 		goto badcred;
-	for (i = 0; i < NGROUPS && i < slen; i++)
-		cred->cr_groups[i] = ntohl(*bufp++);
+	for (i = 0; i < slen; i++)
+		if (i < NGROUPS)
+			cred->cr_groups[i] = ntohl(svc_getu32(argv));
+		else
+			svc_getu32(argv);
 	if (i < NGROUPS)
 		cred->cr_groups[i] = NOGROUP;
-	bufp += (slen - i);
 
-	if (*bufp++ != RPC_AUTH_NULL || *bufp++ != 0) {
+	if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
 		*authp = rpc_autherr_badverf;
 		return SVC_DENIED;
 	}
 
-	argp->buf = bufp;
-	argp->len = len;
-
 	/* Put NULL verifier */
-	svc_putu32(resp, RPC_AUTH_NULL);
-	svc_putu32(resp, 0);
+	svc_putu32(resv, RPC_AUTH_NULL);
+	svc_putu32(resv, 0);
 
 	key.m_class = rqstp->rq_server->sv_program->pg_class;
 	key.m_addr = rqstp->rq_addr.sin_addr;
 
+
 	ipm = ip_map_lookup(&key, 0);
 
 	rqstp->rq_client = NULL;
-
 	if (ipm)
 		switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
 		case -EAGAIN:
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 6c2b97c5d18d..4894ce957549 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -234,7 +234,7 @@ svc_sock_received(struct svc_sock *svsk)
  */
 void svc_reserve(struct svc_rqst *rqstp, int space)
 {
-	space += rqstp->rq_resbuf.len<<2;
+	space += rqstp->rq_res.head[0].iov_len;
 
 	if (space < rqstp->rq_reserved) {
 		struct svc_sock *svsk = rqstp->rq_sock;
@@ -278,13 +278,12 @@ svc_sock_release(struct svc_rqst *rqstp)
 	 * But first, check that enough space was reserved
 	 * for the reply, otherwise we have a bug!
 	 */
-	if ((rqstp->rq_resbuf.len<<2) >  rqstp->rq_reserved)
+	if ((rqstp->rq_res.len) >  rqstp->rq_reserved)
 		printk(KERN_ERR "RPC request reserved %d but used %d\n",
 		       rqstp->rq_reserved,
-		       rqstp->rq_resbuf.len<<2);
+		       rqstp->rq_res.len);
 
-	rqstp->rq_resbuf.buf = rqstp->rq_resbuf.base;
-	rqstp->rq_resbuf.len = 0;
+	rqstp->rq_res.head[0].iov_len = 0;
 	svc_reserve(rqstp, 0);
 	rqstp->rq_sock = NULL;
 
@@ -348,8 +347,9 @@ svc_sendto(struct svc_rqst *rqstp, struct iovec *iov, int nr)
 	len = sock_sendmsg(sock, &msg, buflen);
 	set_fs(oldfs);
 
-	dprintk("svc: socket %p sendto([%p %Zu... ], %d, %d) = %d\n",
-			rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, nr, buflen, len);
+	dprintk("svc: socket %p sendto([%p %Zu... ], %d, %d) = %d (addr %x)\n",
+			rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, nr, buflen, len,
+		rqstp->rq_addr.sin_addr.s_addr);
 
 	return len;
 }
@@ -480,13 +480,15 @@ svc_write_space(struct sock *sk)
 /*
  * Receive a datagram from a UDP socket.
  */
+extern int
+csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb);
+
 static int
 svc_udp_recvfrom(struct svc_rqst *rqstp)
 {
 	struct svc_sock	*svsk = rqstp->rq_sock;
 	struct svc_serv	*serv = svsk->sk_server;
 	struct sk_buff	*skb;
-	u32		*data;
 	int		err, len;
 
 	if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
@@ -512,33 +514,19 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 	}
 	set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
 
-	/* Sorry. */
-	if (skb_is_nonlinear(skb)) {
-		if (skb_linearize(skb, GFP_KERNEL) != 0) {
-			kfree_skb(skb);
-			svc_sock_received(svsk);
-			return 0;
-		}
-	}
+	len  = skb->len - sizeof(struct udphdr);
 
-	if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
-		if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
-			skb_free_datagram(svsk->sk_sk, skb);
-			svc_sock_received(svsk);
-			return 0;
-		}
+	if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
+		/* checksum error */
+		skb_free_datagram(svsk->sk_sk, skb);
+		svc_sock_received(svsk);
+		return 0;
 	}
 
 
-	len  = skb->len - sizeof(struct udphdr);
-	data = (u32 *) (skb->data + sizeof(struct udphdr));
-
-	rqstp->rq_skbuff      = skb;
-	rqstp->rq_argbuf.base = data;
-	rqstp->rq_argbuf.buf  = data;
-	rqstp->rq_argbuf.len  = (len >> 2);
-	rqstp->rq_argbuf.buflen = (len >> 2);
-	/* rqstp->rq_resbuf      = rqstp->rq_defbuf; */
+	rqstp->rq_arg.len = len;
+	rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
+	rqstp->rq_argused += (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE;
 	rqstp->rq_prot        = IPPROTO_UDP;
 
 	/* Get sender address */
@@ -546,6 +534,8 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_addr.sin_port = skb->h.uh->source;
 	rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr;
 
+	skb_free_datagram(svsk->sk_sk, skb);
+
 	if (serv->sv_stats)
 		serv->sv_stats->netudpcnt++;
 
@@ -559,21 +549,36 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 static int
 svc_udp_sendto(struct svc_rqst *rqstp)
 {
-	struct svc_buf	*bufp = &rqstp->rq_resbuf;
 	int		error;
+	struct iovec vec[RPCSVC_MAXPAGES];
+	int v;
+	int base, len;
 
 	/* Set up the first element of the reply iovec.
 	 * Any other iovecs that may be in use have been taken
 	 * care of by the server implementation itself.
 	 */
-	/* bufp->base = bufp->area; */
-	bufp->iov[0].iov_base = bufp->base;
-	bufp->iov[0].iov_len  = bufp->len << 2;
-
-	error = svc_sendto(rqstp, bufp->iov, bufp->nriov);
+	vec[0] = rqstp->rq_res.head[0];
+	v=1;
+	base=rqstp->rq_res.page_base;
+	len = rqstp->rq_res.page_len;
+	while (len) {
+		vec[v].iov_base = page_address(rqstp->rq_res.pages[v-1]) + base;
+		vec[v].iov_len = PAGE_SIZE-base;
+		if (len <= vec[v].iov_len)
+			vec[v].iov_len = len;
+		len -= vec[v].iov_len;
+		base = 0;
+		v++;
+	}
+	if (rqstp->rq_res.tail[0].iov_len) {
+		vec[v] = rqstp->rq_res.tail[0];
+		v++;
+	}
+	error = svc_sendto(rqstp, vec, v);
 	if (error == -ECONNREFUSED)
 		/* ICMP error on earlier request. */
-		error = svc_sendto(rqstp, bufp->iov, bufp->nriov);
+		error = svc_sendto(rqstp, vec, v);
 
 	return error;
 }
@@ -785,8 +790,9 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 {
 	struct svc_sock	*svsk = rqstp->rq_sock;
 	struct svc_serv	*serv = svsk->sk_server;
-	struct svc_buf	*bufp = &rqstp->rq_argbuf;
 	int		len;
+	struct iovec vec[RPCSVC_MAXPAGES];
+	int pnum, vlen;
 
 	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
 		svsk, test_bit(SK_DATA, &svsk->sk_flags),
@@ -851,7 +857,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 		}
 		svsk->sk_reclen &= 0x7fffffff;
 		dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen);
-		if (svsk->sk_reclen > (bufp->buflen<<2)) {
+		if (svsk->sk_reclen > serv->sv_bufsz) {
 			printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n",
 			       (unsigned long) svsk->sk_reclen);
 			goto err_delete;
@@ -869,30 +875,35 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 		svc_sock_received(svsk);
 		return -EAGAIN;	/* record not complete */
 	}
+	len = svsk->sk_reclen;
 	set_bit(SK_DATA, &svsk->sk_flags);
 
-	/* Frob argbuf */
-	bufp->iov[0].iov_base += 4;
-	bufp->iov[0].iov_len  -= 4;
+	vec[0] = rqstp->rq_arg.head[0];
+	vlen = PAGE_SIZE;
+	pnum = 1;
+	while (vlen < len) {
+		vec[pnum].iov_base = page_address(rqstp->rq_argpages[rqstp->rq_argused++]);
+		vec[pnum].iov_len = PAGE_SIZE;
+		pnum++;
+		vlen += PAGE_SIZE;
+	}
 
 	/* Now receive data */
-	len = svc_recvfrom(rqstp, bufp->iov, bufp->nriov, svsk->sk_reclen);
+	len = svc_recvfrom(rqstp, vec, pnum, len);
 	if (len < 0)
 		goto error;
 
 	dprintk("svc: TCP complete record (%d bytes)\n", len);
-
-	/* Position reply write pointer immediately after args,
-	 * allowing for record length */
-	rqstp->rq_resbuf.base = rqstp->rq_argbuf.base + 1 + (len>>2);
-	rqstp->rq_resbuf.buf  = rqstp->rq_resbuf.base + 1;
-	rqstp->rq_resbuf.len  = 1;
-	rqstp->rq_resbuf.buflen= rqstp->rq_argbuf.buflen - (len>>2) - 1;
+	rqstp->rq_arg.len = len;
+	rqstp->rq_arg.page_base = 0;
+	if (len <= rqstp->rq_arg.head[0].iov_len) {
+		rqstp->rq_arg.head[0].iov_len = len;
+		rqstp->rq_arg.page_len = 0;
+	} else {
+		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
+	}
 
 	rqstp->rq_skbuff      = 0;
-	rqstp->rq_argbuf.buf += 1;
-	rqstp->rq_argbuf.len  = (len >> 2);
-	rqstp->rq_argbuf.buflen = (len >> 2) +1;
 	rqstp->rq_prot	      = IPPROTO_TCP;
 
 	/* Reset TCP read info */
@@ -928,23 +939,44 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 static int
 svc_tcp_sendto(struct svc_rqst *rqstp)
 {
-	struct svc_buf	*bufp = &rqstp->rq_resbuf;
+	struct xdr_buf	*xbufp = &rqstp->rq_res;
+	struct iovec vec[RPCSVC_MAXPAGES];
+	int v;
+	int base, len;
 	int sent;
+	u32 reclen;
 
 	/* Set up the first element of the reply iovec.
 	 * Any other iovecs that may be in use have been taken
 	 * care of by the server implementation itself.
 	 */
-	bufp->iov[0].iov_base = bufp->base;
-	bufp->iov[0].iov_len  = bufp->len << 2;
-	bufp->base[0] = htonl(0x80000000|((bufp->len << 2) - 4));
+	reclen = htonl(0x80000000|((xbufp->len ) - 4));
+	memcpy(xbufp->head[0].iov_base, &reclen, 4);
+
+	vec[0] = rqstp->rq_res.head[0];
+	v=1;
+	base= xbufp->page_base;
+	len = xbufp->page_len;
+	while (len) {
+		vec[v].iov_base = page_address(xbufp->pages[v-1]) + base;
+		vec[v].iov_len = PAGE_SIZE-base;
+		if (len <= vec[v].iov_len)
+			vec[v].iov_len = len;
+		len -= vec[v].iov_len;
+		base = 0;
+		v++;
+	}
+	if (xbufp->tail[0].iov_len) {
+		vec[v] = xbufp->tail[0];
+		v++;
+	}
 
-	sent = svc_sendto(rqstp, bufp->iov, bufp->nriov);
-	if (sent != bufp->len<<2) {
+	sent = svc_sendto(rqstp, vec, v);
+	if (sent != xbufp->len) {
 		printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
 		       rqstp->rq_sock->sk_server->sv_name,
 		       (sent<0)?"got error":"sent only",
-		       sent, bufp->len << 2);
+		       sent, xbufp->len);
 		svc_delete_socket(rqstp->rq_sock);
 		sent = -EAGAIN;
 	}
@@ -1016,6 +1048,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 {
 	struct svc_sock		*svsk =NULL;
 	int			len;
+	int 			pages;
+	struct xdr_buf		*arg;
 	DECLARE_WAITQUEUE(wait, current);
 
 	dprintk("svc: server %p waiting for data (to = %ld)\n",
@@ -1031,9 +1065,35 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 			 rqstp);
 
 	/* Initialize the buffers */
-	rqstp->rq_argbuf = rqstp->rq_defbuf;
-	rqstp->rq_resbuf = rqstp->rq_defbuf;
+	/* first reclaim pages that were moved to response list */
+	while (rqstp->rq_resused) 
+		rqstp->rq_argpages[rqstp->rq_arghi++] =
+			rqstp->rq_respages[--rqstp->rq_resused];
+	/* now allocate needed pages.  If we get a failure, sleep briefly */
+	pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE;
+	while (rqstp->rq_arghi < pages) {
+		struct page *p = alloc_page(GFP_KERNEL);
+		if (!p) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(HZ/2);
+			current->state = TASK_RUNNING;
+			continue;
+		}
+		rqstp->rq_argpages[rqstp->rq_arghi++] = p;
+	}
 
+	/* Make arg->head point to first page and arg->pages point to rest */
+	arg = &rqstp->rq_arg;
+	arg->head[0].iov_base = page_address(rqstp->rq_argpages[0]);
+	arg->head[0].iov_len = PAGE_SIZE;
+	rqstp->rq_argused = 1;
+	arg->pages = rqstp->rq_argpages + 1;
+	arg->page_base = 0;
+	/* save at least one page for response */
+	arg->page_len = (pages-2)*PAGE_SIZE;
+	arg->len = (pages-1)*PAGE_SIZE;
+	arg->tail[0].iov_len = 0;
+	
 	if (signalled())
 		return -EINTR;
 
@@ -1109,12 +1169,6 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 	rqstp->rq_userset = 0;
 	rqstp->rq_chandle.defer = svc_defer;
 
-	svc_getu32(&rqstp->rq_argbuf, rqstp->rq_xid);
-	svc_putu32(&rqstp->rq_resbuf, rqstp->rq_xid);
-
-	/* Assume that the reply consists of a single buffer. */
-	rqstp->rq_resbuf.nriov = 1;
-
 	if (serv->sv_stats)
 		serv->sv_stats->netcnt++;
 	return len;
@@ -1354,23 +1408,25 @@ static struct cache_deferred_req *
 svc_defer(struct cache_req *req)
 {
 	struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
-	int size = sizeof(struct svc_deferred_req) + (rqstp->rq_argbuf.buflen << 2);
+	int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.head[0].iov_len);
 	struct svc_deferred_req *dr;
 
+	if (rqstp->rq_arg.page_len)
+		return NULL; /* if more than a page, give up FIXME */
 	if (rqstp->rq_deferred) {
 		dr = rqstp->rq_deferred;
 		rqstp->rq_deferred = NULL;
 	} else {
 		/* FIXME maybe discard if size too large */
-		dr = kmalloc(size<<2, GFP_KERNEL);
+		dr = kmalloc(size, GFP_KERNEL);
 		if (dr == NULL)
 			return NULL;
 
 		dr->serv = rqstp->rq_server;
 		dr->prot = rqstp->rq_prot;
 		dr->addr = rqstp->rq_addr;
-		dr->argslen = rqstp->rq_argbuf.buflen;
-		memcpy(dr->args, rqstp->rq_argbuf.base, dr->argslen<<2);
+		dr->argslen = rqstp->rq_arg.head[0].iov_len >> 2;
+		memcpy(dr->args, rqstp->rq_arg.head[0].iov_base, dr->argslen<<2);
 	}
 	spin_lock(&rqstp->rq_server->sv_lock);
 	rqstp->rq_sock->sk_inuse++;
@@ -1388,10 +1444,10 @@ static int svc_deferred_recv(struct svc_rqst *rqstp)
 {
 	struct svc_deferred_req *dr = rqstp->rq_deferred;
 
-	rqstp->rq_argbuf.base = dr->args;
-	rqstp->rq_argbuf.buf  = dr->args;
-	rqstp->rq_argbuf.len  = dr->argslen;
-	rqstp->rq_argbuf.buflen = dr->argslen;
+	rqstp->rq_arg.head[0].iov_base = dr->args;
+	rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
+	rqstp->rq_arg.page_len = 0;
+	rqstp->rq_arg.len = dr->argslen<<2;
 	rqstp->rq_prot        = dr->prot;
 	rqstp->rq_addr        = dr->addr;
 	return dr->argslen<<2;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 390d2b13543c..3fc0e22521ce 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -655,7 +655,7 @@ skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len)
  * We have set things up such that we perform the checksum of the UDP
  * packet in parallel with the copies into the RPC client iovec.  -DaveM
  */
-static int
+int
 csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 {
 	skb_reader_t desc;
-- 
cgit v1.2.3