From 98eb235b7febbb2941e1b442b92fc5e23b0d7a83 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 2 Jul 2003 08:47:30 -0700
Subject: [PATCH] page unmapping debug

From: Manfred Spraul <manfred@colorfullife.com>

Manfred's latest page unmapping debug patch.

The patch adds support for a special debug mode to both the page and the slab
allocator: Unused pages are removed from the kernel linear mapping.  This
means that now any access to freed memory will cause an immediate exception.
Right now, read accesses remain totally unnoticed and write accesses may be
catched by the slab poisoning, but usually far too late for a meaningfull bug
report.

The implementation is based on a new arch dependant function,
kernel_map_pages(), that removes the pages from the linear mapping.  It's
right now only implemented for i386.

Changelog:

- Add kernel_map_pages() for i386, based on change_page_attr.  If
  DEBUG_PAGEALLOC is not set, then the function is an empty stub.  The stub
  is in <linux/mm.h>, i.e.  it exists for all archs.

- Make change_page_attr irq safe.  Note that it's not fully irq safe due to
  the lack of the tlb flush ipi, but it's good enough for kernel_map_pages().
   Another problem is that kernel_map_pages is not permitted to fail, thus
  PSE is disabled if DEBUG_PAGEALLOC is enabled

- use kernel_map pages for the page allocator.

- use kernel_map_pages for the slab allocator.

  I couldn't resist and added additional debugging support into mm/slab.c:

  * at kfree time, the complete backtrace of the kfree caller is stored
    in the freed object.

  * a ptrinfo() function that dumps all known data about a kernel virtual
    address: the pte value, if it belongs to a slab cache the cache name and
    additional info.

  * merging of common code: new helper function obj_dbglen and obj_dbghdr
    for the conversion between the user visible object pointers/len and the
    actual, internal addresses and len values.
---
 include/linux/mm.h   | 8 ++++++++
 include/linux/slab.h | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 492bc8aeb053..4d183974fd36 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -609,5 +609,13 @@ extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
 		int write);
 extern int remap_page_range(struct vm_area_struct *vma, unsigned long from,
 		unsigned long to, unsigned long size, pgprot_t prot);
+
+#ifndef CONFIG_DEBUG_PAGEALLOC
+static inline void
+kernel_map_pages(struct page *page, int numpages, int enable)
+{
+}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 843c8d638d29..9f8bccba4ad3 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -114,6 +114,8 @@ extern kmem_cache_t	*signal_cachep;
 extern kmem_cache_t	*sighand_cachep;
 extern kmem_cache_t	*bio_cachep;
 
+void ptrinfo(unsigned long addr);
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_SLAB_H */
-- 
cgit v1.2.3


From d4388840f41d71d1570326f77860431c7080f7ed Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 2 Jul 2003 08:47:43 -0700
Subject: [PATCH] NUMA memory reporting fix

From: Dave Hansen <haveblue@us.ibm.com>

The current numa meminfo code exports (via sysfs) pgdat->node_size, as
totalram.  This variable is consistently used elsewhere to mean "the number
of physical pages that this particular node spans".  This is _not_ what we
want to see from meminfo, which is: "how much actual memory does this node
have?"

The following patch removes pgdat->node_size, and replaces it with
->node_spanned_pages.  This is to avoid confusion with a new variable,
node_present_pages, which is the _actual_ value that we want to export in
meminfo.  Most of the patch is a simple s/node_size/node_spanned_pages/.
The node_size() macro is also removed, and replaced with new ones for
node_{spanned,present}_pages() to avoid confusion.

We were bitten by this problem in this bug:
	http://bugme.osdl.org/show_bug.cgi?id=818

Compiled and tested on NUMA-Q.
---
 arch/alpha/mm/numa.c        |  4 ++--
 arch/arm/mm/init.c          |  4 ++--
 arch/arm26/mm/init.c        |  4 ++--
 arch/i386/mm/pgtable.c      |  2 +-
 arch/ia64/mm/init.c         |  4 ++--
 arch/ppc64/mm/init.c        |  4 ++--
 arch/ppc64/mm/numa.c        | 16 ++++++++--------
 arch/x86_64/mm/init.c       |  2 +-
 arch/x86_64/mm/numa.c       |  2 +-
 include/asm-alpha/mmzone.h  |  3 +--
 include/asm-i386/mmzone.h   |  5 ++---
 include/asm-mips64/mmzone.h |  2 +-
 include/asm-ppc64/mmzone.h  |  1 -
 include/asm-x86_64/mmzone.h |  3 +--
 include/linux/mmzone.h      |  7 ++++++-
 mm/page_alloc.c             |  7 ++++---
 16 files changed, 36 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index 5a7ad83d367c..3100bb87bd50 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -338,7 +338,7 @@ void __init mem_init(void)
 
 		lmem_map = node_mem_map(nid);
 		pfn = NODE_DATA(nid)->node_start_pfn;
-		for (i = 0; i < node_size(nid); i++, pfn++)
+		for (i = 0; i < node_spanned_pages(nid); i++, pfn++)
 			if (page_is_ram(pfn) && PageReserved(lmem_map+i))
 				reservedpages++;
 	}
@@ -372,7 +372,7 @@ show_mem(void)
 	printk("Free swap:       %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
 	for (nid = 0; nid < numnodes; nid++) {
 		struct page * lmem_map = node_mem_map(nid);
-		i = node_size(nid);
+		i = node_spanned_pages(nid);
 		while (i-- > 0) {
 			total++;
 			if (PageReserved(lmem_map+i))
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 2fbcb7f5766a..90dcf272009c 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -79,7 +79,7 @@ void show_mem(void)
 		struct page *page, *end;
 
 		page = NODE_MEM_MAP(node);
-		end  = page + NODE_DATA(node)->node_size;
+		end  = page + NODE_DATA(node)->node_spanned_pages;
 
 		do {
 			total++;
@@ -576,7 +576,7 @@ void __init mem_init(void)
 	for (node = 0; node < numnodes; node++) {
 		pg_data_t *pgdat = NODE_DATA(node);
 
-		if (pgdat->node_size != 0)
+		if (pgdat->node_spanned_pages != 0)
 			totalram_pages += free_all_bootmem_node(pgdat);
 	}
 
diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c
index c2105fb1a84c..01c772bef70e 100644
--- a/arch/arm26/mm/init.c
+++ b/arch/arm26/mm/init.c
@@ -68,7 +68,7 @@ void show_mem(void)
 
 
 	page = NODE_MEM_MAP(0);
-	end  = page + NODE_DATA(0)->node_size;
+	end  = page + NODE_DATA(0)->node_spanned_pages;
 
 	do {
 		total++;
@@ -353,7 +353,7 @@ void __init mem_init(void)
 	max_mapnr   = virt_to_page(high_memory) - mem_map;
 
 	/* this will put all unused low memory onto the freelists */
-	if (pgdat->node_size != 0)
+	if (pgdat->node_spanned_pages != 0)
 		totalram_pages += free_all_bootmem_node(pgdat);
 
 	printk(KERN_INFO "Memory:");
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 7ab983c90c53..941c2aa5236c 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -34,7 +34,7 @@ void show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
-		for (i = 0; i < pgdat->node_size; ++i) {
+		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pgdat->node_mem_map + i;
 			total++;
 			if (PageHighMem(page))
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 8fc22262ce0f..4d37e437da3f 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -232,7 +232,7 @@ show_mem(void)
 		printk("Free swap:       %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 		for_each_pgdat(pgdat) {
 			printk("Node ID: %d\n", pgdat->node_id);
-			for(i = 0; i < pgdat->node_size; i++) {
+			for(i = 0; i < pgdat->node_spanned_pages; i++) {
 				if (PageReserved(pgdat->node_mem_map+i))
 					reserved++;
 				else if (PageSwapCache(pgdat->node_mem_map+i))
@@ -240,7 +240,7 @@ show_mem(void)
 				else if (page_count(pgdat->node_mem_map + i))
 					shared += page_count(pgdat->node_mem_map + i) - 1;
 			}
-			printk("\t%d pages of RAM\n", pgdat->node_size);
+			printk("\t%d pages of RAM\n", pgdat->node_spanned_pages);
 			printk("\t%d reserved pages\n", reserved);
 			printk("\t%d pages shared\n", shared);
 			printk("\t%d pages swap cached\n", cached);
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index 79b716dbe6db..ca2472a9116a 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -109,7 +109,7 @@ void show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
-		for (i = 0; i < pgdat->node_size; i++) {
+		for (i = 0; i < pgdat->node_spanned_pages; i++) {
 			page = pgdat->node_mem_map + i;
 			total++;
 			if (PageReserved(page))
@@ -564,7 +564,7 @@ void __init mem_init(void)
 	int nid;
 
         for (nid = 0; nid < numnodes; nid++) {
-		if (node_data[nid].node_size != 0) {
+		if (node_data[nid].node_spanned_pages != 0) {
 			printk("freeing bootmem node %x\n", nid);
 			totalram_pages +=
 				free_all_bootmem_node(NODE_DATA(nid));
diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c
index fd86d7ec8267..19b4ee36ee8f 100644
--- a/arch/ppc64/mm/numa.c
+++ b/arch/ppc64/mm/numa.c
@@ -160,21 +160,21 @@ new_range:
 		 * this simple case and complain if there is a gap in
 		 * memory
 		 */
-		if (node_data[numa_domain].node_size) {
+		if (node_data[numa_domain].node_spanned_pages) {
 			unsigned long shouldstart =
 				node_data[numa_domain].node_start_pfn + 
-				node_data[numa_domain].node_size;
+				node_data[numa_domain].node_spanned_pages;
 			if (shouldstart != (start / PAGE_SIZE)) {
 				printk(KERN_ERR "Hole in node, disabling "
 						"region start %lx length %lx\n",
 						start, size);
 				continue;
 			}
-			node_data[numa_domain].node_size += size / PAGE_SIZE;
+			node_data[numa_domain].node_spanned_pages += size / PAGE_SIZE;
 		} else {
 			node_data[numa_domain].node_start_pfn =
 				start / PAGE_SIZE;
-			node_data[numa_domain].node_size = size / PAGE_SIZE;
+			node_data[numa_domain].node_spanned_pages = size / PAGE_SIZE;
 		}
 
 		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
@@ -202,7 +202,7 @@ void setup_nonnuma(void)
 		map_cpu_to_node(i, 0);
 
 	node_data[0].node_start_pfn = 0;
-	node_data[0].node_size = lmb_end_of_DRAM() / PAGE_SIZE;
+	node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE;
 
 	for (i = 0 ; i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT)
 		numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
@@ -224,12 +224,12 @@ void __init do_init_bootmem(void)
 		unsigned long bootmem_paddr;
 		unsigned long bootmap_pages;
 
-		if (node_data[nid].node_size == 0)
+		if (node_data[nid].node_spanned_pages == 0)
 			continue;
 
 		start_paddr = node_data[nid].node_start_pfn * PAGE_SIZE;
 		end_paddr = start_paddr + 
-				(node_data[nid].node_size * PAGE_SIZE);
+				(node_data[nid].node_spanned_pages * PAGE_SIZE);
 
 		dbg("node %d\n", nid);
 		dbg("start_paddr = %lx\n", start_paddr);
@@ -311,7 +311,7 @@ void __init paging_init(void)
 		unsigned long start_pfn;
 		unsigned long end_pfn;
 
-		if (node_data[nid].node_size == 0)
+		if (node_data[nid].node_spanned_pages == 0)
 			continue;
 
 		start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 3be6a8e4b679..cafd352ba636 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -64,7 +64,7 @@ void show_mem(void)
 	printk("Free swap:       %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
 
 	for_each_pgdat(pgdat) {
-               for (i = 0; i < pgdat->node_size; ++i) {
+               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
                        page = pgdat->node_mem_map + i;
 		total++;
                        if (PageReserved(page))
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 0150d11586a7..738ae097faeb 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -86,7 +86,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
 	NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
 	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
-	NODE_DATA(nodeid)->node_size = end_pfn - start_pfn;
+	NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
 
 	/* Find a place for the bootmem map */
 	bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); 
diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h
index 6edb9c64aa7b..36e3130c6696 100644
--- a/include/asm-alpha/mmzone.h
+++ b/include/asm-alpha/mmzone.h
@@ -31,7 +31,6 @@ extern pg_data_t node_data[];
 
 #define pa_to_nid(pa)		alpha_pa_to_nid(pa)
 #define NODE_DATA(nid)		(&node_data[(nid)])
-#define node_size(nid)		(NODE_DATA(nid)->node_size)
 
 #define node_localnr(pfn, nid)	((pfn) - NODE_DATA(nid)->node_start_pfn)
 
@@ -124,7 +123,7 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
 #define pfn_to_nid(pfn)		pa_to_nid(((u64)pfn << PAGE_SHIFT))
 #define pfn_valid(pfn)							\
 	(((pfn) - node_start_pfn(pfn_to_nid(pfn))) <			\
-	 node_size(pfn_to_nid(pfn)))					\
+	 node_spanned_pages(pfn_to_nid(pfn)))					\
 
 #define virt_addr_valid(kaddr)	pfn_valid((__pa(kaddr) >> PAGE_SHIFT))
 
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index b6138f07c309..d5da17912a8c 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -32,8 +32,7 @@ extern struct pglist_data *node_data[];
 #define alloc_bootmem_low_pages_node(ignore, x) \
 	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
 
-#define node_size(nid)		(node_data[nid]->node_size)
-#define node_localnr(pfn, nid)	((pfn) - node_data[nid]->node_start_pfn)
+#define node_localnr(pfn, nid)		((pfn) - node_data[nid]->node_start_pfn)
 
 /*
  * Following are macros that each numa implmentation must define.
@@ -54,7 +53,7 @@ extern struct pglist_data *node_data[];
 #define node_end_pfn(nid)						\
 ({									\
 	pg_data_t *__pgdat = NODE_DATA(nid);				\
-	__pgdat->node_start_pfn + __pgdat->node_size;			\
+	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
 })
 
 #define local_mapnr(kvaddr)						\
diff --git a/include/asm-mips64/mmzone.h b/include/asm-mips64/mmzone.h
index 25a42752b182..cba337b07be1 100644
--- a/include/asm-mips64/mmzone.h
+++ b/include/asm-mips64/mmzone.h
@@ -24,7 +24,7 @@ extern plat_pg_data_t *plat_node_data[];
 
 #define PHYSADDR_TO_NID(pa)		NASID_TO_COMPACT_NODEID(NASID_GET(pa))
 #define PLAT_NODE_DATA(n)		(plat_node_data[n])
-#define PLAT_NODE_DATA_SIZE(n)	     (PLAT_NODE_DATA(n)->gendata.node_size)
+#define PLAT_NODE_DATA_SIZE(n)	     (PLAT_NODE_DATA(n)->gendata.node_spanned_pages)
 #define PLAT_NODE_DATA_LOCALNR(p, n) \
 		(((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn)
 
diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h
index 8503e25b17b3..2e5136012845 100644
--- a/include/asm-ppc64/mmzone.h
+++ b/include/asm-ppc64/mmzone.h
@@ -54,7 +54,6 @@ static inline int pa_to_nid(unsigned long pa)
  */
 #define NODE_DATA(nid)		(&node_data[nid])
 
-#define node_size(nid)		(NODE_DATA(nid)->node_size)
 #define node_localnr(pfn, nid)	((pfn) - NODE_DATA(nid)->node_start_pfn)
 
 /*
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index c1a69000c8d7..398c530270c2 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -40,8 +40,7 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 #define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
-				 NODE_DATA(nid)->node_size)
-#define node_size(nid)		(NODE_DATA(nid)->node_size)
+				 NODE_DATA(nid)->node_spanned_pages)
 
 #define local_mapnr(kvaddr) \
 	( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) )
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 21e95664fdf8..e768f7ab8963 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -184,12 +184,17 @@ typedef struct pglist_data {
 	unsigned long *valid_addr_bitmap;
 	struct bootmem_data *bdata;
 	unsigned long node_start_pfn;
-	unsigned long node_size;
+	unsigned long node_present_pages; /* total number of physical pages */
+	unsigned long node_spanned_pages; /* total size of physical page
+					     range, including holes */
 	int node_id;
 	struct pglist_data *pgdat_next;
 	wait_queue_head_t       kswapd_wait;
 } pg_data_t;
 
+#define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
+#define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
+
 extern int numnodes;
 extern struct pglist_data *pgdat_list;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 741866b59d7d..8b2a02c0350a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -903,7 +903,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
 
-	val->totalram = pgdat->node_size;
+	val->totalram = pgdat->node_present_pages;
 	val->freeram = nr_free_pages_pgdat(pgdat);
 	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
 	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
@@ -1138,12 +1138,13 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
 
 	for (i = 0; i < MAX_NR_ZONES; i++)
 		totalpages += zones_size[i];
-	pgdat->node_size = totalpages;
+	pgdat->node_spanned_pages = totalpages;
 
 	realtotalpages = totalpages;
 	if (zholes_size)
 		for (i = 0; i < MAX_NR_ZONES; i++)
 			realtotalpages -= zholes_size[i];
+	pgdat->node_present_pages = realtotalpages;
 	printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
 }
 
@@ -1349,7 +1350,7 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat,
 	pgdat->node_start_pfn = node_start_pfn;
 	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
 	if (!node_mem_map) {
-		size = (pgdat->node_size + 1) * sizeof(struct page); 
+		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
 		node_mem_map = alloc_bootmem_node(pgdat, size);
 	}
 	pgdat->node_mem_map = node_mem_map;
-- 
cgit v1.2.3


From cee396e281fc2c8a55261eea1a89a594e98f3e0f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 2 Jul 2003 08:49:26 -0700
Subject: [PATCH] cleanup and generalise lowmem_page_address

From: William Lee Irwin III <wli@holomorphy.com>

This patch allows architectures to micro-optimize lowmem_page_address() at
their whims.  Roman Zippel originally wrote and/or suggested this back when
dependencies on page->virtual existing were being shaken out.  That's
long-settled, so it's fine to do this now.
---
 include/linux/mm.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4d183974fd36..d75f64725853 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -339,9 +339,14 @@ static inline void set_page_zone(struct page *page, unsigned long zone_num)
 	page->flags |= zone_num << ZONE_SHIFT;
 }
 
-static inline void * lowmem_page_address(struct page *page)
+#ifndef CONFIG_DISCONTIGMEM
+/* The array of struct pages - for discontigmem use pgdat->lmem_map */
+extern struct page *mem_map;
+#endif
+
+static inline void *lowmem_page_address(struct page *page)
 {
-	return __va( ( (page - page_zone(page)->zone_mem_map)	+ page_zone(page)->zone_start_pfn) << PAGE_SHIFT);
+	return __va(page_to_pfn(page) << PAGE_SHIFT);
 }
 
 #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
@@ -395,11 +400,6 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_MINOR	1
 #define VM_FAULT_MAJOR	2
 
-#ifndef CONFIG_DISCONTIGMEM
-/* The array of struct pages - for discontigmem use pgdat->lmem_map */
-extern struct page *mem_map;
-#endif 
-
 extern void show_free_areas(void);
 
 struct page *shmem_nopage(struct vm_area_struct * vma,
-- 
cgit v1.2.3


From bc75ac4f1dcec256a65b531e2d5be84f5b0fe6bc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 2 Jul 2003 08:49:35 -0700
Subject: [PATCH] Security hook for vm_enough_memory

From: Stephen Smalley <sds@epoch.ncsc.mil>

This patch against 2.5.73 replaces vm_enough_memory with a security hook
per Alan Cox's suggestion so that security modules can completely replace
the logic if desired.

Note that the patch changes the interface to follow the convention of the
other security hooks, i.e.  return 0 if ok or -errno on failure (-ENOMEM in
this case) rather than returning a boolean.  It also exports various
variables and functions required for the vm_enough_memory logic.
---
 arch/ia64/ia32/binfmt_elf32.c  |  3 +-
 arch/ia64/kernel/sys_ia64.c    |  1 -
 arch/mips/kernel/sysirix.c     |  5 ++-
 arch/s390/kernel/compat_exec.c |  3 +-
 arch/x86_64/ia32/ia32_binfmt.c |  4 ++-
 fs/exec.c                      |  2 +-
 include/linux/mman.h           |  3 +-
 include/linux/security.h       | 16 ++++++++++
 include/linux/slab.h           |  2 ++
 kernel/fork.c                  |  2 +-
 mm/mmap.c                      | 71 +++++-------------------------------------
 mm/mprotect.c                  |  2 +-
 mm/mremap.c                    |  3 +-
 mm/page_alloc.c                |  5 +++
 mm/shmem.c                     |  9 +++---
 mm/slab.c                      |  2 ++
 mm/swap.c                      |  2 ++
 mm/swapfile.c                  |  6 +++-
 security/capability.c          | 65 ++++++++++++++++++++++++++++++++++++++
 security/dummy.c               | 52 +++++++++++++++++++++++++++++++
 20 files changed, 178 insertions(+), 80 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c
index dcc4982c2c66..8b2a41592746 100644
--- a/arch/ia64/ia32/binfmt_elf32.c
+++ b/arch/ia64/ia32/binfmt_elf32.c
@@ -13,6 +13,7 @@
 
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/security.h>
 
 #include <asm/param.h>
 #include <asm/signal.h>
@@ -177,7 +178,7 @@ ia32_setup_arg_pages (struct linux_binprm *bprm)
 	if (!mpnt)
 		return -ENOMEM;
 
-	if (!vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
+	if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
 		kmem_cache_free(vm_area_cachep, mpnt);
 		return -ENOMEM;
 	}
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c
index 6de52294bd80..6d94e8b1a0ad 100644
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -100,7 +100,6 @@ ia64_shmat (int shmid, void *shmaddr, int shmflg)
 asmlinkage unsigned long
 ia64_brk (unsigned long brk)
 {
-	extern int vm_enough_memory (long pages);
 	unsigned long rlim, retval, newbrk, oldbrk;
 	struct mm_struct *mm = current->mm;
 
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index 5722c28c1e9d..fdcc9d5bd057 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -28,6 +28,7 @@
 #include <linux/vfs.h>
 #include <linux/namei.h>
 #include <linux/socket.h>
+#include <linux/security.h>
 
 #include <asm/ptrace.h>
 #include <asm/page.h>
@@ -527,8 +528,6 @@ asmlinkage int irix_gtime(struct pt_regs *regs)
 	return get_seconds();
 }
 
-int vm_enough_memory(long pages);
-
 /*
  * IRIX is completely broken... it returns 0 on success, otherwise
  * ENOMEM.
@@ -585,7 +584,7 @@ asmlinkage int irix_brk(unsigned long brk)
 	/*
 	 * Check if we have enough memory..
 	 */
-	if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) {
+	if (security_vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) {
 		ret = -ENOMEM;
 		goto out;
 	}
diff --git a/arch/s390/kernel/compat_exec.c b/arch/s390/kernel/compat_exec.c
index 74245a64e514..33832846833f 100644
--- a/arch/s390/kernel/compat_exec.c
+++ b/arch/s390/kernel/compat_exec.c
@@ -23,6 +23,7 @@
 #include <linux/spinlock.h>
 #include <linux/binfmts.h>
 #include <linux/module.h>
+#include <linux/security.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
@@ -55,7 +56,7 @@ int setup_arg_pages32(struct linux_binprm *bprm)
 	if (!mpnt) 
 		return -ENOMEM; 
 	
-	if (!vm_enough_memory((STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
+	if (security_vm_enough_memory((STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
 		kmem_cache_free(vm_area_cachep, mpnt);
 		return -ENOMEM;
 	}
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 496dfa9da747..e4b86e6cbf9b 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -14,6 +14,8 @@
 #include <linux/string.h>
 #include <linux/binfmts.h>
 #include <linux/mm.h>
+#include <linux/security.h>
+
 #include <asm/segment.h> 
 #include <asm/ptrace.h>
 #include <asm/processor.h>
@@ -339,7 +341,7 @@ int setup_arg_pages(struct linux_binprm *bprm)
 	if (!mpnt) 
 		return -ENOMEM; 
 	
-	if (!vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
+	if (security_vm_enough_memory((IA32_STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
 		kmem_cache_free(vm_area_cachep, mpnt);
 		return -ENOMEM;
 	}
diff --git a/fs/exec.c b/fs/exec.c
index 204d0a3a1565..f91b25952248 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -392,7 +392,7 @@ int setup_arg_pages(struct linux_binprm *bprm)
 	if (!mpnt)
 		return -ENOMEM;
 
-	if (!vm_enough_memory((STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
+	if (security_vm_enough_memory((STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p))>>PAGE_SHIFT)) {
 		kmem_cache_free(vm_area_cachep, mpnt);
 		return -ENOMEM;
 	}
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 474d1c046436..a8956f6588ad 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -9,7 +9,8 @@
 #define MREMAP_MAYMOVE	1
 #define MREMAP_FIXED	2
 
-extern int vm_enough_memory(long pages);
+extern int sysctl_overcommit_memory;
+extern int sysctl_overcommit_ratio;
 extern atomic_t vm_committed_space;
 
 #ifdef CONFIG_SMP
diff --git a/include/linux/security.h b/include/linux/security.h
index 9589f99c3ef3..4d91dfc52c52 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -49,6 +49,7 @@ extern int cap_bprm_secureexec(struct linux_binprm *bprm);
 extern int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, int flags);
 extern void cap_task_reparent_to_init (struct task_struct *p);
 extern int cap_syslog (int type);
+extern int cap_vm_enough_memory (long pages);
 
 static inline int cap_netlink_send (struct sk_buff *skb)
 {
@@ -958,6 +959,10 @@ struct swap_info_struct;
  *	See the syslog(2) manual page for an explanation of the @type values.  
  *	@type contains the type of action.
  *	Return 0 if permission is granted.
+ * @vm_enough_memory:
+ *	Check permissions for allocating a new virtual mapping.
+ *      @pages contains the number of pages.
+ *	Return 0 if permission is granted.
  *
  * @register_security:
  * 	allow module stacking.
@@ -989,6 +994,7 @@ struct security_operations {
 	int (*quotactl) (int cmds, int type, int id, struct super_block * sb);
 	int (*quota_on) (struct file * f);
 	int (*syslog) (int type);
+	int (*vm_enough_memory) (long pages);
 
 	int (*bprm_alloc_security) (struct linux_binprm * bprm);
 	void (*bprm_free_security) (struct linux_binprm * bprm);
@@ -1238,6 +1244,11 @@ static inline int security_syslog(int type)
 	return security_ops->syslog(type);
 }
 
+static inline int security_vm_enough_memory(long pages)
+{
+	return security_ops->vm_enough_memory(pages);
+}
+
 static inline int security_bprm_alloc (struct linux_binprm *bprm)
 {
 	return security_ops->bprm_alloc_security (bprm);
@@ -1898,6 +1909,11 @@ static inline int security_syslog(int type)
 	return cap_syslog(type);
 }
 
+static inline int security_vm_enough_memory(long pages)
+{
+	return cap_vm_enough_memory(pages);
+}
+
 static inline int security_bprm_alloc (struct linux_binprm *bprm)
 {
 	return 0;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9f8bccba4ad3..d797c981f37e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -116,6 +116,8 @@ extern kmem_cache_t	*bio_cachep;
 
 void ptrinfo(unsigned long addr);
 
+extern atomic_t slab_reclaim_pages;
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _LINUX_SLAB_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 2abbc9c2da23..c17e05614c88 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -286,7 +286,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 			continue;
 		if (mpnt->vm_flags & VM_ACCOUNT) {
 			unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
-			if (!vm_enough_memory(len))
+			if (security_vm_enough_memory(len))
 				goto fail_nomem;
 			charge += len;
 		}
diff --git a/mm/mmap.c b/mm/mmap.c
index c83cf2a8b126..1052f84a82a2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -18,6 +18,7 @@
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/profile.h>
+#include <linux/module.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
@@ -53,65 +54,9 @@ int sysctl_overcommit_memory = 0;	/* default is heuristic overcommit */
 int sysctl_overcommit_ratio = 50;	/* default is 50% */
 atomic_t vm_committed_space = ATOMIC_INIT(0);
 
-/*
- * Check that a process has enough memory to allocate a new virtual
- * mapping. 1 means there is enough memory for the allocation to
- * succeed and 0 implies there is not.
- *
- * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-acounting
- *
- * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
- * Additional code 2002 Jul 20 by Robert Love.
- */
-extern atomic_t slab_reclaim_pages;
-int vm_enough_memory(long pages)
-{
-	unsigned long free, allowed;
-
-	vm_acct_memory(pages);
-
-        /*
-	 * Sometimes we want to use more memory than we have
-	 */
-	if (sysctl_overcommit_memory == 1)
-		return 1;
-
-	if (sysctl_overcommit_memory == 0) {
-		free = get_page_cache_size();
-		free += nr_free_pages();
-		free += nr_swap_pages;
-
-		/*
-		 * Any slabs which are created with the
-		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
-		 * which are reclaimable, under pressure.  The dentry
-		 * cache and most inode caches should fall into this
-		 */
-		free += atomic_read(&slab_reclaim_pages);
-
-		/*
-		 * Leave the last 3% for root
-		 */
-		if (!capable(CAP_SYS_ADMIN))
-			free -= free / 32;
-		
-		if (free > pages)
-			return 1;
-		vm_unacct_memory(pages);
-		return 0;
-	}
-
-	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
-	allowed += total_swap_pages;
-
-	if (atomic_read(&vm_committed_space) < allowed)
-		return 1;
-
-	vm_unacct_memory(pages);
-
-	return 0;
-}
+EXPORT_SYMBOL(sysctl_overcommit_memory);
+EXPORT_SYMBOL(sysctl_overcommit_ratio);
+EXPORT_SYMBOL(vm_committed_space);
 
 /*
  * Requires inode->i_mapping->i_shared_sem
@@ -646,7 +591,7 @@ munmap_back:
 			 * Private writable mapping: check memory availability
 			 */
 			charged = len >> PAGE_SHIFT;
-			if (!vm_enough_memory(charged))
+			if (security_vm_enough_memory(charged))
 				return -ENOMEM;
 			vm_flags |= VM_ACCOUNT;
 		}
@@ -950,7 +895,7 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address)
 	grow = (address - vma->vm_end) >> PAGE_SHIFT;
 
 	/* Overcommit.. */
-	if (!vm_enough_memory(grow)) {
+	if (security_vm_enough_memory(grow)) {
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		return -ENOMEM;
 	}
@@ -1004,7 +949,7 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
 	grow = (vma->vm_start - address) >> PAGE_SHIFT;
 
 	/* Overcommit.. */
-	if (!vm_enough_memory(grow)) {
+	if (security_vm_enough_memory(grow)) {
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		return -ENOMEM;
 	}
@@ -1376,7 +1321,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	if (mm->map_count > MAX_MAP_COUNT)
 		return -ENOMEM;
 
-	if (!vm_enough_memory(len >> PAGE_SHIFT))
+	if (security_vm_enough_memory(len >> PAGE_SHIFT))
 		return -ENOMEM;
 
 	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 978a9509c350..2c015794e3c1 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -175,7 +175,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	if (newflags & VM_WRITE) {
 		if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
 			charged = (end - start) >> PAGE_SHIFT;
-			if (!vm_enough_memory(charged))
+			if (security_vm_enough_memory(charged))
 				return -ENOMEM;
 			newflags |= VM_ACCOUNT;
 		}
diff --git a/mm/mremap.c b/mm/mremap.c
index 3bab43a88125..088af945ac5e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -16,6 +16,7 @@
 #include <linux/fs.h>
 #include <linux/highmem.h>
 #include <linux/rmap-locking.h>
+#include <linux/security.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
@@ -385,7 +386,7 @@ unsigned long do_mremap(unsigned long addr,
 
 	if (vma->vm_flags & VM_ACCOUNT) {
 		charged = (new_len - old_len) >> PAGE_SHIFT;
-		if (!vm_enough_memory(charged))
+		if (security_vm_enough_memory(charged))
 			goto out_nc;
 	}
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8b2a02c0350a..16077203e5a4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,9 @@ int nr_swap_pages;
 int numnodes = 1;
 int sysctl_lower_zone_protection = 0;
 
+EXPORT_SYMBOL(totalram_pages);
+EXPORT_SYMBOL(nr_swap_pages);
+
 /*
  * Used by page_zone() to look up the address of the struct zone whose
  * id is encoded in the upper bits of page->flags
@@ -733,6 +736,7 @@ unsigned int nr_free_pages(void)
 
 	return sum;
 }
+EXPORT_SYMBOL(nr_free_pages);
 
 unsigned int nr_used_zone_pages(void)
 {
@@ -825,6 +829,7 @@ DEFINE_PER_CPU(struct page_state, page_states) = {0};
 EXPORT_PER_CPU_SYMBOL(page_states);
 
 atomic_t nr_pagecache = ATOMIC_INIT(0);
+EXPORT_SYMBOL(nr_pagecache);
 #ifdef CONFIG_SMP
 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 #endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 73301cee3f41..1f4ed8fece45 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -36,6 +36,7 @@
 #include <linux/writeback.h>
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
+#include <linux/security.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 
@@ -507,7 +508,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 	 	 */
 		change = VM_ACCT(attr->ia_size) - VM_ACCT(inode->i_size);
 		if (change > 0) {
-			if (!vm_enough_memory(change))
+			if (security_vm_enough_memory(change))
 				return -ENOMEM;
 		} else if (attr->ia_size < inode->i_size) {
 			vm_unacct_memory(-change);
@@ -1139,7 +1140,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
 	maxpos = inode->i_size;
 	if (maxpos < pos + count) {
 		maxpos = pos + count;
-		if (!vm_enough_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size))) {
+		if (security_vm_enough_memory(VM_ACCT(maxpos) - VM_ACCT(inode->i_size))) {
 			err = -ENOMEM;
 			goto out;
 		}
@@ -1493,7 +1494,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 		memcpy(info, symname, len);
 		inode->i_op = &shmem_symlink_inline_operations;
 	} else {
-		if (!vm_enough_memory(VM_ACCT(1))) {
+		if (security_vm_enough_memory(VM_ACCT(1))) {
 			iput(inode);
 			return -ENOMEM;
 		}
@@ -1887,7 +1888,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
 	if (size > SHMEM_MAX_BYTES)
 		return ERR_PTR(-EINVAL);
 
-	if ((flags & VM_ACCOUNT) && !vm_enough_memory(VM_ACCT(size)))
+	if ((flags & VM_ACCOUNT) && security_vm_enough_memory(VM_ACCT(size)))
 		return ERR_PTR(-ENOMEM);
 
 	error = -ENOMEM;
diff --git a/mm/slab.c b/mm/slab.c
index afb8d8415999..e05fcba90af2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -90,6 +90,7 @@
 #include	<linux/kallsyms.h>
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
+#include	<linux/module.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -462,6 +463,7 @@ struct list_head cache_chain;
  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
  */
 atomic_t slab_reclaim_pages;
+EXPORT_SYMBOL(slab_reclaim_pages);
 
 /*
  * chicken and egg problem: delay the per-cpu array allocation
diff --git a/mm/swap.c b/mm/swap.c
index 5818b0a5a72d..37302961e371 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -20,6 +20,7 @@
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/mm_inline.h>
 #include <linux/buffer_head.h>	/* for try_to_release_page() */
 #include <linux/percpu.h>
@@ -370,6 +371,7 @@ void vm_acct_memory(long pages)
 	}
 	preempt_enable();
 }
+EXPORT_SYMBOL(vm_acct_memory);
 #endif
 
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bdfd09be8d4c..bc31505b689f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -20,7 +20,9 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <linux/rmap-locking.h>
+#include <linux/security.h>
 
 #include <asm/pgtable.h>
 #include <linux/swapops.h>
@@ -30,6 +32,8 @@ unsigned int nr_swapfiles;
 int total_swap_pages;
 static int swap_overflow;
 
+EXPORT_SYMBOL(total_swap_pages);
+
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
@@ -1042,7 +1046,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 		swap_list_unlock();
 		goto out_dput;
 	}
-	if (vm_enough_memory(p->pages))
+	if (!security_vm_enough_memory(p->pages))
 		vm_unacct_memory(p->pages);
 	else {
 		err = -ENOMEM;
diff --git a/security/capability.c b/security/capability.c
index e01bc5271c36..cff54dd440fc 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -15,6 +15,9 @@
 #include <linux/security.h>
 #include <linux/file.h>
 #include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
 #include <linux/smp_lock.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
@@ -275,6 +278,65 @@ int cap_syslog (int type)
 	return 0;
 }
 
+/*
+ * Check that a process has enough memory to allocate a new virtual
+ * mapping. 0 means there is enough memory for the allocation to
+ * succeed and -ENOMEM implies there is not.
+ *
+ * We currently support three overcommit policies, which are set via the
+ * vm.overcommit_memory sysctl.  See Documentation/vm/overcommit-acounting
+ *
+ * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
+ * Additional code 2002 Jul 20 by Robert Love.
+ */
+int cap_vm_enough_memory(long pages)
+{
+	unsigned long free, allowed;
+
+	vm_acct_memory(pages);
+
+        /*
+	 * Sometimes we want to use more memory than we have
+	 */
+	if (sysctl_overcommit_memory == 1)
+		return 0;
+
+	if (sysctl_overcommit_memory == 0) {
+		free = get_page_cache_size();
+		free += nr_free_pages();
+		free += nr_swap_pages;
+
+		/*
+		 * Any slabs which are created with the
+		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
+		 * which are reclaimable, under pressure.  The dentry
+		 * cache and most inode caches should fall into this
+		 */
+		free += atomic_read(&slab_reclaim_pages);
+
+		/*
+		 * Leave the last 3% for root
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			free -= free / 32;
+
+		if (free > pages)
+			return 0;
+		vm_unacct_memory(pages);
+		return -ENOMEM;
+	}
+
+	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+	allowed += total_swap_pages;
+
+	if (atomic_read(&vm_committed_space) < allowed)
+		return 0;
+
+	vm_unacct_memory(pages);
+
+	return -ENOMEM;
+}
+
 EXPORT_SYMBOL(cap_capable);
 EXPORT_SYMBOL(cap_ptrace);
 EXPORT_SYMBOL(cap_capget);
@@ -286,6 +348,7 @@ EXPORT_SYMBOL(cap_bprm_secureexec);
 EXPORT_SYMBOL(cap_task_post_setuid);
 EXPORT_SYMBOL(cap_task_reparent_to_init);
 EXPORT_SYMBOL(cap_syslog);
+EXPORT_SYMBOL(cap_vm_enough_memory);
 
 #ifdef CONFIG_SECURITY
 
@@ -307,6 +370,8 @@ static struct security_operations capability_ops = {
 	.task_reparent_to_init =	cap_task_reparent_to_init,
 
 	.syslog =                       cap_syslog,
+
+	.vm_enough_memory =             cap_vm_enough_memory,
 };
 
 #if defined(CONFIG_SECURITY_CAPABILITIES_MODULE)
diff --git a/security/dummy.c b/security/dummy.c
index a4307e78a168..76c6560a76c2 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -17,6 +17,9 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/mman.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
 #include <linux/security.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
@@ -97,6 +100,54 @@ static int dummy_syslog (int type)
 	return 0;
 }
 
+static int dummy_vm_enough_memory(long pages)
+{
+	unsigned long free, allowed;
+
+	vm_acct_memory(pages);
+
+        /*
+	 * Sometimes we want to use more memory than we have
+	 */
+	if (sysctl_overcommit_memory == 1)
+		return 0;
+
+	if (sysctl_overcommit_memory == 0) {
+		free = get_page_cache_size();
+		free += nr_free_pages();
+		free += nr_swap_pages;
+
+		/*
+		 * Any slabs which are created with the
+		 * SLAB_RECLAIM_ACCOUNT flag claim to have contents
+		 * which are reclaimable, under pressure.  The dentry
+		 * cache and most inode caches should fall into this
+		 */
+		free += atomic_read(&slab_reclaim_pages);
+
+		/*
+		 * Leave the last 3% for root
+		 */
+		if (current->euid)
+			free -= free / 32;
+
+		if (free > pages)
+			return 0;
+		vm_unacct_memory(pages);
+		return -ENOMEM;
+	}
+
+	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
+	allowed += total_swap_pages;
+
+	if (atomic_read(&vm_committed_space) < allowed)
+		return 0;
+
+	vm_unacct_memory(pages);
+
+	return -ENOMEM;
+}
+
 static int dummy_bprm_alloc_security (struct linux_binprm *bprm)
 {
 	return 0;
@@ -793,6 +844,7 @@ void security_fixup_ops (struct security_operations *ops)
 	set_to_dummy_if_null(ops, quota_on);
 	set_to_dummy_if_null(ops, sysctl);
 	set_to_dummy_if_null(ops, syslog);
+	set_to_dummy_if_null(ops, vm_enough_memory);
 	set_to_dummy_if_null(ops, bprm_alloc_security);
 	set_to_dummy_if_null(ops, bprm_free_security);
 	set_to_dummy_if_null(ops, bprm_compute_creds);
-- 
cgit v1.2.3


From 26e48e571aba7b6cba0cebb41d832949137b5fd5 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 2 Jul 2003 10:32:49 -0700
Subject: [PATCH] Remove cpu arg from cpu_raise_irq

The function cpu_raise_softirq() takes a softirq number, and a cpu number,
but cannot be used with cpu != smp_processor_id(), because there's no
locking around the pending softirq lists.  Since noone does this, remove
that arg.

As per Linus' suggestion, names changed:
	raise_softirq(int nr)
	cpu_raise_softirq(int cpu, int nr) -> raise_softirq_irqoff(int nr)
	__cpu_raise_softirq(int cpu, int nr) -> __raise_softirq_irqoff(int nr)
---
 drivers/scsi/scsi.c       |  2 +-
 include/linux/interrupt.h |  4 ++--
 include/linux/netdevice.h |  8 ++++----
 kernel/ksyms.c            |  2 +-
 kernel/softirq.c          | 14 +++++++-------
 net/core/dev.c            |  2 +-
 6 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index af35f91fe3b2..633c9a028e29 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -582,7 +582,7 @@ void scsi_done(struct scsi_cmnd *cmd)
 	local_irq_save(flags);
 	cpu = smp_processor_id();
 	list_add_tail(&cmd->eh_entry, &done_q[cpu]);
-	cpu_raise_softirq(cpu, SCSI_SOFTIRQ);
+	raise_softirq_irqoff(SCSI_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index da2eaeb18118..21e48723b386 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -94,8 +94,8 @@ struct softirq_action
 asmlinkage void do_softirq(void);
 extern void open_softirq(int nr, void (*action)(struct softirq_action*), void *data);
 extern void softirq_init(void);
-#define __cpu_raise_softirq(cpu, nr) do { softirq_pending(cpu) |= 1UL << (nr); } while (0)
-extern void FASTCALL(cpu_raise_softirq(unsigned int cpu, unsigned int nr));
+#define __raise_softirq_irqoff(nr) do { local_softirq_pending() |= 1UL << (nr); } while (0)
+extern void FASTCALL(raise_softirq_irqoff(unsigned int nr));
 extern void FASTCALL(raise_softirq(unsigned int nr));
 
 #ifndef invoke_softirq
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d79375c33273..3aef822b4493 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -561,7 +561,7 @@ static inline void __netif_schedule(struct net_device *dev)
 		cpu = smp_processor_id();
 		dev->next_sched = softnet_data[cpu].output_queue;
 		softnet_data[cpu].output_queue = dev;
-		cpu_raise_softirq(cpu, NET_TX_SOFTIRQ);
+		raise_softirq_irqoff(NET_TX_SOFTIRQ);
 		local_irq_restore(flags);
 	}
 }
@@ -612,7 +612,7 @@ static inline void dev_kfree_skb_irq(struct sk_buff *skb)
 		cpu = smp_processor_id();
 		skb->next = softnet_data[cpu].completion_queue;
 		softnet_data[cpu].completion_queue = skb;
-		cpu_raise_softirq(cpu, NET_TX_SOFTIRQ);
+		raise_softirq_irqoff(NET_TX_SOFTIRQ);
 		local_irq_restore(flags);
 	}
 }
@@ -779,7 +779,7 @@ static inline void __netif_rx_schedule(struct net_device *dev)
 		dev->quota += dev->weight;
 	else
 		dev->quota = dev->weight;
-	__cpu_raise_softirq(cpu, NET_RX_SOFTIRQ);
+	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -805,7 +805,7 @@ static inline int netif_rx_reschedule(struct net_device *dev, int undo)
 		local_irq_save(flags);
 		cpu = smp_processor_id();
 		list_add_tail(&dev->poll_list, &softnet_data[cpu].poll_list);
-		__cpu_raise_softirq(cpu, NET_RX_SOFTIRQ);
+		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 		local_irq_restore(flags);
 		return 1;
 	}
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 816627adc50a..66ea4b6b4d84 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -587,7 +587,7 @@ EXPORT_SYMBOL(tasklet_kill);
 EXPORT_SYMBOL(do_softirq);
 EXPORT_SYMBOL(raise_softirq);
 EXPORT_SYMBOL(open_softirq);
-EXPORT_SYMBOL(cpu_raise_softirq);
+EXPORT_SYMBOL(raise_softirq_irqoff);
 EXPORT_SYMBOL(__tasklet_schedule);
 EXPORT_SYMBOL(__tasklet_hi_schedule);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 20bf233a14c3..e581740a6e26 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -117,9 +117,9 @@ EXPORT_SYMBOL(local_bh_enable);
 /*
  * This function must run with irqs disabled!
  */
-inline void cpu_raise_softirq(unsigned int cpu, unsigned int nr)
+inline void raise_softirq_irqoff(unsigned int nr)
 {
-	__cpu_raise_softirq(cpu, nr);
+	__raise_softirq_irqoff(nr);
 
 	/*
 	 * If we're in an interrupt or softirq, we're done
@@ -139,7 +139,7 @@ void raise_softirq(unsigned int nr)
 	unsigned long flags;
 
 	local_irq_save(flags);
-	cpu_raise_softirq(smp_processor_id(), nr);
+	raise_softirq_irqoff(nr);
 	local_irq_restore(flags);
 }
 
@@ -168,7 +168,7 @@ void __tasklet_schedule(struct tasklet_struct *t)
 	local_irq_save(flags);
 	t->next = __get_cpu_var(tasklet_vec).list;
 	__get_cpu_var(tasklet_vec).list = t;
-	cpu_raise_softirq(smp_processor_id(), TASKLET_SOFTIRQ);
+	raise_softirq_irqoff(TASKLET_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -179,7 +179,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 	local_irq_save(flags);
 	t->next = __get_cpu_var(tasklet_hi_vec).list;
 	__get_cpu_var(tasklet_hi_vec).list = t;
-	cpu_raise_softirq(smp_processor_id(), HI_SOFTIRQ);
+	raise_softirq_irqoff(HI_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -211,7 +211,7 @@ static void tasklet_action(struct softirq_action *a)
 		local_irq_disable();
 		t->next = __get_cpu_var(tasklet_vec).list;
 		__get_cpu_var(tasklet_vec).list = t;
-		__cpu_raise_softirq(smp_processor_id(), TASKLET_SOFTIRQ);
+		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
 		local_irq_enable();
 	}
 }
@@ -244,7 +244,7 @@ static void tasklet_hi_action(struct softirq_action *a)
 		local_irq_disable();
 		t->next = __get_cpu_var(tasklet_hi_vec).list;
 		__get_cpu_var(tasklet_hi_vec).list = t;
-		__cpu_raise_softirq(smp_processor_id(), HI_SOFTIRQ);
+		__raise_softirq_irqoff(HI_SOFTIRQ);
 		local_irq_enable();
 	}
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index 5102b235b57c..0605391589ad 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1712,7 +1712,7 @@ out:
 
 softnet_break:
 	netdev_rx_stat[this_cpu].time_squeeze++;
-	__cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
+	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
 	goto out;
 }
 
-- 
cgit v1.2.3


From 7b957b78f962cf3e844b7ddf8d740cb21dd276b9 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 2 Jul 2003 10:38:21 -0700
Subject: [PATCH] Remove unused __syscall_count

Noone seems to use __syscall_count.  Remove the field from i386
irq_cpustat_t struct, and the generic accessor macros.

Because some archs have hardcoded asm references to offsets in this
structure, I haven't touched non-x86, but doing so is usually
trivial.
---
 include/asm-i386/hardirq.h  | 1 -
 include/linux/irq_cpustat.h | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/hardirq.h b/include/asm-i386/hardirq.h
index e8b9149f0b29..a711a1890d97 100644
--- a/include/asm-i386/hardirq.h
+++ b/include/asm-i386/hardirq.h
@@ -7,7 +7,6 @@
 
 typedef struct {
 	unsigned int __softirq_pending;
-	unsigned int __syscall_count;
 	struct task_struct * __ksoftirqd_task; /* waitqueue is too large */
 	unsigned long idle_timestamp;
 	unsigned int __nmi_count;	/* arch dependent */
diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h
index 3f49c2ba63ed..641e7964a0d7 100644
--- a/include/linux/irq_cpustat.h
+++ b/include/linux/irq_cpustat.h
@@ -29,8 +29,6 @@ extern irq_cpustat_t irq_stat[];		/* defined in asm/hardirq.h */
   /* arch independent irq_stat fields */
 #define softirq_pending(cpu)	__IRQ_STAT((cpu), __softirq_pending)
 #define local_softirq_pending()	softirq_pending(smp_processor_id())
-#define syscall_count(cpu)	__IRQ_STAT((cpu), __syscall_count)
-#define local_syscall_count()	syscall_count(smp_processor_id())
 #define ksoftirqd_task(cpu)	__IRQ_STAT((cpu), __ksoftirqd_task)
 #define local_ksoftirqd_task()	ksoftirqd_task(smp_processor_id())
 
-- 
cgit v1.2.3


From 3ac57d3424bca0406b5349f5187f5e3d84f64013 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 2 Jul 2003 10:38:29 -0700
Subject: [PATCH] Make ksoftirqd a normal per-cpu variable.

This moves the ksoftirqd pointers out of the irq_stat struct, and uses a
normal per-cpu variable.  It's not that time critical, nor referenced in
assembler.  This moves us closer to making irq_stat a per-cpu variable.

Because some archs have hardcoded asm references to offsets in this
structure, I haven't touched non-x86.  The __ksoftirqd_task field is
unused in other archs, too.
---
 include/asm-i386/hardirq.h  |  1 -
 include/linux/irq_cpustat.h |  2 --
 kernel/softirq.c            | 16 ++++++++++------
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/hardirq.h b/include/asm-i386/hardirq.h
index a711a1890d97..5a14545af179 100644
--- a/include/asm-i386/hardirq.h
+++ b/include/asm-i386/hardirq.h
@@ -7,7 +7,6 @@
 
 typedef struct {
 	unsigned int __softirq_pending;
-	struct task_struct * __ksoftirqd_task; /* waitqueue is too large */
 	unsigned long idle_timestamp;
 	unsigned int __nmi_count;	/* arch dependent */
 	unsigned int apic_timer_irqs;	/* arch dependent */
diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h
index 641e7964a0d7..03b3e17de805 100644
--- a/include/linux/irq_cpustat.h
+++ b/include/linux/irq_cpustat.h
@@ -29,8 +29,6 @@ extern irq_cpustat_t irq_stat[];		/* defined in asm/hardirq.h */
   /* arch independent irq_stat fields */
 #define softirq_pending(cpu)	__IRQ_STAT((cpu), __softirq_pending)
 #define local_softirq_pending()	softirq_pending(smp_processor_id())
-#define ksoftirqd_task(cpu)	__IRQ_STAT((cpu), __ksoftirqd_task)
-#define local_ksoftirqd_task()	ksoftirqd_task(smp_processor_id())
 
   /* arch dependent irq_stat fields */
 #define nmi_count(cpu)		__IRQ_STAT((cpu), __nmi_count)	/* i386 */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e581740a6e26..96294a3d673f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/notifier.h>
+#include <linux/percpu.h>
 #include <linux/cpu.h>
 
 /*
@@ -41,15 +42,18 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
 
+static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
-static inline void wakeup_softirqd(unsigned cpu)
+static inline void wakeup_softirqd(void)
 {
-	struct task_struct * tsk = ksoftirqd_task(cpu);
+	/* Interrupts are disabled: no need to stop preemption */
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
 
 	if (tsk && tsk->state != TASK_RUNNING)
 		wake_up_process(tsk);
@@ -96,7 +100,7 @@ restart:
 			goto restart;
 		}
 		if (pending)
-			wakeup_softirqd(smp_processor_id());
+			wakeup_softirqd();
 		__local_bh_enable();
 	}
 
@@ -131,7 +135,7 @@ inline void raise_softirq_irqoff(unsigned int nr)
 	 * schedule the softirq soon.
 	 */
 	if (!in_interrupt())
-		wakeup_softirqd(cpu);
+		wakeup_softirqd();
 }
 
 void raise_softirq(unsigned int nr)
@@ -325,7 +329,7 @@ static int ksoftirqd(void * __bind_cpu)
 	__set_current_state(TASK_INTERRUPTIBLE);
 	mb();
 
-	local_ksoftirqd_task() = current;
+	__get_cpu_var(ksoftirqd) = current;
 
 	for (;;) {
 		if (!local_softirq_pending())
@@ -354,7 +358,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 			return NOTIFY_BAD;
 		}
 
-		while (!ksoftirqd_task(hotcpu))
+		while (!per_cpu(ksoftirqd, hotcpu))
 			yield();
  	}
 	return NOTIFY_OK;
-- 
cgit v1.2.3


From f9cc1da5f29f35a0ebb69124092df437b4ab41fe Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@debian.org>
Date: Thu, 3 Jul 2003 01:51:15 -0700
Subject: [PATCH] PCI: pci_find_bus needs a domain Give pci_find_bus a domain
 argument and move its declaration to <linux/pci.h>

---
 drivers/pci/hotplug/acpiphp_glue.c     |  2 +-
 drivers/pci/hotplug/cpci_hotplug_pci.c |  2 +-
 drivers/pci/hotplug/ibmphp_core.c      |  6 +++---
 drivers/pci/pci.h                      |  1 -
 drivers/pci/search.c                   | 18 ++++++++++--------
 include/linux/pci.h                    |  1 +
 6 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 3af6ad4adbe7..4e8ddf184341 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -385,7 +385,7 @@ static void add_host_bridge (acpi_handle *handle, int seg, int bus)
 	bridge->seg = seg;
 	bridge->bus = bus;
 
-	bridge->pci_bus = pci_find_bus(bus);
+	bridge->pci_bus = pci_find_bus(seg, bus);
 
 	bridge->res_lock = SPIN_LOCK_UNLOCKED;
 
diff --git a/drivers/pci/hotplug/cpci_hotplug_pci.c b/drivers/pci/hotplug/cpci_hotplug_pci.c
index 8ffe245a1ca2..88bc69c50539 100644
--- a/drivers/pci/hotplug/cpci_hotplug_pci.c
+++ b/drivers/pci/hotplug/cpci_hotplug_pci.c
@@ -395,7 +395,7 @@ static int cpci_configure_bridge(struct pci_bus* bus, struct pci_dev* dev)
 
 	/* Scan behind bridge */
 	n = pci_scan_bridge(bus, dev, max, 2);
-	child = pci_find_bus(max + 1);
+	child = pci_find_bus(0, max + 1);
 	if (!child)
 		return -ENODEV;
 	pci_proc_attach_bus(child);
diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c
index 1e44444e6287..1f0fa666cf8b 100644
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -774,7 +774,7 @@ static u8 bus_structure_fixup (u8 busno)
 	struct pci_dev *dev;
 	u16 l;
 
-	if (pci_find_bus(busno) || !(ibmphp_find_same_bus_num (busno)))
+	if (pci_find_bus(0, busno) || !(ibmphp_find_same_bus_num (busno)))
 		return 1;
 
 	bus = kmalloc (sizeof (*bus), GFP_KERNEL);
@@ -819,7 +819,7 @@ static int ibm_configure_device (struct pci_func *func)
 		func->dev = pci_find_slot (func->busno, PCI_DEVFN(func->device, func->function));
 
 	if (func->dev == NULL) {
-		struct pci_bus *bus = pci_find_bus(func->busno);
+		struct pci_bus *bus = pci_find_bus(0, func->busno);
 		if (!bus)
 			return 0;
 
@@ -1335,7 +1335,7 @@ static int __init ibmphp_init (void)
 		goto exit;
 	}
 
-	bus = pci_find_bus(0);
+	bus = pci_find_bus(0, 0);
 	if (!bus) {
 		err ("Can't find the root pci bus, can not continue\n");
 		rc = -ENODEV;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 3288e401d914..2ad19d3f928e 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -29,7 +29,6 @@ extern int pci_remove_device_safe(struct pci_dev *dev);
 extern unsigned char pci_max_busnr(void);
 extern unsigned char pci_bus_max_busnr(struct pci_bus *bus);
 extern int pci_bus_find_capability (struct pci_bus *bus, unsigned int devfn, int cap);
-extern struct pci_bus *pci_find_bus(unsigned char busnr);
 
 struct pci_dev_wrapped {
 	struct pci_dev	*dev;
diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 4793caaa4989..15c687f74343 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -31,22 +31,24 @@ pci_do_find_bus(struct pci_bus* bus, unsigned char busnr)
 }
 
 /**
- * pci_find_bus - locate PCI bus from a given bus number
+ * pci_find_bus - locate PCI bus from a given domain and bus number
+ * @domain: number of PCI domain to search
  * @busnr: number of desired PCI bus
  *
- * Given a PCI bus number, the desired PCI bus is located in system
- * global list of PCI buses.  If the bus is found, a pointer to its
+ * Given a PCI bus number and domain number, the desired PCI bus is located
+ * in the global list of PCI buses.  If the bus is found, a pointer to its
  * data structure is returned.  If no bus is found, %NULL is returned.
  */
-struct pci_bus *
-pci_find_bus(unsigned char busnr)
+struct pci_bus * pci_find_bus(int domain, int busnr)
 {
-	struct pci_bus* bus = NULL;
-	struct pci_bus* tmp_bus;
+	struct pci_bus *bus = NULL;
+	struct pci_bus *tmp_bus;
 
 	while ((bus = pci_find_next_bus(bus)) != NULL)  {
+		if (pci_domain_nr(bus) != domain)
+			continue;
 		tmp_bus = pci_do_find_bus(bus, busnr);
-		if(tmp_bus)
+		if (tmp_bus)
 			return tmp_bus;
 	}
 	return NULL;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3ceb5d7da821..72f08971a232 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -543,6 +543,7 @@ void pcibios_update_irq(struct pci_dev *, int irq);
 
 /* Generic PCI functions used internally */
 
+extern struct pci_bus *pci_find_bus(int domain, int busnr);
 int pci_bus_exists(const struct list_head *list, int nr);
 struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus, struct pci_ops *ops, void *sysdata);
 static inline struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops, void *sysdata)
-- 
cgit v1.2.3


From fed2058e09d7ddf242079e0dd409fc25e5f428c0 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@debian.org>
Date: Thu, 3 Jul 2003 01:51:30 -0700
Subject: [PATCH] PCI: Remove pci_bus_exists Convert all callers of
 pci_bus_exists() to call pci_find_bus() instead. Since all callers of
 pci_find_bus() are __init or __devinit, mark it as __devinit too.

---
 arch/i386/pci/legacy.c              |  2 +-
 arch/sh/kernel/cpu/sh4/pci-sh7751.c |  2 +-
 drivers/pci/probe.c                 | 13 +------------
 drivers/pci/search.c                |  5 +++--
 include/linux/pci.h                 |  1 -
 5 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/pci/legacy.c b/arch/i386/pci/legacy.c
index 29fea7d6ad6c..71ca3133cdeb 100644
--- a/arch/i386/pci/legacy.c
+++ b/arch/i386/pci/legacy.c
@@ -28,7 +28,7 @@ static void __devinit pcibios_fixup_peer_bridges(void)
 	}
 
 	for (n=0; n <= pcibios_last_bus; n++) {
-		if (pci_bus_exists(&pci_root_buses, n))
+		if (pci_find_bus(0, n))
 			continue;
 		bus->number = n;
 		bus->ops = &pci_root_ops;
diff --git a/arch/sh/kernel/cpu/sh4/pci-sh7751.c b/arch/sh/kernel/cpu/sh4/pci-sh7751.c
index 365c71a4fbe0..0831b1c646ac 100644
--- a/arch/sh/kernel/cpu/sh4/pci-sh7751.c
+++ b/arch/sh/kernel/cpu/sh4/pci-sh7751.c
@@ -200,7 +200,7 @@ static void __init pcibios_fixup_peer_bridges(void)
 		return;
 	PCIDBG(2,"PCI: Peer bridge fixup\n");
 	for (n=0; n <= pcibios_last_bus; n++) {
-		if (pci_bus_exists(&pci_root_buses, n))
+		if (pci_find_bus(0, n))
 			continue;
 		bus.number = n;
 		bus.ops = pci_root_ops;
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 0894f4aed331..af83b3936f6f 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -633,22 +633,11 @@ unsigned int __devinit pci_do_scan_bus(struct pci_bus *bus)
 	return max;
 }
 
-int __devinit pci_bus_exists(const struct list_head *list, int nr)
-{
-	const struct pci_bus *b;
-
-	list_for_each_entry(b, list, node) {
-		if (b->number == nr || pci_bus_exists(&b->children, nr))
-			return 1;
-	}
-	return 0;
-}
-
 struct pci_bus * __devinit pci_scan_bus_parented(struct device *parent, int bus, struct pci_ops *ops, void *sysdata)
 {
 	struct pci_bus *b;
 
-	if (pci_bus_exists(&pci_root_buses, bus)) {
+	if (pci_find_bus(0, bus)) {
 		/* If we already got to this bus through a different bridge, ignore it */
 		DBG("PCI: Bus %02x already known\n", bus);
 		return NULL;
diff --git a/drivers/pci/search.c b/drivers/pci/search.c
index 15c687f74343..366f1f16fb2f 100644
--- a/drivers/pci/search.c
+++ b/drivers/pci/search.c
@@ -7,13 +7,14 @@
  *	Copyright 2003 -- Greg Kroah-Hartman <greg@kroah.com>
  */
 
+#include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 
 spinlock_t pci_bus_lock = SPIN_LOCK_UNLOCKED;
 
-static struct pci_bus *
+static struct pci_bus * __devinit
 pci_do_find_bus(struct pci_bus* bus, unsigned char busnr)
 {
 	struct pci_bus* child;
@@ -39,7 +40,7 @@ pci_do_find_bus(struct pci_bus* bus, unsigned char busnr)
  * in the global list of PCI buses.  If the bus is found, a pointer to its
  * data structure is returned.  If no bus is found, %NULL is returned.
  */
-struct pci_bus * pci_find_bus(int domain, int busnr)
+struct pci_bus * __devinit pci_find_bus(int domain, int busnr)
 {
 	struct pci_bus *bus = NULL;
 	struct pci_bus *tmp_bus;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 72f08971a232..a219c58ad88e 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -544,7 +544,6 @@ void pcibios_update_irq(struct pci_dev *, int irq);
 /* Generic PCI functions used internally */
 
 extern struct pci_bus *pci_find_bus(int domain, int busnr);
-int pci_bus_exists(const struct list_head *list, int nr);
 struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus, struct pci_ops *ops, void *sysdata);
 static inline struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops, void *sysdata)
 {
-- 
cgit v1.2.3


From 98823466c86c19e7c5e7d8ebe75527acf7f47f6a Mon Sep 17 00:00:00 2001
From: Adam Belay <ambx1@neo.rr.com>
Date: Thu, 3 Jul 2003 15:39:09 +0000
Subject: [PNP] Handle Disabled Resources Properly

Some devices will allow for individual resources to be disabled,
even when the device as a whole is active.  The current PnP
resource manager is not handling this situation properly.  This
patch corrects the issue by detecting disabled resources and then
flagging them. The pnp layer will now skip over any disabled
resources.  Interface updates have also been included so that we
can properly display resource tables when a resource is disabled.

Also note that a new flag "IORESOURCE_DISABLED" has been added to
linux/ioports.h.
---
 drivers/pnp/interface.c | 22 ++++++++++++++++++----
 drivers/pnp/manager.c   | 12 ++++++++++++
 drivers/pnp/resource.c  |  8 ++++++++
 drivers/pnp/support.c   | 24 ++++++++++++++++++++----
 include/linux/ioport.h  |  1 +
 5 files changed, 59 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pnp/interface.c b/drivers/pnp/interface.c
index eb2094bf72d0..e2b7388f7ec2 100644
--- a/drivers/pnp/interface.c
+++ b/drivers/pnp/interface.c
@@ -259,7 +259,10 @@ static ssize_t pnp_show_current_resources(struct device *dmdev, char *buf)
 	for (i = 0; i < PNP_MAX_PORT; i++) {
 		if (pnp_port_valid(dev, i)) {
 			pnp_printf(buffer,"io");
-			pnp_printf(buffer," 0x%lx-0x%lx \n",
+			if (pnp_port_flags(dev, i) & IORESOURCE_DISABLED)
+				pnp_printf(buffer," disabled\n");
+			else
+				pnp_printf(buffer," 0x%lx-0x%lx\n",
 						pnp_port_start(dev, i),
 						pnp_port_end(dev, i));
 		}
@@ -267,7 +270,10 @@ static ssize_t pnp_show_current_resources(struct device *dmdev, char *buf)
 	for (i = 0; i < PNP_MAX_MEM; i++) {
 		if (pnp_mem_valid(dev, i)) {
 			pnp_printf(buffer,"mem");
-			pnp_printf(buffer," 0x%lx-0x%lx \n",
+			if (pnp_mem_flags(dev, i) & IORESOURCE_DISABLED)
+				pnp_printf(buffer," disabled\n");
+			else
+				pnp_printf(buffer," 0x%lx-0x%lx\n",
 						pnp_mem_start(dev, i),
 						pnp_mem_end(dev, i));
 		}
@@ -275,13 +281,21 @@ static ssize_t pnp_show_current_resources(struct device *dmdev, char *buf)
 	for (i = 0; i < PNP_MAX_IRQ; i++) {
 		if (pnp_irq_valid(dev, i)) {
 			pnp_printf(buffer,"irq");
-			pnp_printf(buffer," %ld \n", pnp_irq(dev, i));
+			if (pnp_irq_flags(dev, i) & IORESOURCE_DISABLED)
+				pnp_printf(buffer," disabled\n");
+			else
+				pnp_printf(buffer," %ld\n",
+						pnp_irq(dev, i));
 		}
 	}
 	for (i = 0; i < PNP_MAX_DMA; i++) {
 		if (pnp_dma_valid(dev, i)) {
 			pnp_printf(buffer,"dma");
-			pnp_printf(buffer," %ld \n", pnp_dma(dev, i));
+			if (pnp_dma_flags(dev, i) & IORESOURCE_DISABLED)
+				pnp_printf(buffer," disabled\n");
+			else
+				pnp_printf(buffer," %ld\n",
+						pnp_dma(dev, i));
 		}
 	}
 	ret = (buffer->curr - buf);
diff --git a/drivers/pnp/manager.c b/drivers/pnp/manager.c
index a56dfac58b36..cc2bd90ae990 100644
--- a/drivers/pnp/manager.c
+++ b/drivers/pnp/manager.c
@@ -40,6 +40,9 @@ static int pnp_assign_port(struct pnp_dev *dev, struct pnp_port *rule, int idx)
 	if (!(dev->res.port_resource[idx].flags & IORESOURCE_AUTO))
 		return 1;
 
+	if (!rule->size)
+		return 1; /* skip disabled resource requests */
+
 	start = &dev->res.port_resource[idx].start;
 	end = &dev->res.port_resource[idx].end;
 	flags = &dev->res.port_resource[idx].flags;
@@ -76,6 +79,9 @@ static int pnp_assign_mem(struct pnp_dev *dev, struct pnp_mem *rule, int idx)
 	if (!(dev->res.mem_resource[idx].flags & IORESOURCE_AUTO))
 		return 1;
 
+	if (!rule->size)
+		return 1; /* skip disabled resource requests */
+
 	start = &dev->res.mem_resource[idx].start;
 	end = &dev->res.mem_resource[idx].end;
 	flags = &dev->res.mem_resource[idx].flags;
@@ -128,6 +134,9 @@ static int pnp_assign_irq(struct pnp_dev * dev, struct pnp_irq *rule, int idx)
 	if (!(dev->res.irq_resource[idx].flags & IORESOURCE_AUTO))
 		return 1;
 
+	if (!rule->map)
+		return 1; /* skip disabled resource requests */
+
 	start = &dev->res.irq_resource[idx].start;
 	end = &dev->res.irq_resource[idx].end;
 	flags = &dev->res.irq_resource[idx].flags;
@@ -168,6 +177,9 @@ static int pnp_assign_dma(struct pnp_dev *dev, struct pnp_dma *rule, int idx)
 	if (!(dev->res.dma_resource[idx].flags & IORESOURCE_AUTO))
 		return 1;
 
+	if (!rule->map)
+		return 1; /* skip disabled resource requests */
+
 	start = &dev->res.dma_resource[idx].start;
 	end = &dev->res.dma_resource[idx].end;
 	flags = &dev->res.dma_resource[idx].flags;
diff --git a/drivers/pnp/resource.c b/drivers/pnp/resource.c
index 978decf7504a..c7c664a3035f 100644
--- a/drivers/pnp/resource.c
+++ b/drivers/pnp/resource.c
@@ -286,6 +286,8 @@ int pnp_check_port(struct pnp_dev * dev, int idx)
 			continue;
 		for (tmp = 0; tmp < PNP_MAX_PORT; tmp++) {
 			if (tdev->res.port_resource[tmp].flags & IORESOURCE_IO) {
+				if (pnp_port_flags(dev, tmp) & IORESOURCE_DISABLED)
+					continue;
 				tport = &tdev->res.port_resource[tmp].start;
 				tend = &tdev->res.port_resource[tmp].end;
 				if (ranged_conflict(port,end,tport,tend))
@@ -340,6 +342,8 @@ int pnp_check_mem(struct pnp_dev * dev, int idx)
 			continue;
 		for (tmp = 0; tmp < PNP_MAX_MEM; tmp++) {
 			if (tdev->res.mem_resource[tmp].flags & IORESOURCE_MEM) {
+				if (pnp_mem_flags(dev, tmp) & IORESOURCE_DISABLED)
+					continue;
 				taddr = &tdev->res.mem_resource[tmp].start;
 				tend = &tdev->res.mem_resource[tmp].end;
 				if (ranged_conflict(addr,end,taddr,tend))
@@ -409,6 +413,8 @@ int pnp_check_irq(struct pnp_dev * dev, int idx)
 			continue;
 		for (tmp = 0; tmp < PNP_MAX_IRQ; tmp++) {
 			if (tdev->res.irq_resource[tmp].flags & IORESOURCE_IRQ) {
+				if (pnp_irq_flags(dev, tmp) & IORESOURCE_DISABLED)
+					continue;
 				if ((tdev->res.irq_resource[tmp].start == *irq))
 					return 0;
 			}
@@ -462,6 +468,8 @@ int pnp_check_dma(struct pnp_dev * dev, int idx)
 			continue;
 		for (tmp = 0; tmp < PNP_MAX_DMA; tmp++) {
 			if (tdev->res.dma_resource[tmp].flags & IORESOURCE_DMA) {
+				if (pnp_dma_flags(dev, tmp) & IORESOURCE_DISABLED)
+					continue;
 				if ((tdev->res.dma_resource[tmp].start == *dma))
 					return 0;
 			}
diff --git a/drivers/pnp/support.c b/drivers/pnp/support.c
index af359e092ed0..375aa2172239 100644
--- a/drivers/pnp/support.c
+++ b/drivers/pnp/support.c
@@ -68,9 +68,13 @@ static void current_irqresource(struct pnp_resource_table * res, int irq)
 	int i = 0;
 	while ((res->irq_resource[i].flags & IORESOURCE_IRQ) && i < PNP_MAX_IRQ) i++;
 	if (i < PNP_MAX_IRQ) {
+		res->irq_resource[i].flags = IORESOURCE_IRQ;  // Also clears _UNSET flag
+		if (irq == -1) {
+			res->irq_resource[i].flags |= IORESOURCE_DISABLED;
+			return;
+		}
 		res->irq_resource[i].start =
 		res->irq_resource[i].end = (unsigned long) irq;
-		res->irq_resource[i].flags = IORESOURCE_IRQ;  // Also clears _UNSET flag
 	}
 }
 
@@ -79,9 +83,13 @@ static void current_dmaresource(struct pnp_resource_table * res, int dma)
 	int i = 0;
 	while ((res->dma_resource[i].flags & IORESOURCE_DMA) && i < PNP_MAX_DMA) i++;
 	if (i < PNP_MAX_DMA) {
+		res->dma_resource[i].flags = IORESOURCE_DMA;  // Also clears _UNSET flag
+		if (dma == -1) {
+			res->dma_resource[i].flags |= IORESOURCE_DISABLED;
+			return;
+		}
 		res->dma_resource[i].start =
 		res->dma_resource[i].end = (unsigned long) dma;
-		res->dma_resource[i].flags = IORESOURCE_DMA;  // Also clears _UNSET flag
 	}
 }
 
@@ -90,9 +98,13 @@ static void current_ioresource(struct pnp_resource_table * res, int io, int len)
 	int i = 0;
 	while ((res->port_resource[i].flags & IORESOURCE_IO) && i < PNP_MAX_PORT) i++;
 	if (i < PNP_MAX_PORT) {
+		res->port_resource[i].flags = IORESOURCE_IO;  // Also clears _UNSET flag
+		if (len <= 0 || (io + len -1) >= 0x10003) {
+			res->port_resource[i].flags |= IORESOURCE_DISABLED;
+			return;
+		}
 		res->port_resource[i].start = (unsigned long) io;
 		res->port_resource[i].end = (unsigned long)(io + len - 1);
-		res->port_resource[i].flags = IORESOURCE_IO;  // Also clears _UNSET flag
 	}
 }
 
@@ -101,9 +113,13 @@ static void current_memresource(struct pnp_resource_table * res, int mem, int le
 	int i = 0;
 	while ((res->mem_resource[i].flags & IORESOURCE_MEM) && i < PNP_MAX_MEM) i++;
 	if (i < PNP_MAX_MEM) {
+		res->mem_resource[i].flags = IORESOURCE_MEM;  // Also clears _UNSET flag
+		if (len <= 0) {
+			res->mem_resource[i].flags |= IORESOURCE_DISABLED;
+			return;
+		}
 		res->mem_resource[i].start = (unsigned long) mem;
 		res->mem_resource[i].end = (unsigned long)(mem + len - 1);
-		res->mem_resource[i].flags = IORESOURCE_MEM;  // Also clears _UNSET flag
 	}
 }
 
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 9193a8df0122..26d6293ed4c9 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -43,6 +43,7 @@ struct resource_list {
 #define IORESOURCE_SHADOWABLE	0x00010000
 #define IORESOURCE_BUS_HAS_VGA	0x00080000
 
+#define IORESOURCE_DISABLED	0x10000000
 #define IORESOURCE_UNSET	0x20000000
 #define IORESOURCE_AUTO		0x40000000
 #define IORESOURCE_BUSY		0x80000000	/* Driver has marked this resource busy */
-- 
cgit v1.2.3


From 1cf6d20f607854e784041115edc5709b5c847937 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <greg@kroah.com>
Date: Thu, 3 Jul 2003 02:28:49 -0700
Subject: [PATCH] SYSFS: add module referencing to sysfs attribute files.

---
 fs/sysfs/file.c        |  9 +++++++++
 include/linux/device.h | 11 ++++++-----
 include/linux/sysfs.h  |  2 ++
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1d25a84702b2..2cedefe8c4a0 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -247,6 +247,12 @@ static int check_perm(struct inode * inode, struct file * file)
 	if (!kobj || !attr)
 		goto Einval;
 
+	/* Grab the module reference for this attribute if we have one */
+	if (!try_module_get(attr->owner)) {
+		error = -ENODEV;
+		goto Done;
+	}
+
 	/* if the kobject has no ktype, then we assume that it is a subsystem
 	 * itself, and use ops for it.
 	 */
@@ -300,6 +306,7 @@ static int check_perm(struct inode * inode, struct file * file)
 	goto Done;
  Eaccess:
 	error = -EACCES;
+	module_put(attr->owner);
  Done:
 	if (error && kobj)
 		kobject_put(kobj);
@@ -314,10 +321,12 @@ static int sysfs_open_file(struct inode * inode, struct file * filp)
 static int sysfs_release(struct inode * inode, struct file * filp)
 {
 	struct kobject * kobj = filp->f_dentry->d_parent->d_fsdata;
+	struct attribute * attr = filp->f_dentry->d_fsdata;
 	struct sysfs_buffer * buffer = filp->private_data;
 
 	if (kobj) 
 		kobject_put(kobj);
+	module_put(attr->owner);
 
 	if (buffer) {
 		if (buffer->page)
diff --git a/include/linux/device.h b/include/linux/device.h
index 1bd92551c077..edf43ff2ffb2 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -18,6 +18,7 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/ioport.h>
+#include <linux/module.h>
 #include <asm/semaphore.h>
 #include <asm/atomic.h>
 
@@ -95,7 +96,7 @@ struct bus_attribute {
 
 #define BUS_ATTR(_name,_mode,_show,_store)	\
 struct bus_attribute bus_attr_##_name = { 		\
-	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE },	\
 	.show	= _show,				\
 	.store	= _store,				\
 };
@@ -136,7 +137,7 @@ struct driver_attribute {
 
 #define DRIVER_ATTR(_name,_mode,_show,_store)	\
 struct driver_attribute driver_attr_##_name = { 		\
-	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE },	\
 	.show	= _show,				\
 	.store	= _store,				\
 };
@@ -176,7 +177,7 @@ struct class_attribute {
 
 #define CLASS_ATTR(_name,_mode,_show,_store)			\
 struct class_attribute class_attr_##_name = { 			\
-	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE },	\
 	.show	= _show,					\
 	.store	= _store,					\
 };
@@ -226,7 +227,7 @@ struct class_device_attribute {
 
 #define CLASS_DEVICE_ATTR(_name,_mode,_show,_store)		\
 struct class_device_attribute class_device_attr_##_name = { 	\
-	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE },	\
 	.show	= _show,					\
 	.store	= _store,					\
 };
@@ -324,7 +325,7 @@ struct device_attribute {
 
 #define DEVICE_ATTR(_name,_mode,_show,_store) \
 struct device_attribute dev_attr_##_name = { 		\
-	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.attr = {.name = __stringify(_name), .mode = _mode, .owner = THIS_MODULE },	\
 	.show	= _show,				\
 	.store	= _store,				\
 };
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index f054416c8145..6d8af386ab1d 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -10,9 +10,11 @@
 #define _SYSFS_H_
 
 struct kobject;
+struct module;
 
 struct attribute {
 	char			* name;
+	struct module 		* owner;
 	mode_t			mode;
 };
 
-- 
cgit v1.2.3


From f91c01ac74c4970d1d31324d6e80d78aaceae2b8 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <greg@kroah.com>
Date: Thu, 3 Jul 2003 03:43:18 -0700
Subject: [PATCH] sysfs: add sysfs_rename_dir() Based on a patch written by Dan
 Aloni <da-x@gmx.net>

---
 fs/sysfs/dir.c        | 22 ++++++++++++++++++++++
 include/linux/sysfs.h |  3 +++
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 1ca3a06db1df..0b1588ab9259 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -121,7 +121,29 @@ void sysfs_remove_dir(struct kobject * kobj)
 	dput(parent);
 }
 
+void sysfs_rename_dir(struct kobject * kobj, char *new_name)
+{
+	struct dentry * new_dentry, * parent;
+
+	if (!strcmp(kobj->name, new_name))
+		return;
+
+	if (!kobj->parent)
+		return;
+
+	parent = kobj->parent->dentry;
+
+	down(&parent->d_inode->i_sem);
+
+	new_dentry = sysfs_get_dentry(parent, new_name);
+	d_move(kobj->dentry, new_dentry);
+
+	strlcpy(kobj->name, new_name, KOBJ_NAME_LEN);
+
+	up(&parent->d_inode->i_sem);	
+}
 
 EXPORT_SYMBOL(sysfs_create_dir);
 EXPORT_SYMBOL(sysfs_remove_dir);
+EXPORT_SYMBOL(sysfs_rename_dir);
 
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 6d8af386ab1d..441c0d91f583 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -39,6 +39,9 @@ sysfs_create_dir(struct kobject *);
 extern void
 sysfs_remove_dir(struct kobject *);
 
+extern void
+sysfs_rename_dir(struct kobject *, char *new_name);
+
 extern int
 sysfs_create_file(struct kobject *, struct attribute *);
 
-- 
cgit v1.2.3


From e956d3ab2a9fd0387d41f5035e0902e06bcbc219 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <greg@kroah.com>
Date: Thu, 3 Jul 2003 03:43:34 -0700
Subject: [PATCH] kobject: add kobject_rename() Based on a patch written by Dan
 Aloni <da-x@gmx.net>

---
 include/linux/kobject.h |  2 ++
 lib/kobject.c           | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 5d42248dd95f..e744a55d07d5 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -39,6 +39,8 @@ extern void kobject_cleanup(struct kobject *);
 extern int kobject_add(struct kobject *);
 extern void kobject_del(struct kobject *);
 
+extern void kobject_rename(struct kobject *, char *new_name);
+
 extern int kobject_register(struct kobject *);
 extern void kobject_unregister(struct kobject *);
 
diff --git a/lib/kobject.c b/lib/kobject.c
index fb49131f5ff9..15fa0ba4dd88 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -313,6 +313,21 @@ int kobject_register(struct kobject * kobj)
 	return error;
 }
 
+/**
+ *	kobject_rename - change the name of an object
+ *	@kobj:	object in question.
+ *	@new_name: object's new name
+ */
+
+void kobject_rename(struct kobject * kobj, char *new_name)
+{
+	kobj = kobject_get(kobj);
+	if (!kobj)
+		return;
+	sysfs_rename_dir(kobj, new_name);
+	kobject_put(kobj);
+}
+
 /**
  *	kobject_del - unlink kobject from hierarchy.
  * 	@kobj:	object.
-- 
cgit v1.2.3


From 59c6630a851e15bc6bcecac9656e916574203b95 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <greg@kroah.com>
Date: Thu, 3 Jul 2003 03:43:49 -0700
Subject: [PATCH] driver core: added class_device_rename() Based on a patch
 written by Dan Aloni <da-x@gmx.net>

---
 drivers/base/class.c   | 18 ++++++++++++++++++
 include/linux/device.h |  2 ++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/class.c b/drivers/base/class.c
index ea551b8dc28b..d9eff17c0bb0 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -339,6 +339,24 @@ void class_device_unregister(struct class_device *class_dev)
 	class_device_put(class_dev);
 }
 
+int class_device_rename(struct class_device *class_dev, char *new_name)
+{
+	class_dev = class_device_get(class_dev);
+	if (!class_dev)
+		return -EINVAL;
+
+	pr_debug("CLASS: renaming '%s' to '%s'\n", class_dev->class_id,
+		 new_name);
+
+	strlcpy(class_dev->class_id, new_name, KOBJ_NAME_LEN);
+
+	kobject_rename(&class_dev->kobj, new_name);
+
+	class_device_put(class_dev);
+
+	return 0;
+}
+
 struct class_device * class_device_get(struct class_device *class_dev)
 {
 	if (class_dev)
diff --git a/include/linux/device.h b/include/linux/device.h
index edf43ff2ffb2..2795b85ac6f1 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -216,6 +216,8 @@ extern void class_device_initialize(struct class_device *);
 extern int class_device_add(struct class_device *);
 extern void class_device_del(struct class_device *);
 
+extern int class_device_rename(struct class_device *, char *);
+
 extern struct class_device * class_device_get(struct class_device *);
 extern void class_device_put(struct class_device *);
 
-- 
cgit v1.2.3


From 4b22645477b933f5cf2a972beebef367b628cdc2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@home.osdl.org>
Date: Thu, 3 Jul 2003 04:54:42 -0700
Subject: Add an asynchronous buffer read-ahead facility. Nobody uses it for
 now, but I needed it for some tuning tests, and it is potentially useful for
 others.

---
 fs/buffer.c                 | 22 ++++++++++++++++++++++
 include/linux/buffer_head.h |  7 +++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 56c9f4e03bdd..f063200c5b66 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1447,6 +1447,28 @@ __getblk(struct block_device *bdev, sector_t block, int size)
 }
 EXPORT_SYMBOL(__getblk);
 
+/*
+ * Do async read-ahead on a buffer..
+ */
+void
+__breadahead(struct block_device *bdev, sector_t block, int size)
+{
+	struct buffer_head *bh = __getblk(bdev, block, size);
+	if (!test_set_buffer_locked(bh)) {
+		if (!buffer_uptodate(bh)) {
+			/*
+			 * This eats the bh count from __getblk() and
+			 * unlocks when the read is done.
+			 */
+			bh->b_end_io = end_buffer_io_sync;
+			submit_bh(READ, bh);
+			return;
+		}
+		unlock_buffer(bh);
+	}
+	brelse(bh);
+}
+
 /**
  *  __bread() - reads a specified block and returns the bh
  *  @block: number of block
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 22d3ac8efc6b..1f468b0491ed 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -167,6 +167,7 @@ struct buffer_head *__find_get_block(struct block_device *, sector_t, int);
 struct buffer_head * __getblk(struct block_device *, sector_t, int);
 void __brelse(struct buffer_head *);
 void __bforget(struct buffer_head *);
+void __breadahead(struct block_device *, sector_t block, int size);
 struct buffer_head *__bread(struct block_device *, sector_t block, int size);
 struct buffer_head *alloc_buffer_head(int gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
@@ -241,6 +242,12 @@ sb_bread(struct super_block *sb, sector_t block)
 	return __bread(sb->s_bdev, block, sb->s_blocksize);
 }
 
+static inline void
+sb_breadahead(struct super_block *sb, sector_t block)
+{
+	__breadahead(sb->s_bdev, block, sb->s_blocksize);
+}
+
 static inline struct buffer_head *
 sb_getblk(struct super_block *sb, sector_t block)
 {
-- 
cgit v1.2.3


From 9c67eccb82d6ce0fb44a812ef5f76be970eedd1b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@home.osdl.org>
Date: Thu, 3 Jul 2003 05:20:52 -0700
Subject: Re-organize "ext3_get_inode_loc()" and make it easier to follow by
 splitting it into two functions: one that calculates the position, and the
 other that actually reads the inode block off the disk.

---
 fs/ext3/inode.c         | 101 +++++++++++++++++++++++++-----------------------
 include/linux/ext3_fs.h |   6 ++-
 2 files changed, 57 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index c2b0304b1855..aa632b07899a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2290,68 +2290,72 @@ out_stop:
 	ext3_journal_stop(handle);
 }
 
-/* 
- * ext3_get_inode_loc returns with an extra refcount against the
- * inode's underlying buffer_head on success. 
- */
-
-int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
+static unsigned long ext3_get_inode_block(struct super_block *sb,
+		unsigned long ino, struct ext3_iloc *iloc)
 {
-	struct buffer_head *bh = 0;
-	unsigned long block;
-	unsigned long block_group;
-	unsigned long group_desc;
-	unsigned long desc;
-	unsigned long offset;
+	unsigned long desc, group_desc, block_group;
+	unsigned long offset, block;
+	struct buffer_head *bh;
 	struct ext3_group_desc * gdp;
 
-	if ((inode->i_ino != EXT3_ROOT_INO &&
-		inode->i_ino != EXT3_JOURNAL_INO &&
-		inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
-		inode->i_ino > le32_to_cpu(
-			EXT3_SB(inode->i_sb)->s_es->s_inodes_count)) {
-		ext3_error (inode->i_sb, "ext3_get_inode_loc",
-			    "bad inode number: %lu", inode->i_ino);
-		goto bad_inode;
+	if ((ino != EXT3_ROOT_INO &&
+		ino != EXT3_JOURNAL_INO &&
+		ino < EXT3_FIRST_INO(sb)) ||
+		ino > le32_to_cpu(
+			EXT3_SB(sb)->s_es->s_inodes_count)) {
+		ext3_error (sb, "ext3_get_inode_block",
+			    "bad inode number: %lu", ino);
+		return 0;
 	}
-	block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb);
-	if (block_group >= EXT3_SB(inode->i_sb)->s_groups_count) {
-		ext3_error (inode->i_sb, "ext3_get_inode_loc",
+	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+	if (block_group >= EXT3_SB(sb)->s_groups_count) {
+		ext3_error (sb, "ext3_get_inode_block",
 			    "group >= groups count");
-		goto bad_inode;
+		return 0;
 	}
-	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
-	desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
-	bh = EXT3_SB(inode->i_sb)->s_group_desc[group_desc];
+	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
+	desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
+	bh = EXT3_SB(sb)->s_group_desc[group_desc];
 	if (!bh) {
-		ext3_error (inode->i_sb, "ext3_get_inode_loc",
+		ext3_error (sb, "ext3_get_inode_block",
 			    "Descriptor not loaded");
-		goto bad_inode;
+		return 0;
 	}
 
 	gdp = (struct ext3_group_desc *) bh->b_data;
 	/*
 	 * Figure out the offset within the block group inode table
 	 */
-	offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) *
-		EXT3_INODE_SIZE(inode->i_sb);
+	offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
+		EXT3_INODE_SIZE(sb);
 	block = le32_to_cpu(gdp[desc].bg_inode_table) +
-		(offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
-	if (!(bh = sb_bread(inode->i_sb, block))) {
-		ext3_error (inode->i_sb, "ext3_get_inode_loc",
-			    "unable to read inode block - "
-			    "inode=%lu, block=%lu", inode->i_ino, block);
-		goto bad_inode;
-	}
-	offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
+		(offset >> EXT3_BLOCK_SIZE_BITS(sb));
 
-	iloc->bh = bh;
-	iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
 	iloc->block_group = block_group;
+	iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
+	return block;
+}
 
-	return 0;
+/* 
+ * ext3_get_inode_loc returns with an extra refcount against the
+ * inode's underlying buffer_head on success. 
+ */
+
+int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
+{
+	unsigned long block;
 
- bad_inode:
+	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
+	if (block) {
+		struct buffer_head *bh = sb_bread(inode->i_sb, block);
+		if (bh) {
+			iloc->bh = bh;
+			return 0;
+		}
+		ext3_error (inode->i_sb, "ext3_get_inode_loc",
+			    "unable to read inode block - "
+			    "inode=%lu, block=%lu", inode->i_ino, block);
+	}
 	return -EIO;
 }
 
@@ -2388,7 +2392,7 @@ void ext3_read_inode(struct inode * inode)
 	if (ext3_get_inode_loc(inode, &iloc))
 		goto bad_inode;
 	bh = iloc.bh;
-	raw_inode = iloc.raw_inode;
+	raw_inode = ext3_raw_inode(&iloc);
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
@@ -2454,11 +2458,9 @@ void ext3_read_inode(struct inode * inode)
 	 * even on big-endian machines: we do NOT byteswap the block numbers!
 	 */
 	for (block = 0; block < EXT3_N_BLOCKS; block++)
-		ei->i_data[block] = iloc.raw_inode->i_block[block];
+		ei->i_data[block] = raw_inode->i_block[block];
 	INIT_LIST_HEAD(&ei->i_orphan);
 
-	brelse (iloc.bh);
-
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext3_file_inode_operations;
 		inode->i_fop = &ext3_file_operations;
@@ -2476,8 +2478,9 @@ void ext3_read_inode(struct inode * inode)
 	} else {
 		inode->i_op = &ext3_special_inode_operations;
 		init_special_inode(inode, inode->i_mode,
-				   le32_to_cpu(iloc.raw_inode->i_block[0]));
+				   le32_to_cpu(raw_inode->i_block[0]));
 	}
+	brelse (iloc.bh);
 	ext3_set_inode_flags(inode);
 	return;
 
@@ -2497,7 +2500,7 @@ static int ext3_do_update_inode(handle_t *handle,
 				struct inode *inode, 
 				struct ext3_iloc *iloc)
 {
-	struct ext3_inode *raw_inode = iloc->raw_inode;
+	struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
 	struct ext3_inode_info *ei = EXT3_I(inode);
 	struct buffer_head *bh = iloc->bh;
 	int err = 0, rc, block;
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index f077563bcfc3..c360f84fed3d 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -636,10 +636,14 @@ struct dx_hash_info
 struct ext3_iloc
 {
 	struct buffer_head *bh;
-	struct ext3_inode *raw_inode;
+	unsigned long offset;
 	unsigned long block_group;
 };
 
+static inline struct ext3_inode *ext3_raw_inode(struct ext3_iloc *iloc)
+{
+	return (struct ext3_inode *) (iloc->bh->b_data + iloc->offset);
+}
 
 /*
  * This structure is stuffed into the struct file's private_data field
-- 
cgit v1.2.3


From fc8b427ef827733152f3e9d9e8b61ac7d69e06a5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@fys.uio.no>
Date: Thu, 3 Jul 2003 22:06:06 -0700
Subject: [PATCH] Add open intent information to the 'struct nameidata'

 - Add open intent information to the 'struct nameidata'.
 - Pass the struct nameidata as an optional parameter to the
   lookup() inode operation.
 - Pass the struct nameidata as an optional parameter to the
   d_revalidate() dentry operation.
 - Make link_path_walk() set the LOOKUP_CONTINUE flag in nd->flags instead
   of passing it as an extra parameter to d_revalidate().
 - Make open_namei(), and sys_uselib() set the open()/create() intent
   data.
---
 drivers/net/wan/comx.c        |  4 ++--
 fs/adfs/adfs.h                |  2 +-
 fs/adfs/dir.c                 |  2 +-
 fs/affs/namei.c               |  2 +-
 fs/afs/dir.c                  | 10 ++++-----
 fs/afs/mntpt.c                |  4 ++--
 fs/autofs/root.c              |  8 ++++----
 fs/autofs4/root.c             | 14 ++++++-------
 fs/befs/linuxvfs.c            |  4 ++--
 fs/bfs/dir.c                  |  2 +-
 fs/cifs/cifsfs.h              |  2 +-
 fs/cifs/dir.c                 |  4 ++--
 fs/coda/dir.c                 |  8 ++++----
 fs/cramfs/inode.c             |  2 +-
 fs/devfs/base.c               |  6 +++---
 fs/efs/namei.c                |  2 +-
 fs/exec.c                     |  3 ++-
 fs/ext2/namei.c               |  2 +-
 fs/ext3/namei.c               |  2 +-
 fs/freevxfs/vxfs_lookup.c     |  5 +++--
 fs/hfs/dir_cap.c              |  4 ++--
 fs/hfs/dir_dbl.c              |  4 ++--
 fs/hfs/dir_nat.c              |  4 ++--
 fs/hfs/sysdep.c               |  4 ++--
 fs/hpfs/dir.c                 |  2 +-
 fs/hpfs/hpfs_fn.h             |  2 +-
 fs/intermezzo/dcache.c        |  2 +-
 fs/intermezzo/dir.c           |  4 ++--
 fs/intermezzo/intermezzo_fs.h |  2 +-
 fs/isofs/namei.c              |  2 +-
 fs/jffs/inode-v23.c           |  2 +-
 fs/jffs2/dir.c                |  4 ++--
 fs/jfs/namei.c                |  2 +-
 fs/libfs.c                    |  2 +-
 fs/minix/namei.c              |  2 +-
 fs/msdos/namei.c              |  2 +-
 fs/namei.c                    | 48 ++++++++++++++++++++++++++++---------------
 fs/ncpfs/dir.c                | 12 +++++------
 fs/nfs/dir.c                  |  6 +++---
 fs/ntfs/namei.c               |  3 ++-
 fs/openpromfs/inode.c         |  4 ++--
 fs/proc/base.c                | 10 ++++-----
 fs/proc/generic.c             |  2 +-
 fs/proc/root.c                |  6 +++---
 fs/qnx4/namei.c               |  2 +-
 fs/reiserfs/namei.c           |  2 +-
 fs/romfs/inode.c              |  2 +-
 fs/smbfs/dir.c                |  8 ++++----
 fs/sysv/namei.c               |  2 +-
 fs/udf/namei.c                |  3 ++-
 fs/ufs/namei.c                |  2 +-
 fs/umsdos/dir.c               |  4 ++--
 fs/umsdos/rdir.c              |  4 ++--
 fs/vfat/namei.c               |  6 +++---
 fs/xfs/linux/xfs_iops.c       |  3 ++-
 include/linux/affs_fs.h       |  2 +-
 include/linux/dcache.h        |  3 ++-
 include/linux/efs_fs.h        |  2 +-
 include/linux/fs.h            |  4 ++--
 include/linux/iso_fs.h        |  2 +-
 include/linux/msdos_fs.h      |  4 ++--
 include/linux/namei.h         | 16 ++++++++++++++-
 include/linux/proc_fs.h       |  4 ++--
 include/linux/qnx4_fs.h       |  2 +-
 include/linux/umsdos_fs.p     |  4 ++--
 65 files changed, 169 insertions(+), 135 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wan/comx.c b/drivers/net/wan/comx.c
index 1039bf85ea0a..8cd73bc9a3d5 100644
--- a/drivers/net/wan/comx.c
+++ b/drivers/net/wan/comx.c
@@ -86,7 +86,7 @@ static struct comx_protocol *comx_lines = NULL;
 
 static int comx_mkdir(struct inode *, struct dentry *, int);
 static int comx_rmdir(struct inode *, struct dentry *);
-static struct dentry *comx_lookup(struct inode *, struct dentry *);
+static struct dentry *comx_lookup(struct inode *, struct dentry *, struct nameidata *);
 
 static struct inode_operations comx_root_inode_ops = {
 	.lookup = comx_lookup,
@@ -922,7 +922,7 @@ static int comx_rmdir(struct inode *dir, struct dentry *dentry)
 	return 0;
 }
 
-static struct dentry *comx_lookup(struct inode *dir, struct dentry *dentry)
+static struct dentry *comx_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct proc_dir_entry *de;
 	struct inode *inode = NULL;
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index f4fde1c2310a..6e4a1b3a4e15 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -88,7 +88,7 @@ void __adfs_error(struct super_block *sb, const char *function,
 #define adfs_error(sb, fmt...) __adfs_error(sb, __FUNCTION__, fmt)
 
 /* namei.c */
-extern struct dentry *adfs_lookup(struct inode *dir, struct dentry *dentry);
+extern struct dentry *adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
 
 /* super.c */
 
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index df29ce99c6ed..aae5b4e066db 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -269,7 +269,7 @@ struct dentry_operations adfs_dentry_operations = {
 	.d_compare	= adfs_compare,
 };
 
-struct dentry *adfs_lookup(struct inode *dir, struct dentry *dentry)
+struct dentry *adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = NULL;
 	struct object_info obj;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 01defe3d0ff8..55beff12444f 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -210,7 +210,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
 }
 
 struct dentry *
-affs_lookup(struct inode *dir, struct dentry *dentry)
+affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct super_block *sb = dir->i_sb;
 	struct buffer_head *bh;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 12effcc6f896..a63e3e9679fe 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -23,10 +23,10 @@
 #include "super.h"
 #include "internal.h"
 
-static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry);
+static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
 static int afs_dir_open(struct inode *inode, struct file *file);
 static int afs_dir_readdir(struct file *file, void *dirent, filldir_t filldir);
-static int afs_d_revalidate(struct dentry *dentry, int flags);
+static int afs_d_revalidate(struct dentry *dentry, struct nameidata *);
 static int afs_d_delete(struct dentry *dentry);
 static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen, loff_t fpos,
 				     ino_t ino, unsigned dtype);
@@ -414,7 +414,7 @@ static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen, lof
 /*
  * look up an entry in a directory
  */
-static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry)
+static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct afs_dir_lookup_cookie cookie;
 	struct afs_super_info *as;
@@ -487,7 +487,7 @@ static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry)
  * - NOTE! the hit can be a negative hit too, so we can't assume we have an inode
  * (derived from nfs_lookup_revalidate)
  */
-static int afs_d_revalidate(struct dentry *dentry, int flags)
+static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct afs_dir_lookup_cookie cookie;
 	struct dentry *parent;
@@ -495,7 +495,7 @@ static int afs_d_revalidate(struct dentry *dentry, int flags)
 	unsigned fpos;
 	int ret;
 
-	_enter("%s,%x",dentry->d_name.name,flags);
+	_enter("%s,%p",dentry->d_name.name,nd);
 
 	parent = dget_parent(dentry);
 	dir = parent->d_inode;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 0279fcbf8329..d22887d47f38 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -21,7 +21,7 @@
 #include "internal.h"
 
 
-static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry);
+static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
 static int afs_mntpt_open(struct inode *inode, struct file *file);
 
 struct file_operations afs_mntpt_file_operations = {
@@ -93,7 +93,7 @@ int afs_mntpt_check_symlink(afs_vnode_t *vnode)
 /*
  * no valid lookup procedure on this sort of dir
  */
-static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry)
+static struct dentry *afs_mntpt_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	return ERR_PTR(-EREMOTE);
 } /* end afs_mntpt_lookup() */
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index e6e3b0c468d7..546ac2f9af87 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -18,7 +18,7 @@
 #include "autofs_i.h"
 
 static int autofs_root_readdir(struct file *,void *,filldir_t);
-static struct dentry *autofs_root_lookup(struct inode *,struct dentry *);
+static struct dentry *autofs_root_lookup(struct inode *,struct dentry *, struct nameidata *);
 static int autofs_root_symlink(struct inode *,struct dentry *,const char *);
 static int autofs_root_unlink(struct inode *,struct dentry *);
 static int autofs_root_rmdir(struct inode *,struct dentry *);
@@ -144,7 +144,7 @@ static int try_to_fill_dentry(struct dentry *dentry, struct super_block *sb, str
  * yet completely filled in, and revalidate has to delay such
  * lookups..
  */
-static int autofs_revalidate(struct dentry * dentry, int flags)
+static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
 {
 	struct inode * dir;
 	struct autofs_sb_info *sbi;
@@ -195,7 +195,7 @@ static struct dentry_operations autofs_dentry_operations = {
 	.d_revalidate	= autofs_revalidate,
 };
 
-static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry)
+static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi;
 	int oz_mode;
@@ -230,7 +230,7 @@ static struct dentry *autofs_root_lookup(struct inode *dir, struct dentry *dentr
 	d_add(dentry, NULL);
 
 	up(&dir->i_sem);
-	autofs_revalidate(dentry, 0);
+	autofs_revalidate(dentry, nd);
 	down(&dir->i_sem);
 
 	/*
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index de61c25779c1..49f9f4d3b406 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,13 +18,13 @@
 #include <linux/smp_lock.h>
 #include "autofs_i.h"
 
-static struct dentry *autofs4_dir_lookup(struct inode *,struct dentry *);
+static struct dentry *autofs4_dir_lookup(struct inode *,struct dentry *, struct nameidata *);
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
 static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
-static struct dentry *autofs4_root_lookup(struct inode *,struct dentry *);
+static struct dentry *autofs4_root_lookup(struct inode *,struct dentry *, struct nameidata *);
 
 struct file_operations autofs4_root_operations = {
 	.open		= dcache_dir_open,
@@ -143,7 +143,7 @@ static int try_to_fill_dentry(struct dentry *dentry,
  * yet completely filled in, and revalidate has to delay such
  * lookups..
  */
-static int autofs4_root_revalidate(struct dentry * dentry, int flags)
+static int autofs4_root_revalidate(struct dentry * dentry, struct nameidata *nd)
 {
 	struct inode * dir = dentry->d_parent->d_inode;
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
@@ -183,7 +183,7 @@ static int autofs4_root_revalidate(struct dentry * dentry, int flags)
 	return 1;
 }
 
-static int autofs4_revalidate(struct dentry *dentry, int flags)
+static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 
@@ -225,7 +225,7 @@ static struct dentry_operations autofs4_dentry_operations = {
 /* Lookups in non-root dirs never find anything - if it's there, it's
    already in the dcache */
 /* SMP-safe */
-static struct dentry *autofs4_dir_lookup(struct inode *dir, struct dentry *dentry)
+static struct dentry *autofs4_dir_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 #if 0
 	DPRINTK(("autofs_dir_lookup: ignoring lookup of %.*s/%.*s\n",
@@ -239,7 +239,7 @@ static struct dentry *autofs4_dir_lookup(struct inode *dir, struct dentry *dentr
 }
 
 /* Lookups in the root directory */
-static struct dentry *autofs4_root_lookup(struct inode *dir, struct dentry *dentry)
+static struct dentry *autofs4_root_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct autofs_sb_info *sbi;
 	int oz_mode;
@@ -276,7 +276,7 @@ static struct dentry *autofs4_root_lookup(struct inode *dir, struct dentry *dent
 
 	if (dentry->d_op && dentry->d_op->d_revalidate) {
 		up(&dir->i_sem);
-		(dentry->d_op->d_revalidate)(dentry, 0);
+		(dentry->d_op->d_revalidate)(dentry, nd);
 		down(&dir->i_sem);
 	}
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 4fb5a163e50d..d7846d65b361 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -33,7 +33,7 @@ static int befs_readdir(struct file *, void *, filldir_t);
 static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 static int befs_readpage(struct file *file, struct page *page);
 static sector_t befs_bmap(struct address_space *mapping, sector_t block);
-static struct dentry *befs_lookup(struct inode *, struct dentry *);
+static struct dentry *befs_lookup(struct inode *, struct dentry *, struct nameidata *);
 static void befs_read_inode(struct inode *ino);
 static struct inode *befs_alloc_inode(struct super_block *sb);
 static void befs_destroy_inode(struct inode *inode);
@@ -163,7 +163,7 @@ befs_get_block(struct inode *inode, sector_t block,
 }
 
 static struct dentry *
-befs_lookup(struct inode *dir, struct dentry *dentry)
+befs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = NULL;
 	struct super_block *sb = dir->i_sb;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 386f5fff4a77..d1f665826065 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -127,7 +127,7 @@ static int bfs_create(struct inode * dir, struct dentry * dentry, int mode)
 	return 0;
 }
 
-static struct dentry * bfs_lookup(struct inode * dir, struct dentry * dentry)
+static struct dentry * bfs_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct inode * inode = NULL;
 	struct buffer_head * bh;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2776f7c0b7c2..92aef944dcab 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -47,7 +47,7 @@ extern void cifs_delete_inode(struct inode *);
 /* Functions related to inodes */
 extern struct inode_operations cifs_dir_inode_ops;
 extern int cifs_create(struct inode *, struct dentry *, int);
-extern struct dentry *cifs_lookup(struct inode *, struct dentry *);
+extern struct dentry *cifs_lookup(struct inode *, struct dentry *, struct nameidata *);
 extern int cifs_unlink(struct inode *, struct dentry *);
 extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
 extern int cifs_mkdir(struct inode *, struct dentry *, int);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 123639718e99..b8b546eb8489 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -178,7 +178,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode)
 }
 
 struct dentry *
-cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry)
+cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct nameidata *nd)
 {
 	int rc, xid;
 	struct cifs_sb_info *cifs_sb;
@@ -262,7 +262,7 @@ cifs_dir_open(struct inode *inode, struct file *file)
 }
 
 static int
-cifs_d_revalidate(struct dentry *direntry, int flags)
+cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 {
 	int isValid = 1;
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index a7952879bd8f..030977f42952 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -30,7 +30,7 @@
 /* dir inode-ops */
 static int coda_create(struct inode *dir, struct dentry *new, int mode);
 static int coda_mknod(struct inode *dir, struct dentry *new, int mode, dev_t rdev);
-static struct dentry *coda_lookup(struct inode *dir, struct dentry *target);
+static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, struct nameidata *nd);
 static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, 
 		     struct dentry *entry);
 static int coda_unlink(struct inode *dir_inode, struct dentry *entry);
@@ -45,7 +45,7 @@ static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
 static int coda_readdir(struct file *file, void *dirent, filldir_t filldir);
 
 /* dentry ops */
-static int coda_dentry_revalidate(struct dentry *de, int);
+static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd);
 static int coda_dentry_delete(struct dentry *);
 
 /* support routines */
@@ -90,7 +90,7 @@ struct file_operations coda_dir_operations = {
 
 /* inode operations for directories */
 /* access routines: lookup, readlink, permission */
-static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry)
+static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd)
 {
 	struct inode *res_inode = NULL;
 	struct ViceFid resfid = {0,0,0};
@@ -627,7 +627,7 @@ static int coda_venus_readdir(struct file *filp, filldir_t filldir,
 }
 
 /* called when a cache lookup succeeds */
-static int coda_dentry_revalidate(struct dentry *de, int flags)
+static int coda_dentry_revalidate(struct dentry *de, struct nameidata *nd)
 {
 	struct inode *inode = de->d_inode;
 	struct coda_inode_info *cii;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index c6d6844796bb..b6a83ad7b325 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -342,7 +342,7 @@ static int cramfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 /*
  * Lookup and fill in the inode data..
  */
-static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry)
+static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	unsigned int offset = 0;
 	int sorted;
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index c632affe5dae..5c787aaa4901 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2175,7 +2175,7 @@ static struct dentry_operations devfs_dops =
     .d_iput       = devfs_d_iput,
 };
 
-static int devfs_d_revalidate_wait (struct dentry *dentry, int flags);
+static int devfs_d_revalidate_wait (struct dentry *dentry, struct nameidata *);
 
 static struct dentry_operations devfs_wait_dops =
 {
@@ -2212,7 +2212,7 @@ struct devfs_lookup_struct
 
 /* XXX: this doesn't handle the case where we got a negative dentry
         but a devfs entry has been registered in the meanwhile */
-static int devfs_d_revalidate_wait (struct dentry *dentry, int flags)
+static int devfs_d_revalidate_wait (struct dentry *dentry, struct nameidata *nd)
 {
     struct inode *dir = dentry->d_parent->d_inode;
     struct fs_info *fs_info = dir->i_sb->s_fs_info;
@@ -2265,7 +2265,7 @@ static int devfs_d_revalidate_wait (struct dentry *dentry, int flags)
 
 /*  Inode operations for device entries follow  */
 
-static struct dentry *devfs_lookup (struct inode *dir, struct dentry *dentry)
+static struct dentry *devfs_lookup (struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
     struct devfs_entry tmp;  /*  Must stay in scope until devfsd idle again  */
     struct devfs_lookup_struct lookup_info;
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 086630cc435a..e6c7210f0a68 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -57,7 +57,7 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
 	return(0);
 }
 
-struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry) {
+struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) {
 	efs_ino_t inodenum;
 	struct inode * inode = NULL;
 
diff --git a/fs/exec.c b/fs/exec.c
index ef73cbeff536..68a64ee4b234 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -117,7 +117,8 @@ asmlinkage long sys_uselib(const char __user * library)
 	struct nameidata nd;
 	int error;
 
-	error = user_path_walk(library, &nd);
+	nd.intent.open.flags = O_RDONLY;
+	error = __user_walk(library, LOOKUP_FOLLOW|LOOKUP_OPEN, &nd);
 	if (error)
 		goto out;
 
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 04489df5a2e5..9b9b713c8472 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -66,7 +66,7 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
  * Methods themselves.
  */
 
-static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry)
+static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode * inode;
 	ino_t ino;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index ae995cad505a..cf521814314a 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -970,7 +970,7 @@ errout:
 }
 #endif
 
-static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode * inode;
 	struct ext3_dir_entry_2 * de;
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 1f2c91676ee7..9c7f99f7bd01 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -51,7 +51,7 @@
 #define VXFS_BLOCK_PER_PAGE(sbp)  ((PAGE_CACHE_SIZE / (sbp)->s_blocksize))
 
 
-static struct dentry *	vxfs_lookup(struct inode *, struct dentry *);
+static struct dentry *	vxfs_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int		vxfs_readdir(struct file *, void *, filldir_t);
 
 struct inode_operations vxfs_dir_inode_ops = {
@@ -193,6 +193,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
  * vxfs_lookup - lookup pathname component
  * @dip:	dir in which we lookup
  * @dp:		dentry we lookup
+ * @nd:		lookup nameidata
  *
  * Description:
  *   vxfs_lookup tries to lookup the pathname component described
@@ -203,7 +204,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
  *   in the return pointer.
  */
 static struct dentry *
-vxfs_lookup(struct inode *dip, struct dentry *dp)
+vxfs_lookup(struct inode *dip, struct dentry *dp, struct nameidata *nd)
 {
 	struct inode		*ip = NULL;
 	ino_t			ino;
diff --git a/fs/hfs/dir_cap.c b/fs/hfs/dir_cap.c
index 78da551630a4..62bbda0a6311 100644
--- a/fs/hfs/dir_cap.c
+++ b/fs/hfs/dir_cap.c
@@ -28,7 +28,7 @@
 
 /*================ Forward declarations ================*/
 
-static struct dentry *cap_lookup(struct inode *, struct dentry *);
+static struct dentry *cap_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cap_readdir(struct file *, void *, filldir_t);
 
 /*================ Global variables ================*/
@@ -95,7 +95,7 @@ struct inode_operations hfs_cap_rdir_inode_operations = {
  * inode corresponding to an entry in a directory, given the inode for
  * the directory and the name (and its length) of the entry.
  */
-static struct dentry *cap_lookup(struct inode * dir, struct dentry *dentry)
+static struct dentry *cap_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
 	ino_t dtype;
 	struct hfs_name cname;
diff --git a/fs/hfs/dir_dbl.c b/fs/hfs/dir_dbl.c
index 36b7abd1eb6b..9ccdc5afa5d5 100644
--- a/fs/hfs/dir_dbl.c
+++ b/fs/hfs/dir_dbl.c
@@ -24,7 +24,7 @@
 
 /*================ Forward declarations ================*/
 
-static struct dentry *dbl_lookup(struct inode *, struct dentry *);
+static struct dentry *dbl_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int dbl_readdir(struct file *, void *, filldir_t);
 static int dbl_create(struct inode *, struct dentry *, int);
 static int dbl_mkdir(struct inode *, struct dentry *, int);
@@ -108,7 +108,7 @@ static int is_hdr(struct inode *dir, const char *name, int len)
  * the inode for the directory and the name (and its length) of the
  * entry.
  */
-static struct dentry *dbl_lookup(struct inode * dir, struct dentry *dentry)
+static struct dentry *dbl_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct hfs_name cname;
 	struct hfs_cat_entry *entry;
diff --git a/fs/hfs/dir_nat.c b/fs/hfs/dir_nat.c
index 5dda709ebdf5..9688bcf7c145 100644
--- a/fs/hfs/dir_nat.c
+++ b/fs/hfs/dir_nat.c
@@ -30,7 +30,7 @@
 
 /*================ Forward declarations ================*/
 
-static struct dentry *nat_lookup(struct inode *, struct dentry *);
+static struct dentry *nat_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int nat_readdir(struct file *, void *, filldir_t);
 static int nat_rmdir(struct inode *, struct dentry *);
 static int nat_hdr_unlink(struct inode *, struct dentry *);
@@ -97,7 +97,7 @@ struct inode_operations hfs_nat_hdir_inode_operations = {
  * the inode corresponding to an entry in a directory, given the inode
  * for the directory and the name (and its length) of the entry.
  */
-static struct dentry *nat_lookup(struct inode * dir, struct dentry *dentry)
+static struct dentry *nat_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
 	ino_t dtype;
 	struct hfs_name cname;
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index f0a08037ab1e..1b083b8b9a2f 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -19,7 +19,7 @@
 #include <linux/hfs_fs.h>
 #include <linux/smp_lock.h>
 
-static int hfs_revalidate_dentry(struct dentry *, int);
+static int hfs_revalidate_dentry(struct dentry *, struct nameidata *);
 static int hfs_hash_dentry(struct dentry *, struct qstr *);
 static int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
 static void hfs_dentry_iput(struct dentry *, struct inode *);
@@ -90,7 +90,7 @@ static void hfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 	iput(inode);
 }
 
-static int hfs_revalidate_dentry(struct dentry *dentry, int flags)
+static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	int diff;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 1c1e10c72822..9f0a0d3b2382 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -198,7 +198,7 @@ out:
  *	      to tell read_inode to read fnode or not.
  */
 
-struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry)
+struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	const char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 91f880e88362..2c2565358d49 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -216,7 +216,7 @@ void hpfs_set_dentry_operations(struct dentry *);
 int hpfs_dir_release(struct inode *, struct file *);
 loff_t hpfs_dir_lseek(struct file *, loff_t, int);
 int hpfs_readdir(struct file *, void *, filldir_t);
-struct dentry *hpfs_lookup(struct inode *, struct dentry *);
+struct dentry *hpfs_lookup(struct inode *, struct dentry *, struct nameidata *);
 
 /* dnode.c */
 
diff --git a/fs/intermezzo/dcache.c b/fs/intermezzo/dcache.c
index 2d3ebd2a7611..91cd4d94b5e6 100644
--- a/fs/intermezzo/dcache.c
+++ b/fs/intermezzo/dcache.c
@@ -50,7 +50,7 @@
 kmem_cache_t * presto_dentry_slab;
 
 /* called when a cache lookup succeeds */
-static int presto_d_revalidate(struct dentry *de, int flag)
+static int presto_d_revalidate(struct dentry *de, struct nameidata *nd)
 {
         struct inode *inode = de->d_inode;
         struct presto_file_set * root_fset;
diff --git a/fs/intermezzo/dir.c b/fs/intermezzo/dir.c
index 0446fb4dc174..e7b22dd30a16 100644
--- a/fs/intermezzo/dir.c
+++ b/fs/intermezzo/dir.c
@@ -239,7 +239,7 @@ struct dentry *presto_add_ilookup_dentry(struct dentry *parent,
         return de;
 }
 
-struct dentry *presto_lookup(struct inode * dir, struct dentry *dentry)
+struct dentry *presto_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
         int rc = 0;
         struct dentry *de;
@@ -286,7 +286,7 @@ struct dentry *presto_lookup(struct inode * dir, struct dentry *dentry)
                                 (dir, dentry, ino, generation);
                         is_ilookup = 1;
                 } else
-                        de = iops->lookup(dir, dentry);
+                        de = iops->lookup(dir, dentry, nd);
 #if 0
         }
 #endif
diff --git a/fs/intermezzo/intermezzo_fs.h b/fs/intermezzo/intermezzo_fs.h
index 8d2d33fcee0e..3a7c60be8f26 100644
--- a/fs/intermezzo/intermezzo_fs.h
+++ b/fs/intermezzo/intermezzo_fs.h
@@ -370,7 +370,7 @@ extern int presto_ilookup_uid;
 # define PRESTO_ILOOKUP_MAGIC "...ino:"
 # define PRESTO_ILOOKUP_SEP ':'
 int izo_dentry_is_ilookup(struct dentry *, ino_t *id, unsigned int *generation);
-struct dentry *presto_lookup(struct inode * dir, struct dentry *dentry);
+struct dentry *presto_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd);
 
 struct presto_dentry_data {
         int dd_count; /* how mnay dentries are using this dentry */
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 840cb90d4897..8d525f6bf606 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -158,7 +158,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
 	return 0;
 }
 
-struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry)
+struct dentry *isofs_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
 	unsigned long ino;
 	struct inode *inode;
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index e7e6d5442774..141fadbf8438 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -642,7 +642,7 @@ jffs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 /* Find a file in a directory. If the file exists, return its
    corresponding dentry.  */
 static struct dentry *
-jffs_lookup(struct inode *dir, struct dentry *dentry)
+jffs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct jffs_file *d;
 	struct jffs_file *f;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 61d1b71bc20c..65dd67235f61 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -33,7 +33,7 @@ typedef dev_t mknod_arg_t;
 static int jffs2_readdir (struct file *, void *, filldir_t);
 
 static int jffs2_create (struct inode *,struct dentry *,int);
-static struct dentry *jffs2_lookup (struct inode *,struct dentry *);
+static struct dentry *jffs2_lookup (struct inode *,struct dentry *, struct nameidata *);
 static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
 static int jffs2_unlink (struct inode *,struct dentry *);
 static int jffs2_symlink (struct inode *,struct dentry *,const char *);
@@ -73,7 +73,7 @@ struct inode_operations jffs2_dir_inode_operations =
    and we use the same hash function as the dentries. Makes this 
    nice and simple
 */
-static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target)
+static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, struct nameidata *nd)
 {
 	struct jffs2_inode_info *dir_f;
 	struct jffs2_sb_info *c;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index fd1467521794..b4aa9941a51d 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1373,7 +1373,7 @@ int jfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
 	return -rc;
 }
 
-static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry)
+static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd)
 {
 	struct btstack btstack;
 	ino_t inum;
diff --git a/fs/libfs.c b/fs/libfs.c
index 62fb3c0fbc24..884da83cf77a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -29,7 +29,7 @@ int simple_statfs(struct super_block *sb, struct kstatfs *buf)
  * exist, we know it is negative.
  */
 
-struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry)
+struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	d_add(dentry, NULL);
 	return NULL;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index d2b9ae264ce1..007fb7786236 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -54,7 +54,7 @@ struct dentry_operations minix_dentry_operations = {
 	.d_hash		= minix_hash,
 };
 
-static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry)
+static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode * inode = NULL;
 	ino_t ino;
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 31eb0d076c1f..f0651cd1b996 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -193,7 +193,7 @@ static struct dentry_operations msdos_dentry_operations = {
  */
 
 /***** Get inode using directory and name */
-struct dentry *msdos_lookup(struct inode *dir,struct dentry *dentry)
+struct dentry *msdos_lookup(struct inode *dir,struct dentry *dentry, struct nameidata *nd)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = NULL;
diff --git a/fs/namei.c b/fs/namei.c
index 8c847a1963f8..a04cf1aaceb2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -273,7 +273,7 @@ void path_release(struct nameidata *nd)
  * Internal lookup() using the new generic dcache.
  * SMP-safe
  */
-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
+static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
 {
 	struct dentry * dentry = __d_lookup(parent, name);
 
@@ -284,7 +284,7 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
 		dentry = d_lookup(parent, name);
 
 	if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
-		if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
+		if (!dentry->d_op->d_revalidate(dentry, nd) && !d_invalidate(dentry)) {
 			dput(dentry);
 			dentry = NULL;
 		}
@@ -336,7 +336,7 @@ ok:
  * make sure that nobody added the entry to the dcache in the meantime..
  * SMP-safe
  */
-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
+static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
 {
 	struct dentry * result;
 	struct inode *dir = parent->d_inode;
@@ -361,7 +361,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, i
 		struct dentry * dentry = d_alloc(parent, name);
 		result = ERR_PTR(-ENOMEM);
 		if (dentry) {
-			result = dir->i_op->lookup(dir, dentry);
+			result = dir->i_op->lookup(dir, dentry, nd);
 			if (result)
 				dput(dentry);
 			else
@@ -377,7 +377,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, i
 	 */
 	up(&dir->i_sem);
 	if (result->d_op && result->d_op->d_revalidate) {
-		if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) {
+		if (!result->d_op->d_revalidate(result, nd) && !d_invalidate(result)) {
 			dput(result);
 			result = ERR_PTR(-ENOENT);
 		}
@@ -524,7 +524,7 @@ struct path {
  *  It _is_ time-critical.
  */
 static int do_lookup(struct nameidata *nd, struct qstr *name,
-		     struct path *path, int flags)
+		     struct path *path)
 {
 	struct vfsmount *mnt = nd->mnt;
 	struct dentry *dentry = __d_lookup(nd->dentry, name);
@@ -539,13 +539,13 @@ done:
 	return 0;
 
 need_lookup:
-	dentry = real_lookup(nd->dentry, name, LOOKUP_CONTINUE);
+	dentry = real_lookup(nd->dentry, name, nd);
 	if (IS_ERR(dentry))
 		goto fail;
 	goto done;
 
 need_revalidate:
-	if (dentry->d_op->d_revalidate(dentry, flags))
+	if (dentry->d_op->d_revalidate(dentry, nd))
 		goto done;
 	if (d_invalidate(dentry))
 		goto done;
@@ -638,8 +638,9 @@ int link_path_walk(const char * name, struct nameidata *nd)
 			if (err < 0)
 				break;
 		}
+		nd->flags |= LOOKUP_CONTINUE;
 		/* This does the actual lookups.. */
-		err = do_lookup(nd, &this, &next, LOOKUP_CONTINUE);
+		err = do_lookup(nd, &this, &next);
 		if (err)
 			break;
 		/* Check mountpoints.. */
@@ -681,6 +682,7 @@ int link_path_walk(const char * name, struct nameidata *nd)
 last_with_slashes:
 		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
+		nd->flags &= ~LOOKUP_CONTINUE;
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
 		if (this.name[0] == '.') switch (this.len) {
@@ -700,7 +702,7 @@ last_component:
 			if (err < 0)
 				break;
 		}
-		err = do_lookup(nd, &this, &next, 0);
+		err = do_lookup(nd, &this, &next);
 		if (err)
 			break;
 		follow_mount(&next.mnt, &next.dentry);
@@ -769,6 +771,7 @@ static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
 		 */
 		nd_root.last_type = LAST_ROOT;
 		nd_root.flags = nd->flags;
+		memcpy(&nd_root.intent, &nd->intent, sizeof(nd_root.intent));
 		read_lock(&current->fs->lock);
 		nd_root.mnt = mntget(current->fs->rootmnt);
 		nd_root.dentry = dget(current->fs->root);
@@ -866,7 +869,7 @@ int path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
  * needs parent already locked. Doesn't follow mounts.
  * SMP-safe.
  */
-struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
+static struct dentry * __lookup_hash(struct qstr *name, struct dentry * base, struct nameidata *nd)
 {
 	struct dentry * dentry;
 	struct inode *inode;
@@ -889,13 +892,13 @@ struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
 			goto out;
 	}
 
-	dentry = cached_lookup(base, name, 0);
+	dentry = cached_lookup(base, name, nd);
 	if (!dentry) {
 		struct dentry *new = d_alloc(base, name);
 		dentry = ERR_PTR(-ENOMEM);
 		if (!new)
 			goto out;
-		dentry = inode->i_op->lookup(inode, new);
+		dentry = inode->i_op->lookup(inode, new, nd);
 		if (!dentry)
 			dentry = new;
 		else
@@ -905,6 +908,11 @@ out:
 	return dentry;
 }
 
+struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
+{
+	return __lookup_hash(name, base, NULL);
+}
+
 /* SMP-safe */
 struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
 {
@@ -1222,11 +1230,15 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
 	if (flag & O_APPEND)
 		acc_mode |= MAY_APPEND;
 
+	/* Fill in the open() intent data */
+	nd->intent.open.flags = flag;
+	nd->intent.open.create_mode = mode;
+
 	/*
 	 * The simplest case - just a plain lookup.
 	 */
 	if (!(flag & O_CREAT)) {
-		error = path_lookup(pathname, lookup_flags(flag), nd);
+		error = path_lookup(pathname, lookup_flags(flag)|LOOKUP_OPEN, nd);
 		if (error)
 			return error;
 		dentry = nd->dentry;
@@ -1236,7 +1248,7 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
 	/*
 	 * Create - we need to know the parent.
 	 */
-	error = path_lookup(pathname, LOOKUP_PARENT, nd);
+	error = path_lookup(pathname, LOOKUP_PARENT|LOOKUP_OPEN|LOOKUP_CREATE, nd);
 	if (error)
 		return error;
 
@@ -1250,8 +1262,9 @@ int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
 		goto exit;
 
 	dir = nd->dentry;
+	nd->flags &= ~LOOKUP_PARENT;
 	down(&dir->d_inode->i_sem);
-	dentry = lookup_hash(&nd->last, nd->dentry);
+	dentry = __lookup_hash(&nd->last, nd->dentry, nd);
 
 do_last:
 	error = PTR_ERR(dentry);
@@ -1354,7 +1367,7 @@ do_link:
 	}
 	dir = nd->dentry;
 	down(&dir->d_inode->i_sem);
-	dentry = lookup_hash(&nd->last, nd->dentry);
+	dentry = __lookup_hash(&nd->last, nd->dentry, nd);
 	putname(nd->last.name);
 	goto do_last;
 }
@@ -1368,6 +1381,7 @@ static struct dentry *lookup_create(struct nameidata *nd, int is_dir)
 	dentry = ERR_PTR(-EEXIST);
 	if (nd->last_type != LAST_NORM)
 		goto fail;
+	nd->flags &= ~LOOKUP_PARENT;
 	dentry = lookup_hash(&nd->last, nd->dentry);
 	if (IS_ERR(dentry))
 		goto fail;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index d4577dc7a551..d695f6db5baa 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -35,7 +35,7 @@ static void ncp_do_readdir(struct file *, void *, filldir_t,
 static int ncp_readdir(struct file *, void *, filldir_t);
 
 static int ncp_create(struct inode *, struct dentry *, int);
-static struct dentry *ncp_lookup(struct inode *, struct dentry *);
+static struct dentry *ncp_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int ncp_unlink(struct inode *, struct dentry *);
 static int ncp_mkdir(struct inode *, struct dentry *, int);
 static int ncp_rmdir(struct inode *, struct dentry *);
@@ -72,7 +72,7 @@ struct inode_operations ncp_dir_inode_operations =
 /*
  * Dentry operations routines
  */
-static int ncp_lookup_validate(struct dentry *, int);
+static int ncp_lookup_validate(struct dentry *, struct nameidata *);
 static int ncp_hash_dentry(struct dentry *, struct qstr *);
 static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
 static int ncp_delete_dentry(struct dentry *);
@@ -264,7 +264,7 @@ leave_me:;
 
 
 static int
-__ncp_lookup_validate(struct dentry * dentry, int flags)
+__ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
 {
 	struct ncp_server *server;
 	struct dentry *parent;
@@ -333,11 +333,11 @@ finished:
 }
 
 static int
-ncp_lookup_validate(struct dentry * dentry, int flags)
+ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
 {
 	int res;
 	lock_kernel();
-	res = __ncp_lookup_validate(dentry, flags);
+	res = __ncp_lookup_validate(dentry, nd);
 	unlock_kernel();
 	return res;
 }
@@ -797,7 +797,7 @@ out:
 	return result;
 }
 
-static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry)
+static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct ncp_server *server = NCP_SERVER(dir);
 	struct inode *inode = NULL;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d5499baadd1c..abf189a02e50 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -37,7 +37,7 @@
 
 static int nfs_opendir(struct inode *, struct file *);
 static int nfs_readdir(struct file *, void *, filldir_t);
-static struct dentry *nfs_lookup(struct inode *, struct dentry *);
+static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int nfs_cached_lookup(struct inode *, struct dentry *,
 				struct nfs_fh *, struct nfs_fattr *);
 static int nfs_create(struct inode *, struct dentry *, int);
@@ -515,7 +515,7 @@ static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry)
  * If the parent directory is seen to have changed, we throw out the
  * cached dentry and do a new lookup.
  */
-static int nfs_lookup_revalidate(struct dentry * dentry, int flags)
+static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 {
 	struct inode *dir;
 	struct inode *inode;
@@ -630,7 +630,7 @@ struct dentry_operations nfs_dentry_operations = {
 	.d_iput		= nfs_dentry_iput,
 };
 
-static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry)
+static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct inode *inode = NULL;
 	int error;
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 84d43247ba6e..a8c6e8a4e3b8 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -29,6 +29,7 @@
  * ntfs_lookup - find the inode represented by a dentry in a directory inode
  * @dir_ino:	directory inode in which to look for the inode
  * @dent:	dentry representing the inode to look for
+ * @nd:		lookup nameidata
  *
  * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
  * in the directory inode @dir_ino and if found attaches the inode to the
@@ -87,7 +88,7 @@
  *    name. We then convert the name to the current NLS code page, and proceed
  *    searching for a dentry with this name, etc, as in case 2), above.
  */
-static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent)
+static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, struct nameidata *nd)
 {
 	ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
 	struct inode *dent_inode;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 5a1fb89449be..3c11c87e2f22 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -61,7 +61,7 @@ static char *alias_names [ALIASES_NNODES];
 
 static int openpromfs_create (struct inode *, struct dentry *, int);
 static int openpromfs_readdir(struct file *, void *, filldir_t);
-static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry);
+static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd);
 static int openpromfs_unlink (struct inode *, struct dentry *dentry);
 
 static ssize_t nodenum_read(struct file *file, char *buf,
@@ -639,7 +639,7 @@ static int lookup_children(u16 n, const char * name, int len)
 	return 0;
 }
 
-static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentry)
+static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
 	int ino = 0;
 #define OPFSL_DIR	0
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e843c6584cc9..3d05ee25f8e9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -864,7 +864,7 @@ out_unlock:
  * directory. In this case, however, we can do it - no aliasing problems
  * due to the way we treat inodes.
  */
-static int pid_revalidate(struct dentry * dentry, int flags)
+static int pid_revalidate(struct dentry * dentry, struct nameidata *nd)
 {
 	if (pid_alive(proc_task(dentry->d_inode)))
 		return 1;
@@ -872,7 +872,7 @@ static int pid_revalidate(struct dentry * dentry, int flags)
 	return 0;
 }
 
-static int pid_fd_revalidate(struct dentry * dentry, int flags)
+static int pid_fd_revalidate(struct dentry * dentry, struct nameidata *nd)
 {
 	struct task_struct *task = proc_task(dentry->d_inode);
 	int fd = proc_type(dentry->d_inode) - PROC_PID_FD_DIR;
@@ -961,7 +961,7 @@ out:
 }
 
 /* SMP-safe */
-static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry)
+static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct task_struct *task = proc_task(dir);
 	unsigned fd = name_to_int(dentry);
@@ -1219,7 +1219,7 @@ out:
 	return ERR_PTR(error);
 }
 
-static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry){
+static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
 	return proc_pident_lookup(dir, dentry, base_stuff);
 }
 
@@ -1326,7 +1326,7 @@ void proc_pid_flush(struct dentry *proc_dentry)
 }
 
 /* SMP-safe */
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry)
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct task_struct *task;
 	struct inode *inode;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 6f658ceafc3a..979237c72966 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -336,7 +336,7 @@ static struct dentry_operations proc_dentry_operations =
  * Don't create negative dentries here, return -ENOENT by hand
  * instead.
  */
-struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry)
+struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = NULL;
 	struct proc_dir_entry * de;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index fb40f8c53cb4..936962d01c28 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -79,7 +79,7 @@ void __init proc_root_init(void)
 	proc_bus = proc_mkdir("bus", 0);
 }
 
-static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry)
+static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
 	/*
 	 * nr_threads is actually protected by the tasklist_lock;
@@ -89,11 +89,11 @@ static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentr
 	if (dir->i_ino == PROC_ROOT_INO) /* check for safety... */
 		dir->i_nlink = proc_root.nlink + nr_threads;
 
-	if (!proc_lookup(dir, dentry)) {
+	if (!proc_lookup(dir, dentry, nd)) {
 		return NULL;
 	}
 	
-	return proc_pid_lookup(dir, dentry);
+	return proc_pid_lookup(dir, dentry, nd);
 }
 
 static int proc_root_readdir(struct file * filp,
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 0a7592c5b958..12e423ae6de0 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -107,7 +107,7 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
 	return NULL;
 }
 
-struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry)
+struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	int ino;
 	struct qnx4_inode_entry *de;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 46bc6549577d..18a3353274c4 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -316,7 +316,7 @@ static int reiserfs_find_entry (struct inode * dir, const char * name, int namel
 }
 
 
-static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dentry)
+static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
     int retval;
     struct inode * inode = NULL;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index fb60389d42fc..24cd428a521e 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -329,7 +329,7 @@ out:
 }
 
 static struct dentry *
-romfs_lookup(struct inode *dir, struct dentry *dentry)
+romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	unsigned long offset, maxoff;
 	int fslen, res;
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index af4b42beab8f..081402f119dc 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -24,7 +24,7 @@
 static int smb_readdir(struct file *, void *, filldir_t);
 static int smb_dir_open(struct inode *, struct file *);
 
-static struct dentry *smb_lookup(struct inode *, struct dentry *);
+static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int smb_create(struct inode *, struct dentry *, int);
 static int smb_mkdir(struct inode *, struct dentry *, int);
 static int smb_rmdir(struct inode *, struct dentry *);
@@ -268,7 +268,7 @@ smb_dir_open(struct inode *dir, struct file *file)
 /*
  * Dentry operations routines
  */
-static int smb_lookup_validate(struct dentry *, int);
+static int smb_lookup_validate(struct dentry *, struct nameidata *);
 static int smb_hash_dentry(struct dentry *, struct qstr *);
 static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
 static int smb_delete_dentry(struct dentry *);
@@ -292,7 +292,7 @@ static struct dentry_operations smbfs_dentry_operations_case =
  * This is the callback when the dcache has a lookup hit.
  */
 static int
-smb_lookup_validate(struct dentry * dentry, int flags)
+smb_lookup_validate(struct dentry * dentry, struct nameidata *nd)
 {
 	struct smb_sb_info *server = server_from_dentry(dentry);
 	struct inode * inode = dentry->d_inode;
@@ -420,7 +420,7 @@ smb_renew_times(struct dentry * dentry)
 }
 
 static struct dentry *
-smb_lookup(struct inode *dir, struct dentry *dentry)
+smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct smb_fattr finfo;
 	struct inode *inode;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index f2988f107696..4c0eb5730065 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -64,7 +64,7 @@ struct dentry_operations sysv_dentry_operations = {
 	.d_hash		= sysv_hash,
 };
 
-static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry)
+static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
 {
 	struct inode * inode = NULL;
 	ino_t ino;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index b5be4880deac..7881ffbbd82e 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -289,6 +289,7 @@ udf_find_entry(struct inode *dir, struct dentry *dentry,
  * PRE-CONDITIONS
  *	dir			Pointer to inode of parent directory.
  *	dentry			Pointer to dentry to complete.
+ *	nd			Pointer to lookup nameidata
  *
  * POST-CONDITIONS
  *	<return>		Zero on success.
@@ -299,7 +300,7 @@ udf_find_entry(struct inode *dir, struct dentry *dentry,
  */
 
 static struct dentry *
-udf_lookup(struct inode *dir, struct dentry *dentry)
+udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = NULL;
 	struct fileIdentDesc cfi, *fi;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 24c6c5d2938d..55496ec96e56 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -62,7 +62,7 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
 	return err;
 }
 
-static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry)
+static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode * inode = NULL;
 	ino_t ino;
diff --git a/fs/umsdos/dir.c b/fs/umsdos/dir.c
index befb7545f53c..775f02021128 100644
--- a/fs/umsdos/dir.c
+++ b/fs/umsdos/dir.c
@@ -30,7 +30,7 @@ extern struct inode *pseudo_root;
  */
 
 /* nothing for now ... */
-static int umsdos_dentry_validate(struct dentry *dentry, int flags)
+static int umsdos_dentry_validate(struct dentry *dentry, struct nameidata *nd)
 {
 	return 1;
 }
@@ -564,7 +564,7 @@ out_remove:
  * Called by VFS; should fill dentry->d_inode via d_add.
  */
 
-struct dentry *UMSDOS_lookup (struct inode *dir, struct dentry *dentry)
+struct dentry *UMSDOS_lookup (struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	struct dentry *ret;
 
diff --git a/fs/umsdos/rdir.c b/fs/umsdos/rdir.c
index d4ac89d1e668..2f32539b1a37 100644
--- a/fs/umsdos/rdir.c
+++ b/fs/umsdos/rdir.c
@@ -101,7 +101,7 @@ struct dentry *umsdos_rlookup_x ( struct inode *dir, struct dentry *dentry, int
 		goto out;
 	}
 
-	ret = msdos_lookup (dir, dentry);
+	ret = msdos_lookup (dir, dentry, NULL);
 	if (ret) {
 		printk(KERN_WARNING
 			"umsdos_rlookup_x: %s/%s failed, ret=%ld\n",
@@ -129,7 +129,7 @@ out:
 }
 
 
-struct dentry *UMSDOS_rlookup ( struct inode *dir, struct dentry *dentry)
+struct dentry *UMSDOS_rlookup ( struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
 	return umsdos_rlookup_x (dir, dentry, 0);
 }
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index 1f83a9d77e8a..04f6754fe235 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -45,7 +45,7 @@ static int vfat_hashi(struct dentry *parent, struct qstr *qstr);
 static int vfat_hash(struct dentry *parent, struct qstr *qstr);
 static int vfat_cmpi(struct dentry *dentry, struct qstr *a, struct qstr *b);
 static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b);
-static int vfat_revalidate(struct dentry *dentry, int);
+static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd);
 
 static struct dentry_operations vfat_dentry_ops[4] = {
 	{
@@ -68,7 +68,7 @@ static struct dentry_operations vfat_dentry_ops[4] = {
 	}
 };
 
-static int vfat_revalidate(struct dentry *dentry, int flags)
+static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	PRINTK1(("vfat_revalidate: %s\n", dentry->d_name.name));
 	spin_lock(&dcache_lock);
@@ -860,7 +860,7 @@ static int vfat_find(struct inode *dir,struct qstr* qname,
 	return res ? res : -ENOENT;
 }
 
-struct dentry *vfat_lookup(struct inode *dir,struct dentry *dentry)
+struct dentry *vfat_lookup(struct inode *dir,struct dentry *dentry, struct nameidata *nd)
 {
 	int res;
 	struct vfat_slot_info sinfo;
diff --git a/fs/xfs/linux/xfs_iops.c b/fs/xfs/linux/xfs_iops.c
index 14d393eb6d2e..c4ff85065f41 100644
--- a/fs/xfs/linux/xfs_iops.c
+++ b/fs/xfs/linux/xfs_iops.c
@@ -192,7 +192,8 @@ linvfs_mkdir(
 STATIC struct dentry *
 linvfs_lookup(
 	struct inode	*dir,
-	struct dentry	*dentry)
+	struct dentry	*dentry,
+	struct nameidata *nd)
 {
 	struct inode	*ip = NULL;
 	vnode_t		*vp, *cvp = NULL;
diff --git a/include/linux/affs_fs.h b/include/linux/affs_fs.h
index 47ed05c8b744..837fe37aba93 100644
--- a/include/linux/affs_fs.h
+++ b/include/linux/affs_fs.h
@@ -41,7 +41,7 @@ extern int	affs_init_bitmap(struct super_block *sb);
 /* namei.c */
 
 extern int	affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
-extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry);
+extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
 extern int	affs_unlink(struct inode *dir, struct dentry *dentry);
 extern int	affs_create(struct inode *dir, struct dentry *dentry, int mode);
 extern int	affs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1442779bd893..a25d9f0443a4 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -10,6 +10,7 @@
 #include <linux/rcupdate.h>
 #include <asm/bug.h>
 
+struct nameidata;
 struct vfsmount;
 
 /*
@@ -106,7 +107,7 @@ struct dentry {
 #define DNAME_INLINE_LEN	(sizeof(struct dentry)-offsetof(struct dentry,d_iname))
  
 struct dentry_operations {
-	int (*d_revalidate)(struct dentry *, int);
+	int (*d_revalidate)(struct dentry *, struct nameidata *);
 	int (*d_hash) (struct dentry *, struct qstr *);
 	int (*d_compare) (struct dentry *, struct qstr *, struct qstr *);
 	int (*d_delete)(struct dentry *);
diff --git a/include/linux/efs_fs.h b/include/linux/efs_fs.h
index c78e9c2a7b3a..1640eb875d4e 100644
--- a/include/linux/efs_fs.h
+++ b/include/linux/efs_fs.h
@@ -46,7 +46,7 @@ extern int efs_statfs(struct super_block *, struct kstatfs *);
 extern void efs_read_inode(struct inode *);
 extern efs_block_t efs_map_block(struct inode *, efs_block_t);
 
-extern struct dentry *efs_lookup(struct inode *, struct dentry *);
+extern struct dentry *efs_lookup(struct inode *, struct dentry *, struct nameidata *);
 extern int efs_bmap(struct inode *, int);
 
 #endif /* __EFS_FS_H__ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c3bda88631bc..3ddf4c4edfb7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -731,7 +731,7 @@ struct file_operations {
 
 struct inode_operations {
 	int (*create) (struct inode *,struct dentry *,int);
-	struct dentry * (*lookup) (struct inode *,struct dentry *);
+	struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
 	int (*unlink) (struct inode *,struct dentry *);
 	int (*symlink) (struct inode *,struct dentry *,const char *);
@@ -1291,7 +1291,7 @@ extern int simple_prepare_write(struct file *file, struct page *page,
 extern int simple_commit_write(struct file *file, struct page *page,
 				unsigned offset, unsigned to);
 
-extern struct dentry *simple_lookup(struct inode *, struct dentry *);
+extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
 extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
 extern struct file_operations simple_dir_operations;
 extern struct inode_operations simple_dir_inode_operations;
diff --git a/include/linux/iso_fs.h b/include/linux/iso_fs.h
index 4763b595287a..223f161da018 100644
--- a/include/linux/iso_fs.h
+++ b/include/linux/iso_fs.h
@@ -227,7 +227,7 @@ extern int isofs_name_translate(struct iso_directory_record *, char *, struct in
 int get_joliet_filename(struct iso_directory_record *, unsigned char *, struct inode *);
 int get_acorn_filename(struct iso_directory_record *, char *, struct inode *);
 
-extern struct dentry *isofs_lookup(struct inode *, struct dentry *);
+extern struct dentry *isofs_lookup(struct inode *, struct dentry *, struct nameidata *);
 extern struct buffer_head *isofs_bread(struct inode *, sector_t);
 extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long);
 
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index 4268ed112436..d1b00ab916ce 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -307,7 +307,7 @@ extern int fat_scan(struct inode *dir, const char *name,
 		    struct msdos_dir_entry **res_de, loff_t *i_pos);
 
 /* msdos/namei.c  - these are for Umsdos */
-extern struct dentry *msdos_lookup(struct inode *dir, struct dentry *);
+extern struct dentry *msdos_lookup(struct inode *dir, struct dentry *, struct nameidata *);
 extern int msdos_create(struct inode *dir, struct dentry *dentry, int mode);
 extern int msdos_rmdir(struct inode *dir, struct dentry *dentry);
 extern int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode);
@@ -317,7 +317,7 @@ extern int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
 extern int msdos_fill_super(struct super_block *sb, void *data, int silent);
 
 /* vfat/namei.c - these are for dmsdos */
-extern struct dentry *vfat_lookup(struct inode *dir, struct dentry *);
+extern struct dentry *vfat_lookup(struct inode *dir, struct dentry *, struct nameidata *);
 extern int vfat_create(struct inode *dir, struct dentry *dentry, int mode);
 extern int vfat_rmdir(struct inode *dir, struct dentry *dentry);
 extern int vfat_unlink(struct inode *dir, struct dentry *dentry);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 16baf5cdb9c7..256ceac1fc69 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -5,12 +5,22 @@
 
 struct vfsmount;
 
+struct open_intent {
+	int	flags;
+	int	create_mode;
+};
+
 struct nameidata {
 	struct dentry	*dentry;
 	struct vfsmount *mnt;
 	struct qstr	last;
 	unsigned int	flags;
 	int		last_type;
+
+	/* Intent data */
+	union {
+		struct open_intent open;
+	} intent;
 };
 
 /*
@@ -31,7 +41,11 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_CONTINUE		 4
 #define LOOKUP_PARENT		16
 #define LOOKUP_NOALT		32
-
+/*
+ * Intent data
+ */
+#define LOOKUP_OPEN		(0x0100)
+#define LOOKUP_CREATE		(0x0200)
 
 extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
 #define user_path_walk(name,nd) \
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index c0144a1ba4cb..e2e54ee6186d 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -92,7 +92,7 @@ extern struct proc_dir_entry *proc_root_kcore;
 extern void proc_root_init(void);
 extern void proc_misc_init(void);
 
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry);
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
 struct dentry *proc_pid_unhash(struct task_struct *p);
 void proc_pid_flush(struct dentry *proc_dentry);
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
@@ -115,7 +115,7 @@ extern int proc_match(int, const char *,struct proc_dir_entry *);
  * of the /proc/<pid> subdirectories.
  */
 extern int proc_readdir(struct file *, void *, filldir_t);
-extern struct dentry *proc_lookup(struct inode *, struct dentry *);
+extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
 
 extern struct file_operations proc_kcore_operations;
 extern struct file_operations proc_kmsg_operations;
diff --git a/include/linux/qnx4_fs.h b/include/linux/qnx4_fs.h
index 5710620989d5..2aa7a7ca3d54 100644
--- a/include/linux/qnx4_fs.h
+++ b/include/linux/qnx4_fs.h
@@ -110,7 +110,7 @@ struct qnx4_inode_info {
 	struct inode vfs_inode;
 };
 
-extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry);
+extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
 extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
 extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
 
diff --git a/include/linux/umsdos_fs.p b/include/linux/umsdos_fs.p
index 7034b7eb6b16..1c284c5a7aec 100644
--- a/include/linux/umsdos_fs.p
+++ b/include/linux/umsdos_fs.p
@@ -10,7 +10,7 @@ char * umsdos_d_path(struct dentry *, char *, int);
 void umsdos_lookup_patch_new(struct dentry *, struct umsdos_info *);
 int umsdos_is_pseudodos (struct inode *dir, struct dentry *dentry);
 struct dentry *umsdos_lookup_x ( struct inode *dir, struct dentry *dentry, int nopseudo);
-struct dentry *UMSDOS_lookup(struct inode *, struct dentry *);
+struct dentry *UMSDOS_lookup(struct inode *, struct dentry *, struct nameidata *);
 struct dentry *umsdos_lookup_dentry(struct dentry *, char *, int, int);
 struct dentry *umsdos_covered(struct dentry *, char *, int);
 
@@ -92,7 +92,7 @@ int UMSDOS_rename (struct inode *old_dir,
 
 /* rdir.c 22/03/95 03.31.42 */
 struct dentry *umsdos_rlookup_x (struct inode *dir, struct dentry *dentry, int nopseudo);
-struct dentry *UMSDOS_rlookup (struct inode *dir, struct dentry *dentry);
+struct dentry *UMSDOS_rlookup (struct inode *dir, struct dentry *dentry, struct nameidata *nd);
 
 static inline struct umsdos_inode_info *UMSDOS_I(struct inode *inode)
 {
-- 
cgit v1.2.3


From 675b5da0145fb4b54c4d9f6ea0106df844e4f75d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@fys.uio.no>
Date: Thu, 3 Jul 2003 22:06:21 -0700
Subject: [PATCH] Pass 'nameidata' to ->create()

  - Make the VFS pass the struct nameidata as an optional argument
    to the create inode operation.
  - Patch vfs_create() to take a struct nameidata as an optional
    argument.
---
 fs/affs/namei.c          | 2 +-
 fs/bfs/dir.c             | 3 ++-
 fs/cifs/cifsfs.h         | 2 +-
 fs/cifs/dir.c            | 3 ++-
 fs/coda/dir.c            | 4 ++--
 fs/ext2/namei.c          | 2 +-
 fs/ext3/namei.c          | 3 ++-
 fs/hfs/dir.c             | 2 +-
 fs/hfs/dir_dbl.c         | 6 +++---
 fs/hpfs/hpfs_fn.h        | 2 +-
 fs/hpfs/namei.c          | 2 +-
 fs/hugetlbfs/inode.c     | 2 +-
 fs/intermezzo/dir.c      | 3 ++-
 fs/intermezzo/vfs.c      | 2 +-
 fs/jffs/inode-v23.c      | 3 ++-
 fs/jffs2/dir.c           | 5 +++--
 fs/jfs/namei.c           | 4 +++-
 fs/minix/namei.c         | 3 ++-
 fs/msdos/namei.c         | 3 ++-
 fs/namei.c               | 9 +++++----
 fs/ncpfs/dir.c           | 5 +++--
 fs/nfs/dir.c             | 5 +++--
 fs/nfsd/vfs.c            | 4 ++--
 fs/openpromfs/inode.c    | 5 +++--
 fs/qnx4/namei.c          | 3 ++-
 fs/ramfs/inode.c         | 2 +-
 fs/reiserfs/namei.c      | 3 ++-
 fs/smbfs/dir.c           | 5 +++--
 fs/sysv/namei.c          | 2 +-
 fs/udf/namei.c           | 2 +-
 fs/ufs/namei.c           | 3 ++-
 fs/umsdos/emd.c          | 2 +-
 fs/umsdos/namei.c        | 4 ++--
 fs/vfat/namei.c          | 3 ++-
 fs/xfs/linux/xfs_iops.c  | 3 ++-
 include/linux/affs_fs.h  | 2 +-
 include/linux/fs.h       | 4 ++--
 include/linux/hfs_fs.h   | 2 +-
 include/linux/msdos_fs.h | 4 ++--
 include/linux/qnx4_fs.h  | 3 +--
 mm/shmem.c               | 3 ++-
 41 files changed, 77 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 55beff12444f..f2cbba3b7578 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -256,7 +256,7 @@ affs_unlink(struct inode *dir, struct dentry *dentry)
 }
 
 int
-affs_create(struct inode *dir, struct dentry *dentry, int mode)
+affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode	*inode;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index d1f665826065..7e5b4781eb25 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,8 @@ struct file_operations bfs_dir_operations = {
 
 extern void dump_imap(const char *, struct super_block *);
 
-static int bfs_create(struct inode * dir, struct dentry * dentry, int mode)
+static int bfs_create(struct inode * dir, struct dentry * dentry, int mode,
+		struct nameidata *nd)
 {
 	int err;
 	struct inode * inode;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 92aef944dcab..9c493d50c3fe 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -46,7 +46,7 @@ extern void cifs_delete_inode(struct inode *);
 
 /* Functions related to inodes */
 extern struct inode_operations cifs_dir_inode_ops;
-extern int cifs_create(struct inode *, struct dentry *, int);
+extern int cifs_create(struct inode *, struct dentry *, int, struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *, struct nameidata *);
 extern int cifs_unlink(struct inode *, struct dentry *);
 extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index b8b546eb8489..69c4b70e6b46 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -119,7 +119,8 @@ build_wildcard_path_from_dentry(struct dentry *direntry)
 /* Inode operations in similar order to how they appear in the Linux file fs.h */
 
 int
-cifs_create(struct inode *inode, struct dentry *direntry, int mode)
+cifs_create(struct inode *inode, struct dentry *direntry, int mode,
+		struct nameidata *nd)
 {
 	int rc = -ENOENT;
 	int xid;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 030977f42952..8b3627e0d0e2 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -28,7 +28,7 @@
 #include <linux/coda_proc.h>
 
 /* dir inode-ops */
-static int coda_create(struct inode *dir, struct dentry *new, int mode);
+static int coda_create(struct inode *dir, struct dentry *new, int mode, struct nameidata *nd);
 static int coda_mknod(struct inode *dir, struct dentry *new, int mode, dev_t rdev);
 static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, struct nameidata *nd);
 static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, 
@@ -190,7 +190,7 @@ static inline void coda_dir_changed(struct inode *dir, int link)
 }
 
 /* creation routines: create, mknod, mkdir, link, symlink */
-static int coda_create(struct inode *dir, struct dentry *de, int mode)
+static int coda_create(struct inode *dir, struct dentry *de, int mode, struct nameidata *nd)
 {
         int error=0;
 	const char *name=de->d_name.name;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 9b9b713c8472..52fb0eb666bf 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -120,7 +120,7 @@ struct dentry *ext2_get_parent(struct dentry *child)
  * If the create succeeds, we fill in the inode information
  * with d_instantiate(). 
  */
-static int ext2_create (struct inode * dir, struct dentry * dentry, int mode)
+static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
 {
 	struct inode * inode = ext2_new_inode (dir, mode);
 	int err = PTR_ERR(inode);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index cf521814314a..74e53bcc480e 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1623,7 +1623,8 @@ static int ext3_add_nondir(handle_t *handle,
  * If the create succeeds, we fill in the inode information
  * with d_instantiate(). 
  */
-static int ext3_create (struct inode * dir, struct dentry * dentry, int mode)
+static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
+		struct nameidata *nd)
 {
 	handle_t *handle; 
 	struct inode * inode;
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index fe696c097d27..40df8a2b116c 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -163,7 +163,7 @@ static inline void mark_inodes_deleted(struct hfs_cat_entry *entry,
  * a directory and return a corresponding inode, given the inode for
  * the directory and the name (and its length) of the new file.
  */
-int hfs_create(struct inode * dir, struct dentry *dentry, int mode)
+int hfs_create(struct inode * dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
 	struct hfs_cat_entry *entry = HFS_I(dir)->entry;
 	struct hfs_cat_entry *new;
diff --git a/fs/hfs/dir_dbl.c b/fs/hfs/dir_dbl.c
index 9ccdc5afa5d5..ee2ccef70fe7 100644
--- a/fs/hfs/dir_dbl.c
+++ b/fs/hfs/dir_dbl.c
@@ -26,7 +26,7 @@
 
 static struct dentry *dbl_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int dbl_readdir(struct file *, void *, filldir_t);
-static int dbl_create(struct inode *, struct dentry *, int);
+static int dbl_create(struct inode *, struct dentry *, int, struct nameidata *);
 static int dbl_mkdir(struct inode *, struct dentry *, int);
 static int dbl_unlink(struct inode *, struct dentry *);
 static int dbl_rmdir(struct inode *, struct dentry *);
@@ -272,7 +272,7 @@ out:
  * the directory and the name (and its length) of the new file.
  */
 static int dbl_create(struct inode * dir, struct dentry *dentry,
-		      int mode)
+		      int mode, struct nameidata *nd)
 {
 	int error;
 
@@ -280,7 +280,7 @@ static int dbl_create(struct inode * dir, struct dentry *dentry,
 	if (is_hdr(dir, dentry->d_name.name, dentry->d_name.len)) {
 		error = -EEXIST;
 	} else {
-		error = hfs_create(dir, dentry, mode);
+		error = hfs_create(dir, dentry, mode, nd);
 	}
 	unlock_kernel();
 	return error;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 2c2565358d49..a4dc5bab6efd 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -285,7 +285,7 @@ void hpfs_decide_conv(struct inode *, unsigned char *, unsigned);
 /* namei.c */
 
 int hpfs_mkdir(struct inode *, struct dentry *, int);
-int hpfs_create(struct inode *, struct dentry *, int);
+int hpfs_create(struct inode *, struct dentry *, int, struct nameidata *);
 int hpfs_mknod(struct inode *, struct dentry *, int, dev_t);
 int hpfs_symlink(struct inode *, struct dentry *, const char *);
 int hpfs_unlink(struct inode *, struct dentry *);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 8540f23659a0..128647db8ffc 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -106,7 +106,7 @@ bail:
 	return -ENOSPC;
 }
 
-int hpfs_create(struct inode *dir, struct dentry *dentry, int mode)
+int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
 	const char *name = dentry->d_name.name;
 	unsigned len = dentry->d_name.len;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f0d2a2c65170..5888e05f81bf 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -462,7 +462,7 @@ static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return retval;
 }
 
-static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode)
+static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
 	return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
 }
diff --git a/fs/intermezzo/dir.c b/fs/intermezzo/dir.c
index e7b22dd30a16..61cd7b4d54ea 100644
--- a/fs/intermezzo/dir.c
+++ b/fs/intermezzo/dir.c
@@ -412,7 +412,8 @@ int presto_prep(struct dentry *dentry, struct presto_cache **cache,
         return 0;
 }
 
-static int presto_create(struct inode * dir, struct dentry * dentry, int mode)
+static int presto_create(struct inode * dir, struct dentry * dentry, int mode,
+                struct nameidata *nd)
 {
         int error;
         struct presto_cache *cache;
diff --git a/fs/intermezzo/vfs.c b/fs/intermezzo/vfs.c
index 5dd78cfed581..c3e124c6777f 100644
--- a/fs/intermezzo/vfs.c
+++ b/fs/intermezzo/vfs.c
@@ -598,7 +598,7 @@ int presto_do_create(struct presto_file_set *fset, struct dentry *dir,
         }
         DQUOT_INIT(dir->d_inode);
         lock_kernel();
-        error = iops->create(dir->d_inode, dentry, mode);
+        error = iops->create(dir->d_inode, dentry, mode, NULL);
         if (error) {
                 EXIT;
                 goto exit_lock;
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index 141fadbf8438..94d3560caeae 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1273,7 +1273,8 @@ jffs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
  * with d_instantiate().
  */
 static int
-jffs_create(struct inode *dir, struct dentry *dentry, int mode)
+jffs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
 	struct jffs_raw_inode raw_inode;
 	struct jffs_control *c;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 65dd67235f61..9a2df58cb486 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -32,7 +32,7 @@ typedef dev_t mknod_arg_t;
 
 static int jffs2_readdir (struct file *, void *, filldir_t);
 
-static int jffs2_create (struct inode *,struct dentry *,int);
+static int jffs2_create (struct inode *,struct dentry *,int, struct nameidata *);
 static struct dentry *jffs2_lookup (struct inode *,struct dentry *, struct nameidata *);
 static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
 static int jffs2_unlink (struct inode *,struct dentry *);
@@ -175,7 +175,8 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
 /***********************************************************************/
 
 
-static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode)
+static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
 	struct jffs2_raw_inode *ri;
 	struct jffs2_inode_info *f, *dir_f;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index b4aa9941a51d..3bf710dd0901 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -54,11 +54,13 @@ s64 commitZeroLink(tid_t, struct inode *);
  * PARAMETER:	dip 	- parent directory vnode
  *		dentry	- dentry of new file
  *		mode	- create mode (rwxrwxrwx).
+ *		nd- nd struct
  *
  * RETURN:	Errors from subroutines
  *
  */
-int jfs_create(struct inode *dip, struct dentry *dentry, int mode)
+int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
 	int rc = 0;
 	tid_t tid;		/* transaction id */
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 007fb7786236..2b9e6c64d25a 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -89,7 +89,8 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
 	return error;
 }
 
-static int minix_create(struct inode * dir, struct dentry *dentry, int mode)
+static int minix_create(struct inode * dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
 	return minix_mknod(dir, dentry, mode, 0);
 }
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index f0651cd1b996..19c047776ecd 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -261,7 +261,8 @@ static int msdos_add_entry(struct inode *dir, const char *name,
  */
 
 /***** Create a file */
-int msdos_create(struct inode *dir,struct dentry *dentry,int mode)
+int msdos_create(struct inode *dir,struct dentry *dentry,int mode,
+		struct nameidata *nd)
 {
 	struct super_block *sb = dir->i_sb;
 	struct buffer_head *bh;
diff --git a/fs/namei.c b/fs/namei.c
index a04cf1aaceb2..ae67748c2fc8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1105,7 +1105,8 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 	}
 }
 
-int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
+int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
 	int error = may_create(dir, dentry);
 
@@ -1120,7 +1121,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
 	if (error)
 		return error;
 	DQUOT_INIT(dir);
-	error = dir->i_op->create(dir, dentry, mode);
+	error = dir->i_op->create(dir, dentry, mode, nd);
 	if (!error) {
 		inode_dir_notify(dir, DN_CREATE);
 		security_inode_post_create(dir, dentry, mode);
@@ -1277,7 +1278,7 @@ do_last:
 	if (!dentry->d_inode) {
 		if (!IS_POSIXACL(dir->d_inode))
 			mode &= ~current->fs->umask;
-		error = vfs_create(dir->d_inode, dentry, mode);
+		error = vfs_create(dir->d_inode, dentry, mode, nd);
 		up(&dir->d_inode->i_sem);
 		dput(nd->dentry);
 		nd->dentry = dentry;
@@ -1445,7 +1446,7 @@ asmlinkage long sys_mknod(const char __user * filename, int mode, dev_t dev)
 	if (!IS_ERR(dentry)) {
 		switch (mode & S_IFMT) {
 		case 0: case S_IFREG:
-			error = vfs_create(nd.dentry->d_inode,dentry,mode);
+			error = vfs_create(nd.dentry->d_inode,dentry,mode,&nd);
 			break;
 		case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK:
 			error = vfs_mknod(nd.dentry->d_inode,dentry,mode,dev);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index d695f6db5baa..f10460e559a5 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -34,7 +34,7 @@ static void ncp_do_readdir(struct file *, void *, filldir_t,
 
 static int ncp_readdir(struct file *, void *, filldir_t);
 
-static int ncp_create(struct inode *, struct dentry *, int);
+static int ncp_create(struct inode *, struct dentry *, int, struct nameidata *);
 static struct dentry *ncp_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int ncp_unlink(struct inode *, struct dentry *);
 static int ncp_mkdir(struct inode *, struct dentry *, int);
@@ -942,7 +942,8 @@ out:
 	return error;
 }
 
-static int ncp_create(struct inode *dir, struct dentry *dentry, int mode)
+static int ncp_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
 	return ncp_create_new(dir, dentry, mode, 0, 0);
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index abf189a02e50..c1bd1794de60 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -40,7 +40,7 @@ static int nfs_readdir(struct file *, void *, filldir_t);
 static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int nfs_cached_lookup(struct inode *, struct dentry *,
 				struct nfs_fh *, struct nfs_fattr *);
-static int nfs_create(struct inode *, struct dentry *, int);
+static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *);
 static int nfs_mkdir(struct inode *, struct dentry *, int);
 static int nfs_rmdir(struct inode *, struct dentry *);
 static int nfs_unlink(struct inode *, struct dentry *);
@@ -787,7 +787,8 @@ out_err:
  * that the operation succeeded on the server, but an error in the
  * reply path made it appear to have failed.
  */
-static int nfs_create(struct inode *dir, struct dentry *dentry, int mode)
+static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
 	struct iattr attr;
 	struct nfs_fattr fattr;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 8759cb1076ad..29114b798e56 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -924,7 +924,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	err = nfserr_perm;
 	switch (type) {
 	case S_IFREG:
-		err = vfs_create(dirp, dchild, iap->ia_mode);
+		err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
 		break;
 	case S_IFDIR:
 		err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1067,7 +1067,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out;
 	}
 
-	err = vfs_create(dirp, dchild, iap->ia_mode);
+	err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
 	if (err < 0)
 		goto out_nfserr;
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 3c11c87e2f22..c0df469c9dc6 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -59,7 +59,7 @@ static char *alias_names [ALIASES_NNODES];
 #define NODE2INO(node) (node + OPENPROM_FIRST_INO)
 #define NODEP2INO(no) (no + OPENPROM_FIRST_INO + last_node)
 
-static int openpromfs_create (struct inode *, struct dentry *, int);
+static int openpromfs_create (struct inode *, struct dentry *, int, struct nameidata *);
 static int openpromfs_readdir(struct file *, void *, filldir_t);
 static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd);
 static int openpromfs_unlink (struct inode *, struct dentry *dentry);
@@ -854,7 +854,8 @@ out:
 	return 0;
 }
 
-static int openpromfs_create (struct inode *dir, struct dentry *dentry, int mode)
+static int openpromfs_create (struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
 	char *p;
 	struct inode *inode;
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 12e423ae6de0..36e903d89777 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -142,7 +142,8 @@ out:
 }
 
 #ifdef CONFIG_QNX4FS_RW
-int qnx4_create(struct inode *dir, struct dentry *dentry, int mode)
+int qnx4_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
 	QNX4DEBUG(("qnx4: qnx4_create\n"));
 	if (dir == NULL) {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index bd0a6765ec6d..362ee3135e69 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -111,7 +111,7 @@ static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 	return retval;
 }
 
-static int ramfs_create(struct inode *dir, struct dentry *dentry, int mode)
+static int ramfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
 	return ramfs_mknod(dir, dentry, mode | S_IFREG, 0);
 }
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 18a3353274c4..93151fb285c0 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -558,7 +558,8 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode) {
     return 0 ;
 }
 
-static int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode)
+static int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
     int retval;
     struct inode * inode;
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 081402f119dc..f0b62740ae89 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -25,7 +25,7 @@ static int smb_readdir(struct file *, void *, filldir_t);
 static int smb_dir_open(struct inode *, struct file *);
 
 static struct dentry *smb_lookup(struct inode *, struct dentry *, struct nameidata *);
-static int smb_create(struct inode *, struct dentry *, int);
+static int smb_create(struct inode *, struct dentry *, int, struct nameidata *);
 static int smb_mkdir(struct inode *, struct dentry *, int);
 static int smb_rmdir(struct inode *, struct dentry *);
 static int smb_unlink(struct inode *, struct dentry *);
@@ -510,7 +510,8 @@ out_close:
 
 /* N.B. How should the mode argument be used? */
 static int
-smb_create(struct inode *dir, struct dentry *dentry, int mode)
+smb_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
 	struct smb_sb_info *server = server_from_dentry(dentry);
 	__u16 fileid;
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 4c0eb5730065..cbf08f04d07c 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -96,7 +96,7 @@ static int sysv_mknod(struct inode * dir, struct dentry * dentry, int mode, dev_
 	return err;
 }
 
-static int sysv_create(struct inode * dir, struct dentry * dentry, int mode)
+static int sysv_create(struct inode * dir, struct dentry * dentry, int mode, struct nameidata *nd)
 {
 	return sysv_mknod(dir, dentry, mode, 0);
 }
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 7881ffbbd82e..d2ac88dae447 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -621,7 +621,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
 	return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
 }
 
-static int udf_create(struct inode *dir, struct dentry *dentry, int mode)
+static int udf_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
 	struct udf_fileident_bh fibh;
 	struct inode *inode;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 55496ec96e56..82f391298c48 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -92,7 +92,8 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
  * If the create succeeds, we fill in the inode information
  * with d_instantiate(). 
  */
-static int ufs_create (struct inode * dir, struct dentry * dentry, int mode)
+static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
+		struct nameidata *nd)
 {
 	struct inode * inode = ufs_new_inode(dir, mode);
 	int err = PTR_ERR(inode);
diff --git a/fs/umsdos/emd.c b/fs/umsdos/emd.c
index 06190391d47e..7fadb55b7e44 100644
--- a/fs/umsdos/emd.c
+++ b/fs/umsdos/emd.c
@@ -105,7 +105,7 @@ int umsdos_make_emd(struct dentry *parent)
 Printk(("umsdos_make_emd: creating EMD %s/%s\n",
 parent->d_name.name, demd->d_name.name));
 
-	err = msdos_create(parent->d_inode, demd, S_IFREG | 0777);
+	err = msdos_create(parent->d_inode, demd, S_IFREG | 0777, NULL);
 	if (err) {
 		printk (KERN_WARNING
 			"umsdos_make_emd: create %s/%s failed, err=%d\n",
diff --git a/fs/umsdos/namei.c b/fs/umsdos/namei.c
index 3d89ba970a06..2d8a64af1aed 100644
--- a/fs/umsdos/namei.c
+++ b/fs/umsdos/namei.c
@@ -274,7 +274,7 @@ static int umsdos_create_any (struct inode *dir, struct dentry *dentry,
 	if (fake->d_inode)
 		goto out_remove_dput;
 
-	ret = msdos_create (dir, fake, S_IFREG | 0777);
+	ret = msdos_create (dir, fake, S_IFREG | 0777, NULL);
 	if (ret)
 		goto out_remove_dput;
 
@@ -311,7 +311,7 @@ out_remove:
  * 
  * Return the status of the operation. 0 mean success.
  */
-int UMSDOS_create (struct inode *dir, struct dentry *dentry, int mode)
+int UMSDOS_create (struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd)
 {
 	return umsdos_create_any (dir, dentry, mode, 0, 0);
 }
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index 04f6754fe235..12c067c8355d 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -912,7 +912,8 @@ error:
 	return dentry;
 }
 
-int vfat_create(struct inode *dir,struct dentry* dentry,int mode)
+int vfat_create(struct inode *dir,struct dentry* dentry,int mode,
+		struct nameidata *nd)
 {
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = NULL;
diff --git a/fs/xfs/linux/xfs_iops.c b/fs/xfs/linux/xfs_iops.c
index c4ff85065f41..e94d003ee0a4 100644
--- a/fs/xfs/linux/xfs_iops.c
+++ b/fs/xfs/linux/xfs_iops.c
@@ -175,7 +175,8 @@ STATIC int
 linvfs_create(
 	struct inode	*dir,
 	struct dentry	*dentry,
-	int		mode)
+	int		mode,
+	struct nameidata *nd)
 {
 	return linvfs_mknod(dir, dentry, mode, 0);
 }
diff --git a/include/linux/affs_fs.h b/include/linux/affs_fs.h
index 837fe37aba93..c849309b1131 100644
--- a/include/linux/affs_fs.h
+++ b/include/linux/affs_fs.h
@@ -43,7 +43,7 @@ extern int	affs_init_bitmap(struct super_block *sb);
 extern int	affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len);
 extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *);
 extern int	affs_unlink(struct inode *dir, struct dentry *dentry);
-extern int	affs_create(struct inode *dir, struct dentry *dentry, int mode);
+extern int	affs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *);
 extern int	affs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
 extern int	affs_rmdir(struct inode *dir, struct dentry *dentry);
 extern int	affs_link(struct dentry *olddentry, struct inode *dir,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3ddf4c4edfb7..66cf193c6e64 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -639,7 +639,7 @@ static inline void unlock_super(struct super_block * sb)
 /*
  * VFS helper functions..
  */
-extern int vfs_create(struct inode *, struct dentry *, int);
+extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *);
 extern int vfs_mkdir(struct inode *, struct dentry *, int);
 extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
 extern int vfs_symlink(struct inode *, struct dentry *, const char *);
@@ -730,7 +730,7 @@ struct file_operations {
 };
 
 struct inode_operations {
-	int (*create) (struct inode *,struct dentry *,int);
+	int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
 	struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
 	int (*unlink) (struct inode *,struct dentry *);
diff --git a/include/linux/hfs_fs.h b/include/linux/hfs_fs.h
index 40971822e3e4..7bebd13150dd 100644
--- a/include/linux/hfs_fs.h
+++ b/include/linux/hfs_fs.h
@@ -234,7 +234,7 @@ extern struct hfs_cat_entry *hfs_cat_get(struct hfs_mdb *,
 					 const struct hfs_cat_key *);
 
 /* dir.c */
-extern int hfs_create(struct inode *, struct dentry *, int);
+extern int hfs_create(struct inode *, struct dentry *, int, struct nameidata *);
 extern int hfs_mkdir(struct inode *, struct dentry *, int);
 extern int hfs_unlink(struct inode *, struct dentry *);
 extern int hfs_rmdir(struct inode *, struct dentry *);
diff --git a/include/linux/msdos_fs.h b/include/linux/msdos_fs.h
index d1b00ab916ce..1ce9ba2f57b0 100644
--- a/include/linux/msdos_fs.h
+++ b/include/linux/msdos_fs.h
@@ -308,7 +308,7 @@ extern int fat_scan(struct inode *dir, const char *name,
 
 /* msdos/namei.c  - these are for Umsdos */
 extern struct dentry *msdos_lookup(struct inode *dir, struct dentry *, struct nameidata *);
-extern int msdos_create(struct inode *dir, struct dentry *dentry, int mode);
+extern int msdos_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *);
 extern int msdos_rmdir(struct inode *dir, struct dentry *dentry);
 extern int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode);
 extern int msdos_unlink(struct inode *dir, struct dentry *dentry);
@@ -318,7 +318,7 @@ extern int msdos_fill_super(struct super_block *sb, void *data, int silent);
 
 /* vfat/namei.c - these are for dmsdos */
 extern struct dentry *vfat_lookup(struct inode *dir, struct dentry *, struct nameidata *);
-extern int vfat_create(struct inode *dir, struct dentry *dentry, int mode);
+extern int vfat_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *);
 extern int vfat_rmdir(struct inode *dir, struct dentry *dentry);
 extern int vfat_unlink(struct inode *dir, struct dentry *dentry);
 extern int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode);
diff --git a/include/linux/qnx4_fs.h b/include/linux/qnx4_fs.h
index 2aa7a7ca3d54..53233c8fb3ef 100644
--- a/include/linux/qnx4_fs.h
+++ b/include/linux/qnx4_fs.h
@@ -117,14 +117,13 @@ extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
 extern struct buffer_head *qnx4_getblk(struct inode *, int, int);
 extern struct buffer_head *qnx4_bread(struct inode *, int, int);
 
-extern int qnx4_create(struct inode *dir, struct dentry *dentry, int mode);
 extern struct inode_operations qnx4_file_inode_operations;
 extern struct inode_operations qnx4_dir_inode_operations;
 extern struct file_operations qnx4_file_operations;
 extern struct file_operations qnx4_dir_operations;
 extern int qnx4_is_free(struct super_block *sb, long block);
 extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
-extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode);
+extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
 extern void qnx4_truncate(struct inode *inode);
 extern void qnx4_free_inode(struct inode *inode);
 extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1f4ed8fece45..e9d5042bc13b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1398,7 +1398,8 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return 0;
 }
 
-static int shmem_create(struct inode *dir, struct dentry *dentry, int mode)
+static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
+		struct nameidata *nd)
 {
 	return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
 }
-- 
cgit v1.2.3


From a574f324dab607946682f60b9efdc1b3d810cf03 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@fys.uio.no>
Date: Thu, 3 Jul 2003 22:06:43 -0700
Subject: [PATCH] Pass 'nameidata' to ->permission()

   - Make the VFS pass the struct nameidata as an optional parameter
     to the permission() inode operation.

   - Patch may_create()/may_open() so it passes the struct nameidata from
     vfs_create()/open_namei() as an argument to permission().

   - Add an intent flag for the sys_access() function.
---
 drivers/block/floppy.c     |  2 +-
 fs/cifs/cifsfs.c           |  2 +-
 fs/coda/dir.c              |  2 +-
 fs/coda/pioctl.c           |  6 ++++--
 fs/exec.c                  |  6 +++---
 fs/ext2/acl.c              |  2 +-
 fs/ext2/acl.h              |  2 +-
 fs/ext2/xattr_user.c       |  4 ++--
 fs/ext3/acl.c              |  2 +-
 fs/ext3/acl.h              |  2 +-
 fs/ext3/xattr_user.c       |  4 ++--
 fs/hpfs/namei.c            |  2 +-
 fs/intermezzo/dir.c        | 10 +++++-----
 fs/intermezzo/file.c       |  2 +-
 fs/intermezzo/vfs.c        |  6 +++---
 fs/jfs/acl.c               |  2 +-
 fs/jfs/jfs_acl.h           |  2 +-
 fs/jfs/xattr.c             |  4 ++--
 fs/namei.c                 | 34 ++++++++++++++++++----------------
 fs/namespace.c             |  2 +-
 fs/ncpfs/ioctl.c           | 22 +++++++++++-----------
 fs/nfs/dir.c               |  2 +-
 fs/nfsd/nfsfh.c            |  2 +-
 fs/nfsd/vfs.c              |  4 ++--
 fs/open.c                  | 16 ++++++++--------
 fs/proc/base.c             |  2 +-
 fs/smbfs/file.c            |  2 +-
 fs/udf/file.c              |  2 +-
 fs/xfs/linux/xfs_iops.c    |  3 ++-
 include/linux/coda_linux.h |  2 +-
 include/linux/fs.h         |  4 ++--
 include/linux/namei.h      |  1 +
 include/linux/nfs_fs.h     |  2 +-
 kernel/sysctl.c            |  4 ++--
 net/unix/af_unix.c         |  2 +-
 35 files changed, 87 insertions(+), 81 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index ca2332522f9a..40ff4c76558b 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3767,7 +3767,7 @@ static int floppy_open(struct inode * inode, struct file * filp)
 	 * Needed so that programs such as fdrawcmd still can work on write
 	 * protected disks */
 	if ((filp->f_mode & 2) || 
-	    (inode->i_sb && (permission(inode,2) == 0)))
+	    (inode->i_sb && (permission(inode,2, NULL) == 0)))
 	    filp->private_data = (void*) 8;
 
 	if (UFDCS->rawcmd == 1)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 1b3c43949f33..2201681095ca 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -178,7 +178,7 @@ cifs_statfs(struct super_block *sb, struct kstatfs *buf)
 	return 0;		/* always return success? what if volume is no longer available? */
 }
 
-static int cifs_permission(struct inode * inode, int mask)
+static int cifs_permission(struct inode * inode, int mask, struct nameidata *nd)
 {
 	/* the server does permission checks, we do not need to do it here */
 	return 0;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 8b3627e0d0e2..2917ab9f4976 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -147,7 +147,7 @@ exit:
 }
 
 
-int coda_permission(struct inode *inode, int mask)
+int coda_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
         int error = 0;
  
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 67228f3c2122..e10ac76438c0 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,7 +24,8 @@
 #include <linux/coda_psdev.h>
 
 /* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask);
+static int coda_ioctl_permission(struct inode *inode, int mask,
+				 struct nameidata *nd);
 static int coda_pioctl(struct inode * inode, struct file * filp, 
                        unsigned int cmd, unsigned long user_data);
 
@@ -41,7 +42,8 @@ struct file_operations coda_ioctl_operations = {
 };
 
 /* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask)
+static int coda_ioctl_permission(struct inode *inode, int mask,
+				 struct nameidata *nd)
 {
         return 0;
 }
diff --git a/fs/exec.c b/fs/exec.c
index 68a64ee4b234..4f37deb79e00 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -126,7 +126,7 @@ asmlinkage long sys_uselib(const char __user * library)
 	if (!S_ISREG(nd.dentry->d_inode->i_mode))
 		goto exit;
 
-	error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC);
+	error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC, &nd);
 	if (error)
 		goto exit;
 
@@ -462,7 +462,7 @@ struct file *open_exec(const char *name)
 		file = ERR_PTR(-EACCES);
 		if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
 		    S_ISREG(inode->i_mode)) {
-			int err = permission(inode, MAY_EXEC);
+			int err = permission(inode, MAY_EXEC, &nd);
 			if (!err && !(inode->i_mode & 0111))
 				err = -EACCES;
 			file = ERR_PTR(err);
@@ -794,7 +794,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	flush_thread();
 
 	if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || 
-	    permission(bprm->file->f_dentry->d_inode,MAY_READ))
+	    permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL))
 		current->mm->dumpable = 0;
 
 	/* An exec changes our domain. We are no longer part of the thread
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 9367f43f4acb..4db56bdd8fe5 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -309,7 +309,7 @@ check_capabilities:
  * BKL held [before 2.5.x]
  */
 int
-ext2_permission(struct inode *inode, int mask)
+ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	return __ext2_permission(inode, mask, 1);
 }
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 0cfbf4d1029b..2e0560130b63 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -59,7 +59,7 @@ static inline int ext2_acl_count(size_t size)
 #define EXT2_ACL_NOT_CACHED ((void *)-1)
 
 /* acl.c */
-extern int ext2_permission (struct inode *, int);
+extern int ext2_permission (struct inode *, int, struct nameidata *);
 extern int ext2_permission_locked (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 027beb89c7e0..fc0ec86f4928 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -47,7 +47,7 @@ ext2_xattr_user_get(struct inode *inode, const char *name,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 	error = ext2_permission_locked(inode, MAY_READ);
 #else
-	error = permission(inode, MAY_READ);
+	error = permission(inode, MAY_READ, NULL);
 #endif
 	if (error)
 		return error;
@@ -71,7 +71,7 @@ ext2_xattr_user_set(struct inode *inode, const char *name,
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 	error = ext2_permission_locked(inode, MAY_WRITE);
 #else
-	error = permission(inode, MAY_WRITE);
+	error = permission(inode, MAY_WRITE, NULL);
 #endif
 	if (error)
 		return error;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 9313430093c5..d29f14efb253 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -312,7 +312,7 @@ check_capabilities:
  * inode->i_sem: up
  */
 int
-ext3_permission(struct inode *inode, int mask)
+ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	return __ext3_permission(inode, mask, 1);
 }
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index e0962a6c24b2..6aaef97a5fc3 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -59,7 +59,7 @@ static inline int ext3_acl_count(size_t size)
 #define EXT3_ACL_NOT_CACHED ((void *)-1)
 
 /* acl.c */
-extern int ext3_permission (struct inode *, int);
+extern int ext3_permission (struct inode *, int, struct nameidata *);
 extern int ext3_permission_locked (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index b93a74ded763..b8c789e60fa0 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -49,7 +49,7 @@ ext3_xattr_user_get(struct inode *inode, const char *name,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 	error = ext3_permission_locked(inode, MAY_READ);
 #else
-	error = permission(inode, MAY_READ);
+	error = permission(inode, MAY_READ, NULL);
 #endif
 	if (error)
 		return error;
@@ -73,7 +73,7 @@ ext3_xattr_user_set(struct inode *inode, const char *name,
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 	error = ext3_permission_locked(inode, MAY_WRITE);
 #else
-	error = permission(inode, MAY_WRITE);
+	error = permission(inode, MAY_WRITE, NULL);
 #endif
 	if (error)
 		return error;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 128647db8ffc..866976557245 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -374,7 +374,7 @@ again:
 		d_drop(dentry);
 		spin_lock(&dentry->d_lock);
 		if (atomic_read(&dentry->d_count) > 1 ||
-		    permission(inode, MAY_WRITE) ||
+		    permission(inode, MAY_WRITE, NULL) ||
 		    get_write_access(inode)) {
 			spin_unlock(&dentry->d_lock);
 			d_rehash(dentry);
diff --git a/fs/intermezzo/dir.c b/fs/intermezzo/dir.c
index 61cd7b4d54ea..7e646f9c2211 100644
--- a/fs/intermezzo/dir.c
+++ b/fs/intermezzo/dir.c
@@ -81,7 +81,7 @@ static inline void presto_unlock(struct inode *dir)
 /*
  * these are initialized in super.c
  */
-extern int presto_permission(struct inode *inode, int mask);
+extern int presto_permission(struct inode *inode, int mask, struct nameidata *nd);
 static int izo_authorized_uid = 0;
 
 int izo_dentry_is_ilookup(struct dentry *dentry, ino_t *id,
@@ -830,7 +830,7 @@ int presto_rename(struct inode *old_dir, struct dentry *old_dentry,
  * appropriate permission function. Thus we do not worry here about ACLs
  * or EAs. -SHP
  */
-int presto_permission(struct inode *inode, int mask)
+int presto_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
         unsigned short mode = inode->i_mode;
         struct presto_cache *cache;
@@ -852,11 +852,11 @@ int presto_permission(struct inode *inode, int mask)
 
                 if ( S_ISREG(mode) && fiops && fiops->permission ) {
                         EXIT;
-                        return fiops->permission(inode, mask);
+                        return fiops->permission(inode, mask, nd);
                 }
                 if ( S_ISDIR(mode) && diops && diops->permission ) {
                         EXIT;
-                        return diops->permission(inode, mask);
+                        return diops->permission(inode, mask, nd);
                 }
         }
 
@@ -867,7 +867,7 @@ int presto_permission(struct inode *inode, int mask)
          * the VFS permission function.
          */
         inode->i_op->permission = NULL;
-        rc = permission(inode, mask);
+        rc = permission(inode, mask, nd);
         inode->i_op->permission = &presto_permission;
 
         EXIT;
diff --git a/fs/intermezzo/file.c b/fs/intermezzo/file.c
index 9f0b10422c4e..a1efcbfaa2c9 100644
--- a/fs/intermezzo/file.c
+++ b/fs/intermezzo/file.c
@@ -53,7 +53,7 @@
 /*
  * these are initialized in super.c
  */
-extern int presto_permission(struct inode *inode, int mask);
+extern int presto_permission(struct inode *inode, int mask, struct nameidata *nd);
 
 
 static int presto_open_upcall(int minor, struct dentry *de)
diff --git a/fs/intermezzo/vfs.c b/fs/intermezzo/vfs.c
index c3e124c6777f..1cfa4c9a4b60 100644
--- a/fs/intermezzo/vfs.c
+++ b/fs/intermezzo/vfs.c
@@ -134,7 +134,7 @@ static inline int may_delete(struct inode *dir,struct dentry *victim, int isdir)
         int error;
         if (!victim->d_inode || victim->d_parent->d_inode != dir)
                 return -ENOENT;
-        error = permission(dir,MAY_WRITE | MAY_EXEC);
+        error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
         if (error)
                 return error;
         if (IS_APPEND(dir))
@@ -158,7 +158,7 @@ static inline int may_create(struct inode *dir, struct dentry *child) {
                 return -EEXIST;
         if (IS_DEADDIR(dir))
                 return -ENOENT;
-        return permission(dir,MAY_WRITE | MAY_EXEC);
+        return permission(dir,MAY_WRITE | MAY_EXEC, NULL);
 }
 
 #ifdef PRESTO_DEBUG
@@ -1840,7 +1840,7 @@ int presto_rename_dir(struct presto_file_set *fset, struct dentry *old_parent,
          * we'll need to flip '..'.
          */
         if (new_dir != old_dir) {
-                error = permission(old_dentry->d_inode, MAY_WRITE);
+                error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
         }
         if (error)
                 return error;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 758d370e6419..a83ab660a0b7 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -208,7 +208,7 @@ check_capabilities:
 
 	return -EACCES;
 }
-int jfs_permission(struct inode * inode, int mask)
+int jfs_permission(struct inode * inode, int mask, struct nameidata *nd)
 {
 	return __jfs_permission(inode, mask, 0);
 }
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 179a3893a945..cfb445231972 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -25,7 +25,7 @@
 struct posix_acl *jfs_get_acl(struct inode *, int);
 int jfs_set_acl(struct inode *, int, struct posix_acl *);
 int jfs_permission_have_sem(struct inode *, int);
-int jfs_permission(struct inode *, int);
+int jfs_permission(struct inode *, int, struct nameidata *);
 int jfs_init_acl(struct inode *, struct inode *);
 int jfs_setattr(struct dentry *, struct iattr *);
 
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index a9c455de618b..4ae1b0ffaf05 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -731,7 +731,7 @@ static int can_set_xattr(struct inode *inode, const char *name,
 #ifdef CONFIG_JFS_POSIX_ACL
 	return jfs_permission_have_sem(inode, MAY_WRITE);
 #else
-	return permission(inode, MAY_WRITE);
+	return permission(inode, MAY_WRITE, NULL);
 #endif
 }
 
@@ -893,7 +893,7 @@ static int can_get_xattr(struct inode *inode, const char *name)
 	else
 		return jfs_permission_have_sem(inode, MAY_READ);
 #else
-	return permission(inode, MAY_READ);
+	return permission(inode, MAY_READ, NULL);
 #endif
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index ae67748c2fc8..2fc6f11fe795 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -203,7 +203,7 @@ int vfs_permission(struct inode * inode, int mask)
 	return -EACCES;
 }
 
-int permission(struct inode * inode,int mask)
+int permission(struct inode * inode,int mask, struct nameidata *nd)
 {
 	int retval;
 	int submask;
@@ -212,7 +212,7 @@ int permission(struct inode * inode,int mask)
 	submask = mask & ~MAY_APPEND;
 
 	if (inode->i_op && inode->i_op->permission)
-		retval = inode->i_op->permission(inode, submask);
+		retval = inode->i_op->permission(inode, submask, nd);
 	else
 		retval = vfs_permission(inode, submask);
 	if (retval)
@@ -588,7 +588,7 @@ int link_path_walk(const char * name, struct nameidata *nd)
 
 		err = exec_permission_lite(inode);
 		if (err == -EAGAIN) { 
-			err = permission(inode, MAY_EXEC);
+			err = permission(inode, MAY_EXEC, nd);
 		}
  		if (err)
 			break;
@@ -876,7 +876,7 @@ static struct dentry * __lookup_hash(struct qstr *name, struct dentry * base, st
 	int err;
 
 	inode = base->d_inode;
-	err = permission(inode, MAY_EXEC);
+	err = permission(inode, MAY_EXEC, nd);
 	dentry = ERR_PTR(err);
 	if (err)
 		goto out;
@@ -996,12 +996,12 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
  *     nfs_async_unlink().
  */
-static inline int may_delete(struct inode *dir,struct dentry *victim, int isdir)
+static inline int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 {
 	int error;
 	if (!victim->d_inode || victim->d_parent->d_inode != dir)
 		return -ENOENT;
-	error = permission(dir,MAY_WRITE | MAY_EXEC);
+	error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
@@ -1031,12 +1031,14 @@ static inline int may_delete(struct inode *dir,struct dentry *victim, int isdir)
  *  3. We should have write and exec permissions on dir
  *  4. We can't do it if dir is immutable (done in permission())
  */
-static inline int may_create(struct inode *dir, struct dentry *child) {
+static inline int may_create(struct inode *dir, struct dentry *child,
+			     struct nameidata *nd)
+{
 	if (child->d_inode)
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
-	return permission(dir,MAY_WRITE | MAY_EXEC);
+	return permission(dir,MAY_WRITE | MAY_EXEC, nd);
 }
 
 /* 
@@ -1108,7 +1110,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		struct nameidata *nd)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(dir, dentry, nd);
 
 	if (error)
 		return error;
@@ -1144,7 +1146,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
 	if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE))
 		return -EISDIR;
 
-	error = permission(inode, acc_mode);
+	error = permission(inode, acc_mode, nd);
 	if (error)
 		return error;
 
@@ -1398,7 +1400,7 @@ fail:
 
 int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(dir, dentry, NULL);
 
 	if (error)
 		return error;
@@ -1469,7 +1471,7 @@ out:
 
 int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(dir, dentry, NULL);
 
 	if (error)
 		return error;
@@ -1715,7 +1717,7 @@ slashes:
 
 int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 {
-	int error = may_create(dir, dentry);
+	int error = may_create(dir, dentry, NULL);
 
 	if (error)
 		return error;
@@ -1777,7 +1779,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (!inode)
 		return -ENOENT;
 
-	error = may_create(dir, new_dentry);
+	error = may_create(dir, new_dentry, NULL);
 	if (error)
 		return error;
 
@@ -1898,7 +1900,7 @@ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 	 * we'll need to flip '..'.
 	 */
 	if (new_dir != old_dir) {
-		error = permission(old_dentry->d_inode, MAY_WRITE);
+		error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
 		if (error)
 			return error;
 	}
@@ -1976,7 +1978,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		return error;
 
 	if (!new_dentry->d_inode)
-		error = may_create(new_dir, new_dentry);
+		error = may_create(new_dir, new_dentry, NULL);
 	else
 		error = may_delete(new_dir, new_dentry, is_dir);
 	if (error)
diff --git a/fs/namespace.c b/fs/namespace.c
index 61e5ec891363..a31cd95801cb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -403,7 +403,7 @@ static int mount_is_safe(struct nameidata *nd)
 		if (current->uid != nd->dentry->d_inode->i_uid)
 			return -EPERM;
 	}
-	if (permission(nd->dentry->d_inode, MAY_WRITE))
+	if (permission(nd->dentry->d_inode, MAY_WRITE, nd))
 		return -EPERM;
 	return 0;
 #endif
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 3497f67be924..fb3e550a9abc 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -40,7 +40,7 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 	switch (cmd) {
 	case NCP_IOC_NCPREQUEST:
 
-		if ((permission(inode, MAY_WRITE) != 0)
+		if ((permission(inode, MAY_WRITE, NULL) != 0)
 		    && (current->uid != server->m.mounted_uid)) {
 			return -EACCES;
 		}
@@ -99,7 +99,7 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 		{
 			struct ncp_fs_info info;
 
-			if ((permission(inode, MAY_WRITE) != 0)
+			if ((permission(inode, MAY_WRITE, NULL) != 0)
 			    && (current->uid != server->m.mounted_uid)) {
 				return -EACCES;
 			}
@@ -127,7 +127,7 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 		{
 			struct ncp_fs_info_v2 info2;
 
-			if ((permission(inode, MAY_WRITE) != 0)
+			if ((permission(inode, MAY_WRITE, NULL) != 0)
 			    && (current->uid != server->m.mounted_uid)) {
 				return -EACCES;
 			}
@@ -155,7 +155,7 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 		{
 			unsigned long tmp = server->m.mounted_uid;
 
-			if (   (permission(inode, MAY_READ) != 0)
+			if (   (permission(inode, MAY_READ, NULL) != 0)
 			    && (current->uid != server->m.mounted_uid))
 			{
 				return -EACCES;
@@ -169,7 +169,7 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 		{
 			struct ncp_setroot_ioctl sr;
 
-			if (   (permission(inode, MAY_READ) != 0)
+			if (   (permission(inode, MAY_READ, NULL) != 0)
 			    && (current->uid != server->m.mounted_uid))
 			{
 				return -EACCES;
@@ -249,7 +249,7 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 
 #ifdef CONFIG_NCPFS_PACKET_SIGNING	
 	case NCP_IOC_SIGN_INIT:
-		if ((permission(inode, MAY_WRITE) != 0)
+		if ((permission(inode, MAY_WRITE, NULL) != 0)
 		    && (current->uid != server->m.mounted_uid))
 		{
 			return -EACCES;
@@ -272,7 +272,7 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 		return 0;		
 		
         case NCP_IOC_SIGN_WANTED:
-		if (   (permission(inode, MAY_READ) != 0)
+		if (   (permission(inode, MAY_READ, NULL) != 0)
 		    && (current->uid != server->m.mounted_uid))
 		{
 			return -EACCES;
@@ -285,7 +285,7 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 		{
 			int newstate;
 
-			if (   (permission(inode, MAY_WRITE) != 0)
+			if (   (permission(inode, MAY_WRITE, NULL) != 0)
 			    && (current->uid != server->m.mounted_uid))
 			{
 				return -EACCES;
@@ -306,7 +306,7 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
 
 #ifdef CONFIG_NCPFS_IOCTL_LOCKING
 	case NCP_IOC_LOCKUNLOCK:
-		if (   (permission(inode, MAY_WRITE) != 0)
+		if (   (permission(inode, MAY_WRITE, NULL) != 0)
 		    && (current->uid != server->m.mounted_uid))
 		{
 			return -EACCES;
@@ -608,7 +608,7 @@ outrel:
 		}
 #endif /* CONFIG_NCPFS_NLS */
 	case NCP_IOC_SETDENTRYTTL:
-		if ((permission(inode, MAY_WRITE) != 0) &&
+		if ((permission(inode, MAY_WRITE, NULL) != 0) &&
 				 (current->uid != server->m.mounted_uid))
 			return -EACCES;
 		{
@@ -637,7 +637,7 @@ outrel:
 	/* NCP_IOC_GETMOUNTUID may be same as NCP_IOC_GETMOUNTUID2,
            so we have this out of switch */
 	if (cmd == NCP_IOC_GETMOUNTUID) {
-		if ((permission(inode, MAY_READ) != 0)
+		if ((permission(inode, MAY_READ, NULL) != 0)
 		    && (current->uid != server->m.mounted_uid)) {
 			return -EACCES;
 		}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c1bd1794de60..93585f0099fb 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1240,7 +1240,7 @@ out:
 }
 
 int
-nfs_permission(struct inode *inode, int mask)
+nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	struct nfs_access_cache *cache = &NFS_I(inode)->cache_access;
 	struct rpc_cred *cred;
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 6d023b27ff6b..32a50f1bed11 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -56,7 +56,7 @@ int nfsd_acceptable(void *expv, struct dentry *dentry)
 		/* make sure parents give x permission to user */
 		int err;
 		parent = dget_parent(tdentry);
-		err = permission(parent->d_inode, S_IXOTH);
+		err = permission(parent->d_inode, S_IXOTH, NULL);
 		if (err < 0) {
 			dput(parent);
 			break;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 29114b798e56..663f4839cc33 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1584,12 +1584,12 @@ nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc)
 	    inode->i_uid == current->fsuid)
 		return 0;
 
-	err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
+	err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL);
 
 	/* Allow read access to binaries even when mode 111 */
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
 	    acc == (MAY_READ | MAY_OWNER_OVERRIDE))
-		err = permission(inode, MAY_EXEC);
+		err = permission(inode, MAY_EXEC, NULL);
 
 	return err? nfserrno(err) : 0;
 }
diff --git a/fs/open.c b/fs/open.c
index 98ce4f7374ef..8a4197969f27 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -219,7 +219,7 @@ static inline long do_sys_truncate(const char __user * path, loff_t length)
 	if (!S_ISREG(inode->i_mode))
 		goto dput_and_out;
 
-	error = permission(inode,MAY_WRITE);
+	error = permission(inode,MAY_WRITE,&nd);
 	if (error)
 		goto dput_and_out;
 
@@ -365,7 +365,7 @@ asmlinkage long sys_utime(char __user * filename, struct utimbuf __user * times)
 		newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
 	} else {
 		if (current->fsuid != inode->i_uid &&
-		    (error = permission(inode,MAY_WRITE)) != 0)
+		    (error = permission(inode,MAY_WRITE,&nd)) != 0)
 			goto dput_and_out;
 	}
 	down(&inode->i_sem);
@@ -410,7 +410,7 @@ long do_utimes(char __user * filename, struct timeval * times)
 		newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
 	} else {
 		if (current->fsuid != inode->i_uid &&
-		    (error = permission(inode,MAY_WRITE)) != 0)
+		    (error = permission(inode,MAY_WRITE,&nd)) != 0)
 			goto dput_and_out;
 	}
 	down(&inode->i_sem);
@@ -467,9 +467,9 @@ asmlinkage long sys_access(const char __user * filename, int mode)
 	else
 		current->cap_effective = current->cap_permitted;
 
-	res = user_path_walk(filename, &nd);
+	res = __user_walk(filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
 	if (!res) {
-		res = permission(nd.dentry->d_inode, mode);
+		res = permission(nd.dentry->d_inode, mode, &nd);
 		/* SuS v2 requires we report a read only fs too */
 		if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
 		   && !special_file(nd.dentry->d_inode->i_mode))
@@ -493,7 +493,7 @@ asmlinkage long sys_chdir(const char __user * filename)
 	if (error)
 		goto out;
 
-	error = permission(nd.dentry->d_inode,MAY_EXEC);
+	error = permission(nd.dentry->d_inode,MAY_EXEC,&nd);
 	if (error)
 		goto dput_and_out;
 
@@ -526,7 +526,7 @@ asmlinkage long sys_fchdir(unsigned int fd)
 	if (!S_ISDIR(inode->i_mode))
 		goto out_putf;
 
-	error = permission(inode, MAY_EXEC);
+	error = permission(inode, MAY_EXEC, NULL);
 	if (!error)
 		set_fs_pwd(current->fs, mnt, dentry);
 out_putf:
@@ -544,7 +544,7 @@ asmlinkage long sys_chroot(const char __user * filename)
 	if (error)
 		goto out;
 
-	error = permission(nd.dentry->d_inode,MAY_EXEC);
+	error = permission(nd.dentry->d_inode,MAY_EXEC,&nd);
 	if (error)
 		goto dput_and_out;
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3d05ee25f8e9..2c8d50e98d48 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -334,7 +334,7 @@ out:
 	goto exit;
 }
 
-static int proc_permission(struct inode *inode, int mask)
+static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	if (vfs_permission(inode, mask) != 0)
 		return -EACCES;
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index a174775b2d13..6b25d7c89177 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -367,7 +367,7 @@ smb_file_release(struct inode *inode, struct file * file)
  * privileges, so we need our own check for this.
  */
 static int
-smb_file_permission(struct inode *inode, int mask)
+smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	int mode = inode->i_mode;
 	int error = 0;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 9fd46aff63ae..b1cf9999e902 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -188,7 +188,7 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 {
 	int result = -EINVAL;
 
-	if ( permission(inode, MAY_READ) != 0 )
+	if ( permission(inode, MAY_READ, NULL) != 0 )
 	{
 		udf_debug("no permission to access inode %lu\n",
 						inode->i_ino);
diff --git a/fs/xfs/linux/xfs_iops.c b/fs/xfs/linux/xfs_iops.c
index e94d003ee0a4..8a90e5495a3a 100644
--- a/fs/xfs/linux/xfs_iops.c
+++ b/fs/xfs/linux/xfs_iops.c
@@ -431,7 +431,8 @@ linvfs_follow_link(
 STATIC int
 linvfs_permission(
 	struct inode	*inode,
-	int		mode)
+	int		mode,
+	struct nameidata *nd)
 {
 	vnode_t		*vp = LINVFS_GET_VP(inode);
 	int		error;
diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h
index b22d34fcfb6d..650a6f997f8a 100644
--- a/include/linux/coda_linux.h
+++ b/include/linux/coda_linux.h
@@ -38,7 +38,7 @@ extern struct file_operations coda_ioctl_operations;
 int coda_open(struct inode *i, struct file *f);
 int coda_flush(struct file *f);
 int coda_release(struct inode *i, struct file *f);
-int coda_permission(struct inode *inode, int mask);
+int coda_permission(struct inode *inode, int mask, struct nameidata *nd);
 int coda_revalidate_inode(struct dentry *);
 int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 int coda_setattr(struct dentry *, struct iattr *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 66cf193c6e64..7a5f305101c5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -743,7 +743,7 @@ struct inode_operations {
 	int (*readlink) (struct dentry *, char __user *,int);
 	int (*follow_link) (struct dentry *, struct nameidata *);
 	void (*truncate) (struct inode *);
-	int (*permission) (struct inode *, int);
+	int (*permission) (struct inode *, int, struct nameidata *);
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
 	int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
@@ -1121,7 +1121,7 @@ extern int do_remount_sb(struct super_block *sb, int flags,
 extern sector_t bmap(struct inode *, sector_t);
 extern int setattr_mask(unsigned int);
 extern int notify_change(struct dentry *, struct iattr *);
-extern int permission(struct inode *, int);
+extern int permission(struct inode *, int, struct nameidata *);
 extern int vfs_permission(struct inode *, int);
 extern int get_write_access(struct inode *);
 extern int deny_write_access(struct file *);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 256ceac1fc69..4117cd90a345 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -46,6 +46,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
  */
 #define LOOKUP_OPEN		(0x0100)
 #define LOOKUP_CREATE		(0x0200)
+#define LOOKUP_ACCESS		(0x0400)
 
 extern int FASTCALL(__user_walk(const char __user *, unsigned, struct nameidata *));
 #define user_path_walk(name,nd) \
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 3d7525998534..a6d594bb252c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -240,7 +240,7 @@ extern struct inode *nfs_fhget(struct dentry *, struct nfs_fh *,
 				struct nfs_fattr *);
 extern int __nfs_refresh_inode(struct inode *, struct nfs_fattr *);
 extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
-extern int nfs_permission(struct inode *, int);
+extern int nfs_permission(struct inode *, int, struct nameidata *);
 extern int nfs_open(struct inode *, struct file *);
 extern int nfs_release(struct inode *, struct file *);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7f0cc00cfa4d..edebad7ddec4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,7 +130,7 @@ extern ctl_table random_table[];
 
 static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
 static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
-static int proc_sys_permission(struct inode *, int);
+static int proc_sys_permission(struct inode *, int, struct nameidata *);
 
 struct file_operations proc_sys_file_operations = {
 	.read		= proc_readsys,
@@ -1177,7 +1177,7 @@ static ssize_t proc_writesys(struct file * file, const char __user * buf,
 	return do_rw_proc(1, file, (char __user *) buf, count, ppos);
 }
 
-static int proc_sys_permission(struct inode *inode, int op)
+static int proc_sys_permission(struct inode *inode, int op, struct nameidata *nd)
 {
 	return test_perm(inode->i_mode, op);
 }
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 64b97aec0312..f249d4388e36 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -594,7 +594,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
 		err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
 		if (err)
 			goto fail;
-		err = permission(nd.dentry->d_inode,MAY_WRITE);
+		err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
 		if (err)
 			goto put_fail;
 
-- 
cgit v1.2.3


From ddb6ee510226e05aaea1b90a3e4d672a2a0be857 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <mzyngier@freesurf.fr>
Date: Fri, 4 Jul 2003 03:00:12 -0700
Subject: [PATCH] EISA: core changes

- Now reserves I/O ranges according to EISA specs (four 256 bytes
  regions instead of a single 4KB region).

- By default, do not try to probe the bus if the mainboard does not
  seems to support EISA (allow this behaviour to be changed through a
  command-line option).

- Use parent bridge device dma_mask as default for each discovered
  device.

- Allow devices to be enabled or disabled from the kernel command line
  (useful for non-x86 platforms where the firmware simply disable
  devices it doesn't know about...).
---
 drivers/eisa/eisa-bus.c | 234 +++++++++++++++++++++++++++++++++++++++---------
 include/linux/eisa.h    |  21 ++++-
 2 files changed, 209 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/eisa/eisa-bus.c b/drivers/eisa/eisa-bus.c
index a5241c65d63b..116298c5f7b8 100644
--- a/drivers/eisa/eisa-bus.c
+++ b/drivers/eisa/eisa-bus.c
@@ -1,7 +1,7 @@
 /*
  * EISA bus support functions for sysfs.
  *
- * (C) 2002 Marc Zyngier <maz@wild-wind.fr.eu.org>
+ * (C) 2002, 2003 Marc Zyngier <maz@wild-wind.fr.eu.org>
  *
  * This code is released under the GPL version 2.
  */
@@ -10,6 +10,7 @@
 #include <linux/device.h>
 #include <linux/eisa.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/ioport.h>
@@ -24,7 +25,7 @@ struct eisa_device_info {
 	char name[DEVICE_NAME_SIZE];
 };
 
-struct eisa_device_info __initdata eisa_table[] = {
+static struct eisa_device_info __initdata eisa_table[] = {
 #ifdef CONFIG_EISA_NAMES
 #include "devlist.h"
 #endif
@@ -32,6 +33,30 @@ struct eisa_device_info __initdata eisa_table[] = {
 
 #define EISA_INFOS (sizeof (eisa_table) / (sizeof (struct eisa_device_info)))
 
+#define EISA_MAX_FORCED_DEV 16
+#define EISA_FORCED_OFFSET  2
+
+static int enable_dev[EISA_MAX_FORCED_DEV + EISA_FORCED_OFFSET]  = { 1, EISA_MAX_FORCED_DEV, };
+static int disable_dev[EISA_MAX_FORCED_DEV + EISA_FORCED_OFFSET] = { 1, EISA_MAX_FORCED_DEV, };
+
+static int is_forced_dev (int *forced_tab,
+			  struct eisa_root_device *root,
+			  struct eisa_device *edev)
+{
+	int i, x;
+
+	for (i = 0; i < EISA_MAX_FORCED_DEV; i++) {
+		if (!forced_tab[EISA_FORCED_OFFSET + i])
+			return 0;
+
+		x = (root->bus_nr << 8) | edev->slot;
+		if (forced_tab[EISA_FORCED_OFFSET + i] == x)
+			return 1;
+	}
+
+	return 0;
+}
+
 static void __init eisa_name_device (struct eisa_device *edev)
 {
 	int i;
@@ -92,7 +117,8 @@ static int eisa_bus_match (struct device *dev, struct device_driver *drv)
 		return 0;
 
 	while (strlen (eids->sig)) {
-		if (!strcmp (eids->sig, edev->id.sig)) {
+		if (!strcmp (eids->sig, edev->id.sig) &&
+		    edev->state & EISA_CONFIG_ENABLED) {
 			edev->id.driver_data = eids->driver_data;
 			return 1;
 		}
@@ -132,41 +158,160 @@ static ssize_t eisa_show_sig (struct device *dev, char *buf)
 
 static DEVICE_ATTR(signature, S_IRUGO, eisa_show_sig, NULL);
 
-static int __init eisa_register_device (struct eisa_root_device *root,
-					struct eisa_device *edev,
-					char *sig, int slot)
+static ssize_t eisa_show_state (struct device *dev, char *buf)
+{
+        struct eisa_device *edev = to_eisa_device (dev);
+        return sprintf (buf,"%d\n", edev->state & EISA_CONFIG_ENABLED);
+}
+
+static DEVICE_ATTR(enabled, S_IRUGO, eisa_show_state, NULL);
+
+static int __init eisa_init_device (struct eisa_root_device *root,
+				    struct eisa_device *edev,
+				    int slot)
 {
+	char *sig;
+        unsigned long sig_addr;
+	int i;
+
+	sig_addr = SLOT_ADDRESS (root, slot) + EISA_VENDOR_ID_OFFSET;
+
+	if (!(sig = decode_eisa_sig (sig_addr)))
+		return -1;	/* No EISA device here */
+	
 	memcpy (edev->id.sig, sig, EISA_SIG_LEN);
 	edev->slot = slot;
+	edev->state = inb (SLOT_ADDRESS (root, slot) + EISA_CONFIG_OFFSET) & EISA_CONFIG_ENABLED;
 	edev->base_addr = SLOT_ADDRESS (root, slot);
-	edev->dma_mask = 0xffffffff; /* Default DMA mask */
+	edev->dma_mask = root->dma_mask; /* Default DMA mask */
 	eisa_name_device (edev);
 	edev->dev.parent = root->dev;
 	edev->dev.bus = &eisa_bus_type;
 	edev->dev.dma_mask = &edev->dma_mask;
 	sprintf (edev->dev.bus_id, "%02X:%02X", root->bus_nr, slot);
 
-	edev->res.name  = edev->dev.name;
+	for (i = 0; i < EISA_MAX_RESOURCES; i++)
+		edev->res[i].name  = edev->dev.name;
+
+	if (is_forced_dev (enable_dev, root, edev))
+		edev->state = EISA_CONFIG_ENABLED | EISA_CONFIG_FORCED;
+	
+	if (is_forced_dev (disable_dev, root, edev))
+		edev->state = EISA_CONFIG_FORCED;
+
+	return 0;
+}
 
+static int __init eisa_register_device (struct eisa_device *edev)
+{
 	if (device_register (&edev->dev))
 		return -1;
 
 	device_create_file (&edev->dev, &dev_attr_signature);
+	device_create_file (&edev->dev, &dev_attr_enabled);
+
+	return 0;
+}
+
+static int __init eisa_request_resources (struct eisa_root_device *root,
+					  struct eisa_device *edev,
+					  int slot)
+{
+	int i;
+
+	for (i = 0; i < EISA_MAX_RESOURCES; i++) {
+		/* Don't register resource for slot 0, since this is
+		 * very likely to fail... :-( Instead, grab the EISA
+		 * id, now we can display something in /proc/ioports.
+		 */
+
+		/* Only one region for mainboard */
+		if (!slot && i > 0) {
+			edev->res[i].start = edev->res[i].end = 0;
+			continue;
+		}
+		
+		if (slot) {
+			edev->res[i].name  = NULL;
+			edev->res[i].start = SLOT_ADDRESS (root, slot) + (i * 0x400);
+			edev->res[i].end   = edev->res[i].start + 0xff;
+			edev->res[i].flags = IORESOURCE_IO;
+		} else {
+			edev->res[i].name  = NULL;
+			edev->res[i].start = SLOT_ADDRESS (root, slot) + EISA_VENDOR_ID_OFFSET;
+			edev->res[i].end   = edev->res[i].start + 3;
+			edev->res[i].flags = IORESOURCE_BUSY;
+		}
+
+		if (request_resource (root->res, &edev->res[i]))
+			goto failed;
+	}
 
 	return 0;
+	
+ failed:
+	while (--i >= 0)
+		release_resource (&edev->res[i]);
+
+	return -1;
+}
+
+static void __init eisa_release_resources (struct eisa_device *edev)
+{
+	int i;
+
+	for (i = 0; i < EISA_MAX_RESOURCES; i++)
+		if (edev->res[i].start || edev->res[i].end)
+			release_resource (&edev->res[i]);
 }
 
 static int __init eisa_probe (struct eisa_root_device *root)
 {
         int i, c;
-        char *str;
-        unsigned long sig_addr;
 	struct eisa_device *edev;
 
         printk (KERN_INFO "EISA: Probing bus %d at %s\n",
 		root->bus_nr, root->dev->name);
+
+	/* First try to get hold of slot 0. If there is no device
+	 * here, simply fail, unless root->force_probe is set. */
 	
-        for (c = 0, i = 0; i <= root->slots; i++) {
+	if (!(edev = kmalloc (sizeof (*edev), GFP_KERNEL))) {
+		printk (KERN_ERR "EISA: Couldn't allocate mainboard slot\n");
+		return -ENOMEM;
+	}
+		
+	memset (edev, 0, sizeof (*edev));
+
+	if (eisa_request_resources (root, edev, 0)) {
+		printk (KERN_WARNING \
+			"EISA: Cannot allocate resource for mainboard\n");
+		kfree (edev);
+		if (!root->force_probe)
+			return -EBUSY;
+		goto force_probe;
+	}
+
+	if (eisa_init_device (root, edev, 0)) {
+		eisa_release_resources (edev);
+		kfree (edev);
+		if (!root->force_probe)
+			return -ENODEV;
+		goto force_probe;
+	}
+
+	printk (KERN_INFO "EISA: Mainboard %s detected.\n", edev->id.sig);
+
+	if (eisa_register_device (edev)) {
+		printk (KERN_ERR "EISA: Failed to register %s\n",
+			edev->id.sig);
+		eisa_release_resources (edev);
+		kfree (edev);
+	}
+	
+ force_probe:
+	
+        for (c = 0, i = 1; i <= root->slots; i++) {
 		if (!(edev = kmalloc (sizeof (*edev), GFP_KERNEL))) {
 			printk (KERN_ERR "EISA: Out of memory for slot %d\n",
 				i);
@@ -175,24 +320,7 @@ static int __init eisa_probe (struct eisa_root_device *root)
 		
 		memset (edev, 0, sizeof (*edev));
 
-		/* Don't register resource for slot 0, since this is
-		 * very likely to fail... :-( Instead, grab the EISA
-		 * id, now we can display something in /proc/ioports.
-		 */
-
-		if (i) {
-			edev->res.name  = NULL;
-			edev->res.start = SLOT_ADDRESS (root, i);
-			edev->res.end   = edev->res.start + 0xfff;
-			edev->res.flags = IORESOURCE_IO;
-		} else {
-			edev->res.name  = NULL;
-			edev->res.start = SLOT_ADDRESS (root, i) + EISA_VENDOR_ID_OFFSET;
-			edev->res.end   = edev->res.start + 3;
-			edev->res.flags = IORESOURCE_BUSY;
-		}
-	
-		if (request_resource (root->res, &edev->res)) {
+		if (eisa_request_resources (root, edev, i)) {
 			printk (KERN_WARNING \
 				"Cannot allocate resource for EISA slot %d\n",
 				i);
@@ -200,30 +328,41 @@ static int __init eisa_probe (struct eisa_root_device *root)
 			continue;
 		}
 
-		sig_addr = SLOT_ADDRESS (root, i) + EISA_VENDOR_ID_OFFSET;
-
-                if (!(str = decode_eisa_sig (sig_addr))) {
-			release_resource (&edev->res);
+                if (eisa_init_device (root, edev, i)) {
+			eisa_release_resources (edev);
 			kfree (edev);
 			continue;
 		}
 		
-		if (!i)
-			printk (KERN_INFO "EISA: Motherboard %s detected\n",
-				str);
-		else {
-			printk (KERN_INFO "EISA: slot %d : %s detected.\n",
-				i, str);
-
-			c++;
+		printk (KERN_INFO "EISA: slot %d : %s detected",
+			i, edev->id.sig);
+			
+		switch (edev->state) {
+		case EISA_CONFIG_ENABLED | EISA_CONFIG_FORCED:
+			printk (" (forced enabled)");
+			break;
+
+		case EISA_CONFIG_FORCED:
+			printk (" (forced disabled)");
+			break;
+
+		case 0:
+			printk (" (disabled)");
+			break;
 		}
+			
+		printk (".\n");
 
-		if (eisa_register_device (root, edev, str, i)) {
-			printk (KERN_ERR "EISA: Failed to register %s\n", str);
-			release_resource (&edev->res);
+		c++;
+
+		if (eisa_register_device (edev)) {
+			printk (KERN_ERR "EISA: Failed to register %s\n",
+				edev->id.sig);
+			eisa_release_resources (edev);
 			kfree (edev);
 		}
         }
+
         printk (KERN_INFO "EISA: Detected %d card%s.\n", c, c == 1 ? "" : "s");
 
 	return 0;
@@ -274,6 +413,13 @@ static int __init eisa_init (void)
 	return 0;
 }
 
+/* Couldn't use intarray with checking on... :-( */
+#undef  param_check_intarray
+#define param_check_intarray(name, p)
+
+module_param(enable_dev,  intarray, 0444);
+module_param(disable_dev, intarray, 0444);
+
 postcore_initcall (eisa_init);
 
 EXPORT_SYMBOL (eisa_bus_type);
diff --git a/include/linux/eisa.h b/include/linux/eisa.h
index d77fdf0f9f01..93e4c5503d46 100644
--- a/include/linux/eisa.h
+++ b/include/linux/eisa.h
@@ -4,6 +4,8 @@
 #define EISA_SIG_LEN   8
 #define EISA_MAX_SLOTS 8
 
+#define EISA_MAX_RESOURCES 4
+
 /* A few EISA constants/offsets... */
 
 #define EISA_DMA1_STATUS            8
@@ -17,6 +19,10 @@
 #define EISA_INT1_EDGE_LEVEL    0x4D0
 #define EISA_INT2_EDGE_LEVEL    0x4D1
 #define EISA_VENDOR_ID_OFFSET   0xC80
+#define EISA_CONFIG_OFFSET      0xC84
+
+#define EISA_CONFIG_ENABLED         1
+#define EISA_CONFIG_FORCED          2
 
 /* The EISA signature, in ASCII form, null terminated */
 struct eisa_device_id {
@@ -26,19 +32,28 @@ struct eisa_device_id {
 
 /* There is not much we can say about an EISA device, apart from
  * signature, slot number, and base address. dma_mask is set by
- * default to 32 bits.*/
+ * default to parent device mask..*/
 
 struct eisa_device {
 	struct eisa_device_id id;
 	int                   slot;
+	int                   state;
 	unsigned long         base_addr;
-	struct resource       res;
+	struct resource       res[EISA_MAX_RESOURCES];
 	u64                   dma_mask;
 	struct device         dev; /* generic device */
 };
 
 #define to_eisa_device(n) container_of(n, struct eisa_device, dev)
 
+static inline int eisa_get_region_index (void *addr)
+{
+	unsigned long x = (unsigned long) addr;
+
+	x &= 0xc00;
+	return (x >> 12);
+}
+
 struct eisa_driver {
 	const struct eisa_device_id *id_table;
 	struct device_driver         driver;
@@ -69,6 +84,8 @@ struct eisa_root_device {
 	struct resource *res;
 	unsigned long    bus_base_addr;
 	int		 slots;  /* Max slot number */
+	int		 force_probe; /* Probe even when no slot 0 */
+	u64		 dma_mask; /* from bridge device */
 	int              bus_nr; /* Set by eisa_root_register */
 	struct resource  eisa_root_res;	/* ditto */
 };
-- 
cgit v1.2.3


From 3faa61fe2ece423aeda58d42f2b8c998cfb9fa3a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:35:55 -0700
Subject: [PATCH] ipc semaphore optimization

From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>

This patch proposes a performance fix for the current IPC semaphore
implementation.

There are two shortcoming in the current implementation:
try_atomic_semop() was called two times to wake up a blocked process,
once from the update_queue() (executed from the process that wakes up
the sleeping process) and once in the retry part of the blocked process
(executed from the block process that gets woken up).

A second issue is that when several sleeping processes that are eligible
for wake up, they woke up in daisy chain formation and each one in turn
to wake up next process in line.  However, every time when a process
wakes up, it start scans the wait queue from the beginning, not from
where it was last scanned.  This causes large number of unnecessary
scanning of the wait queue under a situation of deep wait queue.
Blocked processes come and go, but chances are there are still quite a
few blocked processes sit at the beginning of that queue.

What we are proposing here is to merge the portion of the code in the
bottom part of sys_semtimedop() (code that gets executed when a sleeping
process gets woken up) into update_queue() function.  The benefit is two
folds: (1) is to reduce redundant calls to try_atomic_semop() and (2) to
increase efficiency of finding eligible processes to wake up and higher
concurrency for multiple wake-ups.

We have measured that this patch improves throughput for a large
application significantly on a industry standard benchmark.

This patch is relative to 2.5.72.  Any feedback is very much
appreciated.

Some kernel profile data attached:

  Kernel profile before optimization:
  -----------------------------------------------
                0.05    0.14   40805/529060      sys_semop [133]
                0.55    1.73  488255/529060      ia64_ret_from_syscall
[2]
[52]     2.5    0.59    1.88  529060         sys_semtimedop [52]
                0.05    0.83  477766/817966      schedule_timeout [62]
                0.34    0.46  529064/989340      update_queue [61]
                0.14    0.00 1006740/6473086     try_atomic_semop [75]
                0.06    0.00  529060/989336      ipcperms [149]
  -----------------------------------------------

                0.30    0.40  460276/989340      semctl_main [68]
                0.34    0.46  529064/989340      sys_semtimedop [52]
[61]     1.5    0.64    0.87  989340         update_queue [61]
                0.75    0.00 5466346/6473086     try_atomic_semop [75]
                0.01    0.11  477676/576698      wake_up_process [146]
  -----------------------------------------------
                0.14    0.00 1006740/6473086     sys_semtimedop [52]
                0.75    0.00 5466346/6473086     update_queue [61]
[75]     0.9    0.89    0.00 6473086         try_atomic_semop [75]
  -----------------------------------------------

  Kernel profile with optimization:

  -----------------------------------------------
                0.03    0.05   26139/503178      sys_semop [155]
                0.46    0.92  477039/503178      ia64_ret_from_syscall
[2]
[61]     1.2    0.48    0.97  503178         sys_semtimedop [61]
                0.04    0.79  470724/784394      schedule_timeout [62]
                0.05    0.00  503178/3301773     try_atomic_semop [109]
                0.05    0.00  503178/930934      ipcperms [149]
                0.00    0.03   32454/460210      update_queue [99]
  -----------------------------------------------
                0.00    0.03   32454/460210      sys_semtimedop [61]
                0.06    0.36  427756/460210      semctl_main [75]
[99]     0.4    0.06    0.39  460210         update_queue [99]
                0.30    0.00 2798595/3301773     try_atomic_semop [109]
                0.00    0.09  470630/614097      wake_up_process [146]
  -----------------------------------------------
                0.05    0.00  503178/3301773     sys_semtimedop [61]
                0.30    0.00 2798595/3301773     update_queue [99]
[109]    0.3    0.35    0.00 3301773         try_atomic_semop [109]
  -----------------------------------------------=20

Both number of function calls to try_atomic_semop() and update_queue()
are reduced by 50% as a result of the merge.  Execution time of
sys_semtimedop is reduced because of the reduction in the low level
functions.
---
 include/linux/sem.h |  1 -
 ipc/sem.c           | 95 +++++++++++++++++++++--------------------------------
 2 files changed, 38 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 2821bc07f647..6e13e5efc163 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -109,7 +109,6 @@ struct sem_queue {
 	int			id;	 /* internal sem id */
 	struct sembuf *		sops;	 /* array of pending operations */
 	int			nsops;	 /* number of operations */
-	int			alter;	 /* operation will alter semaphore */
 };
 
 /* Each task has a list of undo requests. They are executed automatically
diff --git a/ipc/sem.c b/ipc/sem.c
index 07d9a2e054b7..d1a54864f753 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -49,6 +49,10 @@
  *      increase. If there are decrement operations in the operations
  *      array we do the same as before.
  *
+ * With the incarnation of O(1) scheduler, it becomes unnecessary to perform
+ * check/retry algorithm for waking up blocked processes as the new scheduler
+ * is better at handling thread switch than the old one.
+ *
  * /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
  *
  * SMP-threaded, sysctl's added
@@ -258,8 +262,7 @@ static inline void remove_from_queue (struct sem_array * sma,
  */
 
 static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
-			     int nsops, struct sem_undo *un, int pid,
-			     int do_undo)
+			     int nsops, struct sem_undo *un, int pid)
 {
 	int result, sem_op;
 	struct sembuf *sop;
@@ -289,10 +292,6 @@ static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
 		curr->semval = result;
 	}
 
-	if (do_undo) {
-		result = 0;
-		goto undo;
-	}
 	sop--;
 	while (sop >= sops) {
 		sma->sem_base[sop->sem_num].sempid = pid;
@@ -334,23 +333,14 @@ static void update_queue (struct sem_array * sma)
 
 	for (q = sma->sem_pending; q; q = q->next) {
 			
-		if (q->status == 1)
-			continue;	/* this one was woken up before */
-
 		error = try_atomic_semop(sma, q->sops, q->nsops,
-					 q->undo, q->pid, q->alter);
+					 q->undo, q->pid);
 
 		/* Does q->sleeper still need to sleep? */
 		if (error <= 0) {
-				/* Found one, wake it up */
-			wake_up_process(q->sleeper);
-			if (error == 0 && q->alter) {
-				/* if q-> alter let it self try */
-				q->status = 1;
-				return;
-			}
 			q->status = error;
 			remove_from_queue(sma,q);
+			wake_up_process(q->sleeper);
 		}
 	}
 }
@@ -1062,7 +1052,7 @@ retry_undos:
 	if (error)
 		goto out_unlock_free;
 
-	error = try_atomic_semop (sma, sops, nsops, un, current->pid, 0);
+	error = try_atomic_semop (sma, sops, nsops, un, current->pid);
 	if (error <= 0)
 		goto update;
 
@@ -1075,55 +1065,46 @@ retry_undos:
 	queue.nsops = nsops;
 	queue.undo = un;
 	queue.pid = current->pid;
-	queue.alter = decrease;
 	queue.id = semid;
 	if (alter)
 		append_to_queue(sma ,&queue);
 	else
 		prepend_to_queue(sma ,&queue);
 
-	for (;;) {
-		queue.status = -EINTR;
-		queue.sleeper = current;
-		current->state = TASK_INTERRUPTIBLE;
-		sem_unlock(sma);
+	queue.status = -EINTR;
+	queue.sleeper = current;
+	current->state = TASK_INTERRUPTIBLE;
+	sem_unlock(sma);
 
-		if (timeout)
-			jiffies_left = schedule_timeout(jiffies_left);
-		else
-			schedule();
+	if (timeout)
+		jiffies_left = schedule_timeout(jiffies_left);
+	else
+		schedule();
 
-		sma = sem_lock(semid);
-		if(sma==NULL) {
-			if(queue.prev != NULL)
-				BUG();
-			error = -EIDRM;
-			goto out_free;
-		}
-		/*
-		 * If queue.status == 1 we where woken up and
-		 * have to retry else we simply return.
-		 * If an interrupt occurred we have to clean up the
-		 * queue
-		 *
-		 */
-		if (queue.status == 1)
-		{
-			error = try_atomic_semop (sma, sops, nsops, un,
-						  current->pid,0);
-			if (error <= 0) 
-				break;
-		} else {
-			error = queue.status;
-			if (error == -EINTR && timeout && jiffies_left == 0)
-				error = -EAGAIN;
-			if (queue.prev) /* got Interrupt */
-				break;
-			/* Everything done by update_queue */
-			goto out_unlock_free;
-		}
+	sma = sem_lock(semid);
+	if(sma==NULL) {
+		if(queue.prev != NULL)
+			BUG();
+		error = -EIDRM;
+		goto out_free;
+	}
+
+	/*
+	 * If queue.status != -EINTR we are woken up by another process
+	 */
+	error = queue.status;
+	if (queue.status != -EINTR) {
+		goto out_unlock_free;
 	}
+
+	/*
+	 * If an interrupt occurred we have to clean up the queue
+	 */
+	if (timeout && jiffies_left == 0)
+		error = -EAGAIN;
 	remove_from_queue(sma,&queue);
+	goto out_unlock_free;
+
 update:
 	if (alter)
 		update_queue (sma);
-- 
cgit v1.2.3


From 3abbd8ff39f3da75117a35ac50020818ff3ef7a6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:36:03 -0700
Subject: [PATCH] bring back the batch_requests function

From: Nick Piggin <piggin@cyberone.com.au>

The batch_requests function got lost during the merge of the dynamic request
allocation patch.

We need it for the anticipatory scheduler - when the number of threads
exceeds the number of requests, the anticipated-upon task will undesirably
sleep in get_request_wait().

And apparently some block devices which use small requests need it so they
string a decent number together.

Jens has acked this patch.
---
 drivers/block/ll_rw_blk.c | 34 ++++++++++++++++++++++++++++++----
 include/linux/blkdev.h    |  1 +
 2 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 0f11567e5277..dfd489ea0234 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -51,6 +51,11 @@ static int queue_nr_requests;
 unsigned long blk_max_low_pfn, blk_max_pfn;
 static wait_queue_head_t congestion_wqh[2];
 
+static inline int batch_requests(void)
+{
+	return min(BLKDEV_MAX_RQ / 8, 8);
+}
+
 /*
  * Return the threshold (number of free requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
@@ -1180,6 +1185,8 @@ static int blk_init_free_list(request_queue_t *q)
 	struct request_list *rl = &q->rq;
 
 	rl->count[READ] = rl->count[WRITE] = 0;
+	init_waitqueue_head(&rl->wait[READ]);
+	init_waitqueue_head(&rl->wait[WRITE]);
 
 	rl->rq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, request_cachep);
 
@@ -1325,18 +1332,33 @@ out:
 }
 
 /*
- * No available requests for this queue, unplug the device.
+ * No available requests for this queue, unplug the device and wait for some
+ * requests to become available.
  */
 static struct request *get_request_wait(request_queue_t *q, int rw)
 {
+	DEFINE_WAIT(wait);
 	struct request *rq;
 
 	generic_unplug_device(q);
 	do {
 		rq = get_request(q, rw, GFP_NOIO);
 
-		if (!rq)
-			blk_congestion_wait(rw, HZ / 50);
+		if (!rq) {
+			struct request_list *rl = &q->rq;
+
+			prepare_to_wait_exclusive(&rl->wait[rw], &wait,
+						TASK_UNINTERRUPTIBLE);
+			/*
+			 * If _all_ the requests were suddenly returned then
+			 * no wakeup will be delivered.  So now we're on the
+			 * waitqueue, go check for that.
+			 */
+			rq = get_request(q, rw, GFP_ATOMIC & ~__GFP_HIGH);
+			if (!rq)
+				io_schedule();
+			finish_wait(&rl->wait[rw], &wait);
+		}
 	} while (!rq);
 
 	return rq;
@@ -1498,8 +1520,12 @@ void __blk_put_request(request_queue_t *q, struct request *req)
 		blk_free_request(q, req);
 
 		rl->count[rw]--;
-		if ((BLKDEV_MAX_RQ - rl->count[rw]) >= queue_congestion_off_threshold())
+		if ((BLKDEV_MAX_RQ - rl->count[rw]) >=
+				queue_congestion_off_threshold())
 			clear_queue_congested(q, rw);
+		if ((BLKDEV_MAX_RQ - rl->count[rw]) >= batch_requests() &&
+				waitqueue_active(&rl->wait[rw]))
+			wake_up(&rl->wait[rw]);
 	}
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 786ea3563752..621a5b042a9c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -27,6 +27,7 @@ struct request_pm_state;
 struct request_list {
 	int count[2];
 	mempool_t *rq_pool;
+	wait_queue_head_t wait[2];
 };
 
 /*
-- 
cgit v1.2.3


From 33c664854c9c467f4c30fe038c2afa12cc126311 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:36:09 -0700
Subject: [PATCH] Create `kblockd' workqueue

keventd is inappropriate for running block request queues because keventd
itself can get blocked on disk I/O.  Via call_usermodehelper()'s vfork and,
presumably, GFP_KERNEL allocations.

So create a new gang of kernel threads whose mandate is for running low-level
disk operations.  It must ever block on disk IO, so any memory allocations
should be GFP_NOIO.

We mainly use it for running unplug operations from interrupt context.
---
 drivers/block/Makefile    |  5 +++++
 drivers/block/ll_rw_blk.c | 24 ++++++++++++++++++++++--
 include/linux/blkdev.h    |  4 ++++
 3 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index c723e8ecc584..67c567bc9308 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -8,6 +8,11 @@
 # In the future, some of these should be built conditionally.
 #
 
+#
+# NOTE that ll_rw_blk.c must come early in linkage order - it starts the
+# kblockd threads
+#
+
 obj-y	:= elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o deadline-iosched.o
 
 obj-$(CONFIG_MAC_FLOPPY)	+= swim3.o
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index dfd489ea0234..3e68ceb9578c 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -48,9 +48,15 @@ static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
  */
 static int queue_nr_requests;
 
-unsigned long blk_max_low_pfn, blk_max_pfn;
 static wait_queue_head_t congestion_wqh[2];
 
+/*
+ * Controlling structure to kblockd
+ */
+static struct workqueue_struct *kblockd_workqueue; 
+
+unsigned long blk_max_low_pfn, blk_max_pfn;
+
 static inline int batch_requests(void)
 {
 	return min(BLKDEV_MAX_RQ / 8, 8);
@@ -2308,10 +2314,24 @@ void blk_rq_prep_restart(struct request *rq)
 	rq->current_nr_sectors = rq->hard_cur_sectors;
 }
 
+int kblockd_schedule_work(struct work_struct *work)
+{
+	return queue_work(kblockd_workqueue, work);
+}
+
+void kblockd_flush(void)
+{
+	flush_workqueue(kblockd_workqueue);
+}
+
 int __init blk_dev_init(void)
 {
 	int i;
 
+	kblockd_workqueue = create_workqueue("kblockd");
+	if (!kblockd_workqueue)
+		panic("Failed to create kblockd\n");
+
 	request_cachep = kmem_cache_create("blkdev_requests",
 			sizeof(struct request), 0, 0, NULL, NULL);
 	if (!request_cachep)
@@ -2331,7 +2351,7 @@ int __init blk_dev_init(void)
 	for (i = 0; i < ARRAY_SIZE(congestion_wqh); i++)
 		init_waitqueue_head(&congestion_wqh[i]);
 	return 0;
-};
+}
 
 EXPORT_SYMBOL(process_that_request_first);
 EXPORT_SYMBOL(end_that_request_first);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 621a5b042a9c..e97790517973 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -561,6 +561,10 @@ static inline void put_dev_sector(Sector p)
 	page_cache_release(p.v);
 }
 
+struct work_struct;
+int kblockd_schedule_work(struct work_struct *work);
+void kblockd_flush(void);
+
 #ifdef CONFIG_LBD
 # include <asm/div64.h>
 # define sector_div(a, b) do_div(a, b)
-- 
cgit v1.2.3


From 7d2483a936d9012f0a8ed9b61e191d6b827a66d2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:36:16 -0700
Subject: [PATCH] elv_may_queue() API function

Introduces the elv_may_queue() predicate with which the IO scheduler may tell
the generic request layer that we may add another request to this queue.

It is used by the CFQ elevator.
---
 drivers/block/elevator.c  | 10 ++++++++++
 drivers/block/ll_rw_blk.c |  2 +-
 include/linux/elevator.h  |  5 +++++
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index 00b56d022b0e..bf40a06781d5 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -361,6 +361,16 @@ void elv_put_request(request_queue_t *q, struct request *rq)
 		e->elevator_put_req_fn(q, rq);
 }
 
+int elv_may_queue(request_queue_t *q, int rw)
+{
+	elevator_t *e = &q->elevator;
+
+	if (e->elevator_may_queue_fn)
+		return e->elevator_may_queue_fn(q, rw);
+
+	return 1;
+}
+
 int elv_register_queue(struct gendisk *disk)
 {
 	request_queue_t *q = disk->queue;
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 3e68ceb9578c..1debfebc2f57 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1294,7 +1294,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 	struct request_list *rl = &q->rq;
 
 	spin_lock_irq(q->queue_lock);
-	if (rl->count[rw] == BLKDEV_MAX_RQ) {
+	if (rl->count[rw] == BLKDEV_MAX_RQ || !elv_may_queue(q, rw)) {
 		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 522e51609ef3..66bedb242218 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -15,6 +15,8 @@ typedef int (elevator_queue_empty_fn) (request_queue_t *);
 typedef void (elevator_remove_req_fn) (request_queue_t *, struct request *);
 typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *);
 typedef struct list_head *(elevator_get_sort_head_fn) (request_queue_t *, struct request *);
+typedef int (elevator_may_queue_fn) (request_queue_t *, int);
+
 typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int);
 typedef void (elevator_put_req_fn) (request_queue_t *, struct request *);
 
@@ -39,6 +41,8 @@ struct elevator_s
 	elevator_set_req_fn *elevator_set_req_fn;
 	elevator_put_req_fn *elevator_put_req_fn;
 
+	elevator_may_queue_fn *elevator_may_queue_fn;
+
 	elevator_init_fn *elevator_init_fn;
 	elevator_exit_fn *elevator_exit_fn;
 
@@ -64,6 +68,7 @@ extern struct request *elv_former_request(request_queue_t *, struct request *);
 extern struct request *elv_latter_request(request_queue_t *, struct request *);
 extern int elv_register_queue(struct gendisk *);
 extern void elv_unregister_queue(struct gendisk *);
+extern int elv_may_queue(request_queue_t *, int);
 extern int elv_set_request(request_queue_t *, struct request *, int);
 extern void elv_put_request(request_queue_t *, struct request *);
 
-- 
cgit v1.2.3


From 104e6fdc6f35ea08e1c6ed03158b336b2e9983ed Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:36:23 -0700
Subject: [PATCH] elevator completion API

From: Nick Piggin <piggin@cyberone.com.au>

Introduces an elevator_completed_req() callback with which the generic
queueing layer may tell an IO scheduler that a particualr request has
finished.
---
 drivers/block/elevator.c  | 9 +++++++++
 drivers/block/ll_rw_blk.c | 2 ++
 include/linux/elevator.h  | 3 +++
 3 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index bf40a06781d5..406755724e03 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -371,6 +371,14 @@ int elv_may_queue(request_queue_t *q, int rw)
 	return 1;
 }
 
+void elv_completed_request(request_queue_t *q, struct request *rq)
+{
+	elevator_t *e = &q->elevator;
+
+	if (e->elevator_completed_req_fn)
+		e->elevator_completed_req_fn(q, rq);
+}
+
 int elv_register_queue(struct gendisk *disk)
 {
 	request_queue_t *q = disk->queue;
@@ -418,5 +426,6 @@ EXPORT_SYMBOL(__elv_add_request);
 EXPORT_SYMBOL(elv_next_request);
 EXPORT_SYMBOL(elv_remove_request);
 EXPORT_SYMBOL(elv_queue_empty);
+EXPORT_SYMBOL(elv_completed_request);
 EXPORT_SYMBOL(elevator_exit);
 EXPORT_SYMBOL(elevator_init);
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 1debfebc2f57..71750da0db6d 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1510,6 +1510,8 @@ void __blk_put_request(request_queue_t *q, struct request *req)
 	if (unlikely(--req->ref_count))
 		return;
 
+	elv_completed_request(req->q, req);
+
 	req->rq_status = RQ_INACTIVE;
 	req->q = NULL;
 	req->rl = NULL;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 66bedb242218..07de69c1ef8a 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -15,6 +15,7 @@ typedef int (elevator_queue_empty_fn) (request_queue_t *);
 typedef void (elevator_remove_req_fn) (request_queue_t *, struct request *);
 typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *);
 typedef struct list_head *(elevator_get_sort_head_fn) (request_queue_t *, struct request *);
+typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *);
 typedef int (elevator_may_queue_fn) (request_queue_t *, int);
 
 typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int);
@@ -34,6 +35,7 @@ struct elevator_s
 	elevator_remove_req_fn *elevator_remove_req_fn;
 
 	elevator_queue_empty_fn *elevator_queue_empty_fn;
+	elevator_completed_req_fn *elevator_completed_req_fn;
 
 	elevator_request_list_fn *elevator_former_req_fn;
 	elevator_request_list_fn *elevator_latter_req_fn;
@@ -69,6 +71,7 @@ extern struct request *elv_latter_request(request_queue_t *, struct request *);
 extern int elv_register_queue(struct gendisk *);
 extern void elv_unregister_queue(struct gendisk *);
 extern int elv_may_queue(request_queue_t *, int);
+extern void elv_completed_request(request_queue_t *, struct request *);
 extern int elv_set_request(request_queue_t *, struct request *, int);
 extern void elv_put_request(request_queue_t *, struct request *);
 
-- 
cgit v1.2.3


From 97ff29c22ec3df25621561194692e7e945fcf489 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:36:30 -0700
Subject: [PATCH] anticipatory I/O scheduler

From: Nick Piggin <piggin@cyberone.com.au>

This is the core anticipatory IO scheduler.  There are nearly 100 changesets
in this and five months work.  I really cannot describe it fully here.

Major points:

- It works by recognising that reads are dependent: we don't know where the
  next read will occur, but it's probably close-by the previous one.  So once
  a read has completed we leave the disk idle, anticipating that a request
  for a nearby read will come in.

- There is read batching and write batching logic.

  - when we're servicing a batch of writes we will refuse to seek away
    for a read for some tens of milliseconds.  Then the write stream is
    preempted.

  - when we're servicing a batch of reads (via anticipation) we'll do
    that for some tens of milliseconds, then preempt.

- There are request deadlines, for latency and fairness.
  The oldest outstanding request is examined at regular intervals. If
  this request is older than a specific deadline, it will be the next
  one dispatched. This gives a good fairness heuristic while being simple
  because processes tend to have localised IO.


Just about all of the rest of the complexity involves an array of fixups
which prevent most of teh obvious failure modes with anticipation: trying to
not leave the disk head pointlessly idle.  Some of these algorithms are:

- Process tracking.  If the process whose read we are anticipating submits
  a write, abandon anticipation.

- Process exit tracking.  If the process whose read we are anticipating
  exits, abandon anticipation.

- Process IO history.  We accumulate statistical info on the process's
  recent IO patterns to aid in making decisions about how long to anticipate
  new reads.

  Currently thinktime and seek distance are tracked. Thinktime is the
  time between when a process's last request has completed and when it
  submits another one. Seek distance is simply the number of sectors
  between each read request. If either statistic becomes too high, the
  it isn't anticipated that the process will submit another read.

The above all means that we need a per-process "io context".  This is a fully
refcounted structure.  In this patch it is AS-only.  later we generalise it a
little so other IO schedulers could use the same framework.

- Requests are grouped as synchronous and asynchronous whereas deadline
  scheduler groups requests as reads and writes. This can provide better
  sync write performance, and may give better responsiveness with journalling
  filesystems (although we haven't done that yet).

  We currently detect synchronous writes by nastily setting PF_SYNCWRITE in
  current->flags.  The plan is to remove this later, and to propagate the
  sync hint from writeback_contol.sync_mode into bio->bi_flags thence into
  request->flags.  Once that is done, direct-io needs to set the BIO sync
  hint as well.

- There is also quite a bit of complexity gone into bashing TCQ into
  submission. Timing for a read batch is not started until the first read
  request actually completes. A read batch also does not start until all
  outstanding writes have completed.

AS is the default IO scheduler.  deadline may be chosen by booting with
"elevator=deadline".

There are a few reasons for retaining deadline:

- AS is often slower than deadline in random IO loads with large TCQ
  windows. The usual real world task here is OLTP database loads.

- deadline is presumably more stable.

- deadline is much simpler.


The tunable per-queue entries under /sys/block/*/iosched/ are all in
milliseconds:

* read_expire

  Controls how long until a request becomes "expired".

  It also controls the interval between which expired requests are served,
  so set to 50, a request might take anywhere < 100ms to be serviced _if_ it
  is the next on the expired list.

  Obviously it can't make the disk go faster.  Result is basically the
  timeslice a reader gets in the presence of other IO.  100*((seek time /
  read_expire) + 1) is very roughly the % streaming read efficiency your disk
  should get in the presence of multiple readers.

* read_batch_expire

  Controls how much time a batch of reads is given before pending writes
  are served.  Higher value is more efficient.  Shouldn't really be below
  read_expire.

* write_ versions of the above

* antic_expire

  Controls the maximum amount of time we can anticipate a good read before
  giving up.  Many other factors may cause anticipation to be stopped early,
  or some processes will not be "anticipated" at all.  Should be a bit higher
  for big seek time devices though not a linear correspondance - most
  processes have only a few ms thinktime.
---
 drivers/block/Makefile     |    3 +-
 drivers/block/as-iosched.c | 1888 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/block/ll_rw_blk.c  |   25 +-
 fs/buffer.c                |    4 +
 fs/fs-writeback.c          |    2 +
 include/linux/elevator.h   |    5 +
 include/linux/sched.h      |    5 +
 kernel/exit.c              |    2 +
 kernel/fork.c              |    1 +
 9 files changed, 1932 insertions(+), 3 deletions(-)
 create mode 100644 drivers/block/as-iosched.c

(limited to 'include/linux')

diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 67c567bc9308..4733ec79a6d0 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -13,7 +13,8 @@
 # kblockd threads
 #
 
-obj-y	:= elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o deadline-iosched.o
+obj-y	:= elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o \
+	deadline-iosched.o as-iosched.o
 
 obj-$(CONFIG_MAC_FLOPPY)	+= swim3.o
 obj-$(CONFIG_BLK_DEV_FD)	+= floppy.o
diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c
new file mode 100644
index 000000000000..e6af1f822630
--- /dev/null
+++ b/drivers/block/as-iosched.c
@@ -0,0 +1,1888 @@
+/*
+ *  linux/drivers/block/as-iosched.c
+ *
+ *  Anticipatory & deadline i/o scheduler.
+ *
+ *  Copyright (C) 2002 Jens Axboe <axboe@suse.de>
+ *                     Nick Piggin <piggin@cyberone.com.au>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/blk.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/hash.h>
+#include <linux/rbtree.h>
+#include <linux/interrupt.h>
+
+#define REQ_SYNC	1
+#define REQ_ASYNC	0
+
+/*
+ * See Documentation/as-iosched.txt
+ */
+
+/*
+ * max time before a read is submitted.
+ */
+#define default_read_expire (HZ / 20)
+
+/*
+ * ditto for writes, these limits are not hard, even
+ * if the disk is capable of satisfying them.
+ */
+#define default_write_expire (HZ / 5)
+
+/*
+ * read_batch_expire describes how long we will allow a stream of reads to
+ * persist before looking to see whether it is time to switch over to writes.
+ */
+#define default_read_batch_expire (HZ / 5)
+
+/*
+ * write_batch_expire describes how long we want a stream of writes to run for.
+ * This is not a hard limit, but a target we set for the auto-tuning thingy.
+ * See, the problem is: we can send a lot of writes to disk cache / TCQ in
+ * a short amount of time...
+ */
+#define default_write_batch_expire (HZ / 20)
+
+/*
+ * max time we may wait to anticipate a read (default around 6ms)
+ */
+#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
+
+/*
+ * This is the per-process anticipatory I/O scheduler state.  It is refcounted
+ * and kmalloc'ed.
+ *
+ * There is no locking protecting the contents of this structure!  Pointers
+ * to a single as_io_context may appear in multiple queues at once.
+ */
+
+/*
+ * Keep track of up to 20ms thinktimes. We can go as big as we like here,
+ * however huge values tend to interfere and not decay fast enough. A program
+ * might be in a non-io phase of operation. Waiting on user input for example,
+ * or doing a lengthy computation. A small penalty can be justified there, and
+ * will still catch out those processes that constantly have large thinktimes.
+ */
+#define MAX_THINKTIME (HZ/50UL)
+
+/* Bits in as_io_context.state */
+enum as_io_states {
+	AS_TASK_RUNNING=0,	/* Process has not exitted */
+	AS_TASK_IORUNNING,	/* Process has completed some IO */
+};
+
+struct as_io_context {
+	atomic_t refcount;
+	pid_t pid;
+	unsigned long state;
+	atomic_t nr_queued; /* queued reads & sync writes */
+	atomic_t nr_dispatched; /* number of requests gone to the drivers */
+
+	spinlock_t lock;
+
+	/* IO History tracking */
+	/* Thinktime */
+	unsigned long last_end_request;
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+	/* Layout pattern */
+	long seek_samples;
+	sector_t last_request_pos;
+	sector_t seek_total;
+	sector_t seek_mean;
+};
+
+enum anticipation_status {
+	ANTIC_OFF=0,		/* Not anticipating (normal operation)	*/
+	ANTIC_WAIT_REQ,		/* The last read has not yet completed  */
+	ANTIC_WAIT_NEXT,	/* Currently anticipating a request vs
+				   last read (which has completed) */
+	ANTIC_FINISHED,		/* Anticipating but have found a candidate
+				 * or timed out */
+};
+
+struct as_data {
+	/*
+	 * run time data
+	 */
+
+	struct request_queue *q;	/* the "owner" queue */
+
+	/*
+	 * requests (as_rq s) are present on both sort_list and fifo_list
+	 */
+	struct rb_root sort_list[2];
+	struct list_head fifo_list[2];
+
+	struct as_rq *next_arq[2];	/* next in sort order */
+	sector_t last_sector[2];	/* last REQ_SYNC & REQ_ASYNC sectors */
+	struct list_head *dispatch;	/* driver dispatch queue */
+	struct list_head *hash;		/* request hash */
+	unsigned long hash_valid_count;	/* barrier hash count */
+	unsigned long current_batch_expires;
+	unsigned long last_check_fifo[2];
+	int changed_batch;
+	int batch_data_dir;		/* current batch REQ_SYNC / REQ_ASYNC */
+	int write_batch_count;		/* max # of reqs in a write batch */
+	int current_write_count;	/* how many requests left this batch */
+	int write_batch_idled;		/* has the write batch gone idle? */
+	mempool_t *arq_pool;
+
+	enum anticipation_status antic_status;
+	unsigned long antic_start;	/* jiffies: when it started */
+	struct timer_list antic_timer;	/* anticipatory scheduling timer */
+	struct work_struct antic_work;	/* Deferred unplugging */
+	struct as_io_context *as_io_context;/* Identify the expected process */
+	int aic_finished; /* IO associated with as_io_context finished */
+	int nr_dispatched;
+
+	/*
+	 * settings that change how the i/o scheduler behaves
+	 */
+	unsigned long fifo_expire[2];
+	unsigned long batch_expire[2];
+	unsigned long antic_expire;
+};
+
+#define list_entry_fifo(ptr)	list_entry((ptr), struct as_rq, fifo)
+
+/*
+ * per-request data.
+ */
+enum arq_state {
+	AS_RQ_NEW=0,		/* New - not referenced and not on any lists */
+	AS_RQ_QUEUED,		/* In the request queue. It belongs to the
+				   scheduler */
+	AS_RQ_DISPATCHED,	/* On the dispatch list. It belongs to the
+				   driver now */
+};
+
+struct as_rq {
+	/*
+	 * rbtree index, key is the starting offset
+	 */
+	struct rb_node rb_node;
+	sector_t rb_key;
+
+	struct request *request;
+
+	struct as_io_context *as_io_context;	/* The submitting task */
+
+	/*
+	 * request hash, key is the ending offset (for back merge lookup)
+	 */
+	struct list_head hash;
+	unsigned long hash_valid_count;
+
+	/*
+	 * expire fifo
+	 */
+	struct list_head fifo;
+	unsigned long expires;
+
+	int is_sync;
+	enum arq_state state; /* debug only */
+};
+
+#define RQ_DATA(rq)	((struct as_rq *) (rq)->elevator_private)
+
+static kmem_cache_t *arq_pool;
+
+/*
+ * IO Context helper functions
+ */
+/* Debug */
+static atomic_t nr_as_io_requests = ATOMIC_INIT(0);
+
+static void put_as_io_context(struct as_io_context **paic)
+{
+	struct as_io_context *aic = *paic;
+
+	if (aic == NULL)
+		return;
+
+	BUG_ON(atomic_read(&aic->refcount) == 0);
+
+	if (atomic_dec_and_test(&aic->refcount)) {
+		WARN_ON(atomic_read(&nr_as_io_requests) == 0);
+		atomic_dec(&nr_as_io_requests);
+		kfree(aic);
+	}
+}
+
+/* Called by the exitting task */
+void exit_as_io_context(void)
+{
+	unsigned long flags;
+	struct as_io_context *aic;
+
+	local_irq_save(flags);
+	aic = current->as_io_context;
+	if (aic) {
+		clear_bit(AS_TASK_RUNNING, &aic->state);
+		put_as_io_context(&aic);
+		current->as_io_context = NULL;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * If the current task has no IO context then create one and initialise it.
+ * If it does have a context, take a ref on it.
+ *
+ * This is always called in the context of the task which submitted the I/O.
+ * But weird things happen, so we disable local interrupts to ensure exclusive
+ * access to *current.
+ */
+static struct as_io_context *get_as_io_context(void)
+{
+	struct task_struct *tsk = current;
+	unsigned long flags;
+	struct as_io_context *ret;
+
+	local_irq_save(flags);
+	ret = tsk->as_io_context;
+	if (ret == NULL) {
+		ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+		if (ret) {
+			atomic_inc(&nr_as_io_requests);
+			atomic_set(&ret->refcount, 1);
+			ret->pid = tsk->pid;
+			ret->state = 1 << AS_TASK_RUNNING;
+			atomic_set(&ret->nr_queued, 0);
+			atomic_set(&ret->nr_dispatched, 0);
+			spin_lock_init(&ret->lock);
+			ret->ttime_total = 0;
+			ret->ttime_samples = 0;
+			ret->ttime_mean = 0;
+			ret->seek_total = 0;
+			ret->seek_samples = 0;
+			ret->seek_mean = 0;
+			tsk->as_io_context = ret;
+		}
+	}
+	local_irq_restore(flags);
+	atomic_inc(&ret->refcount);
+	return ret;
+}
+
+static void
+copy_as_io_context(struct as_io_context **pdst, struct as_io_context **psrc)
+{
+	struct as_io_context *src = *psrc;
+
+	if (src) {
+		BUG_ON(atomic_read(&src->refcount) == 0);
+		atomic_inc(&src->refcount);
+		put_as_io_context(pdst);
+		*pdst = src;
+	}
+}
+
+static void
+swap_as_io_context(struct as_io_context **aic1, struct as_io_context **aic2)
+{
+	struct as_io_context *temp;
+	temp = *aic1;
+	*aic1 = *aic2;
+	*aic2 = temp;
+}
+
+/*
+ * the back merge hash support functions
+ */
+static const int as_hash_shift = 6;
+#define AS_HASH_BLOCK(sec)	((sec) >> 3)
+#define AS_HASH_FN(sec)		(hash_long(AS_HASH_BLOCK((sec)), as_hash_shift))
+#define AS_HASH_ENTRIES		(1 << as_hash_shift)
+#define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
+#define list_entry_hash(ptr)	list_entry((ptr), struct as_rq, hash)
+#define ON_HASH(arq)		(arq)->hash_valid_count
+
+#define AS_INVALIDATE_HASH(ad)				\
+	do {						\
+		if (!++(ad)->hash_valid_count)		\
+			(ad)->hash_valid_count = 1;	\
+	} while (0)
+
+static inline void __as_del_arq_hash(struct as_rq *arq)
+{
+	arq->hash_valid_count = 0;
+	list_del_init(&arq->hash);
+}
+
+static inline void as_del_arq_hash(struct as_rq *arq)
+{
+	if (ON_HASH(arq))
+		__as_del_arq_hash(arq);
+}
+
+static void as_remove_merge_hints(request_queue_t *q, struct as_rq *arq)
+{
+	as_del_arq_hash(arq);
+
+	if (q->last_merge == &arq->request->queuelist)
+		q->last_merge = NULL;
+}
+
+static void as_add_arq_hash(struct as_data *ad, struct as_rq *arq)
+{
+	struct request *rq = arq->request;
+
+	BUG_ON(ON_HASH(arq));
+
+	arq->hash_valid_count = ad->hash_valid_count;
+	list_add(&arq->hash, &ad->hash[AS_HASH_FN(rq_hash_key(rq))]);
+}
+
+/*
+ * move hot entry to front of chain
+ */
+static inline void as_hot_arq_hash(struct as_data *ad, struct as_rq *arq)
+{
+	struct request *rq = arq->request;
+	struct list_head *head = &ad->hash[AS_HASH_FN(rq_hash_key(rq))];
+
+	if (!ON_HASH(arq)) {
+		WARN_ON(1);
+		return;
+	}
+
+	if (arq->hash.prev != head) {
+		list_del(&arq->hash);
+		list_add(&arq->hash, head);
+	}
+}
+
+static struct request *as_find_arq_hash(struct as_data *ad, sector_t offset)
+{
+	struct list_head *hash_list = &ad->hash[AS_HASH_FN(offset)];
+	struct list_head *entry, *next = hash_list->next;
+
+	while ((entry = next) != hash_list) {
+		struct as_rq *arq = list_entry_hash(entry);
+		struct request *__rq = arq->request;
+
+		next = entry->next;
+
+		BUG_ON(!ON_HASH(arq));
+
+		if (!rq_mergeable(__rq)
+		    || arq->hash_valid_count != ad->hash_valid_count) {
+			__as_del_arq_hash(arq);
+			continue;
+		}
+
+		if (rq_hash_key(__rq) == offset)
+			return __rq;
+	}
+
+	return NULL;
+}
+
+/*
+ * rb tree support functions
+ */
+#define RB_NONE		(2)
+#define RB_EMPTY(root)	((root)->rb_node == NULL)
+#define ON_RB(node)	((node)->rb_color != RB_NONE)
+#define RB_CLEAR(node)	((node)->rb_color = RB_NONE)
+#define rb_entry_arq(node)	rb_entry((node), struct as_rq, rb_node)
+#define ARQ_RB_ROOT(ad, arq)	(&(ad)->sort_list[(arq)->is_sync])
+#define rq_rb_key(rq)		(rq)->sector
+
+/*
+ * as_find_first_arq finds the first (lowest sector numbered) request
+ * for the specified data_dir. Used to sweep back to the start of the disk
+ * (1-way elevator) after we process the last (highest sector) request.
+ */
+static struct as_rq *as_find_first_arq(struct as_data *ad, int data_dir)
+{
+	struct rb_node *n = ad->sort_list[data_dir].rb_node;
+
+	if (n == NULL)
+		return NULL;
+
+	for (;;) {
+		if (n->rb_left == NULL)
+			return rb_entry_arq(n);
+
+		n = n->rb_left;
+	}
+}
+
+static struct as_rq *__as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
+{
+	struct rb_node **p = &ARQ_RB_ROOT(ad, arq)->rb_node;
+	struct rb_node *parent = NULL;
+	struct as_rq *__arq;
+
+	while (*p) {
+		parent = *p;
+		__arq = rb_entry_arq(parent);
+
+		if (arq->rb_key < __arq->rb_key)
+			p = &(*p)->rb_left;
+		else if (arq->rb_key > __arq->rb_key)
+			p = &(*p)->rb_right;
+		else
+			return __arq;
+	}
+
+	rb_link_node(&arq->rb_node, parent, p);
+	return 0;
+}
+
+static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq);
+/*
+ * Add the request to the rb tree if it is unique.  If there is an alias (an
+ * existing request against the same sector), which can happen when using
+ * direct IO, then move the alias to the dispatch list and then add the
+ * request.
+ */
+static void as_add_arq_rb(struct as_data *ad, struct as_rq *arq)
+{
+	struct as_rq *alias;
+	struct request *rq = arq->request;
+
+	arq->rb_key = rq_rb_key(rq);
+
+	/* This can be caused by direct IO */
+	while ((alias = __as_add_arq_rb(ad, arq)))
+		as_move_to_dispatch(ad, alias);
+
+	rb_insert_color(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
+}
+
+static inline void as_del_arq_rb(struct as_data *ad, struct as_rq *arq)
+{
+	if (!ON_RB(&arq->rb_node)) {
+		WARN_ON(1);
+		return;
+	}
+
+	rb_erase(&arq->rb_node, ARQ_RB_ROOT(ad, arq));
+	RB_CLEAR(&arq->rb_node);
+}
+
+static struct request *
+as_find_arq_rb(struct as_data *ad, sector_t sector, int data_dir)
+{
+	struct rb_node *n = ad->sort_list[data_dir].rb_node;
+	struct as_rq *arq;
+
+	while (n) {
+		arq = rb_entry_arq(n);
+
+		if (sector < arq->rb_key)
+			n = n->rb_left;
+		else if (sector > arq->rb_key)
+			n = n->rb_right;
+		else
+			return arq->request;
+	}
+
+	return NULL;
+}
+
+/*
+ * IO Scheduler proper
+ */
+
+#define MAXBACK (1024 * 1024)	/*
+				 * Maximum distance the disk will go backward
+				 * for a request.
+				 */
+
+/*
+ * as_choose_req selects the preferred one of two requests of the same data_dir
+ * ignoring time - eg. timeouts, which is the job of as_dispatch_request
+ */
+static struct as_rq *
+as_choose_req(struct as_data *ad, struct as_rq *arq1, struct as_rq *arq2)
+{
+	int data_dir;
+	sector_t last, s1, s2, d1, d2;
+	int r1_wrap=0, r2_wrap=0;	/* requests are behind the disk head */
+	const sector_t maxback = MAXBACK;
+
+	if (arq1 == NULL || arq1 == arq2)
+		return arq2;
+	if (arq2 == NULL)
+		return arq1;
+
+	data_dir = arq1->is_sync;
+
+	last = ad->last_sector[data_dir];
+	s1 = arq1->request->sector;
+	s2 = arq2->request->sector;
+
+	BUG_ON(data_dir != arq2->is_sync);
+
+	/*
+	 * Strict one way elevator _except_ in the case where we allow
+	 * short backward seeks which are biased as twice the cost of a
+	 * similar forward seek.
+	 */
+	if (s1 >= last)
+		d1 = s1 - last;
+	else if (s1+maxback >= last)
+		d1 = (last - s1)*2;
+	else {
+		r1_wrap = 1;
+		d1 = 0; /* shut up, gcc */
+	}
+
+	if (s2 >= last)
+		d2 = s2 - last;
+	else if (s2+maxback >= last)
+		d2 = (last - s2)*2;
+	else {
+		r2_wrap = 1;
+		d2 = 0;
+	}
+
+	/* Found required data */
+	if (!r1_wrap && r2_wrap)
+		return arq1;
+	else if (!r2_wrap && r1_wrap)
+		return arq2;
+	else if (r1_wrap && r2_wrap) {
+		/* both behind the head */
+		if (s1 <= s2)
+			return arq1;
+		else
+			return arq2;
+	}
+
+	/* Both requests in front of the head */
+	if (d1 < d2)
+		return arq1;
+	else if (d2 < d1)
+		return arq2;
+	else {
+		if (s1 >= s2)
+			return arq1;
+		else
+			return arq2;
+	}
+}
+
+/*
+ * as_find_next_arq finds the next request after @prev in elevator order.
+ * this with as_choose_req form the basis for how the scheduler chooses
+ * what request to process next. Anticipation works on top of this.
+ */
+static struct as_rq *as_find_next_arq(struct as_data *ad, struct as_rq *last)
+{
+	const int data_dir = last->is_sync;
+	struct as_rq *ret;
+	struct rb_node *rbnext = rb_next(&last->rb_node);
+	struct rb_node *rbprev = rb_prev(&last->rb_node);
+	struct as_rq *arq_next, *arq_prev;
+
+	BUG_ON(!ON_RB(&last->rb_node));
+
+	if (rbprev)
+		arq_prev = rb_entry_arq(rbprev);
+	else
+		arq_prev = NULL;
+
+	if (rbnext)
+		arq_next = rb_entry_arq(rbnext);
+	else {
+		arq_next = as_find_first_arq(ad, data_dir);
+		if (arq_next == last)
+			arq_next = NULL;
+	}
+
+	ret = as_choose_req(ad,	arq_next, arq_prev);
+
+	return ret;
+}
+
+/*
+ * anticipatory scheduling functions follow
+ */
+
+/*
+ * as_antic_expired tells us when we have anticipated too long.
+ * The funny "absolute difference" math on the elapsed time is to handle
+ * jiffy wraps, and disks which have been idle for 0x80000000 jiffies.
+ */
+static int as_antic_expired(struct as_data *ad)
+{
+	long delta_jif;
+
+	delta_jif = jiffies - ad->antic_start;
+	if (unlikely(delta_jif < 0))
+		delta_jif = -delta_jif;
+	if (delta_jif < ad->antic_expire)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * as_antic_waitnext starts anticipating that a nice request will soon be
+ * submitted. See also as_antic_waitreq
+ */
+static void as_antic_waitnext(struct as_data *ad)
+{
+	unsigned long timeout;
+
+	BUG_ON(ad->antic_status != ANTIC_OFF
+			&& ad->antic_status != ANTIC_WAIT_REQ);
+
+	timeout = ad->antic_start + ad->antic_expire;
+
+	mod_timer(&ad->antic_timer, timeout);
+
+	ad->antic_status = ANTIC_WAIT_NEXT;
+}
+
+/*
+ * as_antic_waitreq starts anticipating. We don't start timing the anticipation
+ * until the request that we're anticipating on has finished. This means we
+ * are timing from when the candidate process wakes up hopefully.
+ */
+static void as_antic_waitreq(struct as_data *ad)
+{
+	BUG_ON(ad->antic_status == ANTIC_FINISHED);
+	if (ad->antic_status == ANTIC_OFF) {
+		if (!ad->as_io_context || ad->aic_finished)
+			as_antic_waitnext(ad);
+		else
+			ad->antic_status = ANTIC_WAIT_REQ;
+	}
+}
+
+/*
+ * This is called directly by the functions in this file to stop anticipation.
+ * We kill the timer and schedule a call to the request_fn asap.
+ */
+static void as_antic_stop(struct as_data *ad)
+{
+	int status = ad->antic_status;
+
+	if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) {
+		if (status == ANTIC_WAIT_NEXT)
+			del_timer(&ad->antic_timer);
+		ad->antic_status = ANTIC_FINISHED;
+		/* see as_work_handler */
+		kblockd_schedule_work(&ad->antic_work);
+	}
+}
+
+/*
+ * as_antic_timeout is the timer function set by as_antic_waitnext.
+ */
+static void as_antic_timeout(unsigned long data)
+{
+	struct request_queue *q = (struct request_queue *)data;
+	struct as_data *ad = q->elevator.elevator_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (ad->antic_status == ANTIC_WAIT_REQ
+			|| ad->antic_status == ANTIC_WAIT_NEXT) {
+		ad->antic_status = ANTIC_FINISHED;
+		kblockd_schedule_work(&ad->antic_work);
+	}
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+/*
+ * as_close_req decides if one request is considered "close" to the
+ * previous one issued.
+ */
+static int as_close_req(struct as_data *ad, struct as_rq *arq)
+{
+	unsigned long delay;	/* milliseconds */
+	sector_t last = ad->last_sector[ad->batch_data_dir];
+	sector_t next = arq->request->sector;
+	sector_t delta;	/* acceptable close offset (in sectors) */
+
+	if (ad->antic_status == ANTIC_OFF || !ad->aic_finished)
+		delay = 0;
+	else
+		delay = ((jiffies - ad->antic_start) * 1000) / HZ;
+
+	if (delay <= 1)
+		delta = 64;
+	else if (delay <= 20 && delay <= ad->antic_expire)
+		delta = 64 << (delay-1);
+	else
+		return 1;
+
+	return (last - (delta>>1) <= next) && (next <= last + delta);
+}
+
+/*
+ * as_can_break_anticipation returns true if we have been anticipating this
+ * request.
+ *
+ * It also returns true if the process against which we are anticipating
+ * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to
+ * dispatch it ASAP, because we know that application will not be submitting
+ * any new reads.
+ *
+ * If the task which has submitted the request has exitted, break anticipation.
+ *
+ * If this task has queued some other IO, do not enter enticipation.
+ */
+static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
+{
+	struct as_io_context *aic;
+
+	if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) {
+		/* close request */
+		return 1;
+	}
+
+	if (ad->aic_finished && as_antic_expired(ad)) {
+		/*
+		 * In this situation status should really be FINISHED,
+		 * however the timer hasn't had the chance to run yet.
+		 */
+		return 1;
+	}
+
+	aic = ad->as_io_context;
+	BUG_ON(!aic);
+
+	if (arq && aic == arq->as_io_context) {
+		/* request from same process */
+		return 1;
+	}
+
+	if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
+		/* process anticipated on has exitted */
+		return 1;
+	}
+
+	if (atomic_read(&aic->nr_queued) > 0) {
+		/* process has more requests queued */
+		return 1;
+	}
+
+	if (atomic_read(&aic->nr_dispatched) > 0) {
+		/* process has more requests dispatched */
+		return 1;
+	}
+
+	if (aic->ttime_mean > ad->antic_expire) {
+		/* the process thinks too much between requests */
+		return 1;
+	}
+
+	if (arq && aic->seek_samples) {
+		sector_t s;
+		if (ad->last_sector[REQ_SYNC] < arq->request->sector)
+			s = arq->request->sector - ad->last_sector[REQ_SYNC];
+		else
+			s = ad->last_sector[REQ_SYNC] - arq->request->sector;
+
+		if (aic->seek_mean > (s>>1)) {
+			/* this request is better than what we're expecting */
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * as_can_anticipate indicates weather we should either run arq
+ * or keep anticipating a better request.
+ */
+static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
+{
+	if (!ad->as_io_context)
+		/*
+		 * Last request submitted was a write
+		 */
+		return 0;
+
+	if (ad->antic_status == ANTIC_FINISHED)
+		/*
+		 * Don't restart if we have just finished. Run the next request
+		 */
+		return 0;
+
+	if (as_can_break_anticipation(ad, arq))
+		/*
+		 * This request is a good candidate. Don't keep anticipating,
+		 * run it.
+		 */
+		return 0;
+
+	/*
+	 * OK from here, we haven't finished, and don't have a decent request!
+	 * Status is either ANTIC_OFF so start waiting,
+	 * ANTIC_WAIT_REQ so continue waiting for request to finish
+	 * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request.
+	 *
+	 */
+
+	return 1;
+}
+
+/*
+ * as_update_iohist keeps a decaying histogram of IO thinktimes, and
+ * updates @aic->ttime_mean based on that. It is called when a new
+ * request is queued.
+ */
+static void as_update_iohist(struct as_io_context *aic, struct request *rq)
+{
+	struct as_rq *arq = RQ_DATA(rq);
+	int data_dir = arq->is_sync;
+	unsigned long thinktime;
+	sector_t seek_dist;
+
+	if (aic == NULL)
+		return;
+
+	if (data_dir == REQ_SYNC) {
+		spin_lock(&aic->lock);
+
+		if (test_bit(AS_TASK_IORUNNING, &aic->state)
+				&& !atomic_read(&aic->nr_queued)
+				&& !atomic_read(&aic->nr_dispatched)) {
+			/* Calculate read -> read thinktime */
+			thinktime = jiffies - aic->last_end_request;
+			thinktime = min(thinktime, MAX_THINKTIME-1);
+			/* fixed point: 1.0 == 1<<8 */
+			aic->ttime_samples += 256;
+			aic->ttime_total += 256*thinktime;
+			if (aic->ttime_samples)
+				/* fixed point factor is cancelled here */
+				aic->ttime_mean = (aic->ttime_total + 128)
+							/ aic->ttime_samples;
+			aic->ttime_samples = (aic->ttime_samples>>1)
+						+ (aic->ttime_samples>>2);
+			aic->ttime_total = (aic->ttime_total>>1)
+						+ (aic->ttime_total>>2);
+		}
+
+		/* Calculate read -> read seek distance */
+		if (!aic->seek_samples)
+			seek_dist = 0;
+		else if (aic->last_request_pos < rq->sector)
+			seek_dist = rq->sector - aic->last_request_pos;
+		else
+			seek_dist = aic->last_request_pos - rq->sector;
+
+		aic->last_request_pos = rq->sector + rq->nr_sectors;
+
+		/*
+		 * Don't allow the seek distance to get too large from the
+		 * odd fragment, pagein, etc
+		 */
+		if (aic->seek_samples < 400) /* second&third seek */
+			seek_dist = min(seek_dist, (aic->seek_mean * 4)
+							+ 2*1024*1024);
+		else
+			seek_dist = min(seek_dist, (aic->seek_mean * 4)
+							+ 2*1024*64);
+
+		aic->seek_samples += 256;
+		aic->seek_total += 256*seek_dist;
+		if (aic->seek_samples) {
+			aic->seek_mean = aic->seek_total + 128;
+			do_div(aic->seek_mean, aic->seek_samples);
+		}
+		aic->seek_samples = (aic->seek_samples>>1)
+					+ (aic->seek_samples>>2);
+		aic->seek_total = (aic->seek_total>>1)
+					+ (aic->seek_total>>2);
+
+		spin_unlock(&aic->lock);
+	}
+}
+
+/*
+ * as_update_arq must be called whenever a request (arq) is added to
+ * the sort_list. This function keeps caches up to date, and checks if the
+ * request might be one we are "anticipating"
+ */
+static void as_update_arq(struct as_data *ad, struct as_rq *arq)
+{
+	const int data_dir = arq->is_sync;
+
+	/* keep the next_arq cache up to date */
+	ad->next_arq[data_dir] = as_choose_req(ad, arq, ad->next_arq[data_dir]);
+
+	/*
+	 * have we been anticipating this request?
+	 * or does it come from the same process as the one we are anticipating
+	 * for?
+	 */
+	if (ad->antic_status == ANTIC_WAIT_REQ
+			|| ad->antic_status == ANTIC_WAIT_NEXT) {
+		if (as_can_break_anticipation(ad, arq))
+			as_antic_stop(ad);
+	}
+}
+
+/*
+ * Gathers timings and resizes the write batch automatically
+ */
+void update_write_batch(struct as_data *ad)
+{
+	unsigned long batch = ad->batch_expire[REQ_ASYNC];
+	long write_time;
+
+	write_time = (jiffies - ad->current_batch_expires) + batch;
+	if (write_time < 0)
+		write_time = 0;
+
+	if (write_time > batch && !ad->write_batch_idled) {
+		if (write_time > batch * 3)
+			ad->write_batch_count /= 2;
+		else
+			ad->write_batch_count--;
+	} else if (write_time < batch && ad->current_write_count == 0) {
+		if (batch > write_time * 3)
+			ad->write_batch_count *= 2;
+		else
+			ad->write_batch_count++;
+	}
+
+	if (ad->write_batch_count < 1)
+		ad->write_batch_count = 1;
+}
+
+/*
+ * as_completed_request is to be called when a request has completed and
+ * returned something to the requesting process, be it an error or data.
+ */
+static void as_completed_request(request_queue_t *q, struct request *rq)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+	struct as_rq *arq = RQ_DATA(rq);
+	struct as_io_context *aic = arq->as_io_context;
+
+	if (unlikely(!blk_fs_request(rq))) {
+		WARN_ON(aic);
+		return;
+	}
+
+	WARN_ON(blk_fs_request(rq) && arq->state == AS_RQ_NEW);
+
+	if (arq->state != AS_RQ_DISPATCHED)
+		return;
+
+	if (ad->changed_batch && ad->nr_dispatched == 1) {
+		kblockd_schedule_work(&ad->antic_work);
+		ad->changed_batch = 2;
+	}
+	ad->nr_dispatched--;
+
+	/*
+	 * Start counting the batch from when a request of that direction is
+	 * actually serviced. This should help devices with big TCQ windows
+	 * and writeback caches
+	 */
+	if (ad->batch_data_dir == REQ_SYNC && ad->changed_batch
+			&& ad->batch_data_dir == arq->is_sync) {
+		update_write_batch(ad);
+		ad->current_batch_expires = jiffies +
+				ad->batch_expire[REQ_SYNC];
+		ad->changed_batch = 0;
+	}
+
+	if (!aic)
+		return;
+
+	spin_lock(&aic->lock);
+	if (arq->is_sync == REQ_SYNC) {
+		set_bit(AS_TASK_IORUNNING, &aic->state);
+		aic->last_end_request = jiffies;
+	}
+
+	if (ad->as_io_context == aic) {
+		ad->antic_start = jiffies;
+		ad->aic_finished = 1;
+		if (ad->antic_status == ANTIC_WAIT_REQ) {
+			/*
+			 * We were waiting on this request, now anticipate
+			 * the next one
+			 */
+			as_antic_waitnext(ad);
+		}
+	}
+	spin_unlock(&aic->lock);
+
+	put_as_io_context(&arq->as_io_context);
+}
+
+/*
+ * as_remove_queued_request removes a request from the pre dispatch queue
+ * without updating refcounts. It is expected the caller will drop the
+ * reference unless it replaces the request at somepart of the elevator
+ * (ie. the dispatch queue)
+ */
+static void as_remove_queued_request(request_queue_t *q, struct request *rq)
+{
+	struct as_rq *arq = RQ_DATA(rq);
+
+	if (!arq)
+		BUG();
+	else {
+		const int data_dir = arq->is_sync;
+		struct as_data *ad = q->elevator.elevator_data;
+
+		WARN_ON(arq->state != AS_RQ_QUEUED);
+
+		if (arq->as_io_context) {
+			BUG_ON(!atomic_read(&arq->as_io_context->nr_queued));
+			atomic_dec(&arq->as_io_context->nr_queued);
+		}
+
+		/*
+		 * Update the "next_arq" cache if we are about to remove its
+		 * entry
+		 */
+		if (ad->next_arq[data_dir] == arq)
+			ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
+
+		list_del_init(&arq->fifo);
+		as_remove_merge_hints(q, arq);
+		as_del_arq_rb(ad, arq);
+	}
+
+}
+
+/*
+ * as_remove_dispatched_request is called to remove a request which has gone
+ * to the dispatch list.
+ */
+static void as_remove_dispatched_request(request_queue_t *q, struct request *rq)
+{
+	struct as_rq *arq = RQ_DATA(rq);
+	struct as_io_context *aic;
+
+	if (!arq) {
+		WARN_ON(1);
+		return;
+	}
+
+	WARN_ON(arq->state != AS_RQ_DISPATCHED);
+	WARN_ON(ON_RB(&arq->rb_node));
+	aic = arq->as_io_context;
+	if (aic) {
+		WARN_ON(!atomic_read(&aic->nr_dispatched));
+		atomic_dec(&aic->nr_dispatched);
+	}
+}
+/*
+ * as_remove_request is called when a driver has finished with a request.
+ * This should be only called for dispatched requests, but for some reason
+ * a POWER4 box running hwscan it does not.
+ */
+static void as_remove_request(request_queue_t *q, struct request *rq)
+{
+	struct as_rq *arq = RQ_DATA(rq);
+
+	if (unlikely(!blk_fs_request(rq)))
+		return;
+
+	if (!arq) {
+		WARN_ON(1);
+		return;
+	}
+
+	if (ON_RB(&arq->rb_node))
+		as_remove_queued_request(q, rq);
+	else
+		as_remove_dispatched_request(q, rq);
+}
+
+/*
+ * as_fifo_expired returns 0 if there are no expired reads on the fifo,
+ * 1 otherwise.  It is ratelimited so that we only perform the check once per
+ * `fifo_expire' interval.  Otherwise a large number of expired requests
+ * would create a hopeless seekstorm.
+ *
+ * See as_antic_expired comment.
+ */
+static int as_fifo_expired(struct as_data *ad, int adir)
+{
+	struct as_rq *arq;
+	long delta_jif;
+
+	delta_jif = jiffies - ad->last_check_fifo[adir];
+	if (unlikely(delta_jif < 0))
+		delta_jif = -delta_jif;
+	if (delta_jif < ad->fifo_expire[adir])
+		return 0;
+
+	ad->last_check_fifo[adir] = jiffies;
+
+	if (list_empty(&ad->fifo_list[adir]))
+		return 0;
+
+	arq = list_entry_fifo(ad->fifo_list[adir].next);
+
+	return time_after(jiffies, arq->expires);
+}
+
+/*
+ * as_batch_expired returns true if the current batch has expired. A batch
+ * is a set of reads or a set of writes.
+ */
+static inline int as_batch_expired(struct as_data *ad)
+{
+	if (ad->changed_batch)
+		return 0;
+
+	if (ad->batch_data_dir == REQ_SYNC)
+		/* TODO! add a check so a complete fifo gets written? */
+		return time_after(jiffies, ad->current_batch_expires);
+
+	return time_after(jiffies, ad->current_batch_expires)
+		|| ad->current_write_count == 0;
+}
+
+/*
+ * move an entry to dispatch queue
+ */
+static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
+{
+	const int data_dir = arq->is_sync;
+
+	BUG_ON(!ON_RB(&arq->rb_node));
+
+	as_antic_stop(ad);
+	ad->antic_status = ANTIC_OFF;
+
+	/*
+	 * This has to be set in order to be correctly updated by
+	 * as_find_next_arq
+	 */
+	ad->last_sector[data_dir] = arq->request->sector
+					+ arq->request->nr_sectors;
+
+	ad->nr_dispatched++;
+
+	if (data_dir == REQ_SYNC) {
+		/* In case we have to anticipate after this */
+		copy_as_io_context(&ad->as_io_context, &arq->as_io_context);
+	} else {
+		if (ad->as_io_context) {
+			put_as_io_context(&ad->as_io_context);
+			ad->as_io_context = NULL;
+		}
+
+		if (ad->current_write_count != 0)
+			ad->current_write_count--;
+	}
+	ad->aic_finished = 0;
+
+	ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
+
+	/*
+	 * take it off the sort and fifo list, add to dispatch queue
+	 */
+	as_remove_queued_request(ad->q, arq->request);
+	list_add_tail(&arq->request->queuelist, ad->dispatch);
+	if (arq->as_io_context)
+		atomic_inc(&arq->as_io_context->nr_dispatched);
+
+	WARN_ON(arq->state != AS_RQ_QUEUED);
+	arq->state = AS_RQ_DISPATCHED;
+}
+
+/*
+ * as_dispatch_request selects the best request according to
+ * read/write expire, batch expire, etc, and moves it to the dispatch
+ * queue. Returns 1 if a request was found, 0 otherwise.
+ */
+static int as_dispatch_request(struct as_data *ad)
+{
+	struct as_rq *arq;
+	const int reads = !list_empty(&ad->fifo_list[REQ_SYNC]);
+	const int writes = !list_empty(&ad->fifo_list[REQ_ASYNC]);
+
+	/* Signal that the write batch was uncontended, so we can't time it */
+	if (ad->batch_data_dir == REQ_ASYNC && !reads) {
+		if (ad->current_write_count == 0 || !writes)
+			ad->write_batch_idled = 1;
+	}
+
+	if (!(reads || writes)
+		|| ad->antic_status == ANTIC_WAIT_REQ
+		|| ad->antic_status == ANTIC_WAIT_NEXT
+		|| ad->changed_batch == 1)
+		return 0;
+
+	if (!(reads && writes && as_batch_expired(ad)) ) {
+		/*
+		 * batch is still running or no reads or no writes
+		 */
+		arq = ad->next_arq[ad->batch_data_dir];
+
+		if (ad->batch_data_dir == REQ_SYNC && ad->antic_expire) {
+			if (as_fifo_expired(ad, REQ_SYNC))
+				goto fifo_expired;
+
+			if (as_can_anticipate(ad, arq)) {
+				as_antic_waitreq(ad);
+				return 0;
+			}
+		}
+
+		if (arq) {
+			/* we have a "next request" */
+			if (reads && !writes)
+				ad->current_batch_expires =
+					jiffies + ad->batch_expire[REQ_SYNC];
+			goto dispatch_request;
+		}
+	}
+
+	/*
+	 * at this point we are not running a batch. select the appropriate
+	 * data direction (read / write)
+	 */
+
+	if (reads) {
+		BUG_ON(RB_EMPTY(&ad->sort_list[REQ_SYNC]));
+
+		if (writes && ad->batch_data_dir == REQ_SYNC)
+			/*
+			 * Last batch was a read, switch to writes
+			 */
+			goto dispatch_writes;
+
+ 		if (ad->batch_data_dir == REQ_ASYNC)
+ 			ad->changed_batch = 1;
+		ad->batch_data_dir = REQ_SYNC;
+		arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
+		ad->last_check_fifo[ad->batch_data_dir] = jiffies;
+		goto dispatch_request;
+	}
+
+	/*
+	 * the last batch was a read
+	 */
+
+	if (writes) {
+dispatch_writes:
+		BUG_ON(RB_EMPTY(&ad->sort_list[REQ_ASYNC]));
+
+ 		if (ad->batch_data_dir == REQ_SYNC)
+ 			ad->changed_batch = 1;
+		ad->batch_data_dir = REQ_ASYNC;
+		ad->current_write_count = ad->write_batch_count;
+		ad->write_batch_idled = 0;
+		arq = ad->next_arq[ad->batch_data_dir];
+		goto dispatch_request;
+	}
+
+	BUG();
+	return 0;
+
+dispatch_request:
+	/*
+	 * If a request has expired, service it.
+	 */
+
+	if (as_fifo_expired(ad, ad->batch_data_dir)) {
+fifo_expired:
+		arq = list_entry_fifo(ad->fifo_list[ad->batch_data_dir].next);
+		BUG_ON(arq == NULL);
+	}
+
+	if (ad->changed_batch) {
+		if (ad->changed_batch == 1 && ad->nr_dispatched)
+			return 0;
+		if (ad->batch_data_dir == REQ_ASYNC) {
+			ad->current_batch_expires = jiffies +
+					ad->batch_expire[REQ_ASYNC];
+			ad->changed_batch = 0;
+		} else
+			ad->changed_batch = 2;
+		arq->request->flags |= REQ_HARDBARRIER;
+	}
+
+	/*
+	 * arq is the selected appropriate request.
+	 */
+	as_move_to_dispatch(ad, arq);
+
+	return 1;
+}
+
+static struct request *as_next_request(request_queue_t *q)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+	struct request *rq = NULL;
+
+	/*
+	 * if there are still requests on the dispatch queue, grab the first
+	 */
+	if (!list_empty(ad->dispatch) || as_dispatch_request(ad))
+		rq = list_entry_rq(ad->dispatch->next);
+
+	return rq;
+}
+
+/*
+ * add arq to rbtree and fifo
+ */
+static void as_add_request(struct as_data *ad, struct as_rq *arq)
+{
+	int data_dir;
+
+	if (rq_data_dir(arq->request) == READ
+			|| current->flags&PF_SYNCWRITE)
+		arq->is_sync = 1;
+	else
+		arq->is_sync = 0;
+	data_dir = arq->is_sync;
+
+	arq->as_io_context = get_as_io_context();
+
+	if (arq->as_io_context) {
+		atomic_inc(&arq->as_io_context->nr_queued);
+		as_update_iohist(arq->as_io_context, arq->request);
+	}
+
+	as_add_arq_rb(ad, arq);
+
+	/*
+	 * set expire time (only used for reads) and add to fifo list
+	 */
+	arq->expires = jiffies + ad->fifo_expire[data_dir];
+	list_add_tail(&arq->fifo, &ad->fifo_list[data_dir]);
+	arq->state = AS_RQ_QUEUED;
+	as_update_arq(ad, arq); /* keep state machine up to date */
+}
+
+static void
+as_insert_request(request_queue_t *q, struct request *rq,
+			struct list_head *insert_here)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+	struct as_rq *arq = RQ_DATA(rq);
+
+	if (unlikely(rq->flags & REQ_HARDBARRIER)) {
+		AS_INVALIDATE_HASH(ad);
+		q->last_merge = NULL;
+
+		while (ad->next_arq[REQ_SYNC])
+			as_move_to_dispatch(ad, ad->next_arq[REQ_SYNC]);
+
+		while (ad->next_arq[REQ_ASYNC])
+			as_move_to_dispatch(ad, ad->next_arq[REQ_ASYNC]);
+	}
+
+	if (unlikely(!blk_fs_request(rq))) {
+		if (!insert_here)
+			insert_here = ad->dispatch->prev;
+
+		list_add(&rq->queuelist, insert_here);
+
+		/* Stop anticipating - let this request get through */
+		if (!list_empty(ad->dispatch)
+			&& (ad->antic_status == ANTIC_WAIT_REQ
+				|| ad->antic_status == ANTIC_WAIT_NEXT))
+			as_antic_stop(ad);
+
+		return;
+	}
+
+	if (rq_mergeable(rq)) {
+		as_add_arq_hash(ad, arq);
+
+		if (!q->last_merge)
+			q->last_merge = &rq->queuelist;
+	}
+
+	as_add_request(ad, arq);
+}
+
+/*
+ * as_queue_empty tells us if there are requests left in the device. It may
+ * not be the case that a driver can get the next request even if the queue
+ * is not empty - it is used in the block layer to check for plugging and
+ * merging opportunities
+ */
+static int as_queue_empty(request_queue_t *q)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+
+	if (!list_empty(&ad->fifo_list[REQ_ASYNC])
+		|| !list_empty(&ad->fifo_list[REQ_SYNC])
+		|| !list_empty(ad->dispatch))
+			return 0;
+
+	return 1;
+}
+
+static struct request *
+as_former_request(request_queue_t *q, struct request *rq)
+{
+	struct as_rq *arq = RQ_DATA(rq);
+	struct rb_node *rbprev = rb_prev(&arq->rb_node);
+	struct request *ret = NULL;
+
+	if (rbprev)
+		ret = rb_entry_arq(rbprev)->request;
+
+	return ret;
+}
+
+static struct request *
+as_latter_request(request_queue_t *q, struct request *rq)
+{
+	struct as_rq *arq = RQ_DATA(rq);
+	struct rb_node *rbnext = rb_next(&arq->rb_node);
+	struct request *ret = NULL;
+
+	if (rbnext)
+		ret = rb_entry_arq(rbnext)->request;
+
+	return ret;
+}
+
+static int
+as_merge(request_queue_t *q, struct list_head **insert, struct bio *bio)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+	sector_t rb_key = bio->bi_sector + bio_sectors(bio);
+	struct request *__rq;
+	int ret;
+
+	/*
+	 * try last_merge to avoid going to hash
+	 */
+	ret = elv_try_last_merge(q, bio);
+	if (ret != ELEVATOR_NO_MERGE) {
+		__rq = list_entry_rq(q->last_merge);
+		goto out_insert;
+	}
+
+	/*
+	 * see if the merge hash can satisfy a back merge
+	 */
+	__rq = as_find_arq_hash(ad, bio->bi_sector);
+	if (__rq) {
+		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
+
+		if (elv_rq_merge_ok(__rq, bio)) {
+			ret = ELEVATOR_BACK_MERGE;
+			goto out;
+		}
+	}
+
+	/*
+	 * check for front merge
+	 */
+	__rq = as_find_arq_rb(ad, rb_key, bio_data_dir(bio));
+	if (__rq) {
+		BUG_ON(rb_key != rq_rb_key(__rq));
+
+		if (elv_rq_merge_ok(__rq, bio)) {
+			ret = ELEVATOR_FRONT_MERGE;
+			goto out;
+		}
+	}
+
+	return ELEVATOR_NO_MERGE;
+out:
+	q->last_merge = &__rq->queuelist;
+out_insert:
+	if (ret)
+		as_hot_arq_hash(ad, RQ_DATA(__rq));
+	*insert = &__rq->queuelist;
+	return ret;
+}
+
+static void as_merged_request(request_queue_t *q, struct request *req)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+	struct as_rq *arq = RQ_DATA(req);
+
+	/*
+	 * hash always needs to be repositioned, key is end sector
+	 */
+	as_del_arq_hash(arq);
+	as_add_arq_hash(ad, arq);
+
+	/*
+	 * if the merge was a front merge, we need to reposition request
+	 */
+	if (rq_rb_key(req) != arq->rb_key) {
+		as_del_arq_rb(ad, arq);
+		as_add_arq_rb(ad, arq);
+		/*
+		 * Note! At this stage of this and the next function, our next
+		 * request may not be optimal - eg the request may have "grown"
+		 * behind the disk head. We currently don't bother adjusting.
+		 */
+	}
+
+	q->last_merge = &req->queuelist;
+}
+
+static void
+as_merged_requests(request_queue_t *q, struct request *req,
+			 struct request *next)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+	struct as_rq *arq = RQ_DATA(req);
+	struct as_rq *anext = RQ_DATA(next);
+
+	BUG_ON(!arq);
+	BUG_ON(!anext);
+
+	/*
+	 * reposition arq (this is the merged request) in hash, and in rbtree
+	 * in case of a front merge
+	 */
+	as_del_arq_hash(arq);
+	as_add_arq_hash(ad, arq);
+
+	if (rq_rb_key(req) != arq->rb_key) {
+		as_del_arq_rb(ad, arq);
+		as_add_arq_rb(ad, arq);
+	}
+
+	/*
+	 * if anext expires before arq, assign its expire time to arq
+	 * and move into anext position (anext will be deleted) in fifo
+	 */
+	if (!list_empty(&arq->fifo) && !list_empty(&anext->fifo)) {
+		if (time_before(anext->expires, arq->expires)) {
+			list_move(&arq->fifo, &anext->fifo);
+			arq->expires = anext->expires;
+			/*
+			 * Don't copy here but swap, because when anext is
+			 * removed below, it must contain the unused context
+			 */
+			swap_as_io_context(&arq->as_io_context,
+					&anext->as_io_context);
+		}
+	}
+
+	/*
+	 * kill knowledge of next, this one is a goner
+	 */
+	as_remove_queued_request(q, next);
+	put_as_io_context(&anext->as_io_context);
+}
+
+/*
+ * This is executed in a "deferred" process context, by kblockd. It calls the
+ * driver's request_fn so the driver can submit that request.
+ *
+ * IMPORTANT! This guy will reenter the elevator, so set up all queue global
+ * state before calling, and don't rely on any state over calls.
+ *
+ * FIXME! dispatch queue is not a queue at all!
+ */
+static void as_work_handler(void *data)
+{
+	struct request_queue *q = data;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (as_next_request(q))
+		q->request_fn(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void as_put_request(request_queue_t *q, struct request *rq)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+	struct as_rq *arq = RQ_DATA(rq);
+
+	if (!arq) {
+		WARN_ON(1);
+		return;
+	}
+
+	mempool_free(arq, ad->arq_pool);
+	rq->elevator_private = NULL;
+}
+
+static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+{
+	struct as_data *ad = q->elevator.elevator_data;
+	struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
+
+	if (arq) {
+		RB_CLEAR(&arq->rb_node);
+		arq->request = rq;
+		arq->state = AS_RQ_NEW;
+		arq->as_io_context = NULL;
+		INIT_LIST_HEAD(&arq->hash);
+		arq->hash_valid_count = 0;
+		INIT_LIST_HEAD(&arq->fifo);
+		rq->elevator_private = arq;
+		return 0;
+	}
+
+	return 1;
+}
+
+static void as_exit(request_queue_t *q, elevator_t *e)
+{
+	struct as_data *ad = e->elevator_data;
+
+	del_timer_sync(&ad->antic_timer);
+	kblockd_flush();
+
+	BUG_ON(!list_empty(&ad->fifo_list[REQ_SYNC]));
+	BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
+
+	mempool_destroy(ad->arq_pool);
+	put_as_io_context(&ad->as_io_context);
+	kfree(ad->hash);
+	kfree(ad);
+}
+
+/*
+ * initialize elevator private data (as_data), and alloc a arq for
+ * each request on the free lists
+ */
+static int as_init(request_queue_t *q, elevator_t *e)
+{
+	struct as_data *ad;
+	int i;
+
+	if (!arq_pool)
+		return -ENOMEM;
+
+	ad = kmalloc(sizeof(*ad), GFP_KERNEL);
+	if (!ad)
+		return -ENOMEM;
+	memset(ad, 0, sizeof(*ad));
+
+	ad->q = q; /* Identify what queue the data belongs to */
+
+	ad->hash = kmalloc(sizeof(struct list_head)*AS_HASH_ENTRIES,GFP_KERNEL);
+	if (!ad->hash) {
+		kfree(ad);
+		return -ENOMEM;
+	}
+
+	ad->arq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, arq_pool);
+	if (!ad->arq_pool) {
+		kfree(ad->hash);
+		kfree(ad);
+		return -ENOMEM;
+	}
+
+	/* anticipatory scheduling helpers */
+	ad->antic_timer.function = as_antic_timeout;
+	ad->antic_timer.data = (unsigned long)q;
+	init_timer(&ad->antic_timer);
+	INIT_WORK(&ad->antic_work, as_work_handler, q);
+
+	for (i = 0; i < AS_HASH_ENTRIES; i++)
+		INIT_LIST_HEAD(&ad->hash[i]);
+
+	INIT_LIST_HEAD(&ad->fifo_list[REQ_SYNC]);
+	INIT_LIST_HEAD(&ad->fifo_list[REQ_ASYNC]);
+	ad->sort_list[REQ_SYNC] = RB_ROOT;
+	ad->sort_list[REQ_ASYNC] = RB_ROOT;
+	ad->dispatch = &q->queue_head;
+	ad->fifo_expire[REQ_SYNC] = default_read_expire;
+	ad->fifo_expire[REQ_ASYNC] = default_write_expire;
+	ad->hash_valid_count = 1;
+	ad->antic_expire = default_antic_expire;
+	ad->batch_expire[REQ_SYNC] = default_read_batch_expire;
+	ad->batch_expire[REQ_ASYNC] = default_write_batch_expire;
+	e->elevator_data = ad;
+
+	ad->current_batch_expires = jiffies + ad->batch_expire[REQ_SYNC];
+	ad->write_batch_count = ad->batch_expire[REQ_ASYNC] / 10;
+	if (ad->write_batch_count < 2)
+		ad->write_batch_count = 2;
+	return 0;
+}
+
+/*
+ * sysfs parts below
+ */
+struct as_fs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct as_data *, char *);
+	ssize_t (*store)(struct as_data *, const char *, size_t);
+};
+
+static ssize_t
+as_var_show(unsigned int var, char *page)
+{
+	var = (var * 1000) / HZ;
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+as_var_store(unsigned long *var, const char *page, size_t count)
+{
+	unsigned long tmp;
+	char *p = (char *) page;
+
+	tmp = simple_strtoul(p, &p, 10);
+	if (tmp != 0) {
+		tmp = (tmp * HZ) / 1000;
+		if (tmp == 0)
+			tmp = 1;
+	}
+	*var = tmp;
+	return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR)					\
+static ssize_t __FUNC(struct as_data *ad, char *page)		\
+{									\
+	return as_var_show(__VAR, (page));			\
+}
+SHOW_FUNCTION(as_readexpire_show, ad->fifo_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_writeexpire_show, ad->fifo_expire[REQ_ASYNC]);
+SHOW_FUNCTION(as_anticexpire_show, ad->antic_expire);
+SHOW_FUNCTION(as_read_batchexpire_show, ad->batch_expire[REQ_SYNC]);
+SHOW_FUNCTION(as_write_batchexpire_show, ad->batch_expire[REQ_ASYNC]);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)				\
+static ssize_t __FUNC(struct as_data *ad, const char *page, size_t count)	\
+{									\
+	int ret = as_var_store(__PTR, (page), count);		\
+	if (*(__PTR) < (MIN))						\
+		*(__PTR) = (MIN);					\
+	else if (*(__PTR) > (MAX))					\
+		*(__PTR) = (MAX);					\
+	return ret;							\
+}
+STORE_FUNCTION(as_readexpire_store, &ad->fifo_expire[REQ_SYNC], 0, INT_MAX);
+STORE_FUNCTION(as_writeexpire_store, &ad->fifo_expire[REQ_ASYNC], 0, INT_MAX);
+STORE_FUNCTION(as_anticexpire_store, &ad->antic_expire, 0, INT_MAX);
+STORE_FUNCTION(as_read_batchexpire_store,
+			&ad->batch_expire[REQ_SYNC], 0, INT_MAX);
+STORE_FUNCTION(as_write_batchexpire_store,
+			&ad->batch_expire[REQ_ASYNC], 0, INT_MAX);
+#undef STORE_FUNCTION
+
+static struct as_fs_entry as_readexpire_entry = {
+	.attr = {.name = "read_expire", .mode = S_IRUGO | S_IWUSR },
+	.show = as_readexpire_show,
+	.store = as_readexpire_store,
+};
+static struct as_fs_entry as_writeexpire_entry = {
+	.attr = {.name = "write_expire", .mode = S_IRUGO | S_IWUSR },
+	.show = as_writeexpire_show,
+	.store = as_writeexpire_store,
+};
+static struct as_fs_entry as_anticexpire_entry = {
+	.attr = {.name = "antic_expire", .mode = S_IRUGO | S_IWUSR },
+	.show = as_anticexpire_show,
+	.store = as_anticexpire_store,
+};
+static struct as_fs_entry as_read_batchexpire_entry = {
+	.attr = {.name = "read_batch_expire", .mode = S_IRUGO | S_IWUSR },
+	.show = as_read_batchexpire_show,
+	.store = as_read_batchexpire_store,
+};
+static struct as_fs_entry as_write_batchexpire_entry = {
+	.attr = {.name = "write_batch_expire", .mode = S_IRUGO | S_IWUSR },
+	.show = as_write_batchexpire_show,
+	.store = as_write_batchexpire_store,
+};
+
+static struct attribute *default_attrs[] = {
+	&as_readexpire_entry.attr,
+	&as_writeexpire_entry.attr,
+	&as_anticexpire_entry.attr,
+	&as_read_batchexpire_entry.attr,
+	&as_write_batchexpire_entry.attr,
+	NULL,
+};
+
+#define to_as(atr) container_of((atr), struct as_fs_entry, attr)
+
+static ssize_t
+as_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	struct as_fs_entry *entry = to_as(attr);
+
+	if (!entry->show)
+		return 0;
+
+	return entry->show(e->elevator_data, page);
+}
+
+static ssize_t
+as_attr_store(struct kobject *kobj, struct attribute *attr,
+		    const char *page, size_t length)
+{
+	elevator_t *e = container_of(kobj, elevator_t, kobj);
+	struct as_fs_entry *entry = to_as(attr);
+
+	if (!entry->store)
+		return -EINVAL;
+
+	return entry->store(e->elevator_data, page, length);
+}
+
+static struct sysfs_ops as_sysfs_ops = {
+	.show	= as_attr_show,
+	.store	= as_attr_store,
+};
+
+struct kobj_type as_ktype = {
+	.sysfs_ops	= &as_sysfs_ops,
+	.default_attrs	= default_attrs,
+};
+
+static int __init as_slab_setup(void)
+{
+	arq_pool = kmem_cache_create("as_arq", sizeof(struct as_rq),
+				     0, 0, NULL, NULL);
+
+	if (!arq_pool)
+		panic("as: can't init slab pool\n");
+
+	return 0;
+}
+
+subsys_initcall(as_slab_setup);
+
+elevator_t iosched_as = {
+	.elevator_merge_fn = 		as_merge,
+	.elevator_merged_fn =		as_merged_request,
+	.elevator_merge_req_fn =	as_merged_requests,
+	.elevator_next_req_fn =		as_next_request,
+	.elevator_add_req_fn =		as_insert_request,
+	.elevator_remove_req_fn =	as_remove_request,
+	.elevator_queue_empty_fn =	as_queue_empty,
+	.elevator_completed_req_fn =	as_completed_request,
+	.elevator_former_req_fn =	as_former_request,
+	.elevator_latter_req_fn =	as_latter_request,
+	.elevator_set_req_fn =		as_set_request,
+	.elevator_put_req_fn =		as_put_request,
+	.elevator_init_fn =		as_init,
+	.elevator_exit_fn =		as_exit,
+
+	.elevator_ktype =		&as_ktype,
+};
+
+EXPORT_SYMBOL(iosched_as);
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 71750da0db6d..be19601847b5 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1033,7 +1033,7 @@ static inline void __generic_unplug_device(request_queue_t *q)
 	/*
 	 * was plugged, fire request_fn if queue has stuff to do
 	 */
-	if (!elv_queue_empty(q))
+	if (elv_next_request(q))
 		q->request_fn(q);
 }
 
@@ -1204,6 +1204,18 @@ static int blk_init_free_list(request_queue_t *q)
 
 static int __make_request(request_queue_t *, struct bio *);
 
+static elevator_t *chosen_elevator = &iosched_as;
+
+static int __init elevator_setup(char *str)
+{
+	if (!strcmp(str, "deadline"))
+		chosen_elevator = &iosched_deadline;
+	if (!strcmp(str, "as"))
+		chosen_elevator = &iosched_as;
+	return 1;
+}
+__setup("elevator=", elevator_setup);
+
 /**
  * blk_init_queue  - prepare a request queue for use with a block device
  * @q:    The &request_queue_t to be initialised
@@ -1235,11 +1247,20 @@ static int __make_request(request_queue_t *, struct bio *);
 int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
 {
 	int ret;
+	static int printed;
 
 	if (blk_init_free_list(q))
 		return -ENOMEM;
 
-	if ((ret = elevator_init(q, &iosched_deadline))) {
+	if (!printed) {
+		printed = 1;
+		if (chosen_elevator == &iosched_deadline)
+			printk("deadline elevator\n");
+		else if (chosen_elevator == &iosched_as)
+			printk("anticipatory scheduling elevator\n");
+	}
+
+	if ((ret = elevator_init(q, chosen_elevator))) {
 		blk_cleanup_queue(q);
 		return ret;
 	}
diff --git a/fs/buffer.c b/fs/buffer.c
index f063200c5b66..994bfbc41e73 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -319,6 +319,7 @@ asmlinkage long sys_fsync(unsigned int fd)
 
 	/* We need to protect against concurrent writers.. */
 	down(&inode->i_sem);
+	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(inode->i_mapping);
 	err = file->f_op->fsync(file, dentry, 0);
 	if (!ret)
@@ -326,6 +327,7 @@ asmlinkage long sys_fsync(unsigned int fd)
 	err = filemap_fdatawait(inode->i_mapping);
 	if (!ret)
 		ret = err;
+	current->flags &= ~PF_SYNCWRITE;
 	up(&inode->i_sem);
 
 out_putf:
@@ -354,6 +356,7 @@ asmlinkage long sys_fdatasync(unsigned int fd)
 		goto out_putf;
 
 	down(&inode->i_sem);
+	current->flags |= PF_SYNCWRITE;
 	ret = filemap_fdatawrite(inode->i_mapping);
 	err = file->f_op->fsync(file, dentry, 1);
 	if (!ret)
@@ -361,6 +364,7 @@ asmlinkage long sys_fdatasync(unsigned int fd)
 	err = filemap_fdatawait(inode->i_mapping);
 	if (!ret)
 		ret = err;
+	current->flags &= ~PF_SYNCWRITE;
 	up(&inode->i_sem);
 
 out_putf:
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 92682b02ff12..8732f30faa2b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -516,6 +516,7 @@ int generic_osync_inode(struct inode *inode, int what)
 	int need_write_inode_now = 0;
 	int err2;
 
+	current->flags |= PF_SYNCWRITE;
 	if (what & OSYNC_DATA)
 		err = filemap_fdatawrite(inode->i_mapping);
 	if (what & (OSYNC_METADATA|OSYNC_DATA)) {
@@ -528,6 +529,7 @@ int generic_osync_inode(struct inode *inode, int what)
 		if (!err)
 			err = err2;
 	}
+	current->flags &= ~PF_SYNCWRITE;
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & I_DIRTY) &&
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 07de69c1ef8a..d793bb97dd54 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -89,6 +89,11 @@ extern elevator_t elevator_noop;
  */
 extern elevator_t iosched_deadline;
 
+/*
+ * anticipatory I/O scheduler
+ */
+extern elevator_t iosched_as;
+
 extern int elevator_init(request_queue_t *, elevator_t *);
 extern void elevator_exit(request_queue_t *);
 extern inline int elv_rq_merge_ok(struct request *, struct bio *);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ca97376901b0..e29f9606c2aa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -321,6 +321,8 @@ struct k_itimer {
 };
 
 
+struct as_io_context;			/* Anticipatory scheduler */
+void exit_as_io_context(void);
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
@@ -450,6 +452,8 @@ struct task_struct {
 	struct dentry *proc_dentry;
 	struct backing_dev_info *backing_dev_info;
 
+	struct as_io_context *as_io_context;
+
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
 };
@@ -481,6 +485,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_KSWAPD	0x00040000	/* I am kswapd */
 #define PF_SWAPOFF	0x00080000	/* I am in swapoff */
 #define PF_LESS_THROTTLE 0x01000000	/* Throttle me less: I clena memory */
+#define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed(task_t *p, unsigned long new_mask);
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f090213e481..8471381546af 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -682,6 +682,8 @@ NORET_TYPE void do_exit(long code)
 		panic("Attempted to kill the idle task!");
 	if (unlikely(tsk->pid == 1))
 		panic("Attempted to kill init!");
+	if (tsk->as_io_context)
+		exit_as_io_context();
 	tsk->flags |= PF_EXITING;
 	del_timer_sync(&tsk->real_timer);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index c17e05614c88..fcdc884cd894 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -864,6 +864,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = get_jiffies_64();
 	p->security = NULL;
+	p->as_io_context = NULL;
 
 	retval = -ENOMEM;
 	if ((retval = security_task_alloc(p)))
-- 
cgit v1.2.3


From ee66147bf85b90df796737381e057155b4bc4fe9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:36:44 -0700
Subject: [PATCH] per queue nr_requests

From: Nick Piggin <piggin@cyberone.com.au>

This gets rid of the global queue_nr_requests and usage of BLKDEV_MAX_RQ
(the latter is now only used to set the queues' defaults).

The queue depth becomes per-queue, controlled by a sysfs entry.
---
 drivers/block/elevator.c  |  14 +---
 drivers/block/genhd.c     |   4 +-
 drivers/block/ll_rw_blk.c | 205 ++++++++++++++++++++++++++++++++++++++--------
 include/linux/blkdev.h    |  11 ++-
 include/linux/elevator.h  |   4 +-
 5 files changed, 188 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index 406755724e03..89af76783943 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -379,17 +379,13 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 		e->elevator_completed_req_fn(q, rq);
 }
 
-int elv_register_queue(struct gendisk *disk)
+int elv_register_queue(struct request_queue *q)
 {
-	request_queue_t *q = disk->queue;
 	elevator_t *e;
 
-	if (!q)
-		return -ENXIO;
-
 	e = &q->elevator;
 
-	e->kobj.parent = kobject_get(&disk->kobj);
+	e->kobj.parent = kobject_get(&q->kobj);
 	if (!e->kobj.parent)
 		return -EBUSY;
 
@@ -399,14 +395,12 @@ int elv_register_queue(struct gendisk *disk)
 	return kobject_register(&e->kobj);
 }
 
-void elv_unregister_queue(struct gendisk *disk)
+void elv_unregister_queue(struct request_queue *q)
 {
-	request_queue_t *q = disk->queue;
-
 	if (q) {
 		elevator_t * e = &q->elevator;
 		kobject_unregister(&e->kobj);
-		kobject_put(&disk->kobj);
+		kobject_put(&q->kobj);
 	}
 }
 
diff --git a/drivers/block/genhd.c b/drivers/block/genhd.c
index 889b8753c29f..361aee8ab255 100644
--- a/drivers/block/genhd.c
+++ b/drivers/block/genhd.c
@@ -191,7 +191,7 @@ void add_disk(struct gendisk *disk)
 	blk_register_region(MKDEV(disk->major, disk->first_minor),
 			    disk->minors, NULL, exact_match, exact_lock, disk);
 	register_disk(disk);
-	elv_register_queue(disk);
+	blk_register_queue(disk);
 }
 
 EXPORT_SYMBOL(add_disk);
@@ -199,7 +199,7 @@ EXPORT_SYMBOL(del_gendisk);	/* in partitions/check.c */
 
 void unlink_gendisk(struct gendisk *disk)
 {
-	elv_unregister_queue(disk);
+	blk_unregister_queue(disk);
 	blk_unregister_region(MKDEV(disk->major, disk->first_minor),
 			      disk->minors);
 }
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index cdfe7d3697bc..b1248e542e5e 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -42,12 +42,6 @@ static kmem_cache_t *request_cachep;
 static LIST_HEAD(blk_plug_list);
 static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 
-/*
- * Number of requests per queue.  This many for reads and for writes (twice
- * this number, total).
- */
-static int queue_nr_requests;
-
 static wait_queue_head_t congestion_wqh[2];
 
 /*
@@ -57,9 +51,9 @@ static struct workqueue_struct *kblockd_workqueue;
 
 unsigned long blk_max_low_pfn, blk_max_pfn;
 
-static inline int batch_requests(void)
+static inline int batch_requests(struct request_queue *q)
 {
-	return min(BLKDEV_MAX_RQ / 8, 8);
+	return min(q->nr_requests / 8, 8UL);
 }
 
 /*
@@ -67,11 +61,11 @@ static inline int batch_requests(void)
  * considered to be congested.  It include a little hysteresis to keep the
  * context switch rate down.
  */
-static inline int queue_congestion_on_threshold(void)
+static inline int queue_congestion_on_threshold(struct request_queue *q)
 {
 	int ret;
 
-	ret = queue_nr_requests / 8 - 1;
+	ret = q->nr_requests / 8 - 1;
 	if (ret < 0)
 		ret = 1;
 	return ret;
@@ -80,13 +74,13 @@ static inline int queue_congestion_on_threshold(void)
 /*
  * The threshold at which a queue is considered to be uncongested
  */
-static inline int queue_congestion_off_threshold(void)
+static inline int queue_congestion_off_threshold(struct request_queue *q)
 {
 	int ret;
 
-	ret = queue_nr_requests / 8 + 1;
-	if (ret > queue_nr_requests)
-		ret = queue_nr_requests;
+	ret = q->nr_requests / 8 + 1;
+	if (ret > q->nr_requests)
+		ret = q->nr_requests;
 	return ret;
 }
 
@@ -199,6 +193,7 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	/*
 	 * set defaults
 	 */
+	q->nr_requests = BLKDEV_MAX_RQ;
 	q->max_phys_segments = MAX_PHYS_SEGMENTS;
 	q->max_hw_segments = MAX_HW_SEGMENTS;
 	q->make_request_fn = mfn;
@@ -452,13 +447,15 @@ void blk_queue_free_tags(request_queue_t *q)
 	q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
 }
 
-static int init_tag_map(struct blk_queue_tag *tags, int depth)
+static int
+init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
 {
 	int bits, i;
 
-	if (depth > (queue_nr_requests*2)) {
-		depth = (queue_nr_requests*2);
-		printk(KERN_ERR "%s: adjusted depth to %d\n", __FUNCTION__, depth);
+	if (depth > q->nr_requests * 2) {
+		depth = q->nr_requests * 2;
+		printk(KERN_ERR "%s: adjusted depth to %d\n",
+				__FUNCTION__, depth);
 	}
 
 	tags->tag_index = kmalloc(depth * sizeof(struct request *), GFP_ATOMIC);
@@ -487,7 +484,6 @@ fail:
 	return -ENOMEM;
 }
 
-
 /**
  * blk_queue_init_tags - initialize the queue tag info
  * @q:  the request queue for the device
@@ -501,7 +497,7 @@ int blk_queue_init_tags(request_queue_t *q, int depth)
 	if (!tags)
 		goto fail;
 
-	if (init_tag_map(tags, depth))
+	if (init_tag_map(q, tags, depth))
 		goto fail;
 
 	INIT_LIST_HEAD(&tags->busy_list);
@@ -551,7 +547,7 @@ int blk_queue_resize_tags(request_queue_t *q, int new_depth)
 	tag_map = bqt->tag_map;
 	max_depth = bqt->real_max_depth;
 
-	if (init_tag_map(bqt, new_depth))
+	if (init_tag_map(q, bqt, new_depth))
 		return -ENOMEM;
 
 	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
@@ -1315,12 +1311,12 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 	struct request_list *rl = &q->rq;
 
 	spin_lock_irq(q->queue_lock);
-	if (rl->count[rw] == BLKDEV_MAX_RQ || !elv_may_queue(q, rw)) {
+	if (rl->count[rw] >= q->nr_requests || !elv_may_queue(q, rw)) {
 		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
 	rl->count[rw]++;
-	if ((BLKDEV_MAX_RQ - rl->count[rw]) < queue_congestion_on_threshold())
+	if ((q->nr_requests - rl->count[rw]) < queue_congestion_on_threshold(q))
 		set_queue_congested(q, rw);
 	spin_unlock_irq(q->queue_lock);
 
@@ -1328,7 +1324,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 	if (!rq) {
 		spin_lock_irq(q->queue_lock);
 		rl->count[rw]--;
-		if ((BLKDEV_MAX_RQ - rl->count[rw]) >= queue_congestion_off_threshold())
+		if ((q->nr_requests - rl->count[rw]) >= queue_congestion_off_threshold(q))
                         clear_queue_congested(q, rw);
 		spin_unlock_irq(q->queue_lock);
 		goto out;
@@ -1549,10 +1545,10 @@ void __blk_put_request(request_queue_t *q, struct request *req)
 		blk_free_request(q, req);
 
 		rl->count[rw]--;
-		if ((BLKDEV_MAX_RQ - rl->count[rw]) >=
-				queue_congestion_off_threshold())
+		if ((q->nr_requests - rl->count[rw]) >=
+				queue_congestion_off_threshold(q))
 			clear_queue_congested(q, rw);
-		if ((BLKDEV_MAX_RQ - rl->count[rw]) >= batch_requests() &&
+		if ((q->nr_requests - rl->count[rw]) >= batch_requests(q) &&
 				waitqueue_active(&rl->wait[rw]))
 			wake_up(&rl->wait[rw]);
 	}
@@ -2360,14 +2356,6 @@ int __init blk_dev_init(void)
 	if (!request_cachep)
 		panic("Can't create request pool slab cache\n");
 
-	queue_nr_requests = BLKDEV_MAX_RQ;
-
-	printk("block request queues:\n");
-	printk(" %d/%d requests per read queue\n", BLKDEV_MIN_RQ, queue_nr_requests);
-	printk(" %d/%d requests per write queue\n", BLKDEV_MIN_RQ, queue_nr_requests);
-	printk(" enter congestion at %d\n", queue_congestion_on_threshold());
-	printk(" exit congestion at %d\n", queue_congestion_off_threshold());
-
 	blk_max_low_pfn = max_low_pfn;
 	blk_max_pfn = max_pfn;
 
@@ -2376,6 +2364,153 @@ int __init blk_dev_init(void)
 	return 0;
 }
 
+/*
+ * sysfs parts below
+ */
+struct queue_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct request_queue *, char *);
+	ssize_t (*store)(struct request_queue *, const char *, size_t);
+};
+
+static ssize_t
+queue_var_show(unsigned int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+queue_var_store(unsigned long *var, const char *page, size_t count)
+{
+	char *p = (char *) page;
+
+	*var = simple_strtoul(p, &p, 10);
+	return count;
+}
+
+static ssize_t queue_requests_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(q->nr_requests, (page));
+}
+
+static ssize_t
+queue_requests_store(struct request_queue *q, const char *page, size_t count)
+{
+	struct request_list *rl = &q->rq;
+
+	int ret = queue_var_store(&q->nr_requests, page, count);
+	if (q->nr_requests < BLKDEV_MIN_RQ)
+		q->nr_requests = BLKDEV_MIN_RQ;
+
+	if ((q->nr_requests - rl->count[READ]) <
+				queue_congestion_on_threshold(q))
+		set_queue_congested(q, READ);
+	else if ((q->nr_requests - rl->count[READ]) >=
+				queue_congestion_off_threshold(q))
+		clear_queue_congested(q, READ);
+
+	if ((q->nr_requests - rl->count[READ]) <
+				queue_congestion_on_threshold(q))
+		set_queue_congested(q, READ);
+	else if ((q->nr_requests - rl->count[READ]) >=
+				queue_congestion_off_threshold(q))
+		clear_queue_congested(q, READ);
+
+	return ret;
+}
+
+static struct queue_sysfs_entry queue_requests_entry = {
+	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_requests_show,
+	.store = queue_requests_store,
+};
+
+static struct attribute *default_attrs[] = {
+	&queue_requests_entry.attr,
+	NULL,
+};
+
+#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
+
+static ssize_t
+queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct queue_sysfs_entry *entry = to_queue(attr);
+	struct request_queue *q;
+
+	q = container_of(kobj, struct request_queue, kobj);
+	if (!entry->show)
+		return 0;
+
+	return entry->show(q, page);
+}
+
+static ssize_t
+queue_attr_store(struct kobject *kobj, struct attribute *attr,
+		    const char *page, size_t length)
+{
+	struct queue_sysfs_entry *entry = to_queue(attr);
+	struct request_queue *q;
+
+	q = container_of(kobj, struct request_queue, kobj);
+	if (!entry->store)
+		return -EINVAL;
+
+	return entry->store(q, page, length);
+}
+
+static struct sysfs_ops queue_sysfs_ops = {
+	.show	= queue_attr_show,
+	.store	= queue_attr_store,
+};
+
+struct kobj_type queue_ktype = {
+	.sysfs_ops	= &queue_sysfs_ops,
+	.default_attrs	= default_attrs,
+};
+
+int blk_register_queue(struct gendisk *disk)
+{
+	int ret;
+
+	request_queue_t *q = disk->queue;
+
+	if (!q)
+		return -ENXIO;
+
+	q->kobj.parent = kobject_get(&disk->kobj);
+	if (!q->kobj.parent)
+		return -EBUSY;
+
+	snprintf(q->kobj.name, KOBJ_NAME_LEN, "%s", "queue");
+	q->kobj.ktype = &queue_ktype;
+
+	ret = kobject_register(&q->kobj);
+	if (ret < 0)
+		return ret;
+
+	ret = elv_register_queue(q);
+	if (ret) {
+		kobject_unregister(&q->kobj);
+		return ret;
+	}
+
+	return 0;
+}
+
+void blk_unregister_queue(struct gendisk *disk)
+{
+	request_queue_t *q = disk->queue;
+
+	if (q) {
+		elv_unregister_queue(q);
+
+		kobject_unregister(&q->kobj);
+		kobject_put(&disk->kobj);
+	}
+}
+
+
 EXPORT_SYMBOL(process_that_request_first);
 EXPORT_SYMBOL(end_that_request_first);
 EXPORT_SYMBOL(end_that_request_chunk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e97790517973..4295d60bf661 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -22,7 +22,7 @@ typedef struct elevator_s elevator_t;
 struct request_pm_state;
 
 #define BLKDEV_MIN_RQ	4
-#define BLKDEV_MAX_RQ	128
+#define BLKDEV_MAX_RQ	128	/* Default maximum */
 
 struct request_list {
 	int count[2];
@@ -268,9 +268,16 @@ struct request_queue
 	 */
 	spinlock_t		*queue_lock;
 
+	/*
+	 * queue kobject
+	 */
+	struct kobject kobj;
+
 	/*
 	 * queue settings
 	 */
+	unsigned long		nr_requests;	/* Max # of requests */
+
 	unsigned short		max_sectors;
 	unsigned short		max_phys_segments;
 	unsigned short		max_hw_segments;
@@ -398,6 +405,8 @@ struct sec_size {
 	unsigned block_size_bits;
 };
 
+extern int blk_register_queue(struct gendisk *disk);
+extern void blk_unregister_queue(struct gendisk *disk);
 extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_put_request(struct request *);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index d793bb97dd54..b0e70562be94 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -68,8 +68,8 @@ extern int elv_queue_empty(request_queue_t *);
 extern struct request *elv_next_request(struct request_queue *q);
 extern struct request *elv_former_request(request_queue_t *, struct request *);
 extern struct request *elv_latter_request(request_queue_t *, struct request *);
-extern int elv_register_queue(struct gendisk *);
-extern void elv_unregister_queue(struct gendisk *);
+extern int elv_register_queue(request_queue_t *q);
+extern void elv_unregister_queue(request_queue_t *q);
 extern int elv_may_queue(request_queue_t *, int);
 extern void elv_completed_request(request_queue_t *, struct request *);
 extern int elv_set_request(request_queue_t *, struct request *, int);
-- 
cgit v1.2.3


From 80af89ca709d4dfe41178abe29217a0fefa1af12 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:37:12 -0700
Subject: [PATCH] block batching fairness

From: Nick Piggin <piggin@cyberone.com.au>

This patch fixes the request batching fairness/starvation issue.  Its not
clear what is going on with 2.4, but it seems that its a problem around this
area.

Anyway, previously:

	* request queue fills up
	* process 1 calls get_request, sleeps
	* a couple of requests are freed
	* process 2 calls get_request, proceeds
	* a couple of requests are freed
	* process 2 calls get_request...

Now as unlikely as it seems, it could be a problem.  Its a fairness problem
that process 2 can skip ahead of process 1 anyway.

With the patch:

	* request queue fills up
	* any process calling get_request will sleep
	* once the queue gets below the batch watermark, processes
	  start being worken, and may allocate.


This patch includes Chris Mason's fix to only clear queue_full when all tasks
have been woken.  Previously I think starvation and unfairness could still
occur.

With this change to the blk-fair-batches patch, Chris is showing some much
improved numbers for 2.4 - 170 ms max wait vs 2700ms without blk-fair-batches
for a dbench 90 run.  He didn't indicate how much difference his patch alone
made, but it is an important fix I think.
---
 drivers/block/ll_rw_blk.c | 75 +++++++++++++++++++++++++++++++----------------
 include/linux/blkdev.h    | 26 ++++++++++++++++
 2 files changed, 75 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 57daaf4aea9d..f7981c1fa3e6 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -53,7 +53,7 @@ unsigned long blk_max_low_pfn, blk_max_pfn;
 
 static inline int batch_requests(struct request_queue *q)
 {
-	return q->nr_requests - min(q->nr_requests / 8, 8UL);
+	return q->nr_requests - min(q->nr_requests / 8, 8UL) - 1;
 }
 
 /*
@@ -1309,13 +1309,16 @@ static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask)
 /*
  * Get a free request, queue_lock must not be held
  */
-static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
+static struct request *
+get_request(request_queue_t *q, int rw, int gfp_mask, int force)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
 
 	spin_lock_irq(q->queue_lock);
-	if (rl->count[rw] >= q->nr_requests && !elv_may_queue(q, rw)) {
+	if (rl->count[rw] == q->nr_requests)
+		blk_set_queue_full(q, rw);
+	if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) {
 		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
@@ -1330,6 +1333,14 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 		rl->count[rw]--;
 		if (rl->count[rw] < queue_congestion_off_threshold(q))
                         clear_queue_congested(q, rw);
+
+		if (rl->count[rw] <= batch_requests(q)) {
+			if (waitqueue_active(&rl->wait[rw]))
+				wake_up(&rl->wait[rw]);
+			else
+				blk_clear_queue_full(q, rw);
+		}
+
 		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
@@ -1366,26 +1377,22 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 {
 	DEFINE_WAIT(wait);
 	struct request *rq;
+	int waited = 0;
 
 	generic_unplug_device(q);
 	do {
-		rq = get_request(q, rw, GFP_NOIO);
+		struct request_list *rl = &q->rq;
 
-		if (!rq) {
-			struct request_list *rl = &q->rq;
+		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
+				TASK_UNINTERRUPTIBLE);
 
-			prepare_to_wait_exclusive(&rl->wait[rw], &wait,
-						TASK_UNINTERRUPTIBLE);
-			/*
-			 * If _all_ the requests were suddenly returned then
-			 * no wakeup will be delivered.  So now we're on the
-			 * waitqueue, go check for that.
-			 */
-			rq = get_request(q, rw, GFP_NOIO);
-			if (!rq)
-				io_schedule();
-			finish_wait(&rl->wait[rw], &wait);
+		rq = get_request(q, rw, GFP_NOIO, waited);
+
+		if (!rq) {
+			io_schedule();
+			waited = 1;
 		}
+		finish_wait(&rl->wait[rw], &wait);
 	} while (!rq);
 
 	return rq;
@@ -1397,10 +1404,10 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
 
 	BUG_ON(rw != READ && rw != WRITE);
 
-	rq = get_request(q, rw, gfp_mask);
-
-	if (!rq && (gfp_mask & __GFP_WAIT))
+	if (gfp_mask & __GFP_WAIT)
 		rq = get_request_wait(q, rw);
+	else
+		rq = get_request(q, rw, gfp_mask, 0);
 
 	return rq;
 }
@@ -1551,9 +1558,13 @@ void __blk_put_request(request_queue_t *q, struct request *req)
 		rl->count[rw]--;
 		if (rl->count[rw] < queue_congestion_off_threshold(q))
 			clear_queue_congested(q, rw);
-		if (rl->count[rw] < batch_requests(q) &&
-				waitqueue_active(&rl->wait[rw]))
-			wake_up(&rl->wait[rw]);
+
+		if (rl->count[rw] <= batch_requests(q)) {
+			if (waitqueue_active(&rl->wait[rw]))
+				wake_up(&rl->wait[rw]);
+			else
+				blk_clear_queue_full(q, rw);
+		}
 	}
 }
 
@@ -1796,7 +1807,7 @@ get_rq:
 		freereq = NULL;
 	} else {
 		spin_unlock_irq(q->queue_lock);
-		if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
+		if ((freereq = get_request(q, rw, GFP_ATOMIC, 0)) == NULL) {
 			/*
 			 * READA bit set
 			 */
@@ -1904,8 +1915,7 @@ static inline void blk_partition_remap(struct bio *bio)
  * bio happens to be merged with someone else, and may change bi_dev and
  * bi_sector for remaps as it sees fit.  So the values of these fields
  * should NOT be depended on after the call to generic_make_request.
- *
- * */
+ */
 void generic_make_request(struct bio *bio)
 {
 	request_queue_t *q;
@@ -2415,6 +2425,19 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
 		clear_queue_congested(q, WRITE);
 
+	if (rl->count[READ] >= q->nr_requests) {
+		blk_set_queue_full(q, READ);
+	} else if (rl->count[READ] <= batch_requests(q)) {
+		blk_clear_queue_full(q, READ);
+		wake_up_all(&rl->wait[READ]);
+	}
+
+	if (rl->count[WRITE] >= q->nr_requests) {
+		blk_set_queue_full(q, WRITE);
+	} else if (rl->count[WRITE] <= batch_requests(q)) {
+		blk_clear_queue_full(q, WRITE);
+		wake_up_all(&rl->wait[WRITE]);
+	}
 	return ret;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4295d60bf661..d3a8f6ecd806 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -307,6 +307,8 @@ struct request_queue
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
+#define	QUEUE_FLAG_READFULL	3	/* write queue has been filled */
+#define QUEUE_FLAG_WRITEFULL	4	/* read queue has been filled */
 
 #define blk_queue_plugged(q)	!list_empty(&(q)->plug_list)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
@@ -322,6 +324,30 @@ struct request_queue
 
 #define rq_data_dir(rq)		((rq)->flags & 1)
 
+static inline int blk_queue_full(struct request_queue *q, int rw)
+{
+	if (rw == READ)
+		return test_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
+	return test_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
+}
+
+static inline void blk_set_queue_full(struct request_queue *q, int rw)
+{
+	if (rw == READ)
+		set_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
+	else
+		set_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
+}
+
+static inline void blk_clear_queue_full(struct request_queue *q, int rw)
+{
+	if (rw == READ)
+		clear_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
+	else
+		clear_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
+}
+
+
 /*
  * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may
  * it already be started by driver.
-- 
cgit v1.2.3


From 16f88dbdbffa3dc52b959706e6a311a932b51ed6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:37:19 -0700
Subject: [PATCH] generic io contexts

From: Nick Piggin <piggin@cyberone.com.au>

Generalise the AS-specific per-process IO context so that other IO schedulers
could use it.
---
 drivers/block/as-iosched.c | 254 +++++++++++++++++----------------------------
 drivers/block/ll_rw_blk.c  |  88 ++++++++++++++++
 include/linux/blkdev.h     |  44 ++++++++
 include/linux/sched.h      |   6 +-
 kernel/exit.c              |   4 +-
 kernel/fork.c              |   2 +-
 6 files changed, 233 insertions(+), 165 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c
index 2e5e64fb3b39..d63c92dfcf96 100644
--- a/drivers/block/as-iosched.c
+++ b/drivers/block/as-iosched.c
@@ -59,14 +59,6 @@
  */
 #define default_antic_expire ((HZ / 150) ? HZ / 150 : 1)
 
-/*
- * This is the per-process anticipatory I/O scheduler state.  It is refcounted
- * and kmalloc'ed.
- *
- * There is no locking protecting the contents of this structure!  Pointers
- * to a single as_io_context may appear in multiple queues at once.
- */
-
 /*
  * Keep track of up to 20ms thinktimes. We can go as big as we like here,
  * however huge values tend to interfere and not decay fast enough. A program
@@ -82,28 +74,6 @@ enum as_io_states {
 	AS_TASK_IORUNNING,	/* Process has completed some IO */
 };
 
-struct as_io_context {
-	atomic_t refcount;
-	pid_t pid;
-	unsigned long state;
-	atomic_t nr_queued; /* queued reads & sync writes */
-	atomic_t nr_dispatched; /* number of requests gone to the drivers */
-
-	spinlock_t lock;
-
-	/* IO History tracking */
-	/* Thinktime */
-	unsigned long last_end_request;
-	unsigned long ttime_total;
-	unsigned long ttime_samples;
-	unsigned long ttime_mean;
-	/* Layout pattern */
-	long seek_samples;
-	sector_t last_request_pos;
-	sector_t seek_total;
-	sector_t seek_mean;
-};
-
 enum anticipation_status {
 	ANTIC_OFF=0,		/* Not anticipating (normal operation)	*/
 	ANTIC_WAIT_REQ,		/* The last read has not yet completed  */
@@ -144,8 +114,8 @@ struct as_data {
 	unsigned long antic_start;	/* jiffies: when it started */
 	struct timer_list antic_timer;	/* anticipatory scheduling timer */
 	struct work_struct antic_work;	/* Deferred unplugging */
-	struct as_io_context *as_io_context;/* Identify the expected process */
-	int aic_finished; /* IO associated with as_io_context finished */
+	struct io_context *io_context;	/* Identify the expected process */
+	int ioc_finished; /* IO associated with io_context is finished */
 	int nr_dispatched;
 
 	/*
@@ -178,7 +148,7 @@ struct as_rq {
 
 	struct request *request;
 
-	struct as_io_context *as_io_context;	/* The submitting task */
+	struct io_context *io_context;	/* The submitting task */
 
 	/*
 	 * request hash, key is the ending offset (for back merge lookup)
@@ -206,99 +176,55 @@ static kmem_cache_t *arq_pool;
 /* Debug */
 static atomic_t nr_as_io_requests = ATOMIC_INIT(0);
 
-static void put_as_io_context(struct as_io_context **paic)
+/* Called to deallocate the as_io_context */
+static void free_as_io_context(struct as_io_context *aic)
 {
-	struct as_io_context *aic = *paic;
-
-	if (aic == NULL)
-		return;
-
-	BUG_ON(atomic_read(&aic->refcount) == 0);
-
-	if (atomic_dec_and_test(&aic->refcount)) {
-		WARN_ON(atomic_read(&nr_as_io_requests) == 0);
-		atomic_dec(&nr_as_io_requests);
-		kfree(aic);
-	}
+	atomic_dec(&nr_as_io_requests);
+	kfree(aic);
 }
 
-/* Called by the exitting task */
-void exit_as_io_context(void)
+/* Called when the task exits */
+static void exit_as_io_context(struct as_io_context *aic)
 {
-	unsigned long flags;
-	struct as_io_context *aic;
-
-	local_irq_save(flags);
-	aic = current->as_io_context;
-	if (aic) {
-		clear_bit(AS_TASK_RUNNING, &aic->state);
-		put_as_io_context(&aic);
-		current->as_io_context = NULL;
-	}
-	local_irq_restore(flags);
+	clear_bit(AS_TASK_RUNNING, &aic->state);
 }
 
-/*
- * If the current task has no IO context then create one and initialise it.
- * If it does have a context, take a ref on it.
- *
- * This is always called in the context of the task which submitted the I/O.
- * But weird things happen, so we disable local interrupts to ensure exclusive
- * access to *current.
- */
-static struct as_io_context *get_as_io_context(void)
+static struct as_io_context *alloc_as_io_context(void)
 {
-	struct task_struct *tsk = current;
-	unsigned long flags;
 	struct as_io_context *ret;
 
-	local_irq_save(flags);
-	ret = tsk->as_io_context;
-	if (ret == NULL) {
-		ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
-		if (ret) {
-			atomic_inc(&nr_as_io_requests);
-			atomic_set(&ret->refcount, 1);
-			ret->pid = tsk->pid;
-			ret->state = 1 << AS_TASK_RUNNING;
-			atomic_set(&ret->nr_queued, 0);
-			atomic_set(&ret->nr_dispatched, 0);
-			spin_lock_init(&ret->lock);
-			ret->ttime_total = 0;
-			ret->ttime_samples = 0;
-			ret->ttime_mean = 0;
-			ret->seek_total = 0;
-			ret->seek_samples = 0;
-			ret->seek_mean = 0;
-			tsk->as_io_context = ret;
-		}
+	ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+	if (ret) {
+		atomic_inc(&nr_as_io_requests);
+		ret->dtor = free_as_io_context;
+		ret->exit = exit_as_io_context;
+		ret->state = 1 << AS_TASK_RUNNING;
+		atomic_set(&ret->nr_queued, 0);
+		atomic_set(&ret->nr_dispatched, 0);
+		spin_lock_init(&ret->lock);
+		ret->ttime_total = 0;
+		ret->ttime_samples = 0;
+		ret->ttime_mean = 0;
+		ret->seek_total = 0;
+		ret->seek_samples = 0;
+		ret->seek_mean = 0;
 	}
-	local_irq_restore(flags);
-	atomic_inc(&ret->refcount);
+
 	return ret;
 }
 
-static void
-copy_as_io_context(struct as_io_context **pdst, struct as_io_context **psrc)
+/*
+ * If the current task has no AS IO context then create one and initialise it.
+ * Then take a ref on the task's io context and return it.
+ */
+static struct io_context *as_get_io_context(void)
 {
-	struct as_io_context *src = *psrc;
-
-	if (src) {
-		BUG_ON(atomic_read(&src->refcount) == 0);
-		atomic_inc(&src->refcount);
-		put_as_io_context(pdst);
-		*pdst = src;
-	}
+	struct io_context *ioc = get_io_context();
+	if (ioc && !ioc->aic)
+		ioc->aic = alloc_as_io_context();
+	return ioc;
 }
 
-static void
-swap_as_io_context(struct as_io_context **aic1, struct as_io_context **aic2)
-{
-	struct as_io_context *temp;
-	temp = *aic1;
-	*aic1 = *aic2;
-	*aic2 = temp;
-}
 
 /*
  * the back merge hash support functions
@@ -662,7 +588,7 @@ static void as_antic_waitreq(struct as_data *ad)
 {
 	BUG_ON(ad->antic_status == ANTIC_FINISHED);
 	if (ad->antic_status == ANTIC_OFF) {
-		if (!ad->as_io_context || ad->aic_finished)
+		if (!ad->io_context || ad->ioc_finished)
 			as_antic_waitnext(ad);
 		else
 			ad->antic_status = ANTIC_WAIT_REQ;
@@ -715,7 +641,7 @@ static int as_close_req(struct as_data *ad, struct as_rq *arq)
 	sector_t next = arq->request->sector;
 	sector_t delta;	/* acceptable close offset (in sectors) */
 
-	if (ad->antic_status == ANTIC_OFF || !ad->aic_finished)
+	if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished)
 		delay = 0;
 	else
 		delay = ((jiffies - ad->antic_start) * 1000) / HZ;
@@ -745,6 +671,7 @@ static int as_close_req(struct as_data *ad, struct as_rq *arq)
  */
 static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
 {
+	struct io_context *ioc;
 	struct as_io_context *aic;
 
 	if (arq && arq->is_sync == REQ_SYNC && as_close_req(ad, arq)) {
@@ -752,7 +679,7 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
 		return 1;
 	}
 
-	if (ad->aic_finished && as_antic_expired(ad)) {
+	if (ad->ioc_finished && as_antic_expired(ad)) {
 		/*
 		 * In this situation status should really be FINISHED,
 		 * however the timer hasn't had the chance to run yet.
@@ -760,14 +687,18 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
 		return 1;
 	}
 
-	aic = ad->as_io_context;
-	BUG_ON(!aic);
+	ioc = ad->io_context;
+	BUG_ON(!ioc);
 
-	if (arq && aic == arq->as_io_context) {
+	if (arq && ioc == arq->io_context) {
 		/* request from same process */
 		return 1;
 	}
 
+	aic = ioc->aic;
+	if (!aic)
+		return 0;
+
 	if (!test_bit(AS_TASK_RUNNING, &aic->state)) {
 		/* process anticipated on has exitted */
 		return 1;
@@ -810,7 +741,7 @@ static int as_can_break_anticipation(struct as_data *ad, struct as_rq *arq)
  */
 static int as_can_anticipate(struct as_data *ad, struct as_rq *arq)
 {
-	if (!ad->as_io_context)
+	if (!ad->io_context)
 		/*
 		 * Last request submitted was a write
 		 */
@@ -973,12 +904,10 @@ static void as_completed_request(request_queue_t *q, struct request *rq)
 {
 	struct as_data *ad = q->elevator.elevator_data;
 	struct as_rq *arq = RQ_DATA(rq);
-	struct as_io_context *aic = arq->as_io_context;
+	struct as_io_context *aic;
 
-	if (unlikely(!blk_fs_request(rq))) {
-		WARN_ON(aic);
+	if (unlikely(!blk_fs_request(rq)))
 		return;
-	}
 
 	WARN_ON(blk_fs_request(rq) && arq->state == AS_RQ_NEW);
 
@@ -1004,18 +933,12 @@ static void as_completed_request(request_queue_t *q, struct request *rq)
 		ad->changed_batch = 0;
 	}
 
-	if (!aic)
+	if (!arq->io_context)
 		return;
 
-	spin_lock(&aic->lock);
-	if (arq->is_sync == REQ_SYNC) {
-		set_bit(AS_TASK_IORUNNING, &aic->state);
-		aic->last_end_request = jiffies;
-	}
-
-	if (ad->as_io_context == aic) {
+	if (ad->io_context == arq->io_context) {
 		ad->antic_start = jiffies;
-		ad->aic_finished = 1;
+		ad->ioc_finished = 1;
 		if (ad->antic_status == ANTIC_WAIT_REQ) {
 			/*
 			 * We were waiting on this request, now anticipate
@@ -1024,9 +947,19 @@ static void as_completed_request(request_queue_t *q, struct request *rq)
 			as_antic_waitnext(ad);
 		}
 	}
+
+	aic = arq->io_context->aic;
+	if (!aic)
+		return;
+
+	spin_lock(&aic->lock);
+	if (arq->is_sync == REQ_SYNC) {
+		set_bit(AS_TASK_IORUNNING, &aic->state);
+		aic->last_end_request = jiffies;
+	}
 	spin_unlock(&aic->lock);
 
-	put_as_io_context(&arq->as_io_context);
+	put_io_context(arq->io_context);
 }
 
 /*
@@ -1047,9 +980,9 @@ static void as_remove_queued_request(request_queue_t *q, struct request *rq)
 
 		WARN_ON(arq->state != AS_RQ_QUEUED);
 
-		if (arq->as_io_context) {
-			BUG_ON(!atomic_read(&arq->as_io_context->nr_queued));
-			atomic_dec(&arq->as_io_context->nr_queued);
+		if (arq->io_context && arq->io_context->aic) {
+			BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued));
+			atomic_dec(&arq->io_context->aic->nr_queued);
 		}
 
 		/*
@@ -1082,10 +1015,12 @@ static void as_remove_dispatched_request(request_queue_t *q, struct request *rq)
 
 	WARN_ON(arq->state != AS_RQ_DISPATCHED);
 	WARN_ON(ON_RB(&arq->rb_node));
-	aic = arq->as_io_context;
-	if (aic) {
-		WARN_ON(!atomic_read(&aic->nr_dispatched));
-		atomic_dec(&aic->nr_dispatched);
+	if (arq->io_context && arq->io_context->aic) {
+		aic = arq->io_context->aic;
+		if (aic) {
+			WARN_ON(!atomic_read(&aic->nr_dispatched));
+			atomic_dec(&aic->nr_dispatched);
+		}
 	}
 }
 /*
@@ -1180,17 +1115,17 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
 
 	if (data_dir == REQ_SYNC) {
 		/* In case we have to anticipate after this */
-		copy_as_io_context(&ad->as_io_context, &arq->as_io_context);
+		copy_io_context(&ad->io_context, &arq->io_context);
 	} else {
-		if (ad->as_io_context) {
-			put_as_io_context(&ad->as_io_context);
-			ad->as_io_context = NULL;
+		if (ad->io_context) {
+			put_io_context(ad->io_context);
+			ad->io_context = NULL;
 		}
 
 		if (ad->current_write_count != 0)
 			ad->current_write_count--;
 	}
-	ad->aic_finished = 0;
+	ad->ioc_finished = 0;
 
 	ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
 
@@ -1199,8 +1134,8 @@ static void as_move_to_dispatch(struct as_data *ad, struct as_rq *arq)
 	 */
 	as_remove_queued_request(ad->q, arq->request);
 	list_add_tail(&arq->request->queuelist, ad->dispatch);
-	if (arq->as_io_context)
-		atomic_inc(&arq->as_io_context->nr_dispatched);
+	if (arq->io_context && arq->io_context->aic)
+		atomic_inc(&arq->io_context->aic->nr_dispatched);
 
 	WARN_ON(arq->state != AS_RQ_QUEUED);
 	arq->state = AS_RQ_DISPATCHED;
@@ -1355,11 +1290,11 @@ static void as_add_request(struct as_data *ad, struct as_rq *arq)
 		arq->is_sync = 0;
 	data_dir = arq->is_sync;
 
-	arq->as_io_context = get_as_io_context();
+	arq->io_context = as_get_io_context();
 
-	if (arq->as_io_context) {
-		atomic_inc(&arq->as_io_context->nr_queued);
-		as_update_iohist(arq->as_io_context, arq->request);
+	if (arq->io_context && arq->io_context->aic) {
+		atomic_inc(&arq->io_context->aic->nr_queued);
+		as_update_iohist(arq->io_context->aic, arq->request);
 	}
 
 	as_add_arq_rb(ad, arq);
@@ -1575,8 +1510,7 @@ as_merged_requests(request_queue_t *q, struct request *req,
 			 * Don't copy here but swap, because when anext is
 			 * removed below, it must contain the unused context
 			 */
-			swap_as_io_context(&arq->as_io_context,
-					&anext->as_io_context);
+			swap_io_context(&arq->io_context, &anext->io_context);
 		}
 	}
 
@@ -1584,7 +1518,7 @@ as_merged_requests(request_queue_t *q, struct request *req,
 	 * kill knowledge of next, this one is a goner
 	 */
 	as_remove_queued_request(q, next);
-	put_as_io_context(&anext->as_io_context);
+	put_io_context(anext->io_context);
 }
 
 /*
@@ -1630,7 +1564,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 		RB_CLEAR(&arq->rb_node);
 		arq->request = rq;
 		arq->state = AS_RQ_NEW;
-		arq->as_io_context = NULL;
+		arq->io_context = NULL;
 		INIT_LIST_HEAD(&arq->hash);
 		arq->hash_valid_count = 0;
 		INIT_LIST_HEAD(&arq->fifo);
@@ -1643,16 +1577,18 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 
 static int as_may_queue(request_queue_t *q, int rw)
 {
+	int ret = 0;
 	struct as_data *ad = q->elevator.elevator_data;
-	struct as_io_context *aic;
+	struct io_context *ioc;
 	if (ad->antic_status == ANTIC_WAIT_REQ ||
 			ad->antic_status == ANTIC_WAIT_NEXT) {
-		aic = get_as_io_context();
-		if (ad->as_io_context == aic)
-			return 1;
+		ioc = as_get_io_context();
+		if (ad->io_context == ioc)
+			ret = 1;
+		put_io_context(ioc);
 	}
 
-	return 0;
+	return ret;
 }
 
 static void as_exit(request_queue_t *q, elevator_t *e)
@@ -1666,7 +1602,7 @@ static void as_exit(request_queue_t *q, elevator_t *e)
 	BUG_ON(!list_empty(&ad->fifo_list[REQ_ASYNC]));
 
 	mempool_destroy(ad->arq_pool);
-	put_as_io_context(&ad->as_io_context);
+	put_io_context(ad->io_context);
 	kfree(ad->hash);
 	kfree(ad);
 }
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index f7981c1fa3e6..8f44b5690d9a 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1318,6 +1318,7 @@ get_request(request_queue_t *q, int rw, int gfp_mask, int force)
 	spin_lock_irq(q->queue_lock);
 	if (rl->count[rw] == q->nr_requests)
 		blk_set_queue_full(q, rw);
+
 	if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) {
 		spin_unlock_irq(q->queue_lock);
 		goto out;
@@ -2377,6 +2378,93 @@ int __init blk_dev_init(void)
 	return 0;
 }
 
+
+/*
+ * IO Context helper functions
+ */
+void put_io_context(struct io_context *ioc)
+{
+	if (ioc == NULL)
+		return;
+
+	BUG_ON(atomic_read(&ioc->refcount) == 0);
+
+	if (atomic_dec_and_test(&ioc->refcount)) {
+		if (ioc->aic && ioc->aic->dtor)
+			ioc->aic->dtor(ioc->aic);
+		kfree(ioc);
+	}
+}
+
+/* Called by the exitting task */
+void exit_io_context(void)
+{
+	unsigned long flags;
+	struct io_context *ioc;
+
+	local_irq_save(flags);
+	ioc = current->io_context;
+	if (ioc) {
+		if (ioc->aic && ioc->aic->exit)
+			ioc->aic->exit(ioc->aic);
+		put_io_context(ioc);
+		current->io_context = NULL;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * If the current task has no IO context then create one and initialise it.
+ * If it does have a context, take a ref on it.
+ *
+ * This is always called in the context of the task which submitted the I/O.
+ * But weird things happen, so we disable local interrupts to ensure exclusive
+ * access to *current.
+ */
+struct io_context *get_io_context(void)
+{
+	struct task_struct *tsk = current;
+	unsigned long flags;
+	struct io_context *ret;
+
+	local_irq_save(flags);
+	ret = tsk->io_context;
+	if (ret == NULL) {
+		ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+		if (ret) {
+			atomic_set(&ret->refcount, 1);
+			ret->pid = tsk->pid;
+			ret->aic = NULL;
+			tsk->io_context = ret;
+		}
+	}
+	local_irq_restore(flags);
+	atomic_inc(&ret->refcount);
+	return ret;
+}
+
+void copy_io_context(struct io_context **pdst, struct io_context **psrc)
+{
+	struct io_context *src = *psrc;
+	struct io_context *dst = *pdst;
+
+	if (src) {
+		BUG_ON(atomic_read(&src->refcount) == 0);
+		atomic_inc(&src->refcount);
+		put_io_context(dst);
+		*pdst = src;
+	}
+}
+
+void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
+{
+	struct io_context *temp;
+	temp = *ioc1;
+	*ioc1 = *ioc2;
+	*ioc2 = temp;
+}
+
+
 /*
  * sysfs parts below
  */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d3a8f6ecd806..13116a7a7969 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -24,6 +24,50 @@ struct request_pm_state;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 
+/*
+ * This is the per-process anticipatory I/O scheduler state.
+ */
+struct as_io_context {
+	spinlock_t lock;
+
+	void (*dtor)(struct as_io_context *aic); /* destructor */
+	void (*exit)(struct as_io_context *aic); /* called on task exit */
+
+	unsigned long state;
+	atomic_t nr_queued; /* queued reads & sync writes */
+	atomic_t nr_dispatched; /* number of requests gone to the drivers */
+
+	/* IO History tracking */
+	/* Thinktime */
+	unsigned long last_end_request;
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+	/* Layout pattern */
+	long seek_samples;
+	sector_t last_request_pos;
+	sector_t seek_total;
+	sector_t seek_mean;
+};
+
+/*
+ * This is the per-process I/O subsystem state.  It is refcounted and
+ * kmalloc'ed. Currently all fields are modified in process io context
+ * (apart from the atomic refcount), so require no locking.
+ */
+struct io_context {
+	atomic_t refcount;
+	pid_t pid;
+
+	struct as_io_context *aic;
+};
+
+void put_io_context(struct io_context *ioc);
+void exit_io_context(void);
+struct io_context *get_io_context(void);
+void copy_io_context(struct io_context **pdst, struct io_context **psrc);
+void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
+
 struct request_list {
 	int count[2];
 	mempool_t *rq_pool;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e29f9606c2aa..750f2a12cada 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -321,8 +321,8 @@ struct k_itimer {
 };
 
 
-struct as_io_context;			/* Anticipatory scheduler */
-void exit_as_io_context(void);
+struct io_context;			/* See blkdev.h */
+void exit_io_context(void);
 
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
@@ -452,7 +452,7 @@ struct task_struct {
 	struct dentry *proc_dentry;
 	struct backing_dev_info *backing_dev_info;
 
-	struct as_io_context *as_io_context;
+	struct io_context *io_context;
 
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
diff --git a/kernel/exit.c b/kernel/exit.c
index 8471381546af..ebc839b645a7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -682,8 +682,8 @@ NORET_TYPE void do_exit(long code)
 		panic("Attempted to kill the idle task!");
 	if (unlikely(tsk->pid == 1))
 		panic("Attempted to kill init!");
-	if (tsk->as_io_context)
-		exit_as_io_context();
+	if (tsk->io_context)
+		exit_io_context();
 	tsk->flags |= PF_EXITING;
 	del_timer_sync(&tsk->real_timer);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index fcdc884cd894..96ce3385cc75 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -864,7 +864,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = get_jiffies_64();
 	p->security = NULL;
-	p->as_io_context = NULL;
+	p->io_context = NULL;
 
 	retval = -ENOMEM;
 	if ((retval = security_task_alloc(p)))
-- 
cgit v1.2.3


From 930805a244eaadb5aefbc08b558db72136128388 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:37:26 -0700
Subject: [PATCH] block request batching

From: Nick Piggin <piggin@cyberone.com.au>

The following patch gets batching working how it should be.

After a process is woken up, it is allowed to allocate up to 32 requests
for 20ms.  It does not stop other processes submitting requests if it isn't
submitting though.  This should allow less context switches, and allow
batches of requests from each process to be sent to the io scheduler
instead of 1 request from each process.

tiobench sequential writes are more than tripled, random writes are nearly
doubled over mm1.  In earlier tests I generally saw better CPU efficiency
but it doesn't show here.  There is still debug to be taken out.  Its also
only on UP.

                                Avg     Maximum     Lat%   Lat%   CPU
 Identifier    Rate  (CPU%)  Latency   Latency     >2s    >10s   Eff
 ------------------- ------ --------- ---------- ------- ------ ----
 -2.5.71-mm1   11.13 3.783%    46.10    24668.01   0.84   0.02   294
 +2.5.71-mm1   13.21 4.489%    37.37     5691.66   0.76   0.00   294

 Random Reads
 ------------------- ------ --------- ---------- ------- ------ ----
 -2.5.71-mm1    0.97 0.582%   519.86     6444.66  11.93   0.00   167
 +2.5.71-mm1    1.01 0.604%   484.59     6604.93  10.73   0.00   167

 Sequential Writes
 ------------------- ------ --------- ---------- ------- ------ ----
 -2.5.71-mm1    4.85 4.456%    77.80    99359.39   0.18   0.13   109
 +2.5.71-mm1   14.11 14.19%    10.07    22805.47   0.09   0.04    99

 Random Writes
 ------------------- ------ --------- ---------- ------- ------ ----
 -2.5.71-mm1    0.46 0.371%    14.48     6173.90   0.23   0.00   125
 +2.5.71-mm1    0.86 0.744%    24.08     8753.66   0.31   0.00   115

It decreases context switch rate on IBM's 8-way on ext2 tiobench 64 threads
from ~2500/s to ~140/s on their regression tests.
---
 drivers/block/ll_rw_blk.c | 132 ++++++++++++++++++++++++++++++----------------
 include/linux/blkdev.h    |   6 +++
 2 files changed, 94 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 8f44b5690d9a..633266ee8c87 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -51,10 +51,11 @@ static struct workqueue_struct *kblockd_workqueue;
 
 unsigned long blk_max_low_pfn, blk_max_pfn;
 
-static inline int batch_requests(struct request_queue *q)
-{
-	return q->nr_requests - min(q->nr_requests / 8, 8UL) - 1;
-}
+/* Amount of time in which a process may batch requests */
+#define BLK_BATCH_TIME	(HZ/50UL)
+
+/* Number of requests a "batching" process may submit */
+#define BLK_BATCH_REQ	32
 
 /*
  * Return the threshold (number of used requests) at which the queue is
@@ -1305,24 +1306,76 @@ static inline struct request *blk_alloc_request(request_queue_t *q,int gfp_mask)
 	return NULL;
 }
 
+/*
+ * ioc_batching returns true if the ioc is a valid batching request and
+ * should be given priority access to a request.
+ */
+static inline int ioc_batching(struct io_context *ioc)
+{
+	if (!ioc)
+		return 0;
+
+	return ioc->nr_batch_requests == BLK_BATCH_REQ ||
+		(ioc->nr_batch_requests > 0
+		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
+}
+
+/*
+ * ioc_set_batching sets ioc to be a new "batcher" if it is not one
+ */
+void ioc_set_batching(struct io_context *ioc)
+{
+	if (!ioc || ioc_batching(ioc))
+		return;
+
+	ioc->nr_batch_requests = BLK_BATCH_REQ;
+	ioc->last_waited = jiffies;
+}
+
+/*
+ * A request has just been released.  Account for it, update the full and
+ * congestion status, wake up any waiters.   Called under q->queue_lock.
+ */
+static void freed_request(request_queue_t *q, int rw)
+{
+	struct request_list *rl = &q->rq;
+
+	rl->count[rw]--;
+	if (rl->count[rw] < queue_congestion_off_threshold(q))
+		clear_queue_congested(q, rw);
+	if (rl->count[rw]+1 <= q->nr_requests) {
+		smp_mb();
+		if (waitqueue_active(&rl->wait[rw]))
+			wake_up(&rl->wait[rw]);
+		if (!waitqueue_active(&rl->wait[rw]))
+			blk_clear_queue_full(q, rw);
+	}
+}
+
 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
 /*
  * Get a free request, queue_lock must not be held
  */
-static struct request *
-get_request(request_queue_t *q, int rw, int gfp_mask, int force)
+static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
+	struct io_context *ioc = get_io_context();
 
 	spin_lock_irq(q->queue_lock);
-	if (rl->count[rw] == q->nr_requests)
-		blk_set_queue_full(q, rw);
+	if (rl->count[rw]+1 >= q->nr_requests) {
+		if (!blk_queue_full(q, rw)) {
+			ioc_set_batching(ioc);
+			blk_set_queue_full(q, rw);
+		}
+	}
 
-	if (blk_queue_full(q, rw) && !force && !elv_may_queue(q, rw)) {
+	if (blk_queue_full(q, rw)
+			&& !ioc_batching(ioc) && !elv_may_queue(q, rw)) {
 		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
+
 	rl->count[rw]++;
 	if (rl->count[rw] >= queue_congestion_on_threshold(q))
 		set_queue_congested(q, rw);
@@ -1331,20 +1384,13 @@ get_request(request_queue_t *q, int rw, int gfp_mask, int force)
 	rq = blk_alloc_request(q, gfp_mask);
 	if (!rq) {
 		spin_lock_irq(q->queue_lock);
-		rl->count[rw]--;
-		if (rl->count[rw] < queue_congestion_off_threshold(q))
-                        clear_queue_congested(q, rw);
-
-		if (rl->count[rw] <= batch_requests(q)) {
-			if (waitqueue_active(&rl->wait[rw]))
-				wake_up(&rl->wait[rw]);
-			else
-				blk_clear_queue_full(q, rw);
-		}
-
+		freed_request(q, rw);
 		spin_unlock_irq(q->queue_lock);
 		goto out;
 	}
+
+	if (ioc_batching(ioc))
+		ioc->nr_batch_requests--;
 	
 	INIT_LIST_HEAD(&rq->queuelist);
 
@@ -1367,6 +1413,7 @@ get_request(request_queue_t *q, int rw, int gfp_mask, int force)
 	rq->sense = NULL;
 
 out:
+	put_io_context(ioc);
 	return rq;
 }
 
@@ -1378,7 +1425,6 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 {
 	DEFINE_WAIT(wait);
 	struct request *rq;
-	int waited = 0;
 
 	generic_unplug_device(q);
 	do {
@@ -1387,11 +1433,15 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);
 
-		rq = get_request(q, rw, GFP_NOIO, waited);
+		rq = get_request(q, rw, GFP_NOIO);
 
 		if (!rq) {
+			struct io_context *ioc;
+
 			io_schedule();
-			waited = 1;
+			ioc = get_io_context();
+			ioc_set_batching(ioc);
+			put_io_context(ioc);
 		}
 		finish_wait(&rl->wait[rw], &wait);
 	} while (!rq);
@@ -1408,7 +1458,7 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
 	if (gfp_mask & __GFP_WAIT)
 		rq = get_request_wait(q, rw);
 	else
-		rq = get_request(q, rw, gfp_mask, 0);
+		rq = get_request(q, rw, gfp_mask);
 
 	return rq;
 }
@@ -1555,17 +1605,7 @@ void __blk_put_request(request_queue_t *q, struct request *req)
 		BUG_ON(!list_empty(&req->queuelist));
 
 		blk_free_request(q, req);
-
-		rl->count[rw]--;
-		if (rl->count[rw] < queue_congestion_off_threshold(q))
-			clear_queue_congested(q, rw);
-
-		if (rl->count[rw] <= batch_requests(q)) {
-			if (waitqueue_active(&rl->wait[rw]))
-				wake_up(&rl->wait[rw]);
-			else
-				blk_clear_queue_full(q, rw);
-		}
+		freed_request(q, rw);
 	}
 }
 
@@ -1808,7 +1848,7 @@ get_rq:
 		freereq = NULL;
 	} else {
 		spin_unlock_irq(q->queue_lock);
-		if ((freereq = get_request(q, rw, GFP_ATOMIC, 0)) == NULL) {
+		if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
 			/*
 			 * READA bit set
 			 */
@@ -1852,13 +1892,12 @@ out:
 		__blk_put_request(q, freereq);
 
 	if (blk_queue_plugged(q)) {
-		int nr_queued = q->rq.count[0] + q->rq.count[1];
+		int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
 
 		if (nr_queued == q->unplug_thresh)
 			__generic_unplug_device(q);
 	}
 	spin_unlock_irq(q->queue_lock);
-
 	return 0;
 
 end_io:
@@ -1866,7 +1905,6 @@ end_io:
 	return 0;
 }
 
-
 /*
  * If bio->bi_dev is a partition, remap the location
  */
@@ -2378,6 +2416,7 @@ int __init blk_dev_init(void)
 	return 0;
 }
 
+static atomic_t nr_io_contexts = ATOMIC_INIT(0);
 
 /*
  * IO Context helper functions
@@ -2393,6 +2432,7 @@ void put_io_context(struct io_context *ioc)
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
 		kfree(ioc);
+		atomic_dec(&nr_io_contexts);
 	}
 }
 
@@ -2409,7 +2449,8 @@ void exit_io_context(void)
 			ioc->aic->exit(ioc->aic);
 		put_io_context(ioc);
 		current->io_context = NULL;
-	}
+	} else
+		WARN_ON(1);
 	local_irq_restore(flags);
 }
 
@@ -2432,8 +2473,11 @@ struct io_context *get_io_context(void)
 	if (ret == NULL) {
 		ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
 		if (ret) {
+			atomic_inc(&nr_io_contexts);
 			atomic_set(&ret->refcount, 1);
 			ret->pid = tsk->pid;
+			ret->last_waited = jiffies; /* doesn't matter... */
+			ret->nr_batch_requests = 0; /* because this is 0 */
 			ret->aic = NULL;
 			tsk->io_context = ret;
 		}
@@ -2515,16 +2559,16 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 
 	if (rl->count[READ] >= q->nr_requests) {
 		blk_set_queue_full(q, READ);
-	} else if (rl->count[READ] <= batch_requests(q)) {
+	} else if (rl->count[READ]+1 <= q->nr_requests) {
 		blk_clear_queue_full(q, READ);
-		wake_up_all(&rl->wait[READ]);
+		wake_up(&rl->wait[READ]);
 	}
 
 	if (rl->count[WRITE] >= q->nr_requests) {
 		blk_set_queue_full(q, WRITE);
-	} else if (rl->count[WRITE] <= batch_requests(q)) {
+	} else if (rl->count[WRITE]+1 <= q->nr_requests) {
 		blk_clear_queue_full(q, WRITE);
-		wake_up_all(&rl->wait[WRITE]);
+		wake_up(&rl->wait[WRITE]);
 	}
 	return ret;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 13116a7a7969..69178ca80d7d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -59,6 +59,12 @@ struct io_context {
 	atomic_t refcount;
 	pid_t pid;
 
+	/*
+	 * For request batching
+	 */
+	unsigned long last_waited; /* Time last woken after wait for request */
+	int nr_batch_requests;     /* Number of requests left in the batch */
+
 	struct as_io_context *aic;
 };
 
-- 
cgit v1.2.3


From 07581dd2bdd67146d13a61ca6506c6c8b694666a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:37:34 -0700
Subject: [PATCH] get_io_context fixes

- pass gfp_flags to get_io_context(): not all callers are forced to use
  GFP_ATOMIC().

- fix locking in get_io_context(): bump the refcount whilein the exclusive
  region.

- don't go oops in get_io_context() if the kmalloc failed.

- in as_get_io_context(): fail the whole thing if we were unable to
  allocate the AS-specific part.

- as_remove_queued_request() cleanup
---
 drivers/block/as-iosched.c | 50 ++++++++++++++++++++++------------------------
 drivers/block/ll_rw_blk.c  |  9 +++++----
 include/linux/blkdev.h     |  2 +-
 3 files changed, 30 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c
index d63c92dfcf96..b19289348fb0 100644
--- a/drivers/block/as-iosched.c
+++ b/drivers/block/as-iosched.c
@@ -219,13 +219,17 @@ static struct as_io_context *alloc_as_io_context(void)
  */
 static struct io_context *as_get_io_context(void)
 {
-	struct io_context *ioc = get_io_context();
-	if (ioc && !ioc->aic)
+	struct io_context *ioc = get_io_context(GFP_ATOMIC);
+	if (ioc && !ioc->aic) {
 		ioc->aic = alloc_as_io_context();
+		if (!ioc->aic) {
+			put_io_context(ioc);
+			ioc = NULL;
+		}
+	}
 	return ioc;
 }
 
-
 /*
  * the back merge hash support functions
  */
@@ -971,32 +975,26 @@ static void as_completed_request(request_queue_t *q, struct request *rq)
 static void as_remove_queued_request(request_queue_t *q, struct request *rq)
 {
 	struct as_rq *arq = RQ_DATA(rq);
+	const int data_dir = arq->is_sync;
+	struct as_data *ad = q->elevator.elevator_data;
 
-	if (!arq)
-		BUG();
-	else {
-		const int data_dir = arq->is_sync;
-		struct as_data *ad = q->elevator.elevator_data;
-
-		WARN_ON(arq->state != AS_RQ_QUEUED);
-
-		if (arq->io_context && arq->io_context->aic) {
-			BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued));
-			atomic_dec(&arq->io_context->aic->nr_queued);
-		}
-
-		/*
-		 * Update the "next_arq" cache if we are about to remove its
-		 * entry
-		 */
-		if (ad->next_arq[data_dir] == arq)
-			ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
+	WARN_ON(arq->state != AS_RQ_QUEUED);
 
-		list_del_init(&arq->fifo);
-		as_remove_merge_hints(q, arq);
-		as_del_arq_rb(ad, arq);
+	if (arq->io_context && arq->io_context->aic) {
+		BUG_ON(!atomic_read(&arq->io_context->aic->nr_queued));
+		atomic_dec(&arq->io_context->aic->nr_queued);
 	}
 
+	/*
+	 * Update the "next_arq" cache if we are about to remove its
+	 * entry
+	 */
+	if (ad->next_arq[data_dir] == arq)
+		ad->next_arq[data_dir] = as_find_next_arq(ad, arq);
+
+	list_del_init(&arq->fifo);
+	as_remove_merge_hints(q, arq);
+	as_del_arq_rb(ad, arq);
 }
 
 /*
@@ -1292,7 +1290,7 @@ static void as_add_request(struct as_data *ad, struct as_rq *arq)
 
 	arq->io_context = as_get_io_context();
 
-	if (arq->io_context && arq->io_context->aic) {
+	if (arq->io_context) {
 		atomic_inc(&arq->io_context->aic->nr_queued);
 		as_update_iohist(arq->io_context->aic, arq->request);
 	}
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 633266ee8c87..13cc6073bb47 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1360,7 +1360,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
-	struct io_context *ioc = get_io_context();
+	struct io_context *ioc = get_io_context(gfp_mask);
 
 	spin_lock_irq(q->queue_lock);
 	if (rl->count[rw]+1 >= q->nr_requests) {
@@ -1439,7 +1439,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 			struct io_context *ioc;
 
 			io_schedule();
-			ioc = get_io_context();
+			ioc = get_io_context(GFP_NOIO);
 			ioc_set_batching(ioc);
 			put_io_context(ioc);
 		}
@@ -2462,7 +2462,7 @@ void exit_io_context(void)
  * But weird things happen, so we disable local interrupts to ensure exclusive
  * access to *current.
  */
-struct io_context *get_io_context(void)
+struct io_context *get_io_context(int gfp_flags)
 {
 	struct task_struct *tsk = current;
 	unsigned long flags;
@@ -2482,8 +2482,9 @@ struct io_context *get_io_context(void)
 			tsk->io_context = ret;
 		}
 	}
+	if (ret)
+		atomic_inc(&ret->refcount);
 	local_irq_restore(flags);
-	atomic_inc(&ret->refcount);
 	return ret;
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 69178ca80d7d..2e7f92aa1dc2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -70,7 +70,7 @@ struct io_context {
 
 void put_io_context(struct io_context *ioc);
 void exit_io_context(void);
-struct io_context *get_io_context(void);
+struct io_context *get_io_context(int gfp_flags);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
 
-- 
cgit v1.2.3


From 0d98604b2849f0449b15bf0cb90654e949db4cb8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 4 Jul 2003 19:38:00 -0700
Subject: [PATCH] epoll: microoptimisations

From: Davide Libenzi <davidel@xmailserver.org>

- Inline eventpoll_release() so that __fput() does not need to call in
  epoll code if the file itself is not registered inside an epoll fd

- Add <linux/types.h> inclusion due __u32 and __u64 usage

- Fix debug printf that would otherwise panic if enabled with the new
  epoll code
---
 fs/eventpoll.c            | 30 +++++++++---------------------
 include/linux/eventpoll.h | 35 +++++++++++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b17cdae0109c..d6fef13ef5e0 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -443,28 +443,16 @@ void eventpoll_init_file(struct file *file)
 
 
 /*
- * This is called from inside fs/file_table.c:__fput() to unlink files
- * from the eventpoll interface. We need to have this facility to cleanup
- * correctly files that are closed without being removed from the eventpoll
- * interface.
+ * This is called from eventpoll_release() to unlink files from the eventpoll
+ * interface. We need to have this facility to cleanup correctly files that are
+ * closed without being removed from the eventpoll interface.
  */
-void eventpoll_release(struct file *file)
+void eventpoll_release_file(struct file *file)
 {
 	struct list_head *lsthead = &file->f_ep_links;
 	struct eventpoll *ep;
 	struct epitem *epi;
 
-	/*
-	 * Fast check to avoid the get/release of the semaphore. Since
-	 * we're doing this outside the semaphore lock, it might return
-	 * false negatives, but we don't care. It'll help in 99.99% of cases
-	 * to avoid the semaphore lock. False positives simply cannot happen
-	 * because the file in on the way to be removed and nobody ( but
-	 * eventpoll ) has still a reference to this file.
-	 */
-	if (list_empty(lsthead))
-		return;
-
 	/*
 	 * We don't want to get "file->f_ep_lock" because it is not
 	 * necessary. It is not necessary because we're in the "struct file"
@@ -541,7 +529,7 @@ eexit_1:
 /*
  * The following function implement the controller interface for the eventpoll
  * file that enable the insertion/removal/change of file descriptors inside
- * the interest set. It rapresents the kernel part of the user spcae epoll_ctl(2).
+ * the interest set. It rapresents the kernel part of the user space epoll_ctl(2).
  */
 asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)
 {
@@ -551,8 +539,8 @@ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event *even
 	struct epitem *epi;
 	struct epoll_event epds;
 
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u)\n",
-		     current, epfd, op, fd, event->events));
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
+		     current, epfd, op, fd, event));
 
 	error = -EFAULT;
 	if (copy_from_user(&epds, event, sizeof(struct epoll_event)))
@@ -633,8 +621,8 @@ eexit_3:
 eexit_2:
 	fput(file);
 eexit_1:
-	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u) = %d\n",
-		     current, epfd, op, fd, event->events, error));
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
+		     current, epfd, op, fd, event, error));
 
 	return error;
 }
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 60f8cadb1f50..f89acbe8183a 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -14,6 +14,8 @@
 #ifndef _LINUX_EVENTPOLL_H
 #define _LINUX_EVENTPOLL_H
 
+#include <linux/types.h>
+
 
 /* Valid opcodes to issue to sys_epoll_ctl() */
 #define EPOLL_CTL_ADD 1
@@ -55,8 +57,37 @@ asmlinkage long sys_epoll_wait(int epfd, struct epoll_event *events, int maxeven
 /* Used to initialize the epoll bits inside the "struct file" */
 void eventpoll_init_file(struct file *file);
 
-/* Used in fs/file_table.c:__fput() to unlink files from the eventpoll interface */
-void eventpoll_release(struct file *file);
+/* Used to release the epoll bits inside the "struct file" */
+void eventpoll_release_file(struct file *file);
+
+/*
+ * This is called from inside fs/file_table.c:__fput() to unlink files
+ * from the eventpoll interface. We need to have this facility to cleanup
+ * correctly files that are closed without being removed from the eventpoll
+ * interface.
+ */
+static inline void eventpoll_release(struct file *file)
+{
+
+	/*
+	 * Fast check to avoid the get/release of the semaphore. Since
+	 * we're doing this outside the semaphore lock, it might return
+	 * false negatives, but we don't care. It'll help in 99.99% of cases
+	 * to avoid the semaphore lock. False positives simply cannot happen
+	 * because the file in on the way to be removed and nobody ( but
+	 * eventpoll ) has still a reference to this file.
+	 */
+	if (likely(list_empty(&file->f_ep_links)))
+		return;
+
+	/*
+	 * The file is being closed while it is still linked to an epoll
+	 * descriptor. We need to handle this by correctly unlinking it
+	 * from its containers.
+	 */
+	eventpoll_release_file(file);
+}
+
 
 #else
 
-- 
cgit v1.2.3


From e4c67754e590e4e14f8167065b205708963e81cb Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Sat, 5 Jul 2003 20:39:12 -0700
Subject: [PATCH] enable device mapper in compat layer

The compat ioctls for device mapper were not being enabled due to an
incorrect config option.
---
 include/linux/compat_ioctl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 076b1b268c7d..3fb888cc0c34 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -117,7 +117,7 @@ COMPATIBLE_IOCTL(START_ARRAY)
 COMPATIBLE_IOCTL(STOP_ARRAY)
 COMPATIBLE_IOCTL(STOP_ARRAY_RO)
 COMPATIBLE_IOCTL(RESTART_ARRAY_RW)
-#ifdef CONFIG_DM
+#ifdef CONFIG_BLK_DEV_DM
 /* DM */
 COMPATIBLE_IOCTL(DM_VERSION)
 COMPATIBLE_IOCTL(DM_REMOVE_ALL)
-- 
cgit v1.2.3


From 82a333fa1948869322f32a67223ea8d0ae9ad8ba Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@home.osdl.org>
Date: Sat, 5 Jul 2003 23:23:55 -0700
Subject: Simplify and speed up mmap read-around handling

This improves cold-cache program startup noticeably for me, and
simplifies the read-ahead logic at the same time. The rules for
read-ahead are:

 - if the vma is marked random, we just do the regular one-page case.
   Obvious.

 - if the vma is marked "linear access", we use the regular readahead
   code. No change in behaviour there (well, we also only consider it a
   _miss_ if it was marked linear access - the "readahead" and
   "readaround"  things are now totally independent of each other)

 - otherwise, we look at how many hits/misses we've had for this
   particular file open for mmap, and if we've had noticeably more
   misses than hits, we don't bother with read-around.

In particular, this means that the "real" read-ahead logic literally
only needs to worry about finding sequential accesses, and does not
have to worry about the common executable mmap access patthers that
have very different behaviour.

Some constant tweaking may be a good idea.
---
 include/linux/fs.h |  2 ++
 include/linux/mm.h |  4 ----
 mm/filemap.c       | 62 +++++++++++++++++++++++++++++++++---------------------
 mm/readahead.c     | 31 ---------------------------
 4 files changed, 40 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7a5f305101c5..77dd4b13dc43 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -420,6 +420,8 @@ struct file_ra_state {
 	unsigned long ahead_start;	/* Ahead window */
 	unsigned long ahead_size;
 	unsigned long ra_pages;		/* Maximum readahead window */
+	unsigned long mmap_hit;		/* Cache hit stat for mmap accesses */
+	unsigned long mmap_miss;	/* Cache miss stat for mmap accesses */
 };
 
 struct file {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d75f64725853..858914b2dbd3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -571,10 +571,6 @@ void page_cache_readahead(struct address_space *mapping,
 			  struct file_ra_state *ra,
 			  struct file *filp,
 			  unsigned long offset);
-void page_cache_readaround(struct address_space *mapping, 
-			   struct file_ra_state *ra,
-			   struct file *filp,
-			   unsigned long offset);
 void handle_ra_miss(struct address_space *mapping, 
 		    struct file_ra_state *ra, pgoff_t offset);
 unsigned long max_sane_readahead(unsigned long nr);
diff --git a/mm/filemap.c b/mm/filemap.c
index 1352d59d2ee4..f9623a9fecc6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -925,6 +925,9 @@ static int page_cache_read(struct file * file, unsigned long offset)
 	return error == -EEXIST ? 0 : error;
 }
 
+#define MMAP_READAROUND (16UL)
+#define MMAP_LOTSAMISS  (100)
+
 /*
  * filemap_nopage() is invoked via the vma operations vector for a
  * mapped memory region to read in file data during a page fault.
@@ -942,19 +945,19 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address
 	struct inode *inode = mapping->host;
 	struct page *page;
 	unsigned long size, pgoff, endoff;
-	int did_readahead;
+	int did_readaround = 0;
 
 	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
 	endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
 
 retry_all:
-	/*
-	 * An external ptracer can access pages that normally aren't
-	 * accessible..
-	 */
 	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	if ((pgoff >= size) && (area->vm_mm == current->mm))
-		return NULL;
+	if (pgoff >= size)
+		goto outside_data_content;
+
+	/* If we don't want any read-ahead, don't bother */
+	if (VM_RandomReadHint(area))
+		goto no_cached_page;
 
 	/*
 	 * The "size" of the file, as far as mmap is concerned, isn't bigger
@@ -963,25 +966,14 @@ retry_all:
 	if (size > endoff)
 		size = endoff;
 
-	did_readahead = 0;
-
 	/*
 	 * The readahead code wants to be told about each and every page
 	 * so it can build and shrink its windows appropriately
+	 *
+	 * For sequential accesses, we use the generic readahead logic.
 	 */
-	if (VM_SequentialReadHint(area)) {
-		did_readahead = 1;
+	if (VM_SequentialReadHint(area))
 		page_cache_readahead(mapping, ra, file, pgoff);
-	}
-
-	/*
-	 * If the offset is outside the mapping size we're off the end
-	 * of a privately mapped file, so we need to map a zero page.
-	 */
-	if ((pgoff < size) && !VM_RandomReadHint(area)) {
-		did_readahead = 1;
-		page_cache_readaround(mapping, ra, file, pgoff);
-	}
 
 	/*
 	 * Do we have something in the page cache already?
@@ -989,13 +981,27 @@ retry_all:
 retry_find:
 	page = find_get_page(mapping, pgoff);
 	if (!page) {
-		if (did_readahead) {
+		if (VM_SequentialReadHint(area)) {
 			handle_ra_miss(mapping, ra, pgoff);
-			did_readahead = 0;
+			goto no_cached_page;
 		}
-		goto no_cached_page;
+		ra->mmap_miss++;
+
+		/*
+		 * Do we miss much more than hit in this file? If so,
+		 * stop bothering with read-ahead. It will only hurt.
+		 */
+		if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
+			goto no_cached_page;
+
+		did_readaround = 1;
+		do_page_cache_readahead(mapping, file, pgoff & ~(MMAP_READAROUND-1), MMAP_READAROUND);
+		goto retry_find;
 	}
 
+	if (!did_readaround)
+		ra->mmap_hit++;
+
 	/*
 	 * Ok, found a page in the page cache, now we need to check
 	 * that it's up-to-date.
@@ -1010,6 +1016,14 @@ success:
 	mark_page_accessed(page);
 	return page;
 
+outside_data_content:
+	/*
+	 * An external ptracer can access pages that normally aren't
+	 * accessible..
+	 */
+	if (area->vm_mm == current->mm)
+		return NULL;
+	/* Fall through to the non-read-ahead case */
 no_cached_page:
 	/*
 	 * We're only likely to ever get here if MADV_RANDOM is in
diff --git a/mm/readahead.c b/mm/readahead.c
index ed9ca357a9a5..179ba48d5e5c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -437,37 +437,6 @@ out:
 	return;
 }
 
-/*
- * For mmap reads (typically executables) the access pattern is fairly random,
- * but somewhat ascending.  So readaround favours pages beyond the target one.
- * We also boost the window size, as it can easily shrink due to misses.
- */
-void
-page_cache_readaround(struct address_space *mapping, struct file_ra_state *ra,
-			struct file *filp, unsigned long offset)
-{
-	if (ra->next_size != -1UL) {
-		const unsigned long min = get_min_readahead(ra) * 4;
-		unsigned long target;
-		unsigned long backward;
-
-		/*
-		 * If next_size is zero then leave it alone, because that's a
-		 * readahead startup state.
-		 */
-		if (ra->next_size && ra->next_size < min)
-			ra->next_size = min;
-
-		target = offset;
-		backward = ra->next_size / 4;
-
-		if (backward > target)
-			target = 0;
-		else
-			target -= backward;
-		page_cache_readahead(mapping, ra, filp, target);
-	}
-}
 
 /*
  * handle_ra_miss() is called when it is known that a page which should have
-- 
cgit v1.2.3


From 6abc05cce8b06b9c986fb2bbd83e6fa9888ab1be Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 6 Jul 2003 05:41:12 -0700
Subject: [PATCH] xattr: fine-grained locking

From: Andreas Gruenbacher <agruen@suse.de>

This patch removes the dependency on i_sem in the getxattr and
listxattr iops of ext2 and ext3. In addition, the global ext[23]_xattr
semaphores go away. Instead of i_sem and the global semaphore, mutual
exclusion is now ensured by per-inode xattr semaphores, and by locking
the buffers before modifying them. The detailed locking strategy is
described in comments in fs/ext[23]/xattr.c.

Due to this change it is no longer necessary to take i_sem in
ext[23]_permission() for retrieving acls, so the
ext[23]_permission_locked() functions go away.

Additionally, the patch fixes a race condition in ext[23]_permission:
Accessing inode->i_acl was protected by the BKL in 2.4; in 2.5 there no
longer is such protection. Instead, inode->i_acl (and inode->i_default_acl)
are now accessed under inode->i_lock. (This could be replaced by RCU in
the future.)

In the ext3 extended attribute code, an new uglines results from locking
at the buffer head level: The buffer lock must be held between testing
if an xattr block can be modified and the actual modification to prevent
races from happening. Before a block can be modified,
ext3_journal_get_write_access() must be called. But this requies an unlocked
buffer, so I call ext3_journal_get_write_access() before locking the
buffer. If it turns out that the buffer cannot be modified,
journal_release_buffer() is called. Calling ext3_journal_get_write_access
after the test but while the buffer is still locked would be much better.
---
 fs/ext2/acl.c             | 104 +++++++++++-------------
 fs/ext2/acl.h             |   1 -
 fs/ext2/ext2.h            |  10 +++
 fs/ext2/super.c           |   3 +
 fs/ext2/xattr.c           | 155 ++++++++++++++++++-----------------
 fs/ext2/xattr_user.c      |  12 ---
 fs/ext3/acl.c             |  99 +++++++++++------------
 fs/ext3/acl.h             |   1 -
 fs/ext3/super.c           |   3 +
 fs/ext3/xattr.c           | 201 +++++++++++++++++++++++++---------------------
 fs/ext3/xattr_user.c      |  12 ---
 include/linux/ext3_fs_i.h |  10 +++
 12 files changed, 311 insertions(+), 300 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index bc95b66ff023..0df165f8ee01 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -124,14 +124,38 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
+static inline struct posix_acl *
+ext2_iget_acl(struct inode *inode, struct posix_acl **i_acl)
+{
+	struct posix_acl *acl = EXT2_ACL_NOT_CACHED;
+
+	spin_lock(&inode->i_lock);
+	if (*i_acl != EXT2_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*i_acl);
+	spin_unlock(&inode->i_lock);
+
+	return acl;
+}
+
+static inline void
+ext2_iset_acl(struct inode *inode, struct posix_acl **i_acl,
+		   struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*i_acl != EXT2_ACL_NOT_CACHED)
+		posix_acl_release(*i_acl);
+	*i_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
 /*
- * inode->i_sem: down
+ * inode->i_sem: don't care
  */
 static struct posix_acl *
 ext2_get_acl(struct inode *inode, int type)
 {
 	const size_t max_size = ext2_acl_size(EXT2_ACL_MAX_ENTRIES);
-	struct ext2_inode_inode *ei = EXT2_I(inode);
+	struct ext2_inode_info *ei = EXT2_I(inode);
 	int name_index;
 	char *value;
 	struct posix_acl *acl;
@@ -142,14 +166,16 @@ ext2_get_acl(struct inode *inode, int type)
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			if (ei->i_acl != EXT2_ACL_NOT_CACHED)
-				return posix_acl_dup(ei->i_acl);
+			acl = ext2_iget_acl(inode, &ei->i_acl);
+			if (acl != EXT2_ACL_NOT_CACHED)
+				return acl;
 			name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
 			break;
 
 		case ACL_TYPE_DEFAULT:
-			if (ei->i_default_acl != EXT2_ACL_NOT_CACHED)
-				return posix_acl_dup(ei->i_default_acl);
+			acl = ext2_iget_acl(inode, &ei->i_default_acl);
+			if (acl != EXT2_ACL_NOT_CACHED)
+				return acl;
 			name_index = EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT;
 			break;
 
@@ -171,11 +197,11 @@ ext2_get_acl(struct inode *inode, int type)
 	if (!IS_ERR(acl)) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				ei->i_acl = posix_acl_dup(acl);
+				ext2_iset_acl(inode, &ei->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				ei->i_default_acl = posix_acl_dup(acl);
+				ext2_iset_acl(inode, &ei->i_default_acl, acl);
 				break;
 		}
 	}
@@ -240,23 +266,24 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	if (!error) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				if (ei->i_acl != EXT2_ACL_NOT_CACHED)
-					posix_acl_release(ei->i_acl);
-				ei->i_acl = posix_acl_dup(acl);
+				ext2_iset_acl(inode, &ei->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				if (ei->i_default_acl != EXT2_ACL_NOT_CACHED)
-					posix_acl_release(ei->i_default_acl);
-				ei->i_default_acl = posix_acl_dup(acl);
+				ext2_iset_acl(inode, &ei->i_default_acl, acl);
 				break;
 		}
 	}
 	return error;
 }
 
-static int
-__ext2_permission(struct inode *inode, int mask, int lock)
+/*
+ * Inode operation permission().
+ *
+ * inode->i_sem: don't care
+ */
+int
+ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	int mode = inode->i_mode;
 
@@ -270,30 +297,16 @@ __ext2_permission(struct inode *inode, int mask, int lock)
 	if (current->fsuid == inode->i_uid) {
 		mode >>= 6;
 	} else if (test_opt(inode->i_sb, POSIX_ACL)) {
-		struct ext2_inode_info *ei = EXT2_I(inode);
+		struct posix_acl *acl;
 
 		/* The access ACL cannot grant access if the group class
 		   permission bits don't contain all requested permissions. */
 		if (((mode >> 3) & mask & S_IRWXO) != mask)
 			goto check_groups;
-		if (ei->i_acl == EXT2_ACL_NOT_CACHED) {
-			struct posix_acl *acl;
-
-			if (lock) {
-				down(&inode->i_sem);
-				acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
-				up(&inode->i_sem);
-			} else
-				acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
-
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
+		acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
+		if (acl) {
+			int error = posix_acl_permission(inode, acl, mask);
 			posix_acl_release(acl);
-			if (ei->i_acl == EXT2_ACL_NOT_CACHED)
-				return -EIO;
-		}
-		if (ei->i_acl) {
-			int error = posix_acl_permission(inode, ei->i_acl,mask);
 			if (error == -EACCES)
 				goto check_capabilities;
 			return error;
@@ -319,33 +332,11 @@ check_capabilities:
 	return -EACCES;
 }
 
-/*
- * Inode operation permission().
- *
- * inode->i_sem: up
- * BKL held [before 2.5.x]
- */
-int
-ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	return __ext2_permission(inode, mask, 1);
-}
-
-/*
- * Used internally if i_sem is already down.
- */
-int
-ext2_permission_locked(struct inode *inode, int mask)
-{
-	return __ext2_permission(inode, mask, 0);
-}
-
 /*
  * Initialize the ACLs of a new inode. Called from ext2_new_inode.
  *
  * dir->i_sem: down
  * inode->i_sem: up (access to inode is still exclusive)
- * BKL held [before 2.5.x] 
  */
 int
 ext2_init_acl(struct inode *inode, struct inode *dir)
@@ -405,7 +396,6 @@ cleanup:
  * file mode.
  *
  * inode->i_sem: down
- * BKL held [before 2.5.x]
  */
 int
 ext2_acl_chmod(struct inode *inode)
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 2e0560130b63..01937daf1168 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -60,7 +60,6 @@ static inline int ext2_acl_count(size_t size)
 
 /* acl.c */
 extern int ext2_permission (struct inode *, int, struct nameidata *);
-extern int ext2_permission_locked (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
 
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 610695289845..67f704ab1258 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -41,6 +41,16 @@ struct ext2_inode_info {
 	__u32	i_prealloc_block;
 	__u32	i_prealloc_count;
 	__u32	i_dir_start_lookup;
+#ifdef CONFIG_EXT2_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_sem even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index c4604187f186..14b8cca47277 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -177,6 +177,9 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
 		rwlock_init(&ei->i_meta_lock);
+#ifdef CONFIG_EXT2_FS_XATTR
+		init_rwsem(&ei->xattr_sem);
+#endif
 		inode_init_once(&ei->vfs_inode);
 	}
 }
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index aa29871da68e..f1334adc62ed 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -42,13 +42,12 @@
  *
  * Locking strategy
  * ----------------
- * The VFS already holds the BKL and the inode->i_sem semaphore when any of
- * the xattr inode operations are called, so we are guaranteed that only one
- * processes accesses extended attributes of an inode at any time.
- *
- * For writing we also grab the ext2_xattr_sem semaphore. This ensures that
- * only a single process is modifying an extended attribute block, even
- * if the block is shared among inodes.
+ * EXT2_I(inode)->i_file_acl is protected by EXT2_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count will change. Multiple writers to an EA block are synchronized
+ * by the bh lock. No more than a single bh lock is held at any time
+ * to avoid deadlocks.
  */
 
 #include <linux/buffer_head.h>
@@ -57,7 +56,7 @@
 #include <linux/slab.h>
 #include <linux/mbcache.h>
 #include <linux/quotaops.h>
-#include <asm/semaphore.h>
+#include <linux/rwsem.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -105,15 +104,6 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
 			      struct ext2_xattr_entry *);
 
 static struct mb_cache *ext2_xattr_cache;
-
-/*
- * If a file system does not share extended attributes among inodes,
- * we should not need the ext2_xattr_sem semaphore. However, the
- * filesystem may still contain shared blocks, so we always take
- * the lock.
- */
-
-static DECLARE_MUTEX(ext2_xattr_sem);
 static struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX];
 static rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED;
 
@@ -196,7 +186,7 @@ ext2_xattr_handler(int name_index)
 /*
  * Inode operation getxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: don't care
  */
 ssize_t
 ext2_getxattr(struct dentry *dentry, const char *name,
@@ -204,39 +194,28 @@ ext2_getxattr(struct dentry *dentry, const char *name,
 {
 	struct ext2_xattr_handler *handler;
 	struct inode *inode = dentry->d_inode;
-	ssize_t error;
 
 	handler = ext2_xattr_resolve_name(&name);
 	if (!handler)
 		return -EOPNOTSUPP;
-	down(&inode->i_sem);
-	error = handler->get(inode, name, buffer, size);
-	up(&inode->i_sem);
-
-	return error;
+	return handler->get(inode, name, buffer, size);
 }
 
 /*
  * Inode operation listxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: don't care
  */
 ssize_t
 ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-	ssize_t error;
-
-	down(&dentry->d_inode->i_sem);
-	error = ext2_xattr_list(dentry->d_inode, buffer, size);
-	up(&dentry->d_inode->i_sem);
-
-	return error;
+	return ext2_xattr_list(dentry->d_inode, buffer, size);
 }
 
 /*
  * Inode operation setxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: down
  */
 int
 ext2_setxattr(struct dentry *dentry, const char *name,
@@ -256,7 +235,7 @@ ext2_setxattr(struct dentry *dentry, const char *name,
 /*
  * Inode operation removexattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: down
  */
 int
 ext2_removexattr(struct dentry *dentry, const char *name)
@@ -295,12 +274,15 @@ ext2_xattr_get(struct inode *inode, int name_index, const char *name,
 
 	if (name == NULL)
 		return -EINVAL;
+	down_read(&EXT2_I(inode)->xattr_sem);
+	error = -ENODATA;
 	if (!EXT2_I(inode)->i_file_acl)
-		return -ENODATA;
+		goto cleanup;
 	ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
+	error = -EIO;
 	if (!bh)
-		return -EIO;
+		goto cleanup;
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
 	end = bh->b_data + bh->b_size;
@@ -365,6 +347,7 @@ found:
 
 cleanup:
 	brelse(bh);
+	up_read(&EXT2_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -391,12 +374,15 @@ ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
 
+	down_read(&EXT2_I(inode)->xattr_sem);
+	error = 0;
 	if (!EXT2_I(inode)->i_file_acl)
-		return 0;
+		goto cleanup;
 	ea_idebug(inode, "reading block %d", EXT2_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
+	error = -EIO;
 	if (!bh)
-		return -EIO;
+		goto cleanup;
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
 	end = bh->b_data + bh->b_size;
@@ -449,6 +435,7 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
 
 cleanup:
 	brelse(bh);
+	up_read(&EXT2_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -520,8 +507,7 @@ ext2_xattr_set(struct inode *inode, int name_index, const char *name,
 	name_len = strlen(name);
 	if (name_len > 255 || value_len > sb->s_blocksize)
 		return -ERANGE;
-	down(&ext2_xattr_sem);
-
+	down_write(&EXT2_I(inode)->xattr_sem);
 	if (EXT2_I(inode)->i_file_acl) {
 		/* The inode already has an extended attribute block. */
 		bh = sb_bread(sb, EXT2_I(inode)->i_file_acl);
@@ -614,12 +600,16 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
 	/* Here we know that we can set the new attribute. */
 
 	if (header) {
+		/* assert(header == HDR(bh)); */
+		lock_buffer(bh);
 		if (header->h_refcount == cpu_to_le32(1)) {
 			ea_bdebug(bh, "modifying in-place");
 			ext2_xattr_cache_remove(bh);
+			/* keep the buffer locked while modifying it. */
 		} else {
 			int offset;
 
+			unlock_buffer(bh);
 			ea_bdebug(bh, "cloning");
 			header = kmalloc(bh->b_size, GFP_KERNEL);
 			error = -ENOMEM;
@@ -644,6 +634,8 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
 		last = here = ENTRY(header+1);
 	}
 
+	/* Iff we are modifying the block in-place, bh is locked here. */
+
 	if (not_found) {
 		/* Insert the new name. */
 		size_t size = EXT2_XATTR_LEN(name_len);
@@ -714,9 +706,13 @@ bad_block:		ext2_error(sb, "ext2_xattr_set",
 skip_replace:
 	if (IS_LAST_ENTRY(ENTRY(header+1))) {
 		/* This block is now empty. */
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
 		error = ext2_xattr_set2(inode, bh, NULL);
 	} else {
 		ext2_xattr_rehash(header, here);
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
 		error = ext2_xattr_set2(inode, bh, header);
 	}
 
@@ -724,7 +720,7 @@ cleanup:
 	brelse(bh);
 	if (!(bh && header == HDR(bh)))
 		kfree(header);
-	up(&ext2_xattr_sem);
+	up_write(&EXT2_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -744,24 +740,28 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 		new_bh = ext2_xattr_cache_find(inode, header);
 		if (new_bh) {
 			/*
-			 * We found an identical block in the cache.
-			 * The old block will be released after updating
-			 * the inode.
+			 * We found an identical block in the cache. The
+			 * block returned is locked. The old block will
+			 * be released after updating the inode.
 			 */
 			ea_bdebug(new_bh, "%s block %lu",
 				(old_bh == new_bh) ? "keeping" : "reusing",
 				(unsigned long) new_bh->b_blocknr);
 			
 			error = -EDQUOT;
-			if (DQUOT_ALLOC_BLOCK(inode, 1))
+			if (DQUOT_ALLOC_BLOCK(inode, 1)) {
+				unlock_buffer(new_bh);
 				goto cleanup;
+			}
 			
 			HDR(new_bh)->h_refcount = cpu_to_le32(
 				le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
 			ea_bdebug(new_bh, "refcount now=%d",
 				le32_to_cpu(HDR(new_bh)->h_refcount));
+			unlock_buffer(new_bh);
 		} else if (old_bh && header == HDR(old_bh)) {
-			/* Keep this block. */
+			/* Keep this block. No need to lock the block as we
+			   don't need to change the reference count. */
 			new_bh = old_bh;
 			get_bh(new_bh);
 			ext2_xattr_cache_insert(new_bh);
@@ -812,12 +812,11 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 	error = 0;
 	if (old_bh && old_bh != new_bh) {
 		/*
-		 * If there was an old block, and we are not still using it,
-		 * we now release the old block.
-		*/
-		unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
-
-		if (refcount == 1) {
+		 * If there was an old block and we are no longer using it,
+		 * release the old block.
+		 */
+		lock_buffer(old_bh);
+		if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
 			/* Free the old block. */
 			ea_bdebug(old_bh, "freeing");
 			ext2_free_blocks(inode, old_bh->b_blocknr, 1);
@@ -827,12 +826,14 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			bforget(old_bh);
 		} else {
 			/* Decrement the refcount only. */
-			refcount--;
-			HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
+			HDR(old_bh)->h_refcount = cpu_to_le32(
+				le32_to_cpu(HDR(old_bh)->h_refcount) - 1);
 			DQUOT_FREE_BLOCK(inode, 1);
 			mark_buffer_dirty(old_bh);
-			ea_bdebug(old_bh, "refcount now=%d", refcount);
+			ea_bdebug(old_bh, "refcount now=%d",
+				le32_to_cpu(HDR(old_bh)->h_refcount));
 		}
+		unlock_buffer(old_bh);
 	}
 
 cleanup:
@@ -850,12 +851,11 @@ cleanup:
 void
 ext2_xattr_delete_inode(struct inode *inode)
 {
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 
+	down_write(&EXT2_I(inode)->xattr_sem);
 	if (!EXT2_I(inode)->i_file_acl)
-		return;
-	down(&ext2_xattr_sem);
-
+		goto cleanup;
 	bh = sb_bread(inode->i_sb, EXT2_I(inode)->i_file_acl);
 	if (!bh) {
 		ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
@@ -871,7 +871,7 @@ ext2_xattr_delete_inode(struct inode *inode)
 			EXT2_I(inode)->i_file_acl);
 		goto cleanup;
 	}
-	ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
+	lock_buffer(bh);
 	if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
 		ext2_xattr_cache_remove(bh);
 		ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1);
@@ -885,11 +885,13 @@ ext2_xattr_delete_inode(struct inode *inode)
 			sync_dirty_buffer(bh);
 		DQUOT_FREE_BLOCK(inode, 1);
 	}
+	ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
+	unlock_buffer(bh);
 	EXT2_I(inode)->i_file_acl = 0;
 
 cleanup:
 	brelse(bh);
-	up(&ext2_xattr_sem);
+	up_write(&EXT2_I(inode)->xattr_sem);
 }
 
 /*
@@ -982,8 +984,8 @@ ext2_xattr_cmp(struct ext2_xattr_header *header1,
  *
  * Find an identical extended attribute block.
  *
- * Returns a pointer to the block found, or NULL if such a block was
- * not found or an error occurred.
+ * Returns a locked buffer head to the block found, or NULL if such
+ * a block was not found or an error occurred.
  */
 static struct buffer_head *
 ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
@@ -1003,18 +1005,23 @@ ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
 			ext2_error(inode->i_sb, "ext2_xattr_cache_find",
 				"inode %ld: block %ld read error",
 				inode->i_ino, (unsigned long) ce->e_block);
-		} else if (le32_to_cpu(HDR(bh)->h_refcount) >
-			   EXT2_XATTR_REFCOUNT_MAX) {
-			ea_idebug(inode, "block %ld refcount %d>%d",
-				  (unsigned long) ce->e_block,
-				  le32_to_cpu(HDR(bh)->h_refcount),
-				  EXT2_XATTR_REFCOUNT_MAX);
-		} else if (!ext2_xattr_cmp(header, HDR(bh))) {
-			ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
-			mb_cache_entry_release(ce);
-			return bh;
+		} else {
+			lock_buffer(bh);
+			if (le32_to_cpu(HDR(bh)->h_refcount) >
+				   EXT2_XATTR_REFCOUNT_MAX) {
+				ea_idebug(inode, "block %ld refcount %d>%d",
+					  (unsigned long) ce->e_block,
+					  le32_to_cpu(HDR(bh)->h_refcount),
+					  EXT2_XATTR_REFCOUNT_MAX);
+			} else if (!ext2_xattr_cmp(header, HDR(bh))) {
+				ea_bdebug(bh, "b_count=%d",
+					  atomic_read(&(bh->b_count)));
+				mb_cache_entry_release(ce);
+				return bh;
+			}
+			unlock_buffer(bh);
+			brelse(bh);
 		}
-		brelse(bh);
 		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
 	}
 	return NULL;
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index fc0ec86f4928..be1558761064 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -11,10 +11,6 @@
 #include "ext2.h"
 #include "xattr.h"
 
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-# include "acl.h"
-#endif
-
 #define XATTR_USER_PREFIX "user."
 
 static size_t
@@ -44,11 +40,7 @@ ext2_xattr_user_get(struct inode *inode, const char *name,
 		return -EINVAL;
 	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-	error = ext2_permission_locked(inode, MAY_READ);
-#else
 	error = permission(inode, MAY_READ, NULL);
-#endif
 	if (error)
 		return error;
 
@@ -68,11 +60,7 @@ ext2_xattr_user_set(struct inode *inode, const char *name,
 	if ( !S_ISREG(inode->i_mode) &&
 	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
 		return -EPERM;
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
-	error = ext2_permission_locked(inode, MAY_WRITE);
-#else
 	error = permission(inode, MAY_WRITE, NULL);
-#endif
 	if (error)
 		return error;
 
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 94b6f477f1d8..2416e214280f 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -125,10 +125,34 @@ fail:
 	return ERR_PTR(-EINVAL);
 }
 
+static inline struct posix_acl *
+ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl)
+{
+	struct posix_acl *acl = EXT3_ACL_NOT_CACHED;
+
+	spin_lock(&inode->i_lock);
+	if (*i_acl != EXT3_ACL_NOT_CACHED)
+		acl = posix_acl_dup(*i_acl);
+	spin_unlock(&inode->i_lock);
+
+	return acl;
+}
+
+static inline void
+ext3_iset_acl(struct inode *inode, struct posix_acl **i_acl,
+                  struct posix_acl *acl)
+{
+	spin_lock(&inode->i_lock);
+	if (*i_acl != EXT3_ACL_NOT_CACHED)
+		posix_acl_release(*i_acl);
+	*i_acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+}
+
 /*
  * Inode operation get_posix_acl().
  *
- * inode->i_sem: down
+ * inode->i_sem: don't care
  */
 static struct posix_acl *
 ext3_get_acl(struct inode *inode, int type)
@@ -145,14 +169,16 @@ ext3_get_acl(struct inode *inode, int type)
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			if (ei->i_acl != EXT3_ACL_NOT_CACHED)
-				return posix_acl_dup(ei->i_acl);
+			acl = ext3_iget_acl(inode, &ei->i_acl);
+			if (acl != EXT3_ACL_NOT_CACHED)
+				return acl;
 			name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
 			break;
 
 		case ACL_TYPE_DEFAULT:
-			if (ei->i_default_acl != EXT3_ACL_NOT_CACHED)
-				return posix_acl_dup(ei->i_default_acl);
+			acl = ext3_iget_acl(inode, &ei->i_default_acl);
+			if (acl != EXT3_ACL_NOT_CACHED)
+				return acl;
 			name_index = EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT;
 			break;
 
@@ -174,11 +200,11 @@ ext3_get_acl(struct inode *inode, int type)
 	if (!IS_ERR(acl)) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				ei->i_acl = posix_acl_dup(acl);
+				ext3_iset_acl(inode, &ei->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				ei->i_default_acl = posix_acl_dup(acl);
+				ext3_iset_acl(inode, &ei->i_default_acl, acl);
 				break;
 		}
 	}
@@ -245,23 +271,24 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
 	if (!error) {
 		switch(type) {
 			case ACL_TYPE_ACCESS:
-				if (ei->i_acl != EXT3_ACL_NOT_CACHED)
-					posix_acl_release(ei->i_acl);
-				ei->i_acl = posix_acl_dup(acl);
+				ext3_iset_acl(inode, &ei->i_acl, acl);
 				break;
 
 			case ACL_TYPE_DEFAULT:
-				if (ei->i_default_acl != EXT3_ACL_NOT_CACHED)
-					posix_acl_release(ei->i_default_acl);
-				ei->i_default_acl = posix_acl_dup(acl);
+				ext3_iset_acl(inode, &ei->i_default_acl, acl);
 				break;
 		}
 	}
 	return error;
 }
 
-static int
-__ext3_permission(struct inode *inode, int mask, int lock)
+/*
+ * Inode operation permission().
+ *
+ * inode->i_sem: don't care
+ */
+int
+ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
 	int mode = inode->i_mode;
 
@@ -275,30 +302,16 @@ __ext3_permission(struct inode *inode, int mask, int lock)
 	if (current->fsuid == inode->i_uid) {
 		mode >>= 6;
 	} else if (test_opt(inode->i_sb, POSIX_ACL)) {
-		struct ext3_inode_info *ei = EXT3_I(inode);
+		struct posix_acl *acl;
 
 		/* The access ACL cannot grant access if the group class
 		   permission bits don't contain all requested permissions. */
 		if (((mode >> 3) & mask & S_IRWXO) != mask)
 			goto check_groups;
-		if (ei->i_acl == EXT3_ACL_NOT_CACHED) {
-			struct posix_acl *acl;
-
-			if (lock) {
-				down(&inode->i_sem);
-				acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
-				up(&inode->i_sem);
-			} else
-				acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
-
-			if (IS_ERR(acl))
-				return PTR_ERR(acl);
+		acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
+		if (acl) {
+			int error = posix_acl_permission(inode, acl, mask);
 			posix_acl_release(acl);
-			if (ei->i_acl == EXT3_ACL_NOT_CACHED)
-				return -EIO;
-		}
-		if (ei->i_acl) {
-			int error = posix_acl_permission(inode, ei->i_acl,mask);
 			if (error == -EACCES)
 				goto check_capabilities;
 			return error;
@@ -324,26 +337,6 @@ check_capabilities:
 	return -EACCES;
 }
 
-/*
- * Inode operation permission().
- *
- * inode->i_sem: up
- */
-int
-ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
-	return __ext3_permission(inode, mask, 1);
-}
-
-/*
- * Used internally if i_sem is already down.
- */
-int
-ext3_permission_locked(struct inode *inode, int mask)
-{
-	return __ext3_permission(inode, mask, 0);
-}
-
 /*
  * Initialize the ACLs of a new inode. Called from ext3_new_inode.
  *
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 6aaef97a5fc3..9d9d9d2e4a0c 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -60,7 +60,6 @@ static inline int ext3_acl_count(size_t size)
 
 /* acl.c */
 extern int ext3_permission (struct inode *, int, struct nameidata *);
-extern int ext3_permission_locked (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index d84dddf2959b..04f3c6d04b7f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -519,6 +519,9 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
 		INIT_LIST_HEAD(&ei->i_orphan);
+#ifdef CONFIG_EXT3_FS_XATTR
+		init_rwsem(&ei->xattr_sem);
+#endif
 		init_rwsem(&ei->truncate_sem);
 		inode_init_once(&ei->vfs_inode);
 	}
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index b89f8be46f0f..6fbda077bdbe 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -43,13 +43,12 @@
  *
  * Locking strategy
  * ----------------
- * The VFS holdsinode->i_sem semaphore when any of the xattr inode
- * operations are called, so we are guaranteed that only one
- * processes accesses extended attributes of an inode at any time.
- *
- * For writing we also grab the ext3_xattr_sem semaphore. This ensures that
- * only a single process is modifying an extended attribute block, even
- * if the block is shared among inodes.
+ * EXT3_I(inode)->i_file_acl is protected by EXT3_I(inode)->xattr_sem.
+ * EA blocks are only changed if they are exclusive to an inode, so
+ * holding xattr_sem also means that nothing but the EA block's reference
+ * count will change. Multiple writers to an EA block are synchronized
+ * by the bh lock. No more than a single bh lock is held at any time
+ * to avoid deadlocks.
  */
 
 #include <linux/init.h>
@@ -59,7 +58,7 @@
 #include <linux/ext3_fs.h>
 #include <linux/mbcache.h>
 #include <linux/quotaops.h>
-#include <asm/semaphore.h>
+#include <linux/rwsem.h>
 #include "xattr.h"
 #include "acl.h"
 
@@ -93,22 +92,14 @@ static int ext3_xattr_set_handle2(handle_t *, struct inode *,
 				  struct ext3_xattr_header *);
 
 static int ext3_xattr_cache_insert(struct buffer_head *);
-static struct buffer_head *ext3_xattr_cache_find(struct inode *,
-						 struct ext3_xattr_header *);
+static struct buffer_head *ext3_xattr_cache_find(handle_t *, struct inode *,
+						 struct ext3_xattr_header *,
+						 int *);
 static void ext3_xattr_cache_remove(struct buffer_head *);
 static void ext3_xattr_rehash(struct ext3_xattr_header *,
 			      struct ext3_xattr_entry *);
 
 static struct mb_cache *ext3_xattr_cache;
-
-/*
- * If a file system does not share extended attributes among inodes,
- * we should not need the ext3_xattr_sem semaphore. However, the
- * filesystem may still contain shared blocks, so we always take
- * the lock.
- */
-
-static DECLARE_MUTEX(ext3_xattr_sem);
 static struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
 static rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
 
@@ -191,7 +182,7 @@ ext3_xattr_handler(int name_index)
 /*
  * Inode operation getxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: don't care
  */
 ssize_t
 ext3_getxattr(struct dentry *dentry, const char *name,
@@ -199,39 +190,28 @@ ext3_getxattr(struct dentry *dentry, const char *name,
 {
 	struct ext3_xattr_handler *handler;
 	struct inode *inode = dentry->d_inode;
-	ssize_t error;
 
 	handler = ext3_xattr_resolve_name(&name);
 	if (!handler)
 		return -EOPNOTSUPP;
-	down(&inode->i_sem);
-	error = handler->get(inode, name, buffer, size);
-	up(&inode->i_sem);
-
-	return error;
+	return handler->get(inode, name, buffer, size);
 }
 
 /*
  * Inode operation listxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: don't care
  */
 ssize_t
 ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-	ssize_t error;
-
-	down(&dentry->d_inode->i_sem);
-	error = ext3_xattr_list(dentry->d_inode, buffer, size);
-	up(&dentry->d_inode->i_sem);
-
-	return error;
+	return ext3_xattr_list(dentry->d_inode, buffer, size);
 }
 
 /*
  * Inode operation setxattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: down
  */
 int
 ext3_setxattr(struct dentry *dentry, const char *name,
@@ -251,7 +231,7 @@ ext3_setxattr(struct dentry *dentry, const char *name,
 /*
  * Inode operation removexattr()
  *
- * dentry->d_inode->i_sem down
+ * dentry->d_inode->i_sem: down
  */
 int
 ext3_removexattr(struct dentry *dentry, const char *name)
@@ -290,12 +270,15 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name,
 
 	if (name == NULL)
 		return -EINVAL;
+	down_read(&EXT3_I(inode)->xattr_sem);
+	error = -ENODATA;
 	if (!EXT3_I(inode)->i_file_acl)
-		return -ENODATA;
+		goto cleanup;
 	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
+	error = -EIO;
 	if (!bh)
-		return -EIO;
+		goto cleanup;
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
 	end = bh->b_data + bh->b_size;
@@ -360,6 +343,7 @@ found:
 
 cleanup:
 	brelse(bh);
+	up_read(&EXT3_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -386,12 +370,15 @@ ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
 
+	down_read(&EXT3_I(inode)->xattr_sem);
+	error = 0;
 	if (!EXT3_I(inode)->i_file_acl)
-		return 0;
+		goto cleanup;
 	ea_idebug(inode, "reading block %d", EXT3_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
+	error = -EIO;
 	if (!bh)
-		return -EIO;
+		goto cleanup;
 	ea_bdebug(bh, "b_count=%d, refcount=%d",
 		atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
 	end = bh->b_data + bh->b_size;
@@ -444,6 +431,7 @@ bad_block:	ext3_error(inode->i_sb, "ext3_xattr_list",
 
 cleanup:
 	brelse(bh);
+	up_read(&EXT3_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -459,11 +447,12 @@ static void ext3_xattr_update_super_block(handle_t *handle,
 		return;
 
 	lock_super(sb);
-	ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
-	EXT3_SB(sb)->s_es->s_feature_compat |=
-		cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
-	sb->s_dirt = 1;
-	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
+		EXT3_SB(sb)->s_es->s_feature_compat |=
+			cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
+		sb->s_dirt = 1;
+		ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+	}
 	unlock_super(sb);
 }
 
@@ -518,8 +507,7 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	name_len = strlen(name);
 	if (name_len > 255 || value_len > sb->s_blocksize)
 		return -ERANGE;
-	down(&ext3_xattr_sem);
-
+	down_write(&EXT3_I(inode)->xattr_sem);
 	if (EXT3_I(inode)->i_file_acl) {
 		/* The inode already has an extended attribute block. */
 		bh = sb_bread(sb, EXT3_I(inode)->i_file_acl);
@@ -612,15 +600,28 @@ bad_block:		ext3_error(sb, "ext3_xattr_set",
 	/* Here we know that we can set the new attribute. */
 
 	if (header) {
+		int credits = 0;
+
+		/* assert(header == HDR(bh)); */
+		if (header->h_refcount != cpu_to_le32(1))
+			goto skip_get_write_access;
+		/* ext3_journal_get_write_access() requires an unlocked bh,
+		   which complicates things here. */
+		error = ext3_journal_get_write_access_credits(handle, bh,
+							      &credits);
+		if (error)
+			goto cleanup;
+		lock_buffer(bh);
 		if (header->h_refcount == cpu_to_le32(1)) {
 			ea_bdebug(bh, "modifying in-place");
 			ext3_xattr_cache_remove(bh);
-			error = ext3_journal_get_write_access(handle, bh);
-			if (error)
-				goto cleanup;
+			/* keep the buffer locked while modifying it. */
 		} else {
 			int offset;
 
+			unlock_buffer(bh);
+			journal_release_buffer(handle, bh, credits);
+		skip_get_write_access:
 			ea_bdebug(bh, "cloning");
 			header = kmalloc(bh->b_size, GFP_KERNEL);
 			error = -ENOMEM;
@@ -645,6 +646,8 @@ bad_block:		ext3_error(sb, "ext3_xattr_set",
 		last = here = ENTRY(header+1);
 	}
 
+	/* Iff we are modifying the block in-place, bh is locked here. */
+
 	if (not_found) {
 		/* Insert the new name. */
 		size_t size = EXT3_XATTR_LEN(name_len);
@@ -715,9 +718,13 @@ bad_block:		ext3_error(sb, "ext3_xattr_set",
 skip_replace:
 	if (IS_LAST_ENTRY(ENTRY(header+1))) {
 		/* This block is now empty. */
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
 		error = ext3_xattr_set_handle2(handle, inode, bh, NULL);
 	} else {
 		ext3_xattr_rehash(header, here);
+		if (bh && header == HDR(bh))
+			unlock_buffer(bh);  /* we were modifying in-place. */
 		error = ext3_xattr_set_handle2(handle, inode, bh, header);
 	}
 
@@ -725,7 +732,7 @@ cleanup:
 	brelse(bh);
 	if (!(bh && header == HDR(bh)))
 		kfree(header);
-	up(&ext3_xattr_sem);
+	up_write(&EXT3_I(inode)->xattr_sem);
 
 	return error;
 }
@@ -740,33 +747,34 @@ ext3_xattr_set_handle2(handle_t *handle, struct inode *inode,
 {
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *new_bh = NULL;
-	int error;
+	int credits = 0, error;
 
 	if (header) {
-		new_bh = ext3_xattr_cache_find(inode, header);
+		new_bh = ext3_xattr_cache_find(handle, inode, header, &credits);
 		if (new_bh) {
 			/*
-			 * We found an identical block in the cache.
-			 * The old block will be released after updating
-			 * the inode.
+			 * We found an identical block in the cache. The
+			 * block returned is locked. The old block will
+			 * be released after updating the inode.
 			 */
 			ea_bdebug(new_bh, "%s block %lu",
 				(old_bh == new_bh) ? "keeping" : "reusing",
 				(unsigned long) new_bh->b_blocknr);
 
 			error = -EDQUOT;
-			if (DQUOT_ALLOC_BLOCK(inode, 1))
-				goto cleanup;
-
-			error = ext3_journal_get_write_access(handle, new_bh);
-			if (error)
+			if (DQUOT_ALLOC_BLOCK(inode, 1)) {
+				unlock_buffer(new_bh);
+				journal_release_buffer(handle, new_bh, credits);
 				goto cleanup;
+			}
 			HDR(new_bh)->h_refcount = cpu_to_le32(
 				le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
 			ea_bdebug(new_bh, "refcount now=%d",
 				le32_to_cpu(HDR(new_bh)->h_refcount));
+			unlock_buffer(new_bh);
 		} else if (old_bh && header == HDR(old_bh)) {
-			/* Keep this block. */
+			/* Keep this block. No need to lock the block as we
+			 * don't need to change the reference count. */
 			new_bh = old_bh;
 			get_bh(new_bh);
 			ext3_xattr_cache_insert(new_bh);
@@ -817,15 +825,14 @@ getblk_failed:
 	error = 0;
 	if (old_bh && old_bh != new_bh) {
 		/*
-		 * If there was an old block, and we are not still using it,
-		 * we now release the old block.
+		 * If there was an old block, and we are no longer using it,
+		 * release the old block.
 		*/
-		unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
-
 		error = ext3_journal_get_write_access(handle, old_bh);
 		if (error)
 			goto cleanup;
-		if (refcount == 1) {
+		lock_buffer(old_bh);
+		if (HDR(old_bh)->h_refcount == cpu_to_le32(1)) {
 			/* Free the old block. */
 			ea_bdebug(old_bh, "freeing");
 			ext3_free_blocks(handle, inode, old_bh->b_blocknr, 1);
@@ -837,12 +844,14 @@ getblk_failed:
 			ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr);
 		} else {
 			/* Decrement the refcount only. */
-			refcount--;
-			HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
+			HDR(old_bh)->h_refcount = cpu_to_le32(
+				le32_to_cpu(HDR(old_bh)->h_refcount) - 1);
 			DQUOT_FREE_BLOCK(inode, 1);
 			ext3_journal_dirty_metadata(handle, old_bh);
-			ea_bdebug(old_bh, "refcount now=%d", refcount);
+			ea_bdebug(old_bh, "refcount now=%d",
+				le32_to_cpu(HDR(old_bh)->h_refcount));
 		}
+		unlock_buffer(old_bh);
 	}
 
 cleanup:
@@ -886,12 +895,11 @@ ext3_xattr_set(struct inode *inode, int name_index, const char *name,
 void
 ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
 {
-	struct buffer_head *bh;
+	struct buffer_head *bh = NULL;
 
+	down_write(&EXT3_I(inode)->xattr_sem);
 	if (!EXT3_I(inode)->i_file_acl)
-		return;
-	down(&ext3_xattr_sem);
-
+		goto cleanup;
 	bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl);
 	if (!bh) {
 		ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
@@ -899,7 +907,6 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
-	ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
 	if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
 	    HDR(bh)->h_blocks != cpu_to_le32(1)) {
 		ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
@@ -907,8 +914,9 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
 			EXT3_I(inode)->i_file_acl);
 		goto cleanup;
 	}
-	ext3_journal_get_write_access(handle, bh);
-	ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
+	if (ext3_journal_get_write_access(handle, bh) != 0)
+		goto cleanup;
+	lock_buffer(bh);
 	if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
 		ext3_xattr_cache_remove(bh);
 		ext3_free_blocks(handle, inode, EXT3_I(inode)->i_file_acl, 1);
@@ -922,11 +930,13 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
 			handle->h_sync = 1;
 		DQUOT_FREE_BLOCK(inode, 1);
 	}
+	ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
+	unlock_buffer(bh);
 	EXT3_I(inode)->i_file_acl = 0;
 
 cleanup:
 	brelse(bh);
-	up(&ext3_xattr_sem);
+	up_write(&EXT3_I(inode)->xattr_sem);
 }
 
 /*
@@ -1022,7 +1032,8 @@ ext3_xattr_cmp(struct ext3_xattr_header *header1,
  * not found or an error occurred.
  */
 static struct buffer_head *
-ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
+ext3_xattr_cache_find(handle_t *handle, struct inode *inode,
+		      struct ext3_xattr_header *header, int *credits)
 {
 	__u32 hash = le32_to_cpu(header->h_hash);
 	struct mb_cache_entry *ce;
@@ -1039,18 +1050,28 @@ ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
 			ext3_error(inode->i_sb, "ext3_xattr_cache_find",
 				"inode %ld: block %ld read error",
 				inode->i_ino, (unsigned long) ce->e_block);
-		} else if (le32_to_cpu(HDR(bh)->h_refcount) >
-			   EXT3_XATTR_REFCOUNT_MAX) {
-			ea_idebug(inode, "block %ld refcount %d>%d",
-				  (unsigned long) ce->e_block,
-				  le32_to_cpu(HDR(bh)->h_refcount),
-				  EXT3_XATTR_REFCOUNT_MAX);
-		} else if (!ext3_xattr_cmp(header, HDR(bh))) {
-			ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
-			mb_cache_entry_release(ce);
-			return bh;
+		} else {
+			/* ext3_journal_get_write_access() requires an unlocked
+			 * bh, which complicates things here. */
+			if (ext3_journal_get_write_access_credits(handle, bh,
+								  credits) != 0)
+				return NULL;
+			lock_buffer(bh);
+			if (le32_to_cpu(HDR(bh)->h_refcount) >
+				   EXT3_XATTR_REFCOUNT_MAX) {
+				ea_idebug(inode, "block %ld refcount %d>%d",
+					  (unsigned long) ce->e_block,
+					  le32_to_cpu(HDR(bh)->h_refcount),
+					  EXT3_XATTR_REFCOUNT_MAX);
+			} else if (!ext3_xattr_cmp(header, HDR(bh))) {
+				mb_cache_entry_release(ce);
+				/* buffer will be unlocked by caller */
+				return bh;
+			}
+			unlock_buffer(bh);
+			journal_release_buffer(handle, bh, *credits);
+			brelse(bh);
 		}
-		brelse(bh);
 		ce = mb_cache_entry_find_next(ce, 0, inode->i_sb->s_bdev, hash);
 	}
 	return NULL;
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index b8c789e60fa0..84877afff67a 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -13,10 +13,6 @@
 #include <linux/ext3_fs.h>
 #include "xattr.h"
 
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-# include "acl.h"
-#endif
-
 #define XATTR_USER_PREFIX "user."
 
 static size_t
@@ -46,11 +42,7 @@ ext3_xattr_user_get(struct inode *inode, const char *name,
 		return -EINVAL;
 	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	error = ext3_permission_locked(inode, MAY_READ);
-#else
 	error = permission(inode, MAY_READ, NULL);
-#endif
 	if (error)
 		return error;
 
@@ -70,11 +62,7 @@ ext3_xattr_user_set(struct inode *inode, const char *name,
 	if ( !S_ISREG(inode->i_mode) &&
 	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
 		return -EPERM;
-#ifdef CONFIG_EXT3_FS_POSIX_ACL
-	error = ext3_permission_locked(inode, MAY_WRITE);
-#else
 	error = permission(inode, MAY_WRITE, NULL);
-#endif
 	if (error)
 		return error;
 
diff --git a/include/linux/ext3_fs_i.h b/include/linux/ext3_fs_i.h
index 1a6a6c5922f7..1181cfae7142 100644
--- a/include/linux/ext3_fs_i.h
+++ b/include/linux/ext3_fs_i.h
@@ -62,6 +62,16 @@ struct ext3_inode_info {
 	__u32	i_prealloc_count;
 #endif
 	__u32	i_dir_start_lookup;
+#ifdef CONFIG_EXT3_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_sem even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 	struct posix_acl	*i_acl;
 	struct posix_acl	*i_default_acl;
-- 
cgit v1.2.3


From c132f1432948dc91fd7d03845dfcd918d8b732c4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 6 Jul 2003 05:41:19 -0700
Subject: [PATCH] Module autoloading for quota

From: Jan Kara <jack@suse.cz>

This implements autoloading of quota modules.
---
 fs/dquot.c            | 17 +++++++++++++++--
 include/linux/quota.h | 10 ++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dquot.c b/fs/dquot.c
index 34f9417ed71a..9b3a6aae6c5f 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -74,6 +74,7 @@
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
+#include <linux/kmod.h>
 
 #include <asm/uaccess.h>
 
@@ -96,6 +97,7 @@ spinlock_t dq_data_lock = SPIN_LOCK_UNLOCKED;
 
 static char *quotatypes[] = INITQFNAMES;
 static struct quota_format_type *quota_formats;	/* List of registered formats */
+static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;
 
 int register_quota_format(struct quota_format_type *fmt)
 {
@@ -123,8 +125,19 @@ static struct quota_format_type *find_quota_format(int id)
 
 	spin_lock(&dq_list_lock);
 	for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next);
-	if (actqf && !try_module_get(actqf->qf_owner))
-		actqf = NULL;
+	if (!actqf || !try_module_get(actqf->qf_owner)) {
+		int qm;
+
+		for (qm = 0; module_names[qm].qm_fmt_id && module_names[qm].qm_fmt_id != id; qm++);
+		if (!module_names[qm].qm_fmt_id || request_module(module_names[qm].qm_mod_name)) {
+			actqf = NULL;
+			goto out;
+		}
+		for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next);
+		if (actqf && !try_module_get(actqf->qf_owner))
+			actqf = NULL;
+	}
+out:
 	spin_unlock(&dq_list_lock);
 	return actqf;
 }
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 77d017472dc7..fbf2d2b2a5be 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -306,6 +306,16 @@ int register_quota_format(struct quota_format_type *fmt);
 void unregister_quota_format(struct quota_format_type *fmt);
 void init_dquot_operations(struct dquot_operations *fsdqops);
 
+struct quota_module_name {
+	int qm_fmt_id;
+	char *qm_mod_name;
+};
+
+#define INIT_QUOTA_MODULE_NAMES {\
+	{QFMT_VFS_OLD, "quota_v1"},\
+	{QFMT_VFS_V0, "quota_v2"},\
+	{0, NULL}}
+
 #else
 
 # /* nodep */ include <sys/cdefs.h>
-- 
cgit v1.2.3


From d24d1d3abedcd64a9fd90e602bbd45d73b9c0de0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 6 Jul 2003 05:41:34 -0700
Subject: [PATCH] BSD accounting speedup

From: Ingo Molnar <mingo@elte.hu>

Most distributions turn on process accounting - but even the common
'accounting is off' case is horrible SMP-scalability-wise: it accesses a
global spinlock during every sys_exit() call, which bounces like mad on SMP
(and NUMA) systems.

(i also got rid of the unused return code.)
---
 include/linux/acct.h |  2 +-
 kernel/acct.c        | 29 +++++++++++++++++++----------
 2 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acct.h b/include/linux/acct.h
index 0b4ae8298a0e..69389c4af8e4 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -78,7 +78,7 @@ struct acct
 #ifdef CONFIG_BSD_PROCESS_ACCT
 struct super_block;
 extern void acct_auto_close(struct super_block *sb);
-extern int acct_process(long exitcode);
+extern void acct_process(long exitcode);
 #else
 #define acct_auto_close(x)	do { } while (0)
 #define acct_process(x)		do { } while (0)
diff --git a/kernel/acct.c b/kernel/acct.c
index e63095525ac2..028e310bd15f 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -394,17 +394,26 @@ static void do_acct_process(long exitcode, struct file *file)
 /*
  * acct_process - now just a wrapper around do_acct_process
  */
-int acct_process(long exitcode)
+void acct_process(long exitcode)
 {
 	struct file *file = NULL;
+
+	/*
+	 * accelerate the common fastpath:
+	 */
+	if (!acct_globals.file)
+		return;
+
 	spin_lock(&acct_globals.lock);
-	if (acct_globals.file) {
-		file = acct_globals.file;
-		get_file(file);
-		spin_unlock(&acct_globals.lock);
-		do_acct_process(exitcode, file);
-		fput(file);
-	} else
-		spin_unlock(&acct_globals.lock);
-	return 0;
+	file = acct_globals.file;
+	if (!file)
+		goto out_unlock;
+
+	get_file(file);
+	spin_unlock(&acct_globals.lock);
+	do_acct_process(exitcode, file);
+	fput(file);
+
+out_unlock:
+	spin_unlock(&acct_globals.lock);
 }
-- 
cgit v1.2.3


From a7eec8d9269e8c879e216bfcc11a5b5fa333c250 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@snapgear.com>
Date: Sun, 6 Jul 2003 23:00:12 -0700
Subject: [PATCH] define shared lib limits for flat loader

This patch includes the last peices of the flat laoder shared
library support. Define the shared lib limit and implement a
flag for doing kernel level tracing.
---
 include/linux/flat.h | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/flat.h b/include/linux/flat.h
index 7643a3f9d3f5..ec56852e2659 100644
--- a/include/linux/flat.h
+++ b/include/linux/flat.h
@@ -1,7 +1,7 @@
-
-/* Copyright (C) 1998  Kenneth Albanowski <kjahds@kjahds.com>
- *                     The Silver Hammer Group, Ltd.
- * Copyright (C) 2002  David McCullough <davidm@snapgear.com>
+/*
+ * Copyright (C) 2002-2003  David McCullough <davidm@snapgear.com>
+ * Copyright (C) 1998       Kenneth Albanowski <kjahds@kjahds.com>
+ *                          The Silver Hammer Group, Ltd.
  *
  * This file provides the definitions and structures needed to
  * support uClinux flat-format executables.
@@ -10,10 +10,18 @@
 #ifndef _LINUX_FLAT_H
 #define _LINUX_FLAT_H
 
+#ifdef __KERNEL__
 #include <asm/flat.h>
+#endif
 
 #define	FLAT_VERSION			0x00000004L
 
+#ifdef CONFIG_BINFMT_SHARED_FLAT
+#define	MAX_SHARED_LIBS			(4)
+#else
+#define	MAX_SHARED_LIBS			(1)
+#endif
+
 /*
  * To make everything easier to port and manage cross platform
  * development,  all fields are in network byte order.
@@ -46,8 +54,10 @@ struct flat_hdr {
 #define FLAT_FLAG_GOTPIC 0x0002 /* program is PIC with GOT */
 #define FLAT_FLAG_GZIP   0x0004 /* all but the header is compressed */
 #define FLAT_FLAG_GZDATA 0x0008 /* only data/relocs are compressed (for XIP) */
+#define FLAT_FLAG_KTRACE 0x0010 /* output useful kernel trace for debugging */
 
 
+#ifdef __KERNEL__ /* so systems without linux headers can compile the apps */
 /*
  * While it would be nice to keep this header clean,  users of older
  * tools still need this support in the kernel.  So this section is
@@ -85,4 +95,6 @@ typedef union {
 	} reloc;
 } flat_v2_reloc_t;
 
+#endif /* __KERNEL__ */
+
 #endif /* _LINUX_FLAT_H */
-- 
cgit v1.2.3


From b993be7e4517f328fd6bd8bcea2f038c894a292e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sun, 6 Jul 2003 23:01:42 -0700
Subject: [PATCH] Make kstat_this_cpu in terms of __get_cpu_var and use it

kstat_this_cpu() is defined in terms of per_cpu instead of __get_cpu_var.

This patch changes that, and uses it everywhere appropriate.  The sched.c
change puts it in a local variable, which helps gcc generate better code.
---
 arch/i386/kernel/irq.c      |  3 +--
 include/linux/kernel_stat.h |  3 ++-
 kernel/sched.c              | 13 +++++++------
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 6e0ff0a46b5c..314d47e8ec17 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -416,7 +416,6 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 	 * handled by some other CPU. (or is disabled)
 	 */
 	int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code  */
-	int cpu = smp_processor_id();
 	irq_desc_t *desc = irq_desc + irq;
 	struct irqaction * action;
 	unsigned int status;
@@ -437,7 +436,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 		}
 	}
 #endif
-	kstat_cpu(cpu).irqs[irq]++;
+	kstat_this_cpu.irqs[irq]++;
 	spin_lock(&desc->lock);
 	desc->handler->ack(irq);
 	/*
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 88425e94cdc5..9971827a3c4b 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -31,7 +31,8 @@ struct kernel_stat {
 DECLARE_PER_CPU(struct kernel_stat, kstat);
 
 #define kstat_cpu(cpu)	per_cpu(kstat, cpu)
-#define kstat_this_cpu	kstat_cpu(smp_processor_id())
+/* Must have preemption disabled for this to be meaningful. */
+#define kstat_this_cpu	__get_cpu_var(kstat)
 
 extern unsigned long nr_context_switches(void);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 4f3d1fa42669..a4305315afda 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1175,6 +1175,7 @@ DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } };
 void scheduler_tick(int user_ticks, int sys_ticks)
 {
 	int cpu = smp_processor_id();
+	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
 
@@ -1184,19 +1185,19 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 	if (p == rq->idle) {
 		/* note: this timer irq context must be accounted for as well */
 		if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
-			kstat_cpu(cpu).cpustat.system += sys_ticks;
+			cpustat->system += sys_ticks;
 		else if (atomic_read(&rq->nr_iowait) > 0)
-			kstat_cpu(cpu).cpustat.iowait += sys_ticks;
+			cpustat->iowait += sys_ticks;
 		else
-			kstat_cpu(cpu).cpustat.idle += sys_ticks;
+			cpustat->idle += sys_ticks;
 		rebalance_tick(rq, 1);
 		return;
 	}
 	if (TASK_NICE(p) > 0)
-		kstat_cpu(cpu).cpustat.nice += user_ticks;
+		cpustat->nice += user_ticks;
 	else
-		kstat_cpu(cpu).cpustat.user += user_ticks;
-	kstat_cpu(cpu).cpustat.system += sys_ticks;
+		cpustat->user += user_ticks;
+	cpustat->system += sys_ticks;
 
 	/* Task might have expired already, but not scheduled off yet */
 	if (p->array != rq->active) {
-- 
cgit v1.2.3