summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/proc.txt7
-rw-r--r--Documentation/sysctl/vm.txt3
-rw-r--r--arch/alpha/mm/numa.c6
-rw-r--r--arch/i386/config.in2
-rw-r--r--arch/i386/kernel/cpu/amd.c2
-rw-r--r--arch/i386/kernel/cpu/mtrr/mtrr.h3
-rw-r--r--arch/i386/kernel/i386_ksyms.c6
-rw-r--r--arch/i386/kernel/numaq.c20
-rw-r--r--arch/i386/mm/discontig.c15
-rw-r--r--arch/i386/mm/hugetlbpage.c4
-rw-r--r--arch/i386/mm/init.c6
-rw-r--r--arch/i386/mm/pgtable.c29
-rw-r--r--arch/mips64/sgi-ip27/ip27-memory.c12
-rw-r--r--arch/sparc64/mm/init.c2
-rw-r--r--drivers/block/ll_rw_blk.c28
-rw-r--r--drivers/char/raw.c2
-rw-r--r--drivers/ide/legacy/hd.c2
-rw-r--r--drivers/pcmcia/sa1100.h2
-rw-r--r--drivers/pcmcia/sa1100_generic.c4
-rw-r--r--drivers/scsi/scsi.c4
-rw-r--r--fs/bio.c3
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/ext2/inode.c4
-rw-r--r--fs/ext3/inode.c4
-rw-r--r--fs/fs-writeback.c113
-rw-r--r--fs/jfs/inode.c5
-rw-r--r--fs/mpage.c6
-rw-r--r--fs/proc/array.c143
-rw-r--r--include/asm-alpha/mmzone.h8
-rw-r--r--include/asm-alpha/numnodes.h12
-rw-r--r--include/asm-i386/mmzone.h16
-rw-r--r--include/asm-i386/numaq.h9
-rw-r--r--include/asm-i386/numnodes.h (renamed from include/asm-i386/max_numnodes.h)0
-rw-r--r--include/asm-i386/page.h2
-rw-r--r--include/asm-mips64/mmzone.h1
-rw-r--r--include/asm-mips64/pgtable.h8
-rw-r--r--include/linux/fs.h8
-rw-r--r--include/linux/gfp.h19
-rw-r--r--include/linux/mm.h7
-rw-r--r--include/linux/mmzone.h23
-rw-r--r--include/linux/mpage.h8
-rw-r--r--include/linux/sysctl.h13
-rw-r--r--include/linux/uio.h6
-rw-r--r--include/linux/writeback.h30
-rw-r--r--init/main.c1
-rw-r--r--kernel/ksyms.c6
-rw-r--r--kernel/printk.c6
-rw-r--r--kernel/suspend.c17
-rw-r--r--kernel/sysctl.c15
-rw-r--r--mm/filemap.c140
-rw-r--r--mm/memory.c8
-rw-r--r--mm/mempool.c3
-rw-r--r--mm/mmap.c8
-rw-r--r--mm/mremap.c1
-rw-r--r--mm/numa.c80
-rw-r--r--mm/page-writeback.c69
-rw-r--r--mm/page_alloc.c159
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/shmem.c1
-rw-r--r--mm/swap.c10
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c1
-rw-r--r--mm/vmscan.c13
63 files changed, 562 insertions, 602 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 57597335536d..81c04ff87b4a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -963,13 +963,6 @@ Contains, as a percentage of total system memory, the number of pages at which
a process which is generating disk writes will itself start writing out dirty
data.
-dirty_sync_ratio
-----------------
-
-Contains, as a percentage of total system memory, the number of pages at which
-a process which is generating disk writes will itself start writing out dirty
-data and waiting upon completion of that writeout.
-
dirty_writeback_centisecs
-------------------------
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 6ff0af89ae77..ed6ccff766f4 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -21,13 +21,12 @@ Currently, these files are in /proc/sys/vm:
- dirty_async_ratio
- dirty_background_ratio
- dirty_expire_centisecs
-- dirty_sync_ratio
- dirty_writeback_centisecs
==============================================================
dirty_async_ratio, dirty_background_ratio, dirty_expire_centisecs,
-dirty_sync_ratio dirty_writeback_centisecs:
+dirty_writeback_centisecs:
See Documentation/filesystems/proc.txt
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index 2458576ec8ae..5071c14da059 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -286,7 +286,6 @@ void __init paging_init(void)
for (nid = 0; nid < numnodes; nid++) {
unsigned long start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
unsigned long end_pfn = plat_node_bdata[nid].node_low_pfn;
- unsigned long lmax_mapnr;
if (dma_local_pfn >= end_pfn - start_pfn)
zones_size[ZONE_DMA] = end_pfn - start_pfn;
@@ -295,11 +294,6 @@ void __init paging_init(void)
zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn;
}
free_area_init_node(nid, NODE_DATA(nid), NULL, zones_size, start_pfn, NULL);
- lmax_mapnr = PLAT_NODE_DATA_STARTNR(nid) + PLAT_NODE_DATA_SIZE(nid);
- if (lmax_mapnr > max_mapnr) {
- max_mapnr = lmax_mapnr;
- DBGDCONT("Grow max_mapnr to %ld\n", max_mapnr);
- }
}
/* Initialize the kernel's ZERO_PGE. */
diff --git a/arch/i386/config.in b/arch/i386/config.in
index 702ab169752e..861b2ad69154 100644
--- a/arch/i386/config.in
+++ b/arch/i386/config.in
@@ -154,7 +154,7 @@ if [ "$CONFIG_MWINCHIP3D" = "y" ]; then
define_bool CONFIG_X86_OOSTORE y
fi
-bool 'IA-32 Huge TLB Page Support (if available on processor)' CONFIG_HUGETLB_PAGE
+bool 'Huge TLB Page Support' CONFIG_HUGETLB_PAGE
bool 'Symmetric multi-processing support' CONFIG_SMP
bool 'Preemptible Kernel' CONFIG_PREEMPT
diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c
index 00b09cc403a2..991024118fe6 100644
--- a/arch/i386/kernel/cpu/amd.c
+++ b/arch/i386/kernel/cpu/amd.c
@@ -25,7 +25,7 @@ __asm__(".align 4\nvide: ret");
static void __init init_amd(struct cpuinfo_x86 *c)
{
u32 l, h;
- int mbytes = max_mapnr >> (20-PAGE_SHIFT);
+ int mbytes = num_physpages >> (20-PAGE_SHIFT);
int r;
/*
diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
index 363a5b14acdc..c59ccb1059f7 100644
--- a/arch/i386/kernel/cpu/mtrr/mtrr.h
+++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
@@ -96,4 +96,7 @@ extern struct mtrr_ops * mtrr_if;
extern unsigned int num_var_ranges;
+void finalize_mtrr_state(void);
+void mtrr_state_warn(void);
+
extern char * mtrr_if_name[];
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index 0be81aed6747..8b447ebcb591 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -58,7 +58,11 @@ EXPORT_SYMBOL(boot_cpu_data);
EXPORT_SYMBOL(EISA_bus);
#endif
EXPORT_SYMBOL(MCA_bus);
-#ifdef CONFIG_MULTIQUAD
+#ifdef CONFIG_DISCONTIGMEM
+EXPORT_SYMBOL(node_data);
+EXPORT_SYMBOL(pfn_to_nid);
+#endif
+#ifdef CONFIG_X86_NUMAQ
EXPORT_SYMBOL(xquad_portio);
#endif
EXPORT_SYMBOL(__verify_write);
diff --git a/arch/i386/kernel/numaq.c b/arch/i386/kernel/numaq.c
index ffd27c7d2d81..07cf91d92dd9 100644
--- a/arch/i386/kernel/numaq.c
+++ b/arch/i386/kernel/numaq.c
@@ -82,27 +82,19 @@ static void __init smp_dump_qct(void)
*/
int physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1};
-#define MB_TO_ELEMENT(x) (x >> ELEMENT_REPRESENTS)
-#define PA_TO_MB(pa) (pa >> 20) /* assumption: a physical address is in bytes */
+#define PFN_TO_ELEMENT(pfn) (pfn / PAGES_PER_ELEMENT)
+#define PA_TO_ELEMENT(pa) (PFN_TO_ELEMENT(pa >> PAGE_SHIFT))
-int pa_to_nid(u64 pa)
+int pfn_to_nid(unsigned long pfn)
{
- int nid;
-
- nid = physnode_map[MB_TO_ELEMENT(PA_TO_MB(pa))];
+ int nid = physnode_map[PFN_TO_ELEMENT(pfn)];
- /* the physical address passed in is not in the map for the system */
if (nid == -1)
- BUG();
+ BUG(); /* address is not present */
return nid;
}
-int pfn_to_nid(unsigned long pfn)
-{
- return pa_to_nid(((u64)pfn) << PAGE_SHIFT);
-}
-
/*
* for each node mark the regions
* TOPOFMEM = hi_shrd_mem_start + hi_shrd_mem_size
@@ -132,7 +124,7 @@ static void __init initialize_physnode_map(void)
topofmem = eq->hi_shrd_mem_start + eq->hi_shrd_mem_size;
while (cur < topofmem) {
physnode_map[cur >> 8] = nid;
- cur += (ELEMENT_REPRESENTS - 1);
+ cur ++;
}
}
}
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index de811d22ac09..eab75722398a 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -275,20 +275,9 @@ void __init set_highmem_pages_init(int bad_ppro)
void __init set_max_mapnr_init(void)
{
#ifdef CONFIG_HIGHMEM
- unsigned long lmax_mapnr;
- int nid;
-
- highmem_start_page = mem_map + NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_start_mapnr;
+ highmem_start_page = NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_mem_map;
num_physpages = highend_pfn;
-
- for (nid = 0; nid < numnodes; nid++) {
- lmax_mapnr = node_startnr(nid) + node_size(nid);
- if (lmax_mapnr > max_mapnr) {
- max_mapnr = lmax_mapnr;
- }
- }
-
#else
- max_mapnr = num_physpages = max_low_pfn;
+ num_physpages = max_low_pfn;
#endif
}
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index c50cec1dbafb..928622ee5b22 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -319,7 +319,7 @@ set_new_inode(unsigned long len, int prot, int flag, int key)
}
if (i == MAX_ID)
return NULL;
- inode = kmalloc(sizeof (struct inode), GFP_KERNEL);
+ inode = kmalloc(sizeof (struct inode), GFP_ATOMIC);
if (inode == NULL)
return NULL;
@@ -502,7 +502,7 @@ set_hugetlb_mem_size(int count)
if (lcount > 0) { /* Increase the mem size. */
while (lcount--) {
- page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER);
+ page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
if (page == NULL)
break;
map = page;
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 5d73c07fd726..c672b966bcca 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -440,8 +440,10 @@ void __init mem_init(void)
int tmp;
int bad_ppro;
+#ifndef CONFIG_DISCONTIGMEM
if (!mem_map)
BUG();
+#endif
bad_ppro = ppro_with_ram_bug();
@@ -471,7 +473,7 @@ void __init mem_init(void)
printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n",
(unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
- max_mapnr << (PAGE_SHIFT-10),
+ num_physpages << (PAGE_SHIFT-10),
codesize >> 10,
reservedpages << (PAGE_SHIFT-10),
datasize >> 10,
@@ -504,7 +506,7 @@ void __init mem_init(void)
/*Will make this kernel command line. */
INIT_LIST_HEAD(&htlbpage_freelist);
for (i=0; i<htlbzone_pages; i++) {
- page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER);
+ page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
if (page == NULL)
break;
map = page;
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 18a7664b115b..1f59100e77bf 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -22,26 +22,29 @@
void show_mem(void)
{
- int pfn, total = 0, reserved = 0;
+ int total = 0, reserved = 0;
int shared = 0, cached = 0;
int highmem = 0;
struct page *page;
+ pg_data_t *pgdat;
+ unsigned long i;
printk("Mem-info:\n");
show_free_areas();
printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
- pfn = max_mapnr;
- while (pfn-- > 0) {
- page = pfn_to_page(pfn);
- total++;
- if (PageHighMem(page))
- highmem++;
- if (PageReserved(page))
- reserved++;
- else if (PageSwapCache(page))
- cached++;
- else if (page_count(page))
- shared += page_count(page) - 1;
+ for_each_pgdat(pgdat) {
+ for (i = 0; i < pgdat->node_size; ++i) {
+ page = pgdat->node_mem_map + i;
+ total++;
+ if (PageHighMem(page))
+ highmem++;
+ if (PageReserved(page))
+ reserved++;
+ else if (PageSwapCache(page))
+ cached++;
+ else if (page_count(page))
+ shared += page_count(page) - 1;
+ }
}
printk("%d pages of RAM\n", total);
printk("%d pages of HIGHMEM\n",highmem);
diff --git a/arch/mips64/sgi-ip27/ip27-memory.c b/arch/mips64/sgi-ip27/ip27-memory.c
index e5f79e031816..f46fa89f145f 100644
--- a/arch/mips64/sgi-ip27/ip27-memory.c
+++ b/arch/mips64/sgi-ip27/ip27-memory.c
@@ -254,10 +254,6 @@ void __init paging_init(void)
zones_size[ZONE_DMA] = end_pfn + 1 - start_pfn;
free_area_init_node(node, NODE_DATA(node), 0, zones_size,
start_pfn, 0);
- if ((PLAT_NODE_DATA_STARTNR(node) +
- PLAT_NODE_DATA_SIZE(node)) > pagenr)
- pagenr = PLAT_NODE_DATA_STARTNR(node) +
- PLAT_NODE_DATA_SIZE(node);
}
}
@@ -271,7 +267,6 @@ void __init mem_init(void)
unsigned long codesize, datasize, initsize;
int slot, numslots;
struct page *pg, *pslot;
- pfn_t pgnr;
num_physpages = numpages; /* memory already sized by szmem */
max_mapnr = pagenr; /* already found during paging_init */
@@ -293,7 +288,6 @@ void __init mem_init(void)
* We need to manually do the other slots.
*/
pg = NODE_DATA(nid)->node_mem_map + slot_getsize(nid, 0);
- pgnr = PLAT_NODE_DATA_STARTNR(nid) + slot_getsize(nid, 0);
numslots = node_getlastslot(nid);
for (slot = 1; slot <= numslots; slot++) {
pslot = NODE_DATA(nid)->node_mem_map +
@@ -304,7 +298,7 @@ void __init mem_init(void)
* free up the pages that hold the memmap entries.
*/
while (pg < pslot) {
- pg++; pgnr++;
+ pg++;
}
/*
@@ -312,8 +306,8 @@ void __init mem_init(void)
*/
pslot += slot_getsize(nid, slot);
while (pg < pslot) {
- if (!page_is_ram(pgnr))
- continue;
+ /* if (!page_is_ram(pgnr)) continue; */
+ /* commented out until page_is_ram works */
ClearPageReserved(pg);
atomic_set(&pg->count, 1);
__free_page(pg);
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 6410e974796a..d577bad64d68 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -1733,7 +1733,7 @@ void __init mem_init(void)
* Set up the zero page, mark it reserved, so that page count
* is not manipulated when freeing the page from user ptes.
*/
- mem_map_zero = _alloc_pages(GFP_KERNEL, 0);
+ mem_map_zero = alloc_pages(GFP_KERNEL, 0);
if (mem_map_zero == NULL) {
prom_printf("paging_init: Cannot alloc zero page.\n");
prom_halt();
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 776233f78ac3..8b5ae9a64e03 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -36,7 +36,7 @@ static kmem_cache_t *request_cachep;
/*
* plug management
*/
-static struct list_head blk_plug_list;
+static LIST_HEAD(blk_plug_list);
static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
/* blk_dev_struct is:
@@ -1875,27 +1875,16 @@ void end_that_request_last(struct request *req)
blk_put_request(req);
}
-#define MB(kb) ((kb) << 10)
-
int __init blk_dev_init(void)
{
- struct blk_dev_struct *dev;
- int total_ram;
+ int total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
request_cachep = kmem_cache_create("blkdev_requests",
- sizeof(struct request),
- 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
-
+ sizeof(struct request), 0,
+ SLAB_HWCACHE_ALIGN, NULL, NULL);
if (!request_cachep)
panic("Can't create request pool slab cache\n");
- for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
- dev->queue = NULL;
-
- memset(ro_bits,0,sizeof(ro_bits));
-
- total_ram = nr_free_pages() << (PAGE_SHIFT - 10);
-
/*
* Free request slots per queue.
* (Half for reads, half for writes)
@@ -1911,17 +1900,12 @@ int __init blk_dev_init(void)
*/
if ((batch_requests = queue_nr_requests / 4) > 32)
batch_requests = 32;
- printk("block: %d slots per queue, batch=%d\n", queue_nr_requests, batch_requests);
+ printk("block: %d slots per queue, batch=%d\n",
+ queue_nr_requests, batch_requests);
blk_max_low_pfn = max_low_pfn;
blk_max_pfn = max_pfn;
- INIT_LIST_HEAD(&blk_plug_list);
-
-#if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD)
- hd_init();
-#endif
-
return 0;
};
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index a2f05f72791d..2b08e77a18bb 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -241,7 +241,7 @@ raw_read(struct file *filp, char *buf, size_t size, loff_t *offp)
static ssize_t
raw_write(struct file *filp, const char *buf, size_t size, loff_t *offp)
{
- struct iovec local_iov = { .iov_base = buf, .iov_len = size};
+ struct iovec local_iov = { .iov_base = (char *)buf, .iov_len = size};
return rw_raw_dev(WRITE, filp, &local_iov, 1, offp);
}
diff --git a/drivers/ide/legacy/hd.c b/drivers/ide/legacy/hd.c
index c0c042c1ddf1..24c598563bd4 100644
--- a/drivers/ide/legacy/hd.c
+++ b/drivers/ide/legacy/hd.c
@@ -846,7 +846,7 @@ static void __init hd_geninit(void)
}
}
-int __init hd_init(void)
+static int __init hd_init(void)
{
if (register_blkdev(MAJOR_NR,"hd",&hd_fops)) {
printk("hd: unable to get major %d for hard disk\n",MAJOR_NR);
diff --git a/drivers/pcmcia/sa1100.h b/drivers/pcmcia/sa1100.h
index 713f5b49cf34..53716e9dcf63 100644
--- a/drivers/pcmcia/sa1100.h
+++ b/drivers/pcmcia/sa1100.h
@@ -160,7 +160,7 @@ struct sa1100_pcmcia_socket {
*/
socket_state_t cs_state;
pccard_io_map io_map[MAX_IO_WIN];
- pccard_mem_map mem_map[MAX_WIN];
+ pccard_mem_map pc_mem_map[MAX_WIN];
void (*handler)(void *, unsigned int);
void *handler_info;
diff --git a/drivers/pcmcia/sa1100_generic.c b/drivers/pcmcia/sa1100_generic.c
index ef238c0f90b7..12dc9270e402 100644
--- a/drivers/pcmcia/sa1100_generic.c
+++ b/drivers/pcmcia/sa1100_generic.c
@@ -686,7 +686,7 @@ sa1100_pcmcia_get_mem_map(unsigned int sock, struct pccard_mem_map *map)
DEBUG(2, "%s() for sock %u\n", __FUNCTION__, sock);
if (map->map < MAX_WIN) {
- *map = skt->mem_map[map->map];
+ *map = skt->pc_mem_map[map->map];
ret = 0;
}
@@ -754,7 +754,7 @@ sa1100_pcmcia_set_mem_map(unsigned int sock, struct pccard_mem_map *map)
map->sys_stop += start;
map->sys_start = start;
- skt->mem_map[map->map] = *map;
+ skt->pc_mem_map[map->map] = *map;
return 0;
} /* sa1100_pcmcia_set_mem_map() */
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 101a2b10fe2e..448bb2f167b1 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -2537,6 +2537,7 @@ struct scatterlist *scsi_alloc_sgtable(Scsi_Cmnd *SCpnt, int gfp_mask)
{
struct scsi_host_sg_pool *sgp;
struct scatterlist *sgl;
+ int pf_flags;
BUG_ON(!SCpnt->use_sg);
@@ -2551,9 +2552,10 @@ struct scatterlist *scsi_alloc_sgtable(Scsi_Cmnd *SCpnt, int gfp_mask)
sgp = scsi_sg_pools + SCpnt->sglist_len;
+ pf_flags = current->flags;
current->flags |= PF_NOWARN;
sgl = mempool_alloc(sgp->pool, gfp_mask);
- current->flags &= ~PF_NOWARN;
+ current->flags = pf_flags;
if (sgl) {
memset(sgl, 0, sgp->size);
return sgl;
diff --git a/fs/bio.c b/fs/bio.c
index 95f402e38670..d2fa052eacc2 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -135,6 +135,7 @@ struct bio *bio_alloc(int gfp_mask, int nr_iovecs)
{
struct bio *bio;
struct bio_vec *bvl = NULL;
+ int pf_flags = current->flags;
current->flags |= PF_NOWARN;
bio = mempool_alloc(bio_pool, gfp_mask);
@@ -151,7 +152,7 @@ struct bio *bio_alloc(int gfp_mask, int nr_iovecs)
mempool_free(bio, bio_pool);
bio = NULL;
out:
- current->flags &= ~PF_NOWARN;
+ current->flags = pf_flags;
return bio;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 4fe7c935e4d6..cb06b5454e36 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -937,9 +937,11 @@ try_again:
head = NULL;
offset = PAGE_SIZE;
while ((offset -= size) >= 0) {
+ int pf_flags = current->flags;
+
current->flags |= PF_NOWARN;
bh = alloc_buffer_head();
- current->flags &= ~PF_NOWARN;
+ current->flags = pf_flags;
if (!bh)
goto no_grow;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 78a1b6ace494..99627183120e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -627,13 +627,13 @@ ext2_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
}
static int
-ext2_writepages(struct address_space *mapping, int *nr_to_write)
+ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
int err;
ret = write_mapping_buffers(mapping);
- err = mpage_writepages(mapping, nr_to_write, ext2_get_block);
+ err = mpage_writepages(mapping, wbc, ext2_get_block);
if (!ret)
ret = err;
return ret;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 99157f9cb6e7..2b672bb2aed4 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1475,13 +1475,13 @@ struct address_space_operations ext3_aops = {
/* For writeback mode, we can use mpage_writepages() */
static int
-ext3_writepages(struct address_space *mapping, int *nr_to_write)
+ext3_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
int ret;
int err;
ret = write_mapping_buffers(mapping);
- err = mpage_writepages(mapping, nr_to_write, ext3_get_block);
+ err = mpage_writepages(mapping, wbc, ext3_get_block);
if (!ret)
ret = err;
return ret;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 25e0dfad847c..e306a31f46b5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -111,8 +111,7 @@ static void write_inode(struct inode *inode, int sync)
/*
* Write a single inode's dirty pages and inode data out to disk.
* If `sync' is set, wait on the writeout.
- * If `nr_to_write' is not NULL, subtract the number of written pages
- * from *nr_to_write.
+ * Subtract the number of written pages from nr_to_write.
*
* Normally it is not legal for a single process to lock more than one
* page at a time, due to ab/ba deadlock problems. But writepages()
@@ -127,7 +126,9 @@ static void write_inode(struct inode *inode, int sync)
*
* Called under inode_lock.
*/
-static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
+static void
+__sync_single_inode(struct inode *inode, int wait,
+ struct writeback_control *wbc)
{
unsigned dirty;
unsigned long orig_dirtied_when;
@@ -144,7 +145,7 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
mapping->dirtied_when = 0; /* assume it's whole-file writeback */
spin_unlock(&inode_lock);
- do_writepages(mapping, nr_to_write);
+ do_writepages(mapping, wbc);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
@@ -181,7 +182,8 @@ static void __sync_single_inode(struct inode *inode, int wait, int *nr_to_write)
* Write out an inode's dirty pages. Called under inode_lock.
*/
static void
-__writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
+__writeback_single_inode(struct inode *inode, int sync,
+ struct writeback_control *wbc)
{
if (current_is_pdflush() && (inode->i_state & I_LOCK))
return;
@@ -193,7 +195,7 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
iput(inode);
spin_lock(&inode_lock);
}
- __sync_single_inode(inode, sync, nr_to_write);
+ __sync_single_inode(inode, sync, wbc);
}
/*
@@ -226,8 +228,7 @@ __writeback_single_inode(struct inode *inode, int sync, int *nr_to_write)
* thrlttled threads: we don't want them all piling up on __wait_on_inode.
*/
static void
-sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
- int sync_mode, int *nr_to_write, unsigned long *older_than_this)
+sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
{
struct list_head *tmp;
struct list_head *head;
@@ -241,7 +242,7 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
struct backing_dev_info *bdi;
int really_sync;
- if (single_bdi && mapping->backing_dev_info != single_bdi) {
+ if (wbc->bdi && mapping->backing_dev_info != wbc->bdi) {
if (sb != blockdev_superblock)
break; /* inappropriate superblock */
list_move(&inode->i_list, &sb->s_dirty);
@@ -252,23 +253,20 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
if (time_after(mapping->dirtied_when, start))
break;
- if (older_than_this &&
- time_after(mapping->dirtied_when, *older_than_this))
+ if (wbc->older_than_this && time_after(mapping->dirtied_when,
+ *wbc->older_than_this))
goto out;
bdi = mapping->backing_dev_info;
if (current_is_pdflush() && !writeback_acquire(bdi))
break;
- really_sync = (sync_mode == WB_SYNC_ALL);
- if ((sync_mode == WB_SYNC_LAST) && (head->prev == head))
- really_sync = 1;
-
+ really_sync = (wbc->sync_mode == WB_SYNC_ALL);
BUG_ON(inode->i_state & I_FREEING);
__iget(inode);
list_move(&inode->i_list, &sb->s_dirty);
- __writeback_single_inode(inode, really_sync, nr_to_write);
- if (sync_mode == WB_SYNC_HOLD) {
+ __writeback_single_inode(inode, really_sync, wbc);
+ if (wbc->sync_mode == WB_SYNC_HOLD) {
mapping->dirtied_when = jiffies;
list_move(&inode->i_list, &sb->s_dirty);
}
@@ -277,7 +275,7 @@ sync_sb_inodes(struct backing_dev_info *single_bdi, struct super_block *sb,
spin_unlock(&inode_lock);
iput(inode);
spin_lock(&inode_lock);
- if (nr_to_write && *nr_to_write <= 0)
+ if (wbc->nr_to_write <= 0)
break;
}
out:
@@ -288,16 +286,26 @@ out:
}
/*
+ * Start writeback of dirty pagecache data against all unlocked inodes.
+ *
+ * Note:
+ * We don't need to grab a reference to superblock here. If it has non-empty
+ * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
+ * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are
+ * empty. Since __sync_single_inode() regains inode_lock before it finally moves
+ * inode from superblock lists we are OK.
+ *
+ * If `older_than_this' is non-zero then only flush inodes which have a
+ * flushtime older than *older_than_this.
+ *
* If `bdi' is non-zero then we will scan the first inode against each
* superblock until we find the matching ones. One group will be the dirty
* inodes against a filesystem. Then when we hit the dummy blockdev superblock,
* sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not
* super-efficient but we're about to do a ton of I/O...
*/
-static void
-__writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write,
- enum writeback_sync_modes sync_mode,
- unsigned long *older_than_this)
+void
+writeback_inodes(struct writeback_control *wbc)
{
struct super_block *sb;
@@ -307,11 +315,10 @@ __writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write,
for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) {
if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) {
spin_unlock(&sb_lock);
- sync_sb_inodes(bdi, sb, sync_mode, nr_to_write,
- older_than_this);
+ sync_sb_inodes(sb, wbc);
spin_lock(&sb_lock);
}
- if (nr_to_write && *nr_to_write <= 0)
+ if (wbc->nr_to_write <= 0)
break;
}
spin_unlock(&sb_lock);
@@ -319,43 +326,6 @@ __writeback_unlocked_inodes(struct backing_dev_info *bdi, int *nr_to_write,
}
/*
- * Start writeback of dirty pagecache data against all unlocked inodes.
- *
- * Note:
- * We don't need to grab a reference to superblock here. If it has non-empty
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
- * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are
- * empty. Since __sync_single_inode() regains inode_lock before it finally moves
- * inode from superblock lists we are OK.
- *
- * If `older_than_this' is non-zero then only flush inodes which have a
- * flushtime older than *older_than_this.
- *
- * This is a "memory cleansing" operation, not a "data integrity" operation.
- */
-void writeback_unlocked_inodes(int *nr_to_write,
- enum writeback_sync_modes sync_mode,
- unsigned long *older_than_this)
-{
- __writeback_unlocked_inodes(NULL, nr_to_write,
- sync_mode, older_than_this);
-}
-/*
- * Perform writeback of dirty data against a particular queue.
- *
- * This is for writer throttling. We don't want processes to write back
- * other process's data, espsecially when the other data belongs to a
- * different spindle.
- */
-void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write,
- enum writeback_sync_modes sync_mode,
- unsigned long *older_than_this)
-{
- __writeback_unlocked_inodes(bdi, nr_to_write,
- sync_mode, older_than_this);
-}
-
-/*
* writeback and wait upon the filesystem's dirty inodes. The caller will
* do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is
* used to park the written inodes on sb->s_dirty for the wait pass.
@@ -366,14 +336,17 @@ void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write,
void sync_inodes_sb(struct super_block *sb, int wait)
{
struct page_state ps;
- int nr_to_write;
+ struct writeback_control wbc = {
+ .bdi = NULL,
+ .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+ .older_than_this = NULL,
+ .nr_to_write = 0,
+ };
get_page_state(&ps);
- nr_to_write = ps.nr_dirty + ps.nr_dirty / 4;
-
+ wbc.nr_to_write = ps.nr_dirty + ps.nr_dirty / 4;
spin_lock(&inode_lock);
- sync_sb_inodes(NULL, sb, wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
- &nr_to_write, NULL);
+ sync_sb_inodes(sb, &wbc);
spin_unlock(&inode_lock);
}
@@ -466,8 +439,12 @@ void sync_inodes(int wait)
void write_inode_now(struct inode *inode, int sync)
{
+ struct writeback_control wbc = {
+ .nr_to_write = LONG_MAX,
+ };
+
spin_lock(&inode_lock);
- __writeback_single_inode(inode, sync, NULL);
+ __writeback_single_inode(inode, sync, &wbc);
spin_unlock(&inode_lock);
if (sync)
wait_on_inode(inode);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 65d1dff1f80d..91ab1b3f723f 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -282,9 +282,10 @@ static int jfs_writepage(struct page *page)
return block_write_full_page(page, jfs_get_block);
}
-static int jfs_writepages(struct address_space *mapping, int *nr_to_write)
+static int jfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return mpage_writepages(mapping, nr_to_write, jfs_get_block);
+ return mpage_writepages(mapping, wbc, jfs_get_block);
}
static int jfs_readpage(struct file *file, struct page *page)
diff --git a/fs/mpage.c b/fs/mpage.c
index 363085535ddf..a200d8f68fb8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -484,7 +484,7 @@ out:
* address space and writepage() all of them.
*
* @mapping: address space structure to write
- * @nr_to_write: subtract the number of written pages from *@nr_to_write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
* @get_block: the filesystem's block mapper function.
* If this is NULL then use a_ops->writepage. Otherwise, go
* direct-to-BIO.
@@ -520,7 +520,7 @@ out:
*/
int
mpage_writepages(struct address_space *mapping,
- int *nr_to_write, get_block_t get_block)
+ struct writeback_control *wbc, get_block_t get_block)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
@@ -583,7 +583,7 @@ mpage_writepages(struct address_space *mapping,
__set_page_dirty_nobuffers(page);
ret = 0;
}
- if (ret || (nr_to_write && --(*nr_to_write) <= 0))
+ if (ret || (--(wbc->nr_to_write) <= 0))
done = 1;
} else {
unlock_page(page);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index feb2cbab4699..c1587b0cc89b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -394,131 +394,40 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
return res;
}
-static inline void statm_pte_range(pmd_t * pmd, unsigned long address, unsigned long size, int * pages, int * shared, int * dirty, int * total)
+int proc_pid_statm(task_t *task, char *buffer)
{
- unsigned long end, pmd_end;
- pte_t *pte;
-
- if (pmd_none(*pmd))
- return;
- if (pmd_bad(*pmd)) {
- pmd_ERROR(*pmd);
- pmd_clear(pmd);
- return;
- }
- preempt_disable();
- pte = pte_offset_map(pmd, address);
- end = address + size;
- pmd_end = (address + PMD_SIZE) & PMD_MASK;
- if (end > pmd_end)
- end = pmd_end;
- do {
- pte_t page = *pte;
- struct page *ptpage;
- unsigned long pfn;
+ int size, resident, shared, text, lib, data, dirty;
+ struct mm_struct *mm = get_task_mm(task);
+ struct vm_area_struct * vma;
- address += PAGE_SIZE;
- pte++;
- if (pte_none(page))
- continue;
- ++*total;
- if (!pte_present(page))
- continue;
- pfn = pte_pfn(page);
- if (!pfn_valid(pfn))
- continue;
- ptpage = pfn_to_page(pfn);
- if (PageReserved(ptpage))
- continue;
- ++*pages;
- if (pte_dirty(page))
- ++*dirty;
- if (page_count(pte_page(page)) > 1)
- ++*shared;
- } while (address < end);
- pte_unmap(pte - 1);
- preempt_enable();
-}
+ size = resident = shared = text = lib = data = dirty = 0;
-static inline void statm_pmd_range(pgd_t * pgd, unsigned long address, unsigned long size,
- int * pages, int * shared, int * dirty, int * total)
-{
- pmd_t * pmd;
- unsigned long end;
-
- if (pgd_none(*pgd))
- return;
- if (pgd_bad(*pgd)) {
- pgd_ERROR(*pgd);
- pgd_clear(pgd);
- return;
- }
- pmd = pmd_offset(pgd, address);
- address &= ~PGDIR_MASK;
- end = address + size;
- if (end > PGDIR_SIZE)
- end = PGDIR_SIZE;
- do {
- statm_pte_range(pmd, address, end - address, pages, shared, dirty, total);
- address = (address + PMD_SIZE) & PMD_MASK;
- pmd++;
- } while (address < end);
-}
-
-static void statm_pgd_range(pgd_t * pgd, unsigned long address, unsigned long end,
- int * pages, int * shared, int * dirty, int * total)
-{
- while (address < end) {
- statm_pmd_range(pgd, address, end - address, pages, shared, dirty, total);
- address = (address + PGDIR_SIZE) & PGDIR_MASK;
- pgd++;
- }
-}
+ if (!mm)
+ goto out;
-int proc_pid_statm(struct task_struct *task, char * buffer)
-{
- int size=0, resident=0, share=0, trs=0, lrs=0, drs=0, dt=0;
- struct mm_struct *mm = get_task_mm(task);
+ down_read(&mm->mmap_sem);
+ resident = mm->rss;
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
- if (mm) {
- struct vm_area_struct * vma;
- down_read(&mm->mmap_sem);
- vma = mm->mmap;
- while (vma) {
- pgd_t *pgd = pgd_offset(mm, vma->vm_start);
- int pages = 0, shared = 0, dirty = 0, total = 0;
- if (is_vm_hugetlb_page(vma)) {
- int num_pages = ((vma->vm_end - vma->vm_start)/PAGE_SIZE);
-
- resident += num_pages;
- if (!(vma->vm_flags & VM_DONTCOPY))
- share += num_pages;
- if (vma->vm_flags & VM_WRITE)
- dt += num_pages;
- drs += num_pages;
- vma = vma->vm_next;
- continue;
- }
- statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
- resident += pages;
- share += shared;
- dt += dirty;
- size += total;
- if (vma->vm_flags & VM_EXECUTABLE)
- trs += pages; /* text */
- else if (vma->vm_flags & VM_GROWSDOWN)
- drs += pages; /* stack */
- else if (vma->vm_end > 0x60000000)
- lrs += pages; /* library */
- else
- drs += pages;
- vma = vma->vm_next;
+ size += pages;
+ if (is_vm_hugetlb_page(vma)) {
+ if (!(vma->vm_flags & VM_DONTCOPY))
+ shared += pages;
+ continue;
}
- up_read(&mm->mmap_sem);
- mmput(mm);
+ if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared))
+ shared += pages;
+ if (vma->vm_flags & VM_EXECUTABLE)
+ text += pages;
+ else
+ data += pages;
}
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+out:
return sprintf(buffer,"%d %d %d %d %d %d %d\n",
- size, resident, share, trs, lrs, drs, dt);
+ size, resident, shared, text, lib, data, dirty);
}
/*
diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h
index 572569df5dd4..4059862d4b3d 100644
--- a/include/asm-alpha/mmzone.h
+++ b/include/asm-alpha/mmzone.h
@@ -36,18 +36,14 @@ extern plat_pg_data_t *plat_node_data[];
#ifdef CONFIG_ALPHA_WILDFIRE
# define ALPHA_PA_TO_NID(pa) ((pa) >> 36) /* 16 nodes max due 43bit kseg */
-#define NODE_MAX_MEM_SIZE (64L * 1024L * 1024L * 1024L) /* 64 GB */
-#define MAX_NUMNODES WILDFIRE_MAX_QBB
+# define NODE_MAX_MEM_SIZE (64L * 1024L * 1024L * 1024L) /* 64 GB */
#else
# define ALPHA_PA_TO_NID(pa) (0)
-#define NODE_MAX_MEM_SIZE (~0UL)
-#define MAX_NUMNODES 1
+# define NODE_MAX_MEM_SIZE (~0UL)
#endif
#define PHYSADDR_TO_NID(pa) ALPHA_PA_TO_NID(pa)
#define PLAT_NODE_DATA(n) (plat_node_data[(n)])
-#define PLAT_NODE_DATA_STARTNR(n) \
- (PLAT_NODE_DATA(n)->gendata.node_start_mapnr)
#define PLAT_NODE_DATA_SIZE(n) (PLAT_NODE_DATA(n)->gendata.node_size)
#if 1
diff --git a/include/asm-alpha/numnodes.h b/include/asm-alpha/numnodes.h
new file mode 100644
index 000000000000..4ff6b3ecfbed
--- /dev/null
+++ b/include/asm-alpha/numnodes.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_MAX_NUMNODES_H
+#define _ASM_MAX_NUMNODES_H
+
+/*
+ * Currently the Wildfire is the only discontigmem/NUMA capable Alpha core.
+ */
+#if defined(CONFIG_ALPHA_WILDFIRE) || defined(CONFIG_ALPHA_GENERIC)
+# include <asm/core_wildfire.h>
+# define MAX_NUMNODES WILDFIRE_MAX_QBB
+#endif
+
+#endif /* _ASM_MAX_NUMNODES_H */
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index d2994f116f03..00a5d7ffbed9 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -6,12 +6,13 @@
#ifndef _ASM_MMZONE_H_
#define _ASM_MMZONE_H_
+#include <asm/smp.h>
+
#ifdef CONFIG_DISCONTIGMEM
#ifdef CONFIG_X86_NUMAQ
#include <asm/numaq.h>
#else
-#define pa_to_nid(pa) (0)
#define pfn_to_nid(pfn) (0)
#ifdef CONFIG_NUMA
#define _cpu_to_node(cpu) 0
@@ -44,7 +45,6 @@ extern struct pglist_data *node_data[];
#define alloc_bootmem_low_pages_node(ignore, x) \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-#define node_startnr(nid) (node_data[nid]->node_start_mapnr)
#define node_size(nid) (node_data[nid]->node_size)
#define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn)
@@ -55,7 +55,7 @@ extern struct pglist_data *node_data[];
/*
* Given a kernel address, find the home node of the underlying memory.
*/
-#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr))
+#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
/*
* Return a pointer to the node data for node n.
@@ -64,6 +64,8 @@ extern struct pglist_data *node_data[];
#define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map)
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
+ NODE_DATA(nid)->node_size)
#define local_mapnr(kvaddr) \
( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) )
@@ -74,5 +76,13 @@ extern struct pglist_data *node_data[];
#define pfn_to_page(pfn) (node_mem_map(pfn_to_nid(pfn)) + node_localnr(pfn, pfn_to_nid(pfn)))
#define page_to_pfn(page) ((page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn)
#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT))
+/*
+ * pfn_valid should be made as fast as possible, and the current definition
+ * is valid for machines that are NUMA, but still contiguous, which is what
+ * is currently supported. A more generalised, but slower definition would
+ * be something like this - mbligh:
+ * ( pfn_to_pgdat(pfn) && (pfn < node_end_pfn(pfn_to_nid(pfn))) )
+ */
+#define pfn_valid(pfn) (pfn < num_physpages)
#endif /* CONFIG_DISCONTIGMEM */
#endif /* _ASM_MMZONE_H_ */
diff --git a/include/asm-i386/numaq.h b/include/asm-i386/numaq.h
index ed10442f1dcc..b32b28c12c73 100644
--- a/include/asm-i386/numaq.h
+++ b/include/asm-i386/numaq.h
@@ -32,17 +32,18 @@
/*
* for now assume that 64Gb is max amount of RAM for whole system
- * 64Gb * 1024Mb/Gb = 65536 Mb
- * 65536 Mb / 256Mb = 256
+ * 64Gb / 4096bytes/page = 16777216 pages
*/
+#define MAX_NR_PAGES 16777216
#define MAX_ELEMENTS 256
-#define ELEMENT_REPRESENTS 8 /* 256 Mb */
+#define PAGES_PER_ELEMENT (16777216/256)
+#define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn))
+#define PHYSADDR_TO_NID(pa) pfn_to_nid(pa >> PAGE_SHIFT)
#define MAX_NUMNODES 8
#ifdef CONFIG_NUMA
#define _cpu_to_node(cpu) (cpu_to_logical_apicid(cpu) >> 4)
#endif /* CONFIG_NUMA */
-extern int pa_to_nid(u64);
extern int pfn_to_nid(unsigned long);
extern void get_memcfg_numaq(void);
#define get_memcfg_numa() get_memcfg_numaq()
diff --git a/include/asm-i386/max_numnodes.h b/include/asm-i386/numnodes.h
index 2b63299604ef..2b63299604ef 100644
--- a/include/asm-i386/max_numnodes.h
+++ b/include/asm-i386/numnodes.h
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index 5a09fd4b72f1..f9fe284b9057 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -145,10 +145,10 @@ static __inline__ int get_order(unsigned long size)
#ifndef CONFIG_DISCONTIGMEM
#define pfn_to_page(pfn) (mem_map + (pfn))
#define page_to_pfn(page) ((unsigned long)((page) - mem_map))
+#define pfn_valid(pfn) ((pfn) < max_mapnr)
#endif /* !CONFIG_DISCONTIGMEM */
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define pfn_valid(pfn) ((pfn) < max_mapnr)
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \
diff --git a/include/asm-mips64/mmzone.h b/include/asm-mips64/mmzone.h
index 5e643b114269..d60ad12acd75 100644
--- a/include/asm-mips64/mmzone.h
+++ b/include/asm-mips64/mmzone.h
@@ -24,7 +24,6 @@ extern plat_pg_data_t *plat_node_data[];
#define PHYSADDR_TO_NID(pa) NASID_TO_COMPACT_NODEID(NASID_GET(pa))
#define PLAT_NODE_DATA(n) (plat_node_data[n])
-#define PLAT_NODE_DATA_STARTNR(n) (PLAT_NODE_DATA(n)->gendata.node_start_mapnr)
#define PLAT_NODE_DATA_SIZE(n) (PLAT_NODE_DATA(n)->gendata.node_size)
#define PLAT_NODE_DATA_LOCALNR(p, n) \
(((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn)
diff --git a/include/asm-mips64/pgtable.h b/include/asm-mips64/pgtable.h
index ded7d0a0a986..b32768e57d16 100644
--- a/include/asm-mips64/pgtable.h
+++ b/include/asm-mips64/pgtable.h
@@ -373,10 +373,10 @@ extern inline void pgd_clear(pgd_t *pgdp)
#ifndef CONFIG_DISCONTIGMEM
#define pte_page(x) (mem_map+(unsigned long)((pte_val(x) >> PAGE_SHIFT)))
#else
-#define mips64_pte_pagenr(x) \
- (PLAT_NODE_DATA_STARTNR(PHYSADDR_TO_NID(pte_val(x))) + \
- PLAT_NODE_DATA_LOCALNR(pte_val(x), PHYSADDR_TO_NID(pte_val(x))))
-#define pte_page(x) (mem_map+mips64_pte_pagenr(x))
+
+#define pte_page(x) ( NODE_MEM_MAP(PHYSADDR_TO_NID(pte_val(x))) +
+ PLAT_NODE_DATA_LOCALNR(pte_val(x), PHYSADDR_TO_NID(pte_val(x))) )
+
#endif
/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 804ea47301f5..56f2bab87d7f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -279,6 +279,7 @@ struct iattr {
*/
struct page;
struct address_space;
+struct writeback_control;
struct address_space_operations {
int (*writepage)(struct page *);
@@ -286,10 +287,10 @@ struct address_space_operations {
int (*sync_page)(struct page *);
/* Write back some dirty pages from this mapping. */
- int (*writepages)(struct address_space *, int *nr_to_write);
+ int (*writepages)(struct address_space *, struct writeback_control *);
/* Perform a writeback as a memory-freeing operation. */
- int (*vm_writeback)(struct page *, int *nr_to_write);
+ int (*vm_writeback)(struct page *, struct writeback_control *);
/* Set a page dirty */
int (*set_page_dirty)(struct page *page);
@@ -1259,7 +1260,8 @@ extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
extern int generic_file_open(struct inode * inode, struct file * filp);
-extern int generic_vm_writeback(struct page *page, int *nr_to_write);
+extern int generic_vm_writeback(struct page *page,
+ struct writeback_control *wbc);
extern struct file_operations generic_ro_fops;
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 10021357c093..437572e2240b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -39,18 +39,25 @@
* can allocate highmem pages, the *get*page*() variants return
* virtual kernel addresses to the allocated page(s).
*/
-extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order));
extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, struct zonelist *zonelist));
extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order);
+/*
+ * We get the zone list from the current node and the gfp_mask.
+ * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
+ *
+ * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
+ * optimized to &contig_page_data at compile-time.
+ */
static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order)
{
- /*
- * Gets optimized away by the compiler.
- */
- if (order >= MAX_ORDER)
+ pg_data_t *pgdat = NODE_DATA(numa_node_id());
+ unsigned int idx = (gfp_mask & GFP_ZONEMASK);
+
+ if (unlikely(order >= MAX_ORDER))
return NULL;
- return _alloc_pages(gfp_mask, order);
+
+ return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + idx);
}
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7483c39e28dd..c63e4947387f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -15,7 +15,10 @@
#include <linux/rbtree.h>
#include <linux/fs.h>
+#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */
extern unsigned long max_mapnr;
+#endif
+
extern unsigned long num_physpages;
extern void * high_memory;
extern int page_cluster;
@@ -345,8 +348,10 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_MINOR 1
#define VM_FAULT_MAJOR 2
-/* The array of struct pages */
+#ifndef CONFIG_DISCONTIGMEM
+/* The array of struct pages - for discontigmem use pgdat->lmem_map */
extern struct page *mem_map;
+#endif
extern void show_free_areas(void);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8ebf441bdb47..580c39c4dcc1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -10,11 +10,14 @@
#include <linux/wait.h>
#include <linux/cache.h>
#include <asm/atomic.h>
+#ifdef CONFIG_DISCONTIGMEM
+#include <asm/numnodes.h>
+#endif
+#ifndef MAX_NUMNODES
+#define MAX_NUMNODES 1
+#endif
-/*
- * Free memory management - zoned buddy allocator.
- */
-
+/* Free memory management - zoned buddy allocator. */
#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
@@ -112,7 +115,6 @@ struct zone {
struct page *zone_mem_map;
/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
- unsigned long zone_start_mapnr;
/*
* rarely used fields:
@@ -138,7 +140,7 @@ struct zone {
* footprint of this construct is very small.
*/
struct zonelist {
- struct zone *zones[MAX_NR_ZONES+1]; // NULL delimited
+ struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
};
#define GFP_ZONEMASK 0x0f
@@ -163,7 +165,6 @@ typedef struct pglist_data {
unsigned long *valid_addr_bitmap;
struct bootmem_data *bdata;
unsigned long node_start_pfn;
- unsigned long node_start_mapnr;
unsigned long node_size;
int node_id;
struct pglist_data *pgdat_next;
@@ -187,10 +188,12 @@ memclass(struct zone *pgzone, struct zone *classzone)
* prototypes for the discontig memory code.
*/
struct page;
-void free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
- unsigned long *zones_size, unsigned long paddr, unsigned long *zholes_size,
- struct page *pmap);
+extern void calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size,
+ unsigned long *zholes_size);
+extern void free_area_init_core(pg_data_t *pgdat, unsigned long *zones_size,
+ unsigned long *zholes_size);
void get_zone_counts(unsigned long *active, unsigned long *inactive);
+extern void build_all_zonelists(void);
extern pg_data_t contig_page_data;
diff --git a/include/linux/mpage.h b/include/linux/mpage.h
index 52253d90f55d..86aa7b676274 100644
--- a/include/linux/mpage.h
+++ b/include/linux/mpage.h
@@ -10,14 +10,16 @@
* nested includes. Get it right in the .c file).
*/
+struct writeback_control;
+
int mpage_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages, get_block_t get_block);
int mpage_readpage(struct page *page, get_block_t get_block);
int mpage_writepages(struct address_space *mapping,
- int *nr_to_write, get_block_t get_block);
+ struct writeback_control *wbc, get_block_t get_block);
static inline int
-generic_writepages(struct address_space *mapping, int *nr_to_write)
+generic_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
- return mpage_writepages(mapping, nr_to_write, NULL);
+ return mpage_writepages(mapping, wbc, NULL);
}
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 3127165e7c13..9fd7d5c05605 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -128,7 +128,6 @@ enum
KERN_TAINTED=53, /* int: various kernel tainted flags */
KERN_CADPID=54, /* int: PID of the process to notify on CAD */
KERN_PIDMAX=55, /* int: PID # limit */
- KERN_HUGETLB_PAGE_NUM=56, /* int: Number of available Huge Pages */
};
@@ -147,12 +146,12 @@ enum
VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */
VM_DIRTY_BACKGROUND=11, /* dirty_background_ratio */
VM_DIRTY_ASYNC=12, /* dirty_async_ratio */
- VM_DIRTY_SYNC=13, /* dirty_sync_ratio */
- VM_DIRTY_WB_CS=14, /* dirty_writeback_centisecs */
- VM_DIRTY_EXPIRE_CS=15, /* dirty_expire_centisecs */
- VM_NR_PDFLUSH_THREADS=16, /* nr_pdflush_threads */
- VM_OVERCOMMIT_RATIO=17, /* percent of RAM to allow overcommit in */
- VM_PAGEBUF=18 /* struct: Control pagebuf parameters */
+ VM_DIRTY_WB_CS=13, /* dirty_writeback_centisecs */
+ VM_DIRTY_EXPIRE_CS=14, /* dirty_expire_centisecs */
+ VM_NR_PDFLUSH_THREADS=15, /* nr_pdflush_threads */
+ VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */
+ VM_PAGEBUF=17, /* struct: Control pagebuf parameters */
+ VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */
};
diff --git a/include/linux/uio.h b/include/linux/uio.h
index ec098c8e6793..85b2f0ec9d3f 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -35,7 +35,11 @@ struct iovec
#endif
/*
- * Total number of bytes covered by an iovec
+ * Total number of bytes covered by an iovec.
+ *
+ * NOTE that it is not safe to use this function until all the iovec's
+ * segment lengths have been validated. Because the individual lengths can
+ * overflow a size_t when added together.
*/
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
{
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 5de884cd6a7c..c35b96eb6a90 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -27,22 +27,29 @@ static inline int current_is_pdflush(void)
* fs/fs-writeback.c
*/
enum writeback_sync_modes {
- WB_SYNC_NONE = 0, /* Don't wait on anything */
- WB_SYNC_LAST = 1, /* Wait on the last-written mapping */
- WB_SYNC_ALL = 2, /* Wait on every mapping */
- WB_SYNC_HOLD = 3, /* Hold the inode on sb_dirty for sys_sync() */
+ WB_SYNC_NONE, /* Don't wait on anything */
+ WB_SYNC_ALL, /* Wait on every mapping */
+ WB_SYNC_HOLD, /* Hold the inode on sb_dirty for sys_sync() */
};
-void writeback_unlocked_inodes(int *nr_to_write,
- enum writeback_sync_modes sync_mode,
- unsigned long *older_than_this);
+/*
+ * A control structure which tells the writeback code what to do
+ */
+struct writeback_control {
+ struct backing_dev_info *bdi; /* If !NULL, only write back this
+ queue */
+ enum writeback_sync_modes sync_mode;
+ unsigned long *older_than_this; /* If !NULL, only write back inodes
+ older than this */
+ long nr_to_write; /* Write this many pages, and decrement
+ this for each page written */
+};
+
+void writeback_inodes(struct writeback_control *wbc);
void wake_up_inode(struct inode *inode);
void __wait_on_inode(struct inode * inode);
void sync_inodes_sb(struct super_block *, int wait);
void sync_inodes(int wait);
-void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write,
- enum writeback_sync_modes sync_mode,
- unsigned long *older_than_this);
/* writeback.h requires fs.h; it, too, is not included from here. */
static inline void wait_on_inode(struct inode *inode)
@@ -57,7 +64,6 @@ static inline void wait_on_inode(struct inode *inode)
/* These 5 are exported to sysctl. */
extern int dirty_background_ratio;
extern int dirty_async_ratio;
-extern int dirty_sync_ratio;
extern int dirty_writeback_centisecs;
extern int dirty_expire_centisecs;
@@ -65,7 +71,7 @@ extern int dirty_expire_centisecs;
void balance_dirty_pages(struct address_space *mapping);
void balance_dirty_pages_ratelimited(struct address_space *mapping);
int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
-int do_writepages(struct address_space *mapping, int *nr_to_write);
+int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
/* pdflush.c */
extern int nr_pdflush_threads; /* Global so it can be exported to sysctl
diff --git a/init/main.c b/init/main.c
index 9c38da7a9bd0..b47b623aa6a0 100644
--- a/init/main.c
+++ b/init/main.c
@@ -393,6 +393,7 @@ asmlinkage void __init start_kernel(void)
printk(linux_banner);
setup_arch(&command_line);
setup_per_cpu_areas();
+ build_all_zonelists();
printk("Kernel command line: %s\n", saved_command_line);
parse_options(command_line);
trap_init();
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 4dc1840a8b70..4931e909724f 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -91,7 +91,6 @@ EXPORT_SYMBOL(do_brk);
EXPORT_SYMBOL(exit_mm);
/* internal kernel memory management */
-EXPORT_SYMBOL(_alloc_pages);
EXPORT_SYMBOL(__alloc_pages);
EXPORT_SYMBOL(alloc_pages_node);
EXPORT_SYMBOL(__get_free_pages);
@@ -116,9 +115,12 @@ EXPORT_SYMBOL(vmalloc_32);
EXPORT_SYMBOL(vmap);
EXPORT_SYMBOL(vunmap);
EXPORT_SYMBOL(vmalloc_to_page);
-EXPORT_SYMBOL(mem_map);
EXPORT_SYMBOL(remap_page_range);
+#ifndef CONFIG_DISCONTIGMEM
+EXPORT_SYMBOL(contig_page_data);
+EXPORT_SYMBOL(mem_map);
EXPORT_SYMBOL(max_mapnr);
+#endif
EXPORT_SYMBOL(high_memory);
EXPORT_SYMBOL(vmtruncate);
EXPORT_SYMBOL(find_vma);
diff --git a/kernel/printk.c b/kernel/printk.c
index ca1cd3fea625..a3d23302ae5b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -525,11 +525,11 @@ void release_console_sem(void)
{
unsigned long flags;
unsigned long _con_start, _log_end;
- unsigned long must_wake_klogd = 0;
+ unsigned long wake_klogd = 0;
for ( ; ; ) {
spin_lock_irqsave(&logbuf_lock, flags);
- must_wake_klogd |= log_start - log_end;
+ wake_klogd |= log_start - log_end;
if (con_start == log_end)
break; /* Nothing to print */
_con_start = con_start;
@@ -541,7 +541,7 @@ void release_console_sem(void)
console_may_schedule = 0;
up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
- if (must_wake_klogd && !oops_in_progress)
+ if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
wake_up_interruptible(&log_wait);
}
diff --git a/kernel/suspend.c b/kernel/suspend.c
index 2d7eeaabe127..419490900ff6 100644
--- a/kernel/suspend.c
+++ b/kernel/suspend.c
@@ -471,10 +471,12 @@ static int count_and_copy_data_pages(struct pbe *pagedir_p)
int nr_copy_pages = 0;
int pfn;
struct page *page;
-
+
+#ifndef CONFIG_DISCONTIGMEM
if (max_mapnr != num_physpages)
panic("mapnr is not expected");
- for (pfn = 0; pfn < max_mapnr; pfn++) {
+#endif
+ for (pfn = 0; pfn < num_physpages; pfn++) {
page = pfn_to_page(pfn);
if (PageHighMem(page))
panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-).");
@@ -514,19 +516,20 @@ static int count_and_copy_data_pages(struct pbe *pagedir_p)
static void free_suspend_pagedir(unsigned long this_pagedir)
{
- struct page *page = mem_map;
- int i;
+ struct page *page;
+ int pfn;
unsigned long this_pagedir_end = this_pagedir +
(PAGE_SIZE << pagedir_order);
- for(i=0; i < num_physpages; i++, page++) {
+ for(pfn = 0; pfn < num_physpages; pfn++) {
+ page = pfn_to_page(pfn);
if (!TestClearPageNosave(page))
continue;
- if (ADDRESS(i) >= this_pagedir && ADDRESS(i) < this_pagedir_end)
+ if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end)
continue; /* old pagedir gets freed in one */
- free_page(ADDRESS(i));
+ free_page(ADDRESS(pfn));
}
free_pages(this_pagedir, pagedir_order);
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6f92068e3f29..1a63d254ab80 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -99,8 +99,8 @@ extern int acct_parm[];
#endif
#ifdef CONFIG_HUGETLB_PAGE
-extern int htlbpage_max;
-extern int set_hugetlb_mem_size(int);
+extern int htlbpage_max;
+extern int set_hugetlb_mem_size(int);
#endif
static int parse_table(int *, int, void *, size_t *, void *, size_t,
@@ -263,10 +263,6 @@ static ctl_table kern_table[] = {
#endif
{KERN_PIDMAX, "pid_max", &pid_max, sizeof (int),
0600, NULL, &proc_dointvec},
-#ifdef CONFIG_HUGETLB_PAGE
- {KERN_HUGETLB_PAGE_NUM, "numhugepages", &htlbpage_max, sizeof(int), 0644, NULL,
- &proc_dointvec},
-#endif
{0}
};
@@ -292,9 +288,6 @@ static ctl_table vm_table[] = {
{VM_DIRTY_ASYNC, "dirty_async_ratio", &dirty_async_ratio,
sizeof(dirty_async_ratio), 0644, NULL, &proc_dointvec_minmax,
&sysctl_intvec, NULL, &zero, &one_hundred },
- {VM_DIRTY_SYNC, "dirty_sync_ratio", &dirty_sync_ratio,
- sizeof(dirty_sync_ratio), 0644, NULL, &proc_dointvec_minmax,
- &sysctl_intvec, NULL, &zero, &one_hundred },
{VM_DIRTY_WB_CS, "dirty_writeback_centisecs",
&dirty_writeback_centisecs, sizeof(dirty_writeback_centisecs), 0644,
NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL,
@@ -317,6 +310,10 @@ static ctl_table vm_table[] = {
{ VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads",
&nr_pdflush_threads, sizeof nr_pdflush_threads,
0444 /* read-only*/, NULL, &proc_dointvec},
+#ifdef CONFIG_HUGETLB_PAGE
+ {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, NULL,
+ &proc_dointvec},
+#endif
{0}
};
diff --git a/mm/filemap.c b/mm/filemap.c
index 483699da95d0..3aa685fdcf25 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -487,9 +487,13 @@ EXPORT_SYMBOL(fail_writepage);
int filemap_fdatawrite(struct address_space *mapping)
{
int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = mapping->nrpages * 2,
+ };
current->flags |= PF_SYNC;
- ret = do_writepages(mapping, NULL);
+ ret = do_writepages(mapping, &wbc);
current->flags &= ~PF_SYNC;
return ret;
}
@@ -1130,10 +1134,26 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
struct file *filp = iocb->ki_filp;
ssize_t retval;
unsigned long seg;
- size_t count = iov_length(iov, nr_segs);
+ size_t count;
- if ((ssize_t) count < 0)
- return -EINVAL;
+ count = 0;
+ for (seg = 0; seg < nr_segs; seg++) {
+ const struct iovec *iv = &iov[seg];
+
+ /*
+ * If any segment has a negative length, or the cumulative
+ * length ever wraps negative then return -EINVAL.
+ */
+ count += iv->iov_len;
+ if (unlikely((ssize_t)(count|iv->iov_len) < 0))
+ return -EINVAL;
+ if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
+ continue;
+ if (seg == 0)
+ return -EFAULT;
+ nr_segs = seg;
+ break;
+ }
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
if (filp->f_flags & O_DIRECT) {
@@ -1162,11 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
goto out;
}
- for (seg = 0; seg < nr_segs; seg++) {
- if (!access_ok(VERIFY_WRITE,iov[seg].iov_base,iov[seg].iov_len))
- return -EFAULT;
- }
-
retval = 0;
if (count) {
for (seg = 0; seg < nr_segs; seg++) {
@@ -1626,6 +1641,63 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
return left;
}
+static inline int
+__filemap_copy_from_user_iovec(char *vaddr,
+ const struct iovec *iov, size_t base, unsigned bytes)
+{
+ int left = 0;
+
+ while (bytes) {
+ char *buf = iov->iov_base + base;
+ int copy = min(bytes, iov->iov_len - base);
+ base = 0;
+ if ((left = __copy_from_user(vaddr, buf, copy)))
+ break;
+ bytes -= copy;
+ vaddr += copy;
+ iov++;
+ }
+ return left;
+}
+
+static inline int
+filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
+ const struct iovec *iov, size_t base, unsigned bytes)
+{
+ char *kaddr;
+ int left;
+
+ kaddr = kmap_atomic(page, KM_USER0);
+ left = __filemap_copy_from_user_iovec(kaddr + offset, iov, base, bytes);
+ kunmap_atomic(kaddr, KM_USER0);
+ if (left != 0) {
+ kaddr = kmap(page);
+ left = __filemap_copy_from_user_iovec(kaddr + offset, iov, base, bytes);
+ kunmap(page);
+ }
+ return left;
+}
+
+static inline void
+filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, unsigned bytes)
+{
+ const struct iovec *iov = *iovp;
+ size_t base = *basep;
+
+ while (bytes) {
+ int copy = min(bytes, iov->iov_len - base);
+ bytes -= copy;
+ base += copy;
+ if (iov->iov_len == base) {
+ iov++;
+ base = 0;
+ }
+ }
+ *iovp = iov;
+ *basep = base;
+}
+
+
/*
* Write to a file through the page cache.
*
@@ -1641,8 +1713,8 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
{
struct address_space * mapping = file->f_dentry->d_inode->i_mapping;
struct address_space_operations *a_ops = mapping->a_ops;
- const size_t ocount = iov_length(iov, nr_segs);
- size_t count = ocount;
+ size_t ocount; /* original count */
+ size_t count; /* after file limit checks */
struct inode *inode = mapping->host;
unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
long status = 0;
@@ -1654,19 +1726,30 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
unsigned bytes;
time_t time_now;
struct pagevec lru_pvec;
- struct iovec *cur_iov;
- unsigned iov_bytes; /* Cumulative count to the end of the
- current iovec */
+ const struct iovec *cur_iov = iov; /* current iovec */
+ unsigned iov_base = 0; /* offset in the current iovec */
unsigned long seg;
char *buf;
- if (unlikely((ssize_t)count < 0))
- return -EINVAL;
-
+ ocount = 0;
for (seg = 0; seg < nr_segs; seg++) {
- if (!access_ok(VERIFY_READ,iov[seg].iov_base,iov[seg].iov_len))
+ const struct iovec *iv = &iov[seg];
+
+ /*
+ * If any segment has a negative length, or the cumulative
+ * length ever wraps negative then return -EINVAL.
+ */
+ ocount += iv->iov_len;
+ if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+ return -EINVAL;
+ if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+ continue;
+ if (seg == 0)
return -EFAULT;
+ nr_segs = seg;
+ break;
}
+ count = ocount;
pos = *ppos;
if (unlikely(pos < 0))
@@ -1788,9 +1871,7 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
goto out_status;
}
- cur_iov = (struct iovec *)iov;
- iov_bytes = cur_iov->iov_len;
- buf = cur_iov->iov_base;
+ buf = iov->iov_base;
do {
unsigned long index;
unsigned long offset;
@@ -1801,8 +1882,6 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
bytes = PAGE_CACHE_SIZE - offset;
if (bytes > count)
bytes = count;
- if (bytes + written > iov_bytes)
- bytes = iov_bytes - written;
/*
* Bring in the user page that we will copy from _first_.
@@ -1830,7 +1909,12 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
vmtruncate(inode, inode->i_size);
break;
}
- page_fault = filemap_copy_from_user(page, offset, buf, bytes);
+ if (likely(nr_segs == 1))
+ page_fault = filemap_copy_from_user(page, offset,
+ buf, bytes);
+ else
+ page_fault = filemap_copy_from_user_iovec(page, offset,
+ cur_iov, iov_base, bytes);
flush_dcache_page(page);
status = a_ops->commit_write(file, page, offset, offset+bytes);
if (unlikely(page_fault)) {
@@ -1844,11 +1928,9 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
count -= status;
pos += status;
buf += status;
- if (written == iov_bytes && count) {
- cur_iov++;
- iov_bytes += cur_iov->iov_len;
- buf = cur_iov->iov_base;
- }
+ if (unlikely(nr_segs > 1))
+ filemap_set_next_iovec(&cur_iov,
+ &iov_base, status);
}
}
if (!PageReferenced(page))
diff --git a/mm/memory.c b/mm/memory.c
index c886e849231b..e58e9dee7bfc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -40,7 +40,6 @@
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/swap.h>
-#include <linux/smp_lock.h>
#include <linux/iobuf.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
@@ -53,7 +52,12 @@
#include <linux/swapops.h>
+#ifndef CONFIG_DISCONTIGMEM
+/* use the per-pgdat data instead for discontigmem - mbligh */
unsigned long max_mapnr;
+struct page *mem_map;
+#endif
+
unsigned long num_physpages;
void * high_memory;
struct page *highmem_start_page;
@@ -72,8 +76,6 @@ static inline void copy_cow_page(struct page * from, struct page * to, unsigned
copy_user_highpage(to, from, address);
}
-struct page *mem_map;
-
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
diff --git a/mm/mempool.c b/mm/mempool.c
index b92e72b211d3..a201059c1264 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -187,11 +187,12 @@ void * mempool_alloc(mempool_t *pool, int gfp_mask)
int curr_nr;
DECLARE_WAITQUEUE(wait, current);
int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+ int pf_flags = current->flags;
repeat_alloc:
current->flags |= PF_NOWARN;
element = pool->alloc(gfp_nowait, pool->pool_data);
- current->flags &= ~PF_NOWARN;
+ current->flags = pf_flags;
if (likely(element != NULL))
return element;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b621c7166bf..0038ed6bf5e2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -11,7 +11,6 @@
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
-#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
@@ -444,6 +443,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
*/
vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+ if (flags & MAP_LOCKED) {
+ if (!capable(CAP_IPC_LOCK))
+ return -EPERM;
+ vm_flags |= VM_LOCKED;
+ }
/* mlock MCL_FUTURE? */
if (vm_flags & VM_LOCKED) {
unsigned long locked = mm->locked_vm << PAGE_SHIFT;
@@ -1073,7 +1077,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
/* Munmap is split into 2 main parts -- this part which finds
* what needs doing, and the areas themselves, which do the
* work. This now handles partial unmappings.
- * Jeremy Fitzhardine <jeremy@sw.oz.au>
+ * Jeremy Fitzhardinge <jeremy@goop.org>
*/
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
{
diff --git a/mm/mremap.c b/mm/mremap.c
index 0d22f3d6c20f..6b1d44bd114c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,7 +9,6 @@
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/swap.h>
diff --git a/mm/numa.c b/mm/numa.c
index c293d9ae2df0..a36769c95390 100644
--- a/mm/numa.c
+++ b/mm/numa.c
@@ -22,11 +22,21 @@ pg_data_t contig_page_data = { .bdata = &contig_bootmem_data };
* Should be invoked with paramters (0, 0, unsigned long *[], start_paddr).
*/
void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
- unsigned long *zones_size, unsigned long zone_start_pfn,
+ unsigned long *zones_size, unsigned long node_start_pfn,
unsigned long *zholes_size)
{
- free_area_init_core(0, &contig_page_data, &mem_map, zones_size,
- zone_start_pfn, zholes_size, pmap);
+ unsigned long size;
+
+ contig_page_data.node_id = 0;
+ contig_page_data.node_start_pfn = node_start_pfn;
+ calculate_totalpages (&contig_page_data, zones_size, zholes_size);
+ if (pmap == (struct page *)0) {
+ size = (pgdat->node_size + 1) * sizeof(struct page);
+ pmap = (struct page *) alloc_bootmem_node(pgdat, size);
+ }
+ contig_page_data.node_mem_map = pmap;
+ free_area_init_core(&contig_page_data, zones_size, zholes_size);
+ mem_map = contig_page_data.node_mem_map;
}
#endif /* !CONFIG_DISCONTIGMEM */
@@ -48,22 +58,26 @@ struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int orde
* Nodes can be initialized parallely, in no particular order.
*/
void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
- unsigned long *zones_size, unsigned long zone_start_pfn,
+ unsigned long *zones_size, unsigned long node_start_pfn,
unsigned long *zholes_size)
{
- int i, size = 0;
- struct page *discard;
-
- if (mem_map == NULL)
- mem_map = (struct page *)PAGE_OFFSET;
+ int i;
+ unsigned long size;
- free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_pfn,
- zholes_size, pmap);
pgdat->node_id = nid;
+ pgdat->node_start_pfn = node_start_pfn;
+ calculate_totalpages (pgdat, zones_size, zholes_size);
+ if (pmap == (struct page *)0) {
+ size = (pgdat->node_size + 1) * sizeof(struct page);
+ pmap = (struct page *) alloc_bootmem_node(pgdat, size);
+ }
+ pgdat->node_mem_map = pmap;
+ free_area_init_core(pgdat, zones_size, zholes_size);
/*
* Get space for the valid bitmap.
*/
+ size = 0;
for (i = 0; i < MAX_NR_ZONES; i++)
size += zones_size[i];
size = LONG_ALIGN((size + 7) >> 3);
@@ -71,48 +85,4 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
memset(pgdat->valid_addr_bitmap, 0, size);
}
-static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask,
- unsigned int order)
-{
- return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK));
-}
-
-/*
- * This can be refined. Currently, tries to do round robin, instead
- * should do concentratic circle search, starting from current node.
- */
-struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order)
-{
- struct page *ret = 0;
- pg_data_t *start, *temp;
-#ifndef CONFIG_NUMA
- unsigned long flags;
- static pg_data_t *next = 0;
-#endif
-
- if (order >= MAX_ORDER)
- return NULL;
-#ifdef CONFIG_NUMA
- temp = NODE_DATA(numa_node_id());
-#else
- if (!next)
- next = pgdat_list;
- temp = next;
- next = next->pgdat_next;
-#endif
- start = temp;
- while (temp) {
- if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
- return(ret);
- temp = temp->pgdat_next;
- }
- temp = pgdat_list;
- while (temp != start) {
- if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
- return(ret);
- temp = temp->pgdat_next;
- }
- return(0);
-}
-
#endif /* CONFIG_DISCONTIGMEM */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index fb201b6ca0b2..a8afd3699509 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -51,7 +51,7 @@ static long total_pages;
* It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
* large amounts of I/O are submitted.
*/
-static inline int sync_writeback_pages(void)
+static inline long sync_writeback_pages(void)
{
return ratelimit_pages + ratelimit_pages / 2;
}
@@ -73,11 +73,6 @@ int dirty_background_ratio = 10;
int dirty_async_ratio = 40;
/*
- * The generator of dirty data performs sync writeout at this level
- */
-int dirty_sync_ratio = 50;
-
-/*
* The interval between `kupdate'-style writebacks, in centiseconds
* (hundredths of a second)
*/
@@ -105,15 +100,11 @@ static void background_writeout(unsigned long _min_pages);
* - Does nothing at all.
*
* balance_dirty_pages() can sleep.
- *
- * FIXME: WB_SYNC_LAST doesn't actually work. It waits on the last dirty
- * inode on the superblock list. It should wait when nr_to_write is
- * exhausted. Doesn't seem to matter.
*/
void balance_dirty_pages(struct address_space *mapping)
{
struct page_state ps;
- long background_thresh, async_thresh, sync_thresh;
+ long background_thresh, async_thresh;
unsigned long dirty_and_writeback;
struct backing_dev_info *bdi;
@@ -122,18 +113,17 @@ void balance_dirty_pages(struct address_space *mapping)
background_thresh = (dirty_background_ratio * total_pages) / 100;
async_thresh = (dirty_async_ratio * total_pages) / 100;
- sync_thresh = (dirty_sync_ratio * total_pages) / 100;
bdi = mapping->backing_dev_info;
- if (dirty_and_writeback > sync_thresh) {
- int nr_to_write = sync_writeback_pages();
+ if (dirty_and_writeback > async_thresh) {
+ struct writeback_control wbc = {
+ .bdi = bdi,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = NULL,
+ .nr_to_write = sync_writeback_pages(),
+ };
- writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_LAST, NULL);
- get_page_state(&ps);
- } else if (dirty_and_writeback > async_thresh) {
- int nr_to_write = sync_writeback_pages();
-
- writeback_backing_dev(bdi, &nr_to_write, WB_SYNC_NONE, NULL);
+ writeback_inodes(&wbc);
get_page_state(&ps);
}
@@ -177,7 +167,12 @@ static void background_writeout(unsigned long _min_pages)
{
long min_pages = _min_pages;
long background_thresh;
- int nr_to_write;
+ struct writeback_control wbc = {
+ .bdi = NULL,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = NULL,
+ .nr_to_write = 0,
+ };
CHECK_EMERGENCY_SYNC
@@ -185,14 +180,13 @@ static void background_writeout(unsigned long _min_pages)
do {
struct page_state ps;
-
get_page_state(&ps);
if (ps.nr_dirty < background_thresh && min_pages <= 0)
break;
- nr_to_write = MAX_WRITEBACK_PAGES;
- writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
- min_pages -= MAX_WRITEBACK_PAGES - nr_to_write;
- } while (nr_to_write <= 0);
+ wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+ writeback_inodes(&wbc);
+ min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ } while (wbc.nr_to_write <= 0);
blk_run_queues();
}
@@ -230,7 +224,12 @@ static void wb_kupdate(unsigned long arg)
unsigned long start_jif;
unsigned long next_jif;
struct page_state ps;
- int nr_to_write;
+ struct writeback_control wbc = {
+ .bdi = NULL,
+ .sync_mode = WB_SYNC_NONE,
+ .older_than_this = &oldest_jif,
+ .nr_to_write = 0,
+ };
sync_supers();
get_page_state(&ps);
@@ -238,8 +237,8 @@ static void wb_kupdate(unsigned long arg)
oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
start_jif = jiffies;
next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
- nr_to_write = ps.nr_dirty;
- writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, &oldest_jif);
+ wbc.nr_to_write = ps.nr_dirty;
+ writeback_inodes(&wbc);
blk_run_queues();
yield();
@@ -312,8 +311,6 @@ static int __init page_writeback_init(void)
dirty_background_ratio /= 100;
dirty_async_ratio *= correction;
dirty_async_ratio /= 100;
- dirty_sync_ratio *= correction;
- dirty_sync_ratio /= 100;
}
init_timer(&wb_timer);
@@ -351,7 +348,7 @@ module_init(page_writeback_init);
* So. The proper fix is to leave the page locked-and-dirty and to pass
* it all the way down.
*/
-int generic_vm_writeback(struct page *page, int *nr_to_write)
+int generic_vm_writeback(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
@@ -363,7 +360,7 @@ int generic_vm_writeback(struct page *page, int *nr_to_write)
unlock_page(page);
if (inode) {
- do_writepages(inode->i_mapping, nr_to_write);
+ do_writepages(inode->i_mapping, wbc);
/*
* This iput() will internally call ext2_discard_prealloc(),
@@ -392,11 +389,11 @@ int generic_vm_writeback(struct page *page, int *nr_to_write)
}
EXPORT_SYMBOL(generic_vm_writeback);
-int do_writepages(struct address_space *mapping, int *nr_to_write)
+int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
if (mapping->a_ops->writepages)
- return mapping->a_ops->writepages(mapping, nr_to_write);
- return generic_writepages(mapping, nr_to_write);
+ return mapping->a_ops->writepages(mapping, wbc);
+ return generic_writepages(mapping, wbc);
}
/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a806031113fc..435a12dd1574 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -256,14 +256,6 @@ int is_head_of_free_region(struct page *page)
}
#endif /* CONFIG_SOFTWARE_SUSPEND */
-#ifndef CONFIG_DISCONTIGMEM
-struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
-{
- return __alloc_pages(gfp_mask, order,
- contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
-}
-#endif
-
static /* inline */ struct page *
balance_classzone(struct zone* classzone, unsigned int gfp_mask,
unsigned int order, int * freed)
@@ -680,13 +672,41 @@ void show_free_areas(void)
/*
* Builds allocation fallback zone lists.
*/
-static inline void build_zonelists(pg_data_t *pgdat)
+static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
{
- int i, j, k;
+ switch (k) {
+ struct zone *zone;
+ default:
+ BUG();
+ case ZONE_HIGHMEM:
+ zone = pgdat->node_zones + ZONE_HIGHMEM;
+ if (zone->size) {
+#ifndef CONFIG_HIGHMEM
+ BUG();
+#endif
+ zonelist->zones[j++] = zone;
+ }
+ case ZONE_NORMAL:
+ zone = pgdat->node_zones + ZONE_NORMAL;
+ if (zone->size)
+ zonelist->zones[j++] = zone;
+ case ZONE_DMA:
+ zone = pgdat->node_zones + ZONE_DMA;
+ if (zone->size)
+ zonelist->zones[j++] = zone;
+ }
+ return j;
+}
+
+static void __init build_zonelists(pg_data_t *pgdat)
+{
+ int i, j, k, node, local_node;
+
+ local_node = pgdat->node_id;
+ printk("Building zonelist for node : %d\n", local_node);
for (i = 0; i <= GFP_ZONEMASK; i++) {
struct zonelist *zonelist;
- struct zone *zone;
zonelist = pgdat->node_zonelists + i;
memset(zonelist, 0, sizeof(*zonelist));
@@ -698,33 +718,49 @@ static inline void build_zonelists(pg_data_t *pgdat)
if (i & __GFP_DMA)
k = ZONE_DMA;
- switch (k) {
- default:
- BUG();
- /*
- * fallthrough:
- */
- case ZONE_HIGHMEM:
- zone = pgdat->node_zones + ZONE_HIGHMEM;
- if (zone->size) {
-#ifndef CONFIG_HIGHMEM
- BUG();
-#endif
- zonelist->zones[j++] = zone;
- }
- case ZONE_NORMAL:
- zone = pgdat->node_zones + ZONE_NORMAL;
- if (zone->size)
- zonelist->zones[j++] = zone;
- case ZONE_DMA:
- zone = pgdat->node_zones + ZONE_DMA;
- if (zone->size)
- zonelist->zones[j++] = zone;
- }
+ j = build_zonelists_node(pgdat, zonelist, j, k);
+ /*
+ * Now we build the zonelist so that it contains the zones
+ * of all the other nodes.
+ * We don't want to pressure a particular node, so when
+ * building the zones for node N, we make sure that the
+ * zones coming right after the local ones are those from
+ * node N+1 (modulo N)
+ */
+ for (node = local_node + 1; node < numnodes; node++)
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+ for (node = 0; node < local_node; node++)
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
+
zonelist->zones[j++] = NULL;
}
}
+void __init build_all_zonelists(void)
+{
+ int i;
+
+ for(i = 0 ; i < numnodes ; i++)
+ build_zonelists(NODE_DATA(i));
+}
+
+void __init calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size,
+ unsigned long *zholes_size)
+{
+ unsigned long realtotalpages, totalpages = 0;
+ int i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ totalpages += zones_size[i];
+ pgdat->node_size = totalpages;
+
+ realtotalpages = totalpages;
+ if (zholes_size)
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ realtotalpages -= zholes_size[i];
+ printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
+}
+
/*
* Helper functions to size the waitqueue hash table.
* Essentially these want to choose hash table sizes sufficiently
@@ -775,46 +811,18 @@ static inline unsigned long wait_table_bits(unsigned long size)
* - mark all memory queues empty
* - clear the memory bitmaps
*/
-void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
- unsigned long *zones_size, unsigned long zone_start_pfn,
- unsigned long *zholes_size, struct page *lmem_map)
+void __init free_area_init_core(pg_data_t *pgdat,
+ unsigned long *zones_size, unsigned long *zholes_size)
{
unsigned long i, j;
- unsigned long map_size;
- unsigned long totalpages, offset, realtotalpages;
+ unsigned long local_offset;
const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
+ int nid = pgdat->node_id;
+ struct page *lmem_map = pgdat->node_mem_map;
+ unsigned long zone_start_pfn = pgdat->node_start_pfn;
- totalpages = 0;
- for (i = 0; i < MAX_NR_ZONES; i++)
- totalpages += zones_size[i];
-
- realtotalpages = totalpages;
- if (zholes_size)
- for (i = 0; i < MAX_NR_ZONES; i++)
- realtotalpages -= zholes_size[i];
-
- printk("On node %d totalpages: %lu\n", nid, realtotalpages);
-
- /*
- * Some architectures (with lots of mem and discontinous memory
- * maps) have to search for a good mem_map area:
- * For discontigmem, the conceptual mem map array starts from
- * PAGE_OFFSET, we need to align the actual array onto a mem map
- * boundary, so that MAP_NR works.
- */
- map_size = (totalpages + 1)*sizeof(struct page);
- if (lmem_map == (struct page *)0) {
- lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
- lmem_map = (struct page *)(PAGE_OFFSET +
- MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
- }
- *gmap = pgdat->node_mem_map = lmem_map;
- pgdat->node_size = totalpages;
- pgdat->node_start_pfn = zone_start_pfn;
- pgdat->node_start_mapnr = (lmem_map - mem_map);
pgdat->nr_zones = 0;
-
- offset = lmem_map - mem_map;
+ local_offset = 0; /* offset within lmem_map */
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long mask;
@@ -866,8 +874,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
zone->pages_low = mask*2;
zone->pages_high = mask*3;
- zone->zone_mem_map = mem_map + offset;
- zone->zone_start_mapnr = offset;
+ zone->zone_mem_map = lmem_map + local_offset;
zone->zone_start_pfn = zone_start_pfn;
if ((zone_start_pfn) & (zone_required_alignment-1))
@@ -879,7 +886,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
* done. Non-atomic initialization, single-pass.
*/
for (i = 0; i < size; i++) {
- struct page *page = mem_map + offset + i;
+ struct page *page = lmem_map + local_offset + i;
set_page_zone(page, nid * MAX_NR_ZONES + j);
set_page_count(page, 0);
SetPageReserved(page);
@@ -893,7 +900,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
zone_start_pfn++;
}
- offset += size;
+ local_offset += size;
for (i = 0; ; i++) {
unsigned long bitmap_size;
@@ -932,13 +939,15 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
(unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
}
}
- build_zonelists(pgdat);
}
+#ifndef CONFIG_DISCONTIGMEM
void __init free_area_init(unsigned long *zones_size)
{
- free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
+ free_area_init_node(0, &contig_page_data, NULL, zones_size, 0, NULL);
+ mem_map = contig_page_data.node_mem_map;
}
+#endif
static int __init setup_mem_frac(char *str)
{
diff --git a/mm/page_io.c b/mm/page_io.c
index ced005c65001..47de394d5576 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -131,12 +131,12 @@ out:
* Swap pages are !PageLocked and PageWriteback while under writeout so that
* memory allocators will throttle against them.
*/
-static int swap_vm_writeback(struct page *page, int *nr_to_write)
+static int swap_vm_writeback(struct page *page, struct writeback_control *wbc)
{
struct address_space *mapping = page->mapping;
unlock_page(page);
- return generic_writepages(mapping, nr_to_write);
+ return generic_writepages(mapping, wbc);
}
struct address_space_operations swap_aops = {
diff --git a/mm/shmem.c b/mm/shmem.c
index 53a5defb4436..496659e341f4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,7 +28,6 @@
#include <linux/pagemap.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
diff --git a/mm/swap.c b/mm/swap.c
index 4e88784e2045..4528369df084 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -124,9 +124,9 @@ void release_pages(struct page **pages, int nr)
if (page_count(page) == 0) {
if (!pagevec_add(&pages_to_free, page)) {
spin_unlock_irq(&zone->lru_lock);
- pagevec_free(&pages_to_free);
+ __pagevec_free(&pages_to_free);
pagevec_init(&pages_to_free);
- spin_lock_irq(&zone->lru_lock);
+ zone = NULL; /* No lock is held */
}
}
}
@@ -165,8 +165,8 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
}
/*
- * Move all the inactive pages to the head of the inactive list
- * and release them. Reinitialises the caller's pagevec.
+ * Move all the inactive pages to the head of the inactive list and release
+ * them. Reinitialises the caller's pagevec.
*/
void pagevec_deactivate_inactive(struct pagevec *pvec)
{
@@ -180,8 +180,6 @@ void pagevec_deactivate_inactive(struct pagevec *pvec)
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
- if (PageActive(page) || !PageLRU(page))
- continue;
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d07f8db1f7c7..d936aadcbf92 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -12,7 +12,6 @@
#include <linux/swap.h>
#include <linux/init.h>
#include <linux/pagemap.h>
-#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h> /* block_sync_page() */
@@ -119,7 +118,7 @@ void __delete_from_swap_cache(struct page *page)
int add_to_swap(struct page * page)
{
swp_entry_t entry;
- int flags;
+ int pf_flags;
if (!PageLocked(page))
BUG();
@@ -142,7 +141,7 @@ int add_to_swap(struct page * page)
* just not all of them.
*/
- flags = current->flags;
+ pf_flags = current->flags;
current->flags &= ~PF_MEMALLOC;
current->flags |= PF_NOWARN;
ClearPageUptodate(page); /* why? */
@@ -154,20 +153,20 @@ int add_to_swap(struct page * page)
*/
switch (add_to_swap_cache(page, entry)) {
case 0: /* Success */
- current->flags = flags;
+ current->flags = pf_flags;
SetPageUptodate(page);
set_page_dirty(page);
swap_free(entry);
return 1;
case -ENOMEM: /* radix-tree allocation */
- current->flags = flags;
+ current->flags = pf_flags;
swap_free(entry);
return 0;
default: /* ENOENT: raced */
break;
}
/* Raced with "speculative" read_swap_cache_async */
- current->flags = flags;
+ current->flags = pf_flags;
swap_free(entry);
}
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 330c94cef787..000ed1583dc5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -7,7 +7,6 @@
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/vmalloc.h>
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f220b40fc9c1..a8b2c1911c9a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -15,7 +15,6 @@
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
-#include <linux/smp_lock.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
@@ -145,6 +144,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
if (!add_to_swap(page))
goto activate_locked;
pte_chain_lock(page);
+ mapping = page->mapping;
}
/*
@@ -174,15 +174,18 @@ shrink_list(struct list_head *page_list, int nr_pages,
*/
if (PageDirty(page) && is_page_cache_freeable(page) &&
mapping && may_enter_fs) {
- int (*writeback)(struct page *, int *);
+ int (*writeback)(struct page *,
+ struct writeback_control *);
const int cluster_size = SWAP_CLUSTER_MAX;
- int nr_to_write = cluster_size;
+ struct writeback_control wbc = {
+ .nr_to_write = cluster_size,
+ };
writeback = mapping->a_ops->vm_writeback;
if (writeback == NULL)
writeback = generic_vm_writeback;
- (*writeback)(page, &nr_to_write);
- *max_scan -= (cluster_size - nr_to_write);
+ (*writeback)(page, &wbc);
+ *max_scan -= (cluster_size - wbc.nr_to_write);
goto keep;
}