From 967e6864e6e171a2c1053fe1d0c55937f71d1665 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Sep 2002 08:35:46 -0700 Subject: [PATCH] clean up argument passing in writeback paths The writeback code paths which walk the superblocks and inodes are getting an increasing arguments passed to them. The patch wraps those args into the new `struct writeback_control', and uses that instead. There is no functional change. The new writeback_control structure is passed down through the writeback paths in the place where the old `nr_to_write' pointer used to be. writeback_control will be used to pass new information up and down the writeback paths. Such as whether the writeback should be non-blocking, and whether queue congestion was encountered. --- include/linux/fs.h | 8 +++++--- include/linux/mpage.h | 8 +++++--- include/linux/writeback.h | 22 +++++++++++++++------- 3 files changed, 25 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 622481a00115..e421e95b9320 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -279,6 +279,7 @@ struct iattr { */ struct page; struct address_space; +struct writeback_control; struct address_space_operations { int (*writepage)(struct page *); @@ -286,10 +287,10 @@ struct address_space_operations { int (*sync_page)(struct page *); /* Write back some dirty pages from this mapping. */ - int (*writepages)(struct address_space *, int *nr_to_write); + int (*writepages)(struct address_space *, struct writeback_control *); /* Perform a writeback as a memory-freeing operation. */ - int (*vm_writeback)(struct page *, int *nr_to_write); + int (*vm_writeback)(struct page *, struct writeback_control *); /* Set a page dirty */ int (*set_page_dirty)(struct page *page); @@ -1261,7 +1262,8 @@ extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); extern loff_t remote_llseek(struct file *file, loff_t offset, int origin); extern int generic_file_open(struct inode * inode, struct file * filp); -extern int generic_vm_writeback(struct page *page, int *nr_to_write); +extern int generic_vm_writeback(struct page *page, + struct writeback_control *wbc); extern struct file_operations generic_ro_fops; diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 52253d90f55d..86aa7b676274 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -10,14 +10,16 @@ * nested includes. Get it right in the .c file). */ +struct writeback_control; + int mpage_readpages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages, get_block_t get_block); int mpage_readpage(struct page *page, get_block_t get_block); int mpage_writepages(struct address_space *mapping, - int *nr_to_write, get_block_t get_block); + struct writeback_control *wbc, get_block_t get_block); static inline int -generic_writepages(struct address_space *mapping, int *nr_to_write) +generic_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return mpage_writepages(mapping, nr_to_write, NULL); + return mpage_writepages(mapping, wbc, NULL); } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 5de884cd6a7c..b12971f8eeb0 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -33,16 +33,24 @@ enum writeback_sync_modes { WB_SYNC_HOLD = 3, /* Hold the inode on sb_dirty for sys_sync() */ }; -void writeback_unlocked_inodes(int *nr_to_write, - enum writeback_sync_modes sync_mode, - unsigned long *older_than_this); +/* + * A control structure which tells the writeback code what to do + */ +struct writeback_control { + struct backing_dev_info *bdi; /* If !NULL, only write back this + queue */ + enum writeback_sync_modes sync_mode; + unsigned long *older_than_this; /* If !NULL, only write back inodes + older than this */ + long nr_to_write; /* Write this many pages, and decrement + this for each page written */ +}; + +void writeback_inodes(struct writeback_control *wbc); void wake_up_inode(struct inode *inode); void __wait_on_inode(struct inode * inode); void sync_inodes_sb(struct super_block *, int wait); void sync_inodes(int wait); -void writeback_backing_dev(struct backing_dev_info *bdi, int *nr_to_write, - enum writeback_sync_modes sync_mode, - unsigned long *older_than_this); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) @@ -65,7 +73,7 @@ extern int dirty_expire_centisecs; void balance_dirty_pages(struct address_space *mapping); void balance_dirty_pages_ratelimited(struct address_space *mapping); int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); -int do_writepages(struct address_space *mapping, int *nr_to_write); +int do_writepages(struct address_space *mapping, struct writeback_control *wbc); /* pdflush.c */ extern int nr_pdflush_threads; /* Global so it can be exported to sysctl -- cgit v1.2.3 From e07316f9c849b0fe92eb273e7ada4652053d32d1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Sep 2002 08:35:54 -0700 Subject: [PATCH] free_area_init cleanup Patch from Martin Bligh. It should only affect machines using discontigmem. "This patch cleans up free_area_init stuff, and undefines mem_map and max_mapnr for discontigmem, where they were horrible kludges anyway ... We just use the lmem_maps instead, which makes much more sense. It also kills pgdat->node_start_mapnr, which is tarred with the same brush. It breaks free_area_init_core into a couple of sections, pulls the allocation of the lmem_map back into the next higher function, and passes more things via the pgdat. But that's not very interesting, the objective was to kill mem_map for discontigmem, which seems to attract bugs like flypaper. This brings any misuses to obvious compile-time errors rather than wierd oopses, which I can't help but feel is a good thing. It does break other discontigmem architectures, but in a very obvious way (they won't compile) and it's easy to fix. I think that's a small price to pay ... ;-) At some point soon I will follow up with a patch to remove free_area_init_node for the contig mem case, or at the very least rename it to something more sensible, like __free_area_init. Christoph has grander plans to kill mem_map more extensively in addition to the attatched, but I've heard nobody disagree that it should die for the discontigmem case at least. Oh, and I renamed mem_map in drivers/pcmcia/sa1100 to pc_mem_map because my tiny little brain (and cscope) find it confusing like that. Tested on 16-way NUMA-Q with discontigmem + NUMA support and on a standard PC (well, boots and appears functional). On top of 2.5.33-mm4" --- arch/alpha/mm/numa.c | 6 ---- arch/i386/kernel/cpu/amd.c | 2 +- arch/i386/kernel/i386_ksyms.c | 6 +++- arch/i386/kernel/numaq.c | 20 ++++------- arch/i386/mm/discontig.c | 15 ++------ arch/i386/mm/init.c | 4 ++- arch/i386/mm/pgtable.c | 29 +++++++++------- arch/mips64/sgi-ip27/ip27-memory.c | 12 ++----- drivers/pcmcia/sa1100.h | 2 +- drivers/pcmcia/sa1100_generic.c | 4 +-- include/asm-alpha/mmzone.h | 2 -- include/asm-i386/mmzone.h | 14 ++++++-- include/asm-i386/numaq.h | 9 ++--- include/asm-i386/page.h | 2 +- include/asm-mips64/mmzone.h | 1 - include/asm-mips64/pgtable.h | 8 ++--- include/linux/mm.h | 7 +++- include/linux/mmzone.h | 9 +++-- kernel/ksyms.c | 4 ++- kernel/suspend.c | 17 +++++---- mm/memory.c | 7 ++-- mm/numa.c | 36 +++++++++++++------ mm/page_alloc.c | 71 +++++++++++++++++--------------------- 23 files changed, 144 insertions(+), 143 deletions(-) (limited to 'include/linux') diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index 2458576ec8ae..5071c14da059 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -286,7 +286,6 @@ void __init paging_init(void) for (nid = 0; nid < numnodes; nid++) { unsigned long start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT; unsigned long end_pfn = plat_node_bdata[nid].node_low_pfn; - unsigned long lmax_mapnr; if (dma_local_pfn >= end_pfn - start_pfn) zones_size[ZONE_DMA] = end_pfn - start_pfn; @@ -295,11 +294,6 @@ void __init paging_init(void) zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn; } free_area_init_node(nid, NODE_DATA(nid), NULL, zones_size, start_pfn, NULL); - lmax_mapnr = PLAT_NODE_DATA_STARTNR(nid) + PLAT_NODE_DATA_SIZE(nid); - if (lmax_mapnr > max_mapnr) { - max_mapnr = lmax_mapnr; - DBGDCONT("Grow max_mapnr to %ld\n", max_mapnr); - } } /* Initialize the kernel's ZERO_PGE. */ diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 00b09cc403a2..991024118fe6 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -25,7 +25,7 @@ __asm__(".align 4\nvide: ret"); static void __init init_amd(struct cpuinfo_x86 *c) { u32 l, h; - int mbytes = max_mapnr >> (20-PAGE_SHIFT); + int mbytes = num_physpages >> (20-PAGE_SHIFT); int r; /* diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 0be81aed6747..8b447ebcb591 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -58,7 +58,11 @@ EXPORT_SYMBOL(boot_cpu_data); EXPORT_SYMBOL(EISA_bus); #endif EXPORT_SYMBOL(MCA_bus); -#ifdef CONFIG_MULTIQUAD +#ifdef CONFIG_DISCONTIGMEM +EXPORT_SYMBOL(node_data); +EXPORT_SYMBOL(pfn_to_nid); +#endif +#ifdef CONFIG_X86_NUMAQ EXPORT_SYMBOL(xquad_portio); #endif EXPORT_SYMBOL(__verify_write); diff --git a/arch/i386/kernel/numaq.c b/arch/i386/kernel/numaq.c index ffd27c7d2d81..07cf91d92dd9 100644 --- a/arch/i386/kernel/numaq.c +++ b/arch/i386/kernel/numaq.c @@ -82,27 +82,19 @@ static void __init smp_dump_qct(void) */ int physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1}; -#define MB_TO_ELEMENT(x) (x >> ELEMENT_REPRESENTS) -#define PA_TO_MB(pa) (pa >> 20) /* assumption: a physical address is in bytes */ +#define PFN_TO_ELEMENT(pfn) (pfn / PAGES_PER_ELEMENT) +#define PA_TO_ELEMENT(pa) (PFN_TO_ELEMENT(pa >> PAGE_SHIFT)) -int pa_to_nid(u64 pa) +int pfn_to_nid(unsigned long pfn) { - int nid; - - nid = physnode_map[MB_TO_ELEMENT(PA_TO_MB(pa))]; + int nid = physnode_map[PFN_TO_ELEMENT(pfn)]; - /* the physical address passed in is not in the map for the system */ if (nid == -1) - BUG(); + BUG(); /* address is not present */ return nid; } -int pfn_to_nid(unsigned long pfn) -{ - return pa_to_nid(((u64)pfn) << PAGE_SHIFT); -} - /* * for each node mark the regions * TOPOFMEM = hi_shrd_mem_start + hi_shrd_mem_size @@ -132,7 +124,7 @@ static void __init initialize_physnode_map(void) topofmem = eq->hi_shrd_mem_start + eq->hi_shrd_mem_size; while (cur < topofmem) { physnode_map[cur >> 8] = nid; - cur += (ELEMENT_REPRESENTS - 1); + cur ++; } } } diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index de811d22ac09..eab75722398a 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -275,20 +275,9 @@ void __init set_highmem_pages_init(int bad_ppro) void __init set_max_mapnr_init(void) { #ifdef CONFIG_HIGHMEM - unsigned long lmax_mapnr; - int nid; - - highmem_start_page = mem_map + NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_start_mapnr; + highmem_start_page = NODE_DATA(0)->node_zones[ZONE_HIGHMEM].zone_mem_map; num_physpages = highend_pfn; - - for (nid = 0; nid < numnodes; nid++) { - lmax_mapnr = node_startnr(nid) + node_size(nid); - if (lmax_mapnr > max_mapnr) { - max_mapnr = lmax_mapnr; - } - } - #else - max_mapnr = num_physpages = max_low_pfn; + num_physpages = max_low_pfn; #endif } diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 5d73c07fd726..c2042aee2b65 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -440,8 +440,10 @@ void __init mem_init(void) int tmp; int bad_ppro; +#ifndef CONFIG_DISCONTIGMEM if (!mem_map) BUG(); +#endif bad_ppro = ppro_with_ram_bug(); @@ -471,7 +473,7 @@ void __init mem_init(void) printk("Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), - max_mapnr << (PAGE_SHIFT-10), + num_physpages << (PAGE_SHIFT-10), codesize >> 10, reservedpages << (PAGE_SHIFT-10), datasize >> 10, diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 18a7664b115b..1f59100e77bf 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -22,26 +22,29 @@ void show_mem(void) { - int pfn, total = 0, reserved = 0; + int total = 0, reserved = 0; int shared = 0, cached = 0; int highmem = 0; struct page *page; + pg_data_t *pgdat; + unsigned long i; printk("Mem-info:\n"); show_free_areas(); printk("Free swap: %6dkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); - pfn = max_mapnr; - while (pfn-- > 0) { - page = pfn_to_page(pfn); - total++; - if (PageHighMem(page)) - highmem++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; + for_each_pgdat(pgdat) { + for (i = 0; i < pgdat->node_size; ++i) { + page = pgdat->node_mem_map + i; + total++; + if (PageHighMem(page)) + highmem++; + if (PageReserved(page)) + reserved++; + else if (PageSwapCache(page)) + cached++; + else if (page_count(page)) + shared += page_count(page) - 1; + } } printk("%d pages of RAM\n", total); printk("%d pages of HIGHMEM\n",highmem); diff --git a/arch/mips64/sgi-ip27/ip27-memory.c b/arch/mips64/sgi-ip27/ip27-memory.c index e5f79e031816..f46fa89f145f 100644 --- a/arch/mips64/sgi-ip27/ip27-memory.c +++ b/arch/mips64/sgi-ip27/ip27-memory.c @@ -254,10 +254,6 @@ void __init paging_init(void) zones_size[ZONE_DMA] = end_pfn + 1 - start_pfn; free_area_init_node(node, NODE_DATA(node), 0, zones_size, start_pfn, 0); - if ((PLAT_NODE_DATA_STARTNR(node) + - PLAT_NODE_DATA_SIZE(node)) > pagenr) - pagenr = PLAT_NODE_DATA_STARTNR(node) + - PLAT_NODE_DATA_SIZE(node); } } @@ -271,7 +267,6 @@ void __init mem_init(void) unsigned long codesize, datasize, initsize; int slot, numslots; struct page *pg, *pslot; - pfn_t pgnr; num_physpages = numpages; /* memory already sized by szmem */ max_mapnr = pagenr; /* already found during paging_init */ @@ -293,7 +288,6 @@ void __init mem_init(void) * We need to manually do the other slots. */ pg = NODE_DATA(nid)->node_mem_map + slot_getsize(nid, 0); - pgnr = PLAT_NODE_DATA_STARTNR(nid) + slot_getsize(nid, 0); numslots = node_getlastslot(nid); for (slot = 1; slot <= numslots; slot++) { pslot = NODE_DATA(nid)->node_mem_map + @@ -304,7 +298,7 @@ void __init mem_init(void) * free up the pages that hold the memmap entries. */ while (pg < pslot) { - pg++; pgnr++; + pg++; } /* @@ -312,8 +306,8 @@ void __init mem_init(void) */ pslot += slot_getsize(nid, slot); while (pg < pslot) { - if (!page_is_ram(pgnr)) - continue; + /* if (!page_is_ram(pgnr)) continue; */ + /* commented out until page_is_ram works */ ClearPageReserved(pg); atomic_set(&pg->count, 1); __free_page(pg); diff --git a/drivers/pcmcia/sa1100.h b/drivers/pcmcia/sa1100.h index 713f5b49cf34..53716e9dcf63 100644 --- a/drivers/pcmcia/sa1100.h +++ b/drivers/pcmcia/sa1100.h @@ -160,7 +160,7 @@ struct sa1100_pcmcia_socket { */ socket_state_t cs_state; pccard_io_map io_map[MAX_IO_WIN]; - pccard_mem_map mem_map[MAX_WIN]; + pccard_mem_map pc_mem_map[MAX_WIN]; void (*handler)(void *, unsigned int); void *handler_info; diff --git a/drivers/pcmcia/sa1100_generic.c b/drivers/pcmcia/sa1100_generic.c index ef238c0f90b7..12dc9270e402 100644 --- a/drivers/pcmcia/sa1100_generic.c +++ b/drivers/pcmcia/sa1100_generic.c @@ -686,7 +686,7 @@ sa1100_pcmcia_get_mem_map(unsigned int sock, struct pccard_mem_map *map) DEBUG(2, "%s() for sock %u\n", __FUNCTION__, sock); if (map->map < MAX_WIN) { - *map = skt->mem_map[map->map]; + *map = skt->pc_mem_map[map->map]; ret = 0; } @@ -754,7 +754,7 @@ sa1100_pcmcia_set_mem_map(unsigned int sock, struct pccard_mem_map *map) map->sys_stop += start; map->sys_start = start; - skt->mem_map[map->map] = *map; + skt->pc_mem_map[map->map] = *map; return 0; } /* sa1100_pcmcia_set_mem_map() */ diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h index 572569df5dd4..98fdbae58aeb 100644 --- a/include/asm-alpha/mmzone.h +++ b/include/asm-alpha/mmzone.h @@ -46,8 +46,6 @@ extern plat_pg_data_t *plat_node_data[]; #define PHYSADDR_TO_NID(pa) ALPHA_PA_TO_NID(pa) #define PLAT_NODE_DATA(n) (plat_node_data[(n)]) -#define PLAT_NODE_DATA_STARTNR(n) \ - (PLAT_NODE_DATA(n)->gendata.node_start_mapnr) #define PLAT_NODE_DATA_SIZE(n) (PLAT_NODE_DATA(n)->gendata.node_size) #if 1 diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index d2994f116f03..cc18193f2f65 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -11,7 +11,6 @@ #ifdef CONFIG_X86_NUMAQ #include #else -#define pa_to_nid(pa) (0) #define pfn_to_nid(pfn) (0) #ifdef CONFIG_NUMA #define _cpu_to_node(cpu) 0 @@ -44,7 +43,6 @@ extern struct pglist_data *node_data[]; #define alloc_bootmem_low_pages_node(ignore, x) \ __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0) -#define node_startnr(nid) (node_data[nid]->node_start_mapnr) #define node_size(nid) (node_data[nid]->node_size) #define node_localnr(pfn, nid) ((pfn) - node_data[nid]->node_start_pfn) @@ -55,7 +53,7 @@ extern struct pglist_data *node_data[]; /* * Given a kernel address, find the home node of the underlying memory. */ -#define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) +#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT) /* * Return a pointer to the node data for node n. @@ -64,6 +62,8 @@ extern struct pglist_data *node_data[]; #define node_mem_map(nid) (NODE_DATA(nid)->node_mem_map) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) +#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ + NODE_DATA(nid)->node_size) #define local_mapnr(kvaddr) \ ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) ) @@ -74,5 +74,13 @@ extern struct pglist_data *node_data[]; #define pfn_to_page(pfn) (node_mem_map(pfn_to_nid(pfn)) + node_localnr(pfn, pfn_to_nid(pfn))) #define page_to_pfn(page) ((page - page_zone(page)->zone_mem_map) + page_zone(page)->zone_start_pfn) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) +/* + * pfn_valid should be made as fast as possible, and the current definition + * is valid for machines that are NUMA, but still contiguous, which is what + * is currently supported. A more generalised, but slower definition would + * be something like this - mbligh: + * ( pfn_to_pgdat(pfn) && (pfn < node_end_pfn(pfn_to_nid(pfn))) ) + */ +#define pfn_valid(pfn) (pfn < num_physpages) #endif /* CONFIG_DISCONTIGMEM */ #endif /* _ASM_MMZONE_H_ */ diff --git a/include/asm-i386/numaq.h b/include/asm-i386/numaq.h index ed10442f1dcc..b32b28c12c73 100644 --- a/include/asm-i386/numaq.h +++ b/include/asm-i386/numaq.h @@ -32,17 +32,18 @@ /* * for now assume that 64Gb is max amount of RAM for whole system - * 64Gb * 1024Mb/Gb = 65536 Mb - * 65536 Mb / 256Mb = 256 + * 64Gb / 4096bytes/page = 16777216 pages */ +#define MAX_NR_PAGES 16777216 #define MAX_ELEMENTS 256 -#define ELEMENT_REPRESENTS 8 /* 256 Mb */ +#define PAGES_PER_ELEMENT (16777216/256) +#define pfn_to_pgdat(pfn) NODE_DATA(pfn_to_nid(pfn)) +#define PHYSADDR_TO_NID(pa) pfn_to_nid(pa >> PAGE_SHIFT) #define MAX_NUMNODES 8 #ifdef CONFIG_NUMA #define _cpu_to_node(cpu) (cpu_to_logical_apicid(cpu) >> 4) #endif /* CONFIG_NUMA */ -extern int pa_to_nid(u64); extern int pfn_to_nid(unsigned long); extern void get_memcfg_numaq(void); #define get_memcfg_numa() get_memcfg_numaq() diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 5a09fd4b72f1..f9fe284b9057 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -145,10 +145,10 @@ static __inline__ int get_order(unsigned long size) #ifndef CONFIG_DISCONTIGMEM #define pfn_to_page(pfn) (mem_map + (pfn)) #define page_to_pfn(page) ((unsigned long)((page) - mem_map)) +#define pfn_valid(pfn) ((pfn) < max_mapnr) #endif /* !CONFIG_DISCONTIGMEM */ #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) -#define pfn_valid(pfn) ((pfn) < max_mapnr) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ diff --git a/include/asm-mips64/mmzone.h b/include/asm-mips64/mmzone.h index 5e643b114269..d60ad12acd75 100644 --- a/include/asm-mips64/mmzone.h +++ b/include/asm-mips64/mmzone.h @@ -24,7 +24,6 @@ extern plat_pg_data_t *plat_node_data[]; #define PHYSADDR_TO_NID(pa) NASID_TO_COMPACT_NODEID(NASID_GET(pa)) #define PLAT_NODE_DATA(n) (plat_node_data[n]) -#define PLAT_NODE_DATA_STARTNR(n) (PLAT_NODE_DATA(n)->gendata.node_start_mapnr) #define PLAT_NODE_DATA_SIZE(n) (PLAT_NODE_DATA(n)->gendata.node_size) #define PLAT_NODE_DATA_LOCALNR(p, n) \ (((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn) diff --git a/include/asm-mips64/pgtable.h b/include/asm-mips64/pgtable.h index ded7d0a0a986..b32768e57d16 100644 --- a/include/asm-mips64/pgtable.h +++ b/include/asm-mips64/pgtable.h @@ -373,10 +373,10 @@ extern inline void pgd_clear(pgd_t *pgdp) #ifndef CONFIG_DISCONTIGMEM #define pte_page(x) (mem_map+(unsigned long)((pte_val(x) >> PAGE_SHIFT))) #else -#define mips64_pte_pagenr(x) \ - (PLAT_NODE_DATA_STARTNR(PHYSADDR_TO_NID(pte_val(x))) + \ - PLAT_NODE_DATA_LOCALNR(pte_val(x), PHYSADDR_TO_NID(pte_val(x)))) -#define pte_page(x) (mem_map+mips64_pte_pagenr(x)) + +#define pte_page(x) ( NODE_MEM_MAP(PHYSADDR_TO_NID(pte_val(x))) + + PLAT_NODE_DATA_LOCALNR(pte_val(x), PHYSADDR_TO_NID(pte_val(x))) ) + #endif /* diff --git a/include/linux/mm.h b/include/linux/mm.h index e33d3f2bd080..1bab087776c5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -15,7 +15,10 @@ #include #include +#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; +#endif + extern unsigned long num_physpages; extern void * high_memory; extern int page_cluster; @@ -345,8 +348,10 @@ static inline int page_mapped(struct page *page) #define VM_FAULT_MINOR 1 #define VM_FAULT_MAJOR 2 -/* The array of struct pages */ +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ extern struct page *mem_map; +#endif extern void show_free_areas(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8ebf441bdb47..5cdd464992da 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -112,7 +112,6 @@ struct zone { struct page *zone_mem_map; /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; - unsigned long zone_start_mapnr; /* * rarely used fields: @@ -163,7 +162,6 @@ typedef struct pglist_data { unsigned long *valid_addr_bitmap; struct bootmem_data *bdata; unsigned long node_start_pfn; - unsigned long node_start_mapnr; unsigned long node_size; int node_id; struct pglist_data *pgdat_next; @@ -187,9 +185,10 @@ memclass(struct zone *pgzone, struct zone *classzone) * prototypes for the discontig memory code. */ struct page; -void free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, - unsigned long *zones_size, unsigned long paddr, unsigned long *zholes_size, - struct page *pmap); +extern void calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size, + unsigned long *zholes_size); +extern void free_area_init_core(pg_data_t *pgdat, unsigned long *zones_size, + unsigned long *zholes_size); void get_zone_counts(unsigned long *active, unsigned long *inactive); extern pg_data_t contig_page_data; diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 62c4188ae932..8731cbffb5e4 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -115,9 +115,11 @@ EXPORT_SYMBOL(vmalloc_32); EXPORT_SYMBOL(vmap); EXPORT_SYMBOL(vunmap); EXPORT_SYMBOL(vmalloc_to_page); -EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(remap_page_range); +#ifndef CONFIG_DISCONTIGMEM +EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(max_mapnr); +#endif EXPORT_SYMBOL(high_memory); EXPORT_SYMBOL(vmtruncate); EXPORT_SYMBOL(find_vma); diff --git a/kernel/suspend.c b/kernel/suspend.c index 2d7eeaabe127..419490900ff6 100644 --- a/kernel/suspend.c +++ b/kernel/suspend.c @@ -471,10 +471,12 @@ static int count_and_copy_data_pages(struct pbe *pagedir_p) int nr_copy_pages = 0; int pfn; struct page *page; - + +#ifndef CONFIG_DISCONTIGMEM if (max_mapnr != num_physpages) panic("mapnr is not expected"); - for (pfn = 0; pfn < max_mapnr; pfn++) { +#endif + for (pfn = 0; pfn < num_physpages; pfn++) { page = pfn_to_page(pfn); if (PageHighMem(page)) panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to and try again ;-)."); @@ -514,19 +516,20 @@ static int count_and_copy_data_pages(struct pbe *pagedir_p) static void free_suspend_pagedir(unsigned long this_pagedir) { - struct page *page = mem_map; - int i; + struct page *page; + int pfn; unsigned long this_pagedir_end = this_pagedir + (PAGE_SIZE << pagedir_order); - for(i=0; i < num_physpages; i++, page++) { + for(pfn = 0; pfn < num_physpages; pfn++) { + page = pfn_to_page(pfn); if (!TestClearPageNosave(page)) continue; - if (ADDRESS(i) >= this_pagedir && ADDRESS(i) < this_pagedir_end) + if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end) continue; /* old pagedir gets freed in one */ - free_page(ADDRESS(i)); + free_page(ADDRESS(pfn)); } free_pages(this_pagedir, pagedir_order); } diff --git a/mm/memory.c b/mm/memory.c index c886e849231b..ce55a3279406 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -53,7 +53,12 @@ #include +#ifndef CONFIG_DISCONTIGMEM +/* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; +struct page *mem_map; +#endif + unsigned long num_physpages; void * high_memory; struct page *highmem_start_page; @@ -72,8 +77,6 @@ static inline void copy_cow_page(struct page * from, struct page * to, unsigned copy_user_highpage(to, from, address); } -struct page *mem_map; - /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. diff --git a/mm/numa.c b/mm/numa.c index c293d9ae2df0..f8df7313d115 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -22,11 +22,21 @@ pg_data_t contig_page_data = { .bdata = &contig_bootmem_data }; * Should be invoked with paramters (0, 0, unsigned long *[], start_paddr). */ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, - unsigned long *zones_size, unsigned long zone_start_pfn, + unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { - free_area_init_core(0, &contig_page_data, &mem_map, zones_size, - zone_start_pfn, zholes_size, pmap); + unsigned long size; + + contig_page_data.node_id = 0; + contig_page_data.node_start_pfn = node_start_pfn; + calculate_totalpages (&contig_page_data, zones_size, zholes_size); + if (pmap == (struct page *)0) { + size = (pgdat->node_size + 1) * sizeof(struct page); + pmap = (struct page *) alloc_bootmem_node(pgdat, size); + } + contig_page_data.node_mem_map = pmap; + free_area_init_core(&contig_page_data, zones_size, zholes_size); + mem_map = contig_page_data.node_mem_map; } #endif /* !CONFIG_DISCONTIGMEM */ @@ -48,22 +58,26 @@ struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int orde * Nodes can be initialized parallely, in no particular order. */ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, - unsigned long *zones_size, unsigned long zone_start_pfn, + unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { - int i, size = 0; - struct page *discard; - - if (mem_map == NULL) - mem_map = (struct page *)PAGE_OFFSET; + int i; + unsigned long size; - free_area_init_core(nid, pgdat, &discard, zones_size, zone_start_pfn, - zholes_size, pmap); pgdat->node_id = nid; + pgdat->node_start_pfn = node_start_pfn; + calculate_totalpages (pgdat, zones_size, zholes_size); + if (pmap == (struct page *)0) { + size = (pgdat->node_size + 1) * sizeof(struct page); + pmap = (struct page *) alloc_bootmem_node(pgdat, size); + } + pgdat->node_mem_map = pmap; + free_area_init_core(pgdat, zones_size, zholes_size); /* * Get space for the valid bitmap. */ + size = 0; for (i = 0; i < MAX_NR_ZONES; i++) size += zones_size[i]; size = LONG_ALIGN((size + 7) >> 3); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a54548bbf27b..721b6e5e572c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -724,6 +724,23 @@ static inline void build_zonelists(pg_data_t *pgdat) } } +void __init calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size, + unsigned long *zholes_size) +{ + unsigned long realtotalpages, totalpages = 0; + int i; + + for (i = 0; i < MAX_NR_ZONES; i++) + totalpages += zones_size[i]; + pgdat->node_size = totalpages; + + realtotalpages = totalpages; + if (zholes_size) + for (i = 0; i < MAX_NR_ZONES; i++) + realtotalpages -= zholes_size[i]; + printk("On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); +} + /* * Helper functions to size the waitqueue hash table. * Essentially these want to choose hash table sizes sufficiently @@ -774,46 +791,18 @@ static inline unsigned long wait_table_bits(unsigned long size) * - mark all memory queues empty * - clear the memory bitmaps */ -void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, - unsigned long *zones_size, unsigned long zone_start_pfn, - unsigned long *zholes_size, struct page *lmem_map) +void __init free_area_init_core(pg_data_t *pgdat, + unsigned long *zones_size, unsigned long *zholes_size) { unsigned long i, j; - unsigned long map_size; - unsigned long totalpages, offset, realtotalpages; + unsigned long local_offset; const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); + int nid = pgdat->node_id; + struct page *lmem_map = pgdat->node_mem_map; + unsigned long zone_start_pfn = pgdat->node_start_pfn; - totalpages = 0; - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zones_size[i]; - - realtotalpages = totalpages; - if (zholes_size) - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= zholes_size[i]; - - printk("On node %d totalpages: %lu\n", nid, realtotalpages); - - /* - * Some architectures (with lots of mem and discontinous memory - * maps) have to search for a good mem_map area: - * For discontigmem, the conceptual mem map array starts from - * PAGE_OFFSET, we need to align the actual array onto a mem map - * boundary, so that MAP_NR works. - */ - map_size = (totalpages + 1)*sizeof(struct page); - if (lmem_map == (struct page *)0) { - lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size); - lmem_map = (struct page *)(PAGE_OFFSET + - MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET)); - } - *gmap = pgdat->node_mem_map = lmem_map; - pgdat->node_size = totalpages; - pgdat->node_start_pfn = zone_start_pfn; - pgdat->node_start_mapnr = (lmem_map - mem_map); pgdat->nr_zones = 0; - - offset = lmem_map - mem_map; + local_offset = 0; /* offset within lmem_map */ for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long mask; @@ -865,8 +854,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, zone->pages_low = mask*2; zone->pages_high = mask*3; - zone->zone_mem_map = mem_map + offset; - zone->zone_start_mapnr = offset; + zone->zone_mem_map = lmem_map + local_offset; zone->zone_start_pfn = zone_start_pfn; if ((zone_start_pfn) & (zone_required_alignment-1)) @@ -878,7 +866,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, * done. Non-atomic initialization, single-pass. */ for (i = 0; i < size; i++) { - struct page *page = mem_map + offset + i; + struct page *page = lmem_map + local_offset + i; set_page_zone(page, nid * MAX_NR_ZONES + j); set_page_count(page, 0); SetPageReserved(page); @@ -892,7 +880,7 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, zone_start_pfn++; } - offset += size; + local_offset += size; for (i = 0; ; i++) { unsigned long bitmap_size; @@ -934,10 +922,13 @@ void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap, build_zonelists(pgdat); } +#ifndef CONFIG_DISCONTIGMEM void __init free_area_init(unsigned long *zones_size) { - free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0); + free_area_init_node(0, &contig_page_data, NULL, zones_size, 0, NULL); + mem_map = contig_page_data.node_mem_map; } +#endif static int __init setup_mem_frac(char *str) { -- cgit v1.2.3 From ccc98a67de98c912840e0a35a24115ad64ae426d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Sep 2002 08:36:22 -0700 Subject: [PATCH] _alloc_pages cleanup Patch from Martin Bligh. It should only affect machines using discontigmem. "This patch is was originally from Andrea's tree (from SGI??), and has been tweaked since by both Christoph (who cleaned up all the code), and myself (who just hit it until it worked). It removes _alloc_pages, and adds all nodes to the zonelists directly, which also changes the fallback zone order to something more sensible ... instead of: "foreach (node) { foreach (zone) }" we now do something more like "foreach (zone_type) { foreach (node) }" Christoph has a more recent version that's fancier and does a couple more cleanups, but it seems to have a bug in it that I can't track down easily, so I propose we do the simple thing for now, and take the rest of the cleanups when it works ... it seems to build nicely on top of this seperately to me. Tested on 16-way NUMA-Q with discontigmem + NUMA support." --- arch/sparc64/mm/init.c | 2 +- include/asm-alpha/mmzone.h | 6 +-- include/asm-alpha/numnodes.h | 12 ++++++ include/asm-i386/max_numnodes.h | 12 ------ include/asm-i386/mmzone.h | 2 + include/asm-i386/numnodes.h | 12 ++++++ include/linux/gfp.h | 19 ++++++--- include/linux/mmzone.h | 14 ++++--- init/main.c | 1 + kernel/ksyms.c | 2 +- mm/numa.c | 44 --------------------- mm/page_alloc.c | 88 +++++++++++++++++++++++++---------------- 12 files changed, 106 insertions(+), 108 deletions(-) create mode 100644 include/asm-alpha/numnodes.h delete mode 100644 include/asm-i386/max_numnodes.h create mode 100644 include/asm-i386/numnodes.h (limited to 'include/linux') diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index b3233175628b..ba0adc3cdf21 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -1726,7 +1726,7 @@ void __init mem_init(void) * Set up the zero page, mark it reserved, so that page count * is not manipulated when freeing the page from user ptes. */ - mem_map_zero = _alloc_pages(GFP_KERNEL, 0); + mem_map_zero = alloc_pages(GFP_KERNEL, 0); if (mem_map_zero == NULL) { prom_printf("paging_init: Cannot alloc zero page.\n"); prom_halt(); diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h index 98fdbae58aeb..4059862d4b3d 100644 --- a/include/asm-alpha/mmzone.h +++ b/include/asm-alpha/mmzone.h @@ -36,12 +36,10 @@ extern plat_pg_data_t *plat_node_data[]; #ifdef CONFIG_ALPHA_WILDFIRE # define ALPHA_PA_TO_NID(pa) ((pa) >> 36) /* 16 nodes max due 43bit kseg */ -#define NODE_MAX_MEM_SIZE (64L * 1024L * 1024L * 1024L) /* 64 GB */ -#define MAX_NUMNODES WILDFIRE_MAX_QBB +# define NODE_MAX_MEM_SIZE (64L * 1024L * 1024L * 1024L) /* 64 GB */ #else # define ALPHA_PA_TO_NID(pa) (0) -#define NODE_MAX_MEM_SIZE (~0UL) -#define MAX_NUMNODES 1 +# define NODE_MAX_MEM_SIZE (~0UL) #endif #define PHYSADDR_TO_NID(pa) ALPHA_PA_TO_NID(pa) diff --git a/include/asm-alpha/numnodes.h b/include/asm-alpha/numnodes.h new file mode 100644 index 000000000000..4ff6b3ecfbed --- /dev/null +++ b/include/asm-alpha/numnodes.h @@ -0,0 +1,12 @@ +#ifndef _ASM_MAX_NUMNODES_H +#define _ASM_MAX_NUMNODES_H + +/* + * Currently the Wildfire is the only discontigmem/NUMA capable Alpha core. + */ +#if defined(CONFIG_ALPHA_WILDFIRE) || defined(CONFIG_ALPHA_GENERIC) +# include +# define MAX_NUMNODES WILDFIRE_MAX_QBB +#endif + +#endif /* _ASM_MAX_NUMNODES_H */ diff --git a/include/asm-i386/max_numnodes.h b/include/asm-i386/max_numnodes.h deleted file mode 100644 index 2b63299604ef..000000000000 --- a/include/asm-i386/max_numnodes.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASM_MAX_NUMNODES_H -#define _ASM_MAX_NUMNODES_H - -#include - -#ifdef CONFIG_X86_NUMAQ -#include -#else -#define MAX_NUMNODES 1 -#endif /* CONFIG_X86_NUMAQ */ - -#endif /* _ASM_MAX_NUMNODES_H */ diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index cc18193f2f65..00a5d7ffbed9 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -6,6 +6,8 @@ #ifndef _ASM_MMZONE_H_ #define _ASM_MMZONE_H_ +#include + #ifdef CONFIG_DISCONTIGMEM #ifdef CONFIG_X86_NUMAQ diff --git a/include/asm-i386/numnodes.h b/include/asm-i386/numnodes.h new file mode 100644 index 000000000000..2b63299604ef --- /dev/null +++ b/include/asm-i386/numnodes.h @@ -0,0 +1,12 @@ +#ifndef _ASM_MAX_NUMNODES_H +#define _ASM_MAX_NUMNODES_H + +#include + +#ifdef CONFIG_X86_NUMAQ +#include +#else +#define MAX_NUMNODES 1 +#endif /* CONFIG_X86_NUMAQ */ + +#endif /* _ASM_MAX_NUMNODES_H */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 10021357c093..437572e2240b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,18 +39,25 @@ * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ -extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order)); extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, struct zonelist *zonelist)); extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order); +/* + * We get the zone list from the current node and the gfp_mask. + * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones. + * + * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets + * optimized to &contig_page_data at compile-time. + */ static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order) { - /* - * Gets optimized away by the compiler. - */ - if (order >= MAX_ORDER) + pg_data_t *pgdat = NODE_DATA(numa_node_id()); + unsigned int idx = (gfp_mask & GFP_ZONEMASK); + + if (unlikely(order >= MAX_ORDER)) return NULL; - return _alloc_pages(gfp_mask, order); + + return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + idx); } #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5cdd464992da..580c39c4dcc1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -10,11 +10,14 @@ #include #include #include +#ifdef CONFIG_DISCONTIGMEM +#include +#endif +#ifndef MAX_NUMNODES +#define MAX_NUMNODES 1 +#endif -/* - * Free memory management - zoned buddy allocator. - */ - +/* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER #define MAX_ORDER 11 #else @@ -137,7 +140,7 @@ struct zone { * footprint of this construct is very small. */ struct zonelist { - struct zone *zones[MAX_NR_ZONES+1]; // NULL delimited + struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited }; #define GFP_ZONEMASK 0x0f @@ -190,6 +193,7 @@ extern void calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size, extern void free_area_init_core(pg_data_t *pgdat, unsigned long *zones_size, unsigned long *zholes_size); void get_zone_counts(unsigned long *active, unsigned long *inactive); +extern void build_all_zonelists(void); extern pg_data_t contig_page_data; diff --git a/init/main.c b/init/main.c index 7229f7aae2ea..790634231533 100644 --- a/init/main.c +++ b/init/main.c @@ -392,6 +392,7 @@ asmlinkage void __init start_kernel(void) printk(linux_banner); setup_arch(&command_line); setup_per_cpu_areas(); + build_all_zonelists(); printk("Kernel command line: %s\n", saved_command_line); parse_options(command_line); trap_init(); diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 8731cbffb5e4..6daaa2b0cb73 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -90,7 +90,6 @@ EXPORT_SYMBOL(do_brk); EXPORT_SYMBOL(exit_mm); /* internal kernel memory management */ -EXPORT_SYMBOL(_alloc_pages); EXPORT_SYMBOL(__alloc_pages); EXPORT_SYMBOL(alloc_pages_node); EXPORT_SYMBOL(__get_free_pages); @@ -117,6 +116,7 @@ EXPORT_SYMBOL(vunmap); EXPORT_SYMBOL(vmalloc_to_page); EXPORT_SYMBOL(remap_page_range); #ifndef CONFIG_DISCONTIGMEM +EXPORT_SYMBOL(contig_page_data); EXPORT_SYMBOL(mem_map); EXPORT_SYMBOL(max_mapnr); #endif diff --git a/mm/numa.c b/mm/numa.c index f8df7313d115..a36769c95390 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -85,48 +85,4 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, memset(pgdat->valid_addr_bitmap, 0, size); } -static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask, - unsigned int order) -{ - return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK)); -} - -/* - * This can be refined. Currently, tries to do round robin, instead - * should do concentratic circle search, starting from current node. - */ -struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order) -{ - struct page *ret = 0; - pg_data_t *start, *temp; -#ifndef CONFIG_NUMA - unsigned long flags; - static pg_data_t *next = 0; -#endif - - if (order >= MAX_ORDER) - return NULL; -#ifdef CONFIG_NUMA - temp = NODE_DATA(numa_node_id()); -#else - if (!next) - next = pgdat_list; - temp = next; - next = next->pgdat_next; -#endif - start = temp; - while (temp) { - if ((ret = alloc_pages_pgdat(temp, gfp_mask, order))) - return(ret); - temp = temp->pgdat_next; - } - temp = pgdat_list; - while (temp != start) { - if ((ret = alloc_pages_pgdat(temp, gfp_mask, order))) - return(ret); - temp = temp->pgdat_next; - } - return(0); -} - #endif /* CONFIG_DISCONTIGMEM */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 721b6e5e572c..755216f7bc5a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -256,14 +256,6 @@ int is_head_of_free_region(struct page *page) } #endif /* CONFIG_SOFTWARE_SUSPEND */ -#ifndef CONFIG_DISCONTIGMEM -struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order) -{ - return __alloc_pages(gfp_mask, order, - contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK)); -} -#endif - static /* inline */ struct page * balance_classzone(struct zone* classzone, unsigned int gfp_mask, unsigned int order, int * freed) @@ -679,13 +671,41 @@ void show_free_areas(void) /* * Builds allocation fallback zone lists. */ -static inline void build_zonelists(pg_data_t *pgdat) +static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) +{ + switch (k) { + struct zone *zone; + default: + BUG(); + case ZONE_HIGHMEM: + zone = pgdat->node_zones + ZONE_HIGHMEM; + if (zone->size) { +#ifndef CONFIG_HIGHMEM + BUG(); +#endif + zonelist->zones[j++] = zone; + } + case ZONE_NORMAL: + zone = pgdat->node_zones + ZONE_NORMAL; + if (zone->size) + zonelist->zones[j++] = zone; + case ZONE_DMA: + zone = pgdat->node_zones + ZONE_DMA; + if (zone->size) + zonelist->zones[j++] = zone; + } + + return j; +} + +static void __init build_zonelists(pg_data_t *pgdat) { - int i, j, k; + int i, j, k, node, local_node; + local_node = pgdat->node_id; + printk("Building zonelist for node : %d\n", local_node); for (i = 0; i <= GFP_ZONEMASK; i++) { struct zonelist *zonelist; - struct zone *zone; zonelist = pgdat->node_zonelists + i; memset(zonelist, 0, sizeof(*zonelist)); @@ -697,33 +717,32 @@ static inline void build_zonelists(pg_data_t *pgdat) if (i & __GFP_DMA) k = ZONE_DMA; - switch (k) { - default: - BUG(); - /* - * fallthrough: - */ - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->size) { -#ifndef CONFIG_HIGHMEM - BUG(); -#endif - zonelist->zones[j++] = zone; - } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->size) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->size) - zonelist->zones[j++] = zone; - } + j = build_zonelists_node(pgdat, zonelist, j, k); + /* + * Now we build the zonelist so that it contains the zones + * of all the other nodes. + * We don't want to pressure a particular node, so when + * building the zones for node N, we make sure that the + * zones coming right after the local ones are those from + * node N+1 (modulo N) + */ + for (node = local_node + 1; node < numnodes; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + for (node = 0; node < local_node; node++) + j = build_zonelists_node(NODE_DATA(node), zonelist, j, k); + zonelist->zones[j++] = NULL; } } +void __init build_all_zonelists(void) +{ + int i; + + for(i = 0 ; i < numnodes ; i++) + build_zonelists(NODE_DATA(i)); +} + void __init calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { @@ -919,7 +938,6 @@ void __init free_area_init_core(pg_data_t *pgdat, (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size); } } - build_zonelists(pgdat); } #ifndef CONFIG_DISCONTIGMEM -- cgit v1.2.3 From da1eca6061198690ea08118d78fde70216dcf093 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Sep 2002 08:36:34 -0700 Subject: [PATCH] remove /proc/sys/vm/dirty_sync_thresh This was designed to be a really sterm throttling threshold: if dirty memory reaches this level then perform writeback and actually wait on it. It doesn't work. Because memory dirtiers are required to perform writeback if the amount of dirty AND writeback memory exceeds dirty_async_ratio. So kill it, and rely just on the request queues being appropriately scaled to the machine size (they are). This is basically what 2.4 does. --- Documentation/filesystems/proc.txt | 7 ------- Documentation/sysctl/vm.txt | 3 +-- fs/fs-writeback.c | 3 --- include/linux/sysctl.h | 11 +++++------ include/linux/writeback.h | 8 +++----- kernel/sysctl.c | 3 --- mm/page-writeback.c | 26 ++------------------------ 7 files changed, 11 insertions(+), 50 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 57597335536d..81c04ff87b4a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -963,13 +963,6 @@ Contains, as a percentage of total system memory, the number of pages at which a process which is generating disk writes will itself start writing out dirty data. -dirty_sync_ratio ----------------- - -Contains, as a percentage of total system memory, the number of pages at which -a process which is generating disk writes will itself start writing out dirty -data and waiting upon completion of that writeout. - dirty_writeback_centisecs ------------------------- diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 6ff0af89ae77..ed6ccff766f4 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -21,13 +21,12 @@ Currently, these files are in /proc/sys/vm: - dirty_async_ratio - dirty_background_ratio - dirty_expire_centisecs -- dirty_sync_ratio - dirty_writeback_centisecs ============================================================== dirty_async_ratio, dirty_background_ratio, dirty_expire_centisecs, -dirty_sync_ratio dirty_writeback_centisecs: +dirty_writeback_centisecs: See Documentation/filesystems/proc.txt diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 5f01ae6f8e6b..e306a31f46b5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -262,9 +262,6 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) break; really_sync = (wbc->sync_mode == WB_SYNC_ALL); - if ((wbc->sync_mode == WB_SYNC_LAST) && (head->prev == head)) - really_sync = 1; - BUG_ON(inode->i_state & I_FREEING); __iget(inode); list_move(&inode->i_list, &sb->s_dirty); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 3127165e7c13..606f1385a37e 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -147,12 +147,11 @@ enum VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ VM_DIRTY_BACKGROUND=11, /* dirty_background_ratio */ VM_DIRTY_ASYNC=12, /* dirty_async_ratio */ - VM_DIRTY_SYNC=13, /* dirty_sync_ratio */ - VM_DIRTY_WB_CS=14, /* dirty_writeback_centisecs */ - VM_DIRTY_EXPIRE_CS=15, /* dirty_expire_centisecs */ - VM_NR_PDFLUSH_THREADS=16, /* nr_pdflush_threads */ - VM_OVERCOMMIT_RATIO=17, /* percent of RAM to allow overcommit in */ - VM_PAGEBUF=18 /* struct: Control pagebuf parameters */ + VM_DIRTY_WB_CS=13, /* dirty_writeback_centisecs */ + VM_DIRTY_EXPIRE_CS=14, /* dirty_expire_centisecs */ + VM_NR_PDFLUSH_THREADS=15, /* nr_pdflush_threads */ + VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ + VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ }; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b12971f8eeb0..c35b96eb6a90 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -27,10 +27,9 @@ static inline int current_is_pdflush(void) * fs/fs-writeback.c */ enum writeback_sync_modes { - WB_SYNC_NONE = 0, /* Don't wait on anything */ - WB_SYNC_LAST = 1, /* Wait on the last-written mapping */ - WB_SYNC_ALL = 2, /* Wait on every mapping */ - WB_SYNC_HOLD = 3, /* Hold the inode on sb_dirty for sys_sync() */ + WB_SYNC_NONE, /* Don't wait on anything */ + WB_SYNC_ALL, /* Wait on every mapping */ + WB_SYNC_HOLD, /* Hold the inode on sb_dirty for sys_sync() */ }; /* @@ -65,7 +64,6 @@ static inline void wait_on_inode(struct inode *inode) /* These 5 are exported to sysctl. */ extern int dirty_background_ratio; extern int dirty_async_ratio; -extern int dirty_sync_ratio; extern int dirty_writeback_centisecs; extern int dirty_expire_centisecs; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6f92068e3f29..cca0ba4ee052 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -292,9 +292,6 @@ static ctl_table vm_table[] = { {VM_DIRTY_ASYNC, "dirty_async_ratio", &dirty_async_ratio, sizeof(dirty_async_ratio), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &zero, &one_hundred }, - {VM_DIRTY_SYNC, "dirty_sync_ratio", &dirty_sync_ratio, - sizeof(dirty_sync_ratio), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, &zero, &one_hundred }, {VM_DIRTY_WB_CS, "dirty_writeback_centisecs", &dirty_writeback_centisecs, sizeof(dirty_writeback_centisecs), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 915f55f6e179..a8afd3699509 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -72,11 +72,6 @@ int dirty_background_ratio = 10; */ int dirty_async_ratio = 40; -/* - * The generator of dirty data performs sync writeout at this level - */ -int dirty_sync_ratio = 50; - /* * The interval between `kupdate'-style writebacks, in centiseconds * (hundredths of a second) @@ -105,15 +100,11 @@ static void background_writeout(unsigned long _min_pages); * - Does nothing at all. * * balance_dirty_pages() can sleep. - * - * FIXME: WB_SYNC_LAST doesn't actually work. It waits on the last dirty - * inode on the superblock list. It should wait when nr_to_write is - * exhausted. Doesn't seem to matter. */ void balance_dirty_pages(struct address_space *mapping) { struct page_state ps; - long background_thresh, async_thresh, sync_thresh; + long background_thresh, async_thresh; unsigned long dirty_and_writeback; struct backing_dev_info *bdi; @@ -122,20 +113,9 @@ void balance_dirty_pages(struct address_space *mapping) background_thresh = (dirty_background_ratio * total_pages) / 100; async_thresh = (dirty_async_ratio * total_pages) / 100; - sync_thresh = (dirty_sync_ratio * total_pages) / 100; bdi = mapping->backing_dev_info; - if (dirty_and_writeback > sync_thresh) { - struct writeback_control wbc = { - .bdi = bdi, - .sync_mode = WB_SYNC_LAST, - .older_than_this = NULL, - .nr_to_write = sync_writeback_pages(), - }; - - writeback_inodes(&wbc); - get_page_state(&ps); - } else if (dirty_and_writeback > async_thresh) { + if (dirty_and_writeback > async_thresh) { struct writeback_control wbc = { .bdi = bdi, .sync_mode = WB_SYNC_NONE, @@ -331,8 +311,6 @@ static int __init page_writeback_init(void) dirty_background_ratio /= 100; dirty_async_ratio *= correction; dirty_async_ratio /= 100; - dirty_sync_ratio *= correction; - dirty_sync_ratio /= 100; } init_timer(&wb_timer); -- cgit v1.2.3 From d4872de38e4c74dd5c56facbd986da46ca551b65 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Sep 2002 08:36:47 -0700 Subject: [PATCH] readv/writev bounds checking fixes - writev currently returns -EFAULT if _any_ of the segments has an invalid address. We should only return -EFAULT if the first segment has a bad address. If some of the first segments have valid addresses we need to write them and return a partial result. - The current code only checks if the sum-of-lengths is negative. If individual segments have a negative length but the result is positive we miss that. So rework the code to detect this, and to be immune to odd wrapping situations. As a bonus, we save one pass across the iovec. - ditto for readv. The check for "does any segment have a negative length" has already been performed in do_readv_writev(), but it's basically free here, and we need to do it for generic_file_read/write anyway. This all means that the iov_length() function is unsafe because of wrap/overflow isues. It should only be used after the generic_file_read/write or do_readv_writev() checking has been performed. Its callers have been reviewed and they are OK. The code now passes LTP testing and has been QA'd by Janet's team. --- include/linux/uio.h | 6 +++++- mm/filemap.c | 51 +++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 42 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index ec098c8e6793..85b2f0ec9d3f 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -35,7 +35,11 @@ struct iovec #endif /* - * Total number of bytes covered by an iovec + * Total number of bytes covered by an iovec. + * + * NOTE that it is not safe to use this function until all the iovec's + * segment lengths have been validated. Because the individual lengths can + * overflow a size_t when added together. */ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) { diff --git a/mm/filemap.c b/mm/filemap.c index ea37b24135aa..4cfb5939082e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1134,10 +1134,26 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, struct file *filp = iocb->ki_filp; ssize_t retval; unsigned long seg; - size_t count = iov_length(iov, nr_segs); + size_t count; - if ((ssize_t) count < 0) - return -EINVAL; + count = 0; + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + count += iv->iov_len; + if (unlikely((ssize_t)(count|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + nr_segs = seg; + break; + } /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { @@ -1166,11 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, goto out; } - for (seg = 0; seg < nr_segs; seg++) { - if (!access_ok(VERIFY_WRITE,iov[seg].iov_base,iov[seg].iov_len)) - return -EFAULT; - } - retval = 0; if (count) { for (seg = 0; seg < nr_segs; seg++) { @@ -2032,8 +2043,8 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, { struct address_space * mapping = file->f_dentry->d_inode->i_mapping; struct address_space_operations *a_ops = mapping->a_ops; - const size_t ocount = iov_length(iov, nr_segs); - size_t count = ocount; + size_t ocount; /* original count */ + size_t count; /* after file limit checks */ struct inode *inode = mapping->host; unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; long status = 0; @@ -2050,13 +2061,25 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov, unsigned long seg; char *buf; - if (unlikely((ssize_t)count < 0)) - return -EINVAL; - + ocount = 0; for (seg = 0; seg < nr_segs; seg++) { - if (!access_ok(VERIFY_READ,iov[seg].iov_base,iov[seg].iov_len)) + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + ocount += iv->iov_len; + if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) return -EFAULT; + nr_segs = seg; + break; } + count = ocount; pos = *ppos; if (unlikely(pos < 0)) -- cgit v1.2.3 From a7d2851c9fd9b9317e1e1204badf27cbebb57eca Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 Sep 2002 08:37:08 -0700 Subject: [PATCH] hugetlbpages cleanup From Christoph Hellwig, acked by Rohit. - fix config.in description: we know we're on i386 and we also know that a feature can only be enabled if the hw supports it, the code alone is not enough - the sysctl is VM-releated, so move it from /proc/sys/kernel tp /proc/sys/vm - adopt to standard sysctl names --- arch/i386/config.in | 2 +- include/linux/sysctl.h | 2 +- kernel/sysctl.c | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/config.in b/arch/i386/config.in index b385d8215b1f..2bc0abf75171 100644 --- a/arch/i386/config.in +++ b/arch/i386/config.in @@ -154,7 +154,7 @@ if [ "$CONFIG_MWINCHIP3D" = "y" ]; then define_bool CONFIG_X86_OOSTORE y fi -bool 'IA-32 Huge TLB Page Support (if available on processor)' CONFIG_HUGETLB_PAGE +bool 'Huge TLB Page Support' CONFIG_HUGETLB_PAGE bool 'Symmetric multi-processing support' CONFIG_SMP bool 'Preemptible Kernel' CONFIG_PREEMPT diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 606f1385a37e..9fd7d5c05605 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -128,7 +128,6 @@ enum KERN_TAINTED=53, /* int: various kernel tainted flags */ KERN_CADPID=54, /* int: PID of the process to notify on CAD */ KERN_PIDMAX=55, /* int: PID # limit */ - KERN_HUGETLB_PAGE_NUM=56, /* int: Number of available Huge Pages */ }; @@ -152,6 +151,7 @@ enum VM_NR_PDFLUSH_THREADS=15, /* nr_pdflush_threads */ VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ + VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cca0ba4ee052..1a63d254ab80 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -99,8 +99,8 @@ extern int acct_parm[]; #endif #ifdef CONFIG_HUGETLB_PAGE -extern int htlbpage_max; -extern int set_hugetlb_mem_size(int); +extern int htlbpage_max; +extern int set_hugetlb_mem_size(int); #endif static int parse_table(int *, int, void *, size_t *, void *, size_t, @@ -263,10 +263,6 @@ static ctl_table kern_table[] = { #endif {KERN_PIDMAX, "pid_max", &pid_max, sizeof (int), 0600, NULL, &proc_dointvec}, -#ifdef CONFIG_HUGETLB_PAGE - {KERN_HUGETLB_PAGE_NUM, "numhugepages", &htlbpage_max, sizeof(int), 0644, NULL, - &proc_dointvec}, -#endif {0} }; @@ -314,6 +310,10 @@ static ctl_table vm_table[] = { { VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads", &nr_pdflush_threads, sizeof nr_pdflush_threads, 0444 /* read-only*/, NULL, &proc_dointvec}, +#ifdef CONFIG_HUGETLB_PAGE + {VM_HUGETLB_PAGES, "nr_hugepages", &htlbpage_max, sizeof(int), 0644, NULL, + &proc_dointvec}, +#endif {0} }; -- cgit v1.2.3