diff options
| author | Andrew Morton <akpm@digeo.com> | 2002-10-29 23:35:53 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@penguin.transmeta.com> | 2002-10-29 23:35:53 -0800 |
| commit | a206231bbe6ffb988cdf9fcbdfd98e49abaf4819 (patch) | |
| tree | 640f23350e83ff491f5cc970e79e9cd619704f6f | |
| parent | 1d2652dd2c3e942e75dc3137b3cb1774b43ae377 (diff) | |
[PATCH] hot-n-cold pages: page allocator core
Hot/Cold pages and zone->lock amortisation
| -rw-r--r-- | include/linux/gfp.h | 7 | ||||
| -rw-r--r-- | include/linux/mm.h | 1 | ||||
| -rw-r--r-- | include/linux/mmzone.h | 17 | ||||
| -rw-r--r-- | mm/page_alloc.c | 160 | ||||
| -rw-r--r-- | mm/swap.c | 5 |
5 files changed, 151 insertions, 39 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c340b447a963..8e093813e4f7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -17,6 +17,7 @@ #define __GFP_IO 0x40 /* Can start low memory physical IO? */ #define __GFP_HIGHIO 0x80 /* Can start high mem physical IO? */ #define __GFP_FS 0x100 /* Can call down to low-level FS? */ +#define __GFP_COLD 0x200 /* Cache-cold page required */ #define GFP_NOHIGHIO ( __GFP_WAIT | __GFP_IO) #define GFP_NOIO ( __GFP_WAIT) @@ -32,6 +33,7 @@ #define GFP_DMA __GFP_DMA + /* * There is only one page-allocator function, and two main namespaces to * it. The alloc_page*() variants return 'struct page *' and as such @@ -77,11 +79,10 @@ extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); #define __get_dma_pages(gfp_mask, order) \ __get_free_pages((gfp_mask) | GFP_DMA,(order)) -/* - * There is only one 'core' page-freeing function. - */ extern void FASTCALL(__free_pages(struct page *page, unsigned int order)); extern void FASTCALL(free_pages(unsigned long addr, unsigned int order)); +extern void FASTCALL(free_hot_page(struct page *page)); +extern void FASTCALL(free_cold_page(struct page *page)); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr),0) diff --git a/include/linux/mm.h b/include/linux/mm.h index cab2c4342047..d9d2f20732d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -211,7 +211,6 @@ struct page { #define set_page_count(p,v) atomic_set(&(p)->count, v) extern void FASTCALL(__page_cache_release(struct page *)); -void FASTCALL(__free_pages_ok(struct page *page, unsigned int order)); static inline void put_page(struct page *page) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 10c4ee968020..d80490b1265c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -9,6 +9,7 @@ #include <linux/list.h> #include <linux/wait.h> #include <linux/cache.h> +#include <linux/threads.h> #include <asm/atomic.h> #ifdef CONFIG_DISCONTIGMEM #include <asm/numnodes.h> @@ -46,6 +47,18 @@ struct zone_padding { #define ZONE_PADDING(name) #endif +struct per_cpu_pages { + int count; /* number of pages in the list */ + int low; /* low watermark, refill needed */ + int high; /* high watermark, emptying needed */ + int batch; /* chunk size for buddy add/remove */ + struct list_head list; /* the list of pages */ +}; + +struct per_cpu_pageset { + struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ +} ____cacheline_aligned_in_smp; + /* * On machines where it is needed (eg PCs) we divide physical memory * into multiple physical zones. On a PC we have 3 zones: @@ -107,6 +120,10 @@ struct zone { unsigned long wait_table_size; unsigned long wait_table_bits; + ZONE_PADDING(_pad3_) + + struct per_cpu_pageset pageset[NR_CPUS]; + /* * Discontig memory support fields. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dd35f4d7ac49..f46471b25586 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -10,6 +10,8 @@ * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 + * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 + * (lots of bits borrowed from Ingo Molnar & Andrew Morton) */ #include <linux/config.h> @@ -151,13 +153,14 @@ static inline void free_pages_check(const char *function, struct page *page) * Assumes all pages on list are in same zone, and of same order. * count is the number of pages to free, or 0 for all on the list. */ -static void +static int free_pages_bulk(struct zone *zone, int count, struct list_head *list, unsigned int order) { unsigned long mask, flags; struct free_area *area; struct page *base, *page = NULL; + int ret = 0; mask = (~0UL) << order; base = zone->zone_mem_map; @@ -169,8 +172,10 @@ free_pages_bulk(struct zone *zone, int count, list_del(&page->list); __free_pages_bulk(page, base, zone, area, mask, order); mod_page_state(pgfree, count<<order); + ret++; } spin_unlock_irqrestore(&zone->lock, flags); + return ret; } void __free_pages_ok(struct page *page, unsigned int order) @@ -201,14 +206,13 @@ expand(struct zone *zone, struct page *page, index += size; page += size; } - BUG_ON(bad_range(zone, page)); return page; } /* * This page is about to be returned from the page allocator */ -static inline void prep_new_page(struct page *page) +static void prep_new_page(struct page *page) { if ( page->mapping || page_mapped(page) || @@ -248,36 +252,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) continue; page = list_entry(curr, struct page, list); - BUG_ON(bad_range(zone, page)); list_del(curr); index = page - zone->zone_mem_map; if (current_order != MAX_ORDER-1) MARK_USED(index, current_order, area); zone->free_pages -= 1UL << order; - page = expand(zone, page, index, order, current_order, area); - return page; + return expand(zone, page, index, order, current_order, area); } return NULL; } -/* Obtain a single element from the buddy allocator */ -static struct page *rmqueue(struct zone *zone, unsigned int order) -{ - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); - - if (page != NULL) { - BUG_ON(bad_range(zone, page)); - prep_new_page(page); - } - return page; -} - /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. @@ -341,6 +326,72 @@ int is_head_of_free_region(struct page *page) #endif /* CONFIG_SOFTWARE_SUSPEND */ /* + * Free a 0-order page + */ +static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); +static void free_hot_cold_page(struct page *page, int cold) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + unsigned long flags; + + free_pages_check(__FUNCTION__, page); + pcp = &zone->pageset[get_cpu()].pcp[cold]; + local_irq_save(flags); + if (pcp->count >= pcp->high) + pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + list_add(&page->list, &pcp->list); + pcp->count++; + local_irq_restore(flags); + put_cpu(); +} + +void free_hot_page(struct page *page) +{ + free_hot_cold_page(page, 0); +} + +void free_cold_page(struct page *page) +{ + free_hot_cold_page(page, 1); +} + +static struct page *buffered_rmqueue(struct zone *zone, int order, int cold) +{ + unsigned long flags; + struct page *page = NULL; + + if (order == 0) { + struct per_cpu_pages *pcp; + + pcp = &zone->pageset[get_cpu()].pcp[cold]; + local_irq_save(flags); + if (pcp->count <= pcp->low) + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, &pcp->list); + if (pcp->count) { + page = list_entry(pcp->list.next, struct page, list); + list_del(&page->list); + pcp->count--; + } + local_irq_restore(flags); + put_cpu(); + } + + if (page == NULL) { + spin_lock_irqsave(&zone->lock, flags); + page = __rmqueue(zone, order); + spin_unlock_irqrestore(&zone->lock, flags); + } + + if (page != NULL) { + BUG_ON(bad_range(zone, page)); + prep_new_page(page); + } + return page; +} + +/* * This is the 'heart' of the zoned buddy allocator: */ struct page * @@ -349,13 +400,18 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, { unsigned long min; struct zone **zones, *classzone; - struct page * page; + struct page *page; int cflags; int i; + int cold; if (gfp_mask & __GFP_WAIT) might_sleep(); + cold = 0; + if (gfp_mask & __GFP_COLD) + cold = 1; + mod_page_state(pgalloc, 1<<order); zones = zonelist->zones; /* the list of zones suitable for gfp_mask */ @@ -371,7 +427,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, /* the incremental min is allegedly to discourage fallback */ min += z->pages_low; if (z->free_pages > min || z->free_pages >= z->pages_high) { - page = rmqueue(z, order); + page = buffered_rmqueue(z, order, cold); if (page) return page; } @@ -396,7 +452,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, local_min >>= 2; min += local_min; if (z->free_pages > min || z->free_pages >= z->pages_high) { - page = rmqueue(z, order); + page = buffered_rmqueue(z, order, cold); if (page) return page; } @@ -410,7 +466,7 @@ rebalance: for (i = 0; zones[i] != NULL; i++) { struct zone *z = zones[i]; - page = rmqueue(z, order); + page = buffered_rmqueue(z, order, cold); if (page) return page; } @@ -440,7 +496,7 @@ nopage: min += z->pages_min; if (z->free_pages > min || z->free_pages >= z->pages_high) { - page = rmqueue(z, order); + page = buffered_rmqueue(z, order, cold); if (page) return page; } @@ -492,13 +548,17 @@ void __pagevec_free(struct pagevec *pvec) int i = pagevec_count(pvec); while (--i >= 0) - __free_pages_ok(pvec->pages[i], 0); + free_hot_page(pvec->pages[i]); } void __free_pages(struct page *page, unsigned int order) { - if (!PageReserved(page) && put_page_testzero(page)) - __free_pages_ok(page, order); + if (!PageReserved(page) && put_page_testzero(page)) { + if (order == 0) + free_hot_page(page); + else + __free_pages_ok(page, order); + } } void free_pages(unsigned long addr, unsigned int order) @@ -899,7 +959,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long i, j; unsigned long local_offset; const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); - int nid = pgdat->node_id; + int cpu, nid = pgdat->node_id; struct page *lmem_map = pgdat->node_mem_map; unsigned long zone_start_pfn = pgdat->node_start_pfn; @@ -911,13 +971,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + j; unsigned long mask; unsigned long size, realsize; + unsigned long batch; zone_table[nid * MAX_NR_ZONES + j] = zone; realsize = size = zones_size[j]; if (zholes_size) realsize -= zholes_size[j]; - printk(" %s zone: %lu pages\n", zone_names[j], realsize); zone->spanned_pages = size; zone->present_pages = realsize; zone->name = zone_names[j]; @@ -925,6 +985,40 @@ static void __init free_area_init_core(struct pglist_data *pgdat, spin_lock_init(&zone->lru_lock); zone->zone_pgdat = pgdat; zone->free_pages = 0; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pages *pcp; + + pcp = &zone->pageset[cpu].pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + + pcp = &zone->pageset[cpu].pcp[1]; /* cold */ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = 1 * batch; + INIT_LIST_HEAD(&pcp->list); + } + printk(" %s zone: %lu pages, LIFO batch:%lu\n", + zone_names[j], realsize, batch); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); atomic_set(&zone->refill_counter, 0); diff --git a/mm/swap.c b/mm/swap.c index 72f4c9cdd5c4..225e24f1973d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -69,7 +69,8 @@ void lru_add_drain(void) } /* - * This path almost never happens - pages are normally freed via pagevecs. + * This path almost never happens for VM activity - pages are normally + * freed via pagevecs. But it gets used by networking. */ void __page_cache_release(struct page *page) { @@ -83,7 +84,7 @@ void __page_cache_release(struct page *page) page = NULL; spin_unlock_irqrestore(&zone->lru_lock, flags); if (page) - __free_pages_ok(page, 0); + free_hot_page(page); } /* |
