[PATCH] hot-n-cold pages: page allocator core

Hot/Cold pages and zone->lock amortisation
author: Andrew Morton <akpm@digeo.com> 2002-10-29 23:35:53 -0800
committer: Linus Torvalds <torvalds@penguin.transmeta.com> 2002-10-29 23:35:53 -0800
commit: a206231bbe6ffb988cdf9fcbdfd98e49abaf4819 (patch)
tree: 640f23350e83ff491f5cc970e79e9cd619704f6f
parent: 1d2652dd2c3e942e75dc3137b3cb1774b43ae377 (diff)
5 files changed, 151 insertions, 39 deletions
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c340b447a963..8e093813e4f7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -17,6 +17,7 @@
 #define __GFP_IO	0x40	/* Can start low memory physical IO? */
 #define __GFP_HIGHIO	0x80	/* Can start high mem physical IO? */
 #define __GFP_FS	0x100	/* Can call down to low-level FS? */
+#define __GFP_COLD	0x200	/* Cache-cold page required */
 
 #define GFP_NOHIGHIO	(             __GFP_WAIT | __GFP_IO)
 #define GFP_NOIO	(             __GFP_WAIT)
@@ -32,6 +33,7 @@
 
 #define GFP_DMA		__GFP_DMA
 
+
 /*
  * There is only one page-allocator function, and two main namespaces to
  * it. The alloc_page*() variants return 'struct page *' and as such
@@ -77,11 +79,10 @@ extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask));
 #define __get_dma_pages(gfp_mask, order) \
 		__get_free_pages((gfp_mask) | GFP_DMA,(order))
 
-/*
- * There is only one 'core' page-freeing function.
- */
 extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
 extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
+extern void FASTCALL(free_hot_page(struct page *page));
+extern void FASTCALL(free_cold_page(struct page *page));
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr),0)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cab2c4342047..d9d2f20732d4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -211,7 +211,6 @@ struct page {
 #define set_page_count(p,v) 	atomic_set(&(p)->count, v)
 
 extern void FASTCALL(__page_cache_release(struct page *));
-void FASTCALL(__free_pages_ok(struct page *page, unsigned int order));
 
 static inline void put_page(struct page *page)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 10c4ee968020..d80490b1265c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -9,6 +9,7 @@
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/cache.h>
+#include <linux/threads.h>
 #include <asm/atomic.h>
 #ifdef CONFIG_DISCONTIGMEM
 #include <asm/numnodes.h>
@@ -46,6 +47,18 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
+struct per_cpu_pages {
+	int count;		/* number of pages in the list */
+	int low;		/* low watermark, refill needed */
+	int high;		/* high watermark, emptying needed */
+	int batch;		/* chunk size for buddy add/remove */
+	struct list_head list;	/* the list of pages */
+};
+
+struct per_cpu_pageset {
+	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
+} ____cacheline_aligned_in_smp;
+
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
  * into multiple physical zones. On a PC we have 3 zones:
@@ -107,6 +120,10 @@ struct zone {
 	unsigned long		wait_table_size;
 	unsigned long		wait_table_bits;
 
+	ZONE_PADDING(_pad3_)
+
+	struct per_cpu_pageset	pageset[NR_CPUS];
+
 	/*
 	 * Discontig memory support fields.
 	 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd35f4d7ac49..f46471b25586 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -10,6 +10,8 @@
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 
 #include <linux/config.h>
@@ -151,13 +153,14 @@ static inline void free_pages_check(const char *function, struct page *page)
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free, or 0 for all on the list.
  */
-static void
+static int
 free_pages_bulk(struct zone *zone, int count,
 		struct list_head *list, unsigned int order)
 {
 	unsigned long mask, flags;
 	struct free_area *area;
 	struct page *base, *page = NULL;
+	int ret = 0;
 
 	mask = (~0UL) << order;
 	base = zone->zone_mem_map;
@@ -169,8 +172,10 @@ free_pages_bulk(struct zone *zone, int count,
 		list_del(&page->list);
 		__free_pages_bulk(page, base, zone, area, mask, order);
 		mod_page_state(pgfree, count<<order);
+		ret++;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
+	return ret;
 }
 
 void __free_pages_ok(struct page *page, unsigned int order)
@@ -201,14 +206,13 @@ expand(struct zone *zone, struct page *page,
 		index += size;
 		page += size;
 	}
-	BUG_ON(bad_range(zone, page));
 	return page;
 }
 
 /*
  * This page is about to be returned from the page allocator
  */
-static inline void prep_new_page(struct page *page)
+static void prep_new_page(struct page *page)
 {
 	if (	page->mapping ||
 		page_mapped(page) ||
@@ -248,36 +252,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
 			continue;
 
 		page = list_entry(curr, struct page, list);
-		BUG_ON(bad_range(zone, page));
 		list_del(curr);
 		index = page - zone->zone_mem_map;
 		if (current_order != MAX_ORDER-1)
 			MARK_USED(index, current_order, area);
 		zone->free_pages -= 1UL << order;
-		page = expand(zone, page, index, order, current_order, area);
-		return page;
+		return expand(zone, page, index, order, current_order, area);
 	}
 
 	return NULL;
 }
 
-/* Obtain a single element from the buddy allocator */
-static struct page *rmqueue(struct zone *zone, unsigned int order)
-{
-	unsigned long flags;
-	struct page *page;
-
-	spin_lock_irqsave(&zone->lock, flags);
-	page = __rmqueue(zone, order);
-	spin_unlock_irqrestore(&zone->lock, flags);
-
-	if (page != NULL) {
-		BUG_ON(bad_range(zone, page));
-		prep_new_page(page);
-	}
-	return page;
-}
-
 /* 
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
@@ -341,6 +326,72 @@ int is_head_of_free_region(struct page *page)
 #endif /* CONFIG_SOFTWARE_SUSPEND */
 
 /*
+ * Free a 0-order page
+ */
+static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
+static void free_hot_cold_page(struct page *page, int cold)
+{
+	struct zone *zone = page_zone(page);
+	struct per_cpu_pages *pcp;
+	unsigned long flags;
+
+	free_pages_check(__FUNCTION__, page);
+	pcp = &zone->pageset[get_cpu()].pcp[cold];
+	local_irq_save(flags);
+	if (pcp->count >= pcp->high)
+		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+	list_add(&page->list, &pcp->list);
+	pcp->count++;
+	local_irq_restore(flags);
+	put_cpu();
+}
+
+void free_hot_page(struct page *page)
+{
+	free_hot_cold_page(page, 0);
+}
+	
+void free_cold_page(struct page *page)
+{
+	free_hot_cold_page(page, 1);
+}
+
+static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
+{
+	unsigned long flags;
+	struct page *page = NULL;
+
+	if (order == 0) {
+		struct per_cpu_pages *pcp;
+
+		pcp = &zone->pageset[get_cpu()].pcp[cold];
+		local_irq_save(flags);
+		if (pcp->count <= pcp->low)
+			pcp->count += rmqueue_bulk(zone, 0,
+						pcp->batch, &pcp->list);
+		if (pcp->count) {
+			page = list_entry(pcp->list.next, struct page, list);
+			list_del(&page->list);
+			pcp->count--;
+		}
+		local_irq_restore(flags);
+		put_cpu();
+	}
+
+	if (page == NULL) {
+		spin_lock_irqsave(&zone->lock, flags);
+		page = __rmqueue(zone, order);
+		spin_unlock_irqrestore(&zone->lock, flags);
+	}
+
+	if (page != NULL) {
+		BUG_ON(bad_range(zone, page));
+		prep_new_page(page);
+	}
+	return page;
+}
+
+/*
  * This is the 'heart' of the zoned buddy allocator:
  */
 struct page *
@@ -349,13 +400,18 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 {
 	unsigned long min;
 	struct zone **zones, *classzone;
-	struct page * page;
+	struct page *page;
 	int cflags;
 	int i;
+	int cold;
 
 	if (gfp_mask & __GFP_WAIT)
 		might_sleep();
 
+	cold = 0;
+	if (gfp_mask & __GFP_COLD)
+		cold = 1;
+
 	mod_page_state(pgalloc, 1<<order);
 
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
@@ -371,7 +427,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 		/* the incremental min is allegedly to discourage fallback */
 		min += z->pages_low;
 		if (z->free_pages > min || z->free_pages >= z->pages_high) {
-			page = rmqueue(z, order);
+			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				return page;
 		}
@@ -396,7 +452,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 			local_min >>= 2;
 		min += local_min;
 		if (z->free_pages > min || z->free_pages >= z->pages_high) {
-			page = rmqueue(z, order);
+			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				return page;
 		}
@@ -410,7 +466,7 @@ rebalance:
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *z = zones[i];
 
-			page = rmqueue(z, order);
+			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				return page;
 		}
@@ -440,7 +496,7 @@ nopage:
 
 		min += z->pages_min;
 		if (z->free_pages > min || z->free_pages >= z->pages_high) {
-			page = rmqueue(z, order);
+			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				return page;
 		}
@@ -492,13 +548,17 @@ void __pagevec_free(struct pagevec *pvec)
 	int i = pagevec_count(pvec);
 
 	while (--i >= 0)
-		__free_pages_ok(pvec->pages[i], 0);
+		free_hot_page(pvec->pages[i]);
 }
 
 void __free_pages(struct page *page, unsigned int order)
 {
-	if (!PageReserved(page) && put_page_testzero(page))
-		__free_pages_ok(page, order);
+	if (!PageReserved(page) && put_page_testzero(page)) {
+		if (order == 0)
+			free_hot_page(page);
+		else
+			__free_pages_ok(page, order);
+	}
 }
 
 void free_pages(unsigned long addr, unsigned int order)
@@ -899,7 +959,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 	unsigned long i, j;
 	unsigned long local_offset;
 	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
-	int nid = pgdat->node_id;
+	int cpu, nid = pgdat->node_id;
 	struct page *lmem_map = pgdat->node_mem_map;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 
@@ -911,13 +971,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long mask;
 		unsigned long size, realsize;
+		unsigned long batch;
 
 		zone_table[nid * MAX_NR_ZONES + j] = zone;
 		realsize = size = zones_size[j];
 		if (zholes_size)
 			realsize -= zholes_size[j];
 
-		printk("  %s zone: %lu pages\n", zone_names[j], realsize);
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 		zone->name = zone_names[j];
@@ -925,6 +985,40 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		spin_lock_init(&zone->lru_lock);
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
+
+		/*
+		 * The per-cpu-pages pools are set to around 1000th of the
+		 * size of the zone.  But no more than 1/4 of a meg - there's
+		 * no point in going beyond the size of L2 cache.
+		 *
+		 * OK, so we don't know how big the cache is.  So guess.
+		 */
+		batch = zone->present_pages / 1024;
+		if (batch * PAGE_SIZE > 256 * 1024)
+			batch = (256 * 1024) / PAGE_SIZE;
+		batch /= 4;		/* We effectively *= 4 below */
+		if (batch < 1)
+			batch = 1;
+
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
+			pcp->count = 0;
+			pcp->low = 2 * batch;
+			pcp->high = 6 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+
+			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
+			pcp->count = 0;
+			pcp->low = 0;
+			pcp->high = 2 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+		}
+		printk("  %s zone: %lu pages, LIFO batch:%lu\n",
+				zone_names[j], realsize, batch);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		atomic_set(&zone->refill_counter, 0);
diff --git a/mm/swap.c b/mm/swap.c
index 72f4c9cdd5c4..225e24f1973d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -69,7 +69,8 @@ void lru_add_drain(void)
 }
 
 /*
- * This path almost never happens - pages are normally freed via pagevecs.
+ * This path almost never happens for VM activity - pages are normally
+ * freed via pagevecs.  But it gets used by networking.
  */
 void __page_cache_release(struct page *page)
 {
@@ -83,7 +84,7 @@ void __page_cache_release(struct page *page)
 		page = NULL;
 	spin_unlock_irqrestore(&zone->lru_lock, flags);
 	if (page)
-		__free_pages_ok(page, 0);
+		free_hot_page(page);
 }
 
 /*
author	Andrew Morton <akpm@digeo.com>	2002-10-29 23:35:53 -0800
committer	Linus Torvalds <torvalds@penguin.transmeta.com>	2002-10-29 23:35:53 -0800
commit	a206231bbe6ffb988cdf9fcbdfd98e49abaf4819 (patch)
tree	640f23350e83ff491f5cc970e79e9cd619704f6f
parent	1d2652dd2c3e942e75dc3137b3cb1774b43ae377 (diff)