diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 380 | 
1 files changed, 239 insertions, 141 deletions
| diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2ef1c17942f..a919ba5cb3c8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -20,7 +20,6 @@  #include <linux/interrupt.h>  #include <linux/pagemap.h>  #include <linux/jiffies.h> -#include <linux/bootmem.h>  #include <linux/memblock.h>  #include <linux/compiler.h>  #include <linux/kernel.h> @@ -66,6 +65,7 @@  #include <linux/ftrace.h>  #include <linux/lockdep.h>  #include <linux/nmi.h> +#include <linux/psi.h>  #include <asm/sections.h>  #include <asm/tlbflush.h> @@ -306,24 +306,33 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)  }  /* - * Returns false when the remaining initialisation should be deferred until + * Returns true when the remaining initialisation should be deferred until   * later in the boot cycle when it can be parallelised.   */ -static inline bool update_defer_init(pg_data_t *pgdat, -				unsigned long pfn, unsigned long zone_end, -				unsigned long *nr_initialised) +static bool __meminit +defer_init(int nid, unsigned long pfn, unsigned long end_pfn)  { +	static unsigned long prev_end_pfn, nr_initialised; + +	/* +	 * prev_end_pfn static that contains the end of previous zone +	 * No need to protect because called very early in boot before smp_init. +	 */ +	if (prev_end_pfn != end_pfn) { +		prev_end_pfn = end_pfn; +		nr_initialised = 0; +	} +  	/* Always populate low zones for address-constrained allocations */ -	if (zone_end < pgdat_end_pfn(pgdat)) -		return true; -	(*nr_initialised)++; -	if ((*nr_initialised > pgdat->static_init_pgcnt) && -	    (pfn & (PAGES_PER_SECTION - 1)) == 0) { -		pgdat->first_deferred_pfn = pfn; +	if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))  		return false; +	nr_initialised++; +	if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) && +	    (pfn & (PAGES_PER_SECTION - 1)) == 0) { +		NODE_DATA(nid)->first_deferred_pfn = pfn; +		return true;  	} - -	return true; +	return false;  }  #else  static inline bool early_page_uninitialised(unsigned long pfn) @@ -331,11 +340,9 @@ static inline bool early_page_uninitialised(unsigned long pfn)  	return false;  } -static inline bool update_defer_init(pg_data_t *pgdat, -				unsigned long pfn, unsigned long zone_end, -				unsigned long *nr_initialised) +static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)  { -	return true; +	return false;  }  #endif @@ -1231,7 +1238,12 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)  			/* Avoid false-positive PageTail() */  			INIT_LIST_HEAD(&page->lru); -			SetPageReserved(page); +			/* +			 * no need for atomic set_bit because the struct +			 * page is not visible yet so nobody should +			 * access it yet. +			 */ +			__SetPageReserved(page);  		}  	}  } @@ -1326,7 +1338,7 @@ meminit_pfn_in_nid(unsigned long pfn, int node,  #endif -void __init __free_pages_bootmem(struct page *page, unsigned long pfn, +void __init memblock_free_pages(struct page *page, unsigned long pfn,  							unsigned int order)  {  	if (early_page_uninitialised(pfn)) @@ -2015,10 +2027,6 @@ static int move_freepages(struct zone *zone,  	          pfn_valid(page_to_pfn(end_page)) &&  	          page_zone(start_page) != page_zone(end_page));  #endif - -	if (num_movable) -		*num_movable = 0; -  	for (page = start_page; page <= end_page;) {  		if (!pfn_valid_within(page_to_pfn(page))) {  			page++; @@ -2058,6 +2066,9 @@ int move_freepages_block(struct zone *zone, struct page *page,  	unsigned long start_pfn, end_pfn;  	struct page *start_page, *end_page; +	if (num_movable) +		*num_movable = 0; +  	start_pfn = page_to_pfn(page);  	start_pfn = start_pfn & ~(pageblock_nr_pages-1);  	start_page = pfn_to_page(start_pfn); @@ -3366,26 +3377,12 @@ try_this_zone:  	return NULL;  } -/* - * Large machines with many possible nodes should not always dump per-node - * meminfo in irq context. - */ -static inline bool should_suppress_show_mem(void) -{ -	bool ret = false; - -#if NODES_SHIFT > 8 -	ret = in_interrupt(); -#endif -	return ret; -} -  static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)  {  	unsigned int filter = SHOW_MEM_FILTER_NODES;  	static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); -	if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) +	if (!__ratelimit(&show_mem_rs))  		return;  	/* @@ -3549,15 +3546,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,  		enum compact_priority prio, enum compact_result *compact_result)  {  	struct page *page; +	unsigned long pflags;  	unsigned int noreclaim_flag;  	if (!order)  		return NULL; +	psi_memstall_enter(&pflags);  	noreclaim_flag = memalloc_noreclaim_save(); +  	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,  									prio); +  	memalloc_noreclaim_restore(noreclaim_flag); +	psi_memstall_leave(&pflags);  	if (*compact_result <= COMPACT_INACTIVE)  		return NULL; @@ -3756,11 +3758,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,  	struct reclaim_state reclaim_state;  	int progress;  	unsigned int noreclaim_flag; +	unsigned long pflags;  	cond_resched();  	/* We now go into synchronous reclaim */  	cpuset_memory_pressure_bump(); +	psi_memstall_enter(&pflags);  	fs_reclaim_acquire(gfp_mask);  	noreclaim_flag = memalloc_noreclaim_save();  	reclaim_state.reclaimed_slab = 0; @@ -3772,6 +3776,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,  	current->reclaim_state = NULL;  	memalloc_noreclaim_restore(noreclaim_flag);  	fs_reclaim_release(gfp_mask); +	psi_memstall_leave(&pflags);  	cond_resched(); @@ -3922,6 +3927,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,  {  	struct zone *zone;  	struct zoneref *z; +	bool ret = false;  	/*  	 * Costly allocations might have made a progress but this doesn't mean @@ -3985,25 +3991,24 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,  				}  			} -			/* -			 * Memory allocation/reclaim might be called from a WQ -			 * context and the current implementation of the WQ -			 * concurrency control doesn't recognize that -			 * a particular WQ is congested if the worker thread is -			 * looping without ever sleeping. Therefore we have to -			 * do a short sleep here rather than calling -			 * cond_resched(). -			 */ -			if (current->flags & PF_WQ_WORKER) -				schedule_timeout_uninterruptible(1); -			else -				cond_resched(); - -			return true; +			ret = true; +			goto out;  		}  	} -	return false; +out: +	/* +	 * Memory allocation/reclaim might be called from a WQ context and the +	 * current implementation of the WQ concurrency control doesn't +	 * recognize that a particular WQ is congested if the worker thread is +	 * looping without ever sleeping. Therefore we have to do a short sleep +	 * here rather than calling cond_resched(). +	 */ +	if (current->flags & PF_WQ_WORKER) +		schedule_timeout_uninterruptible(1); +	else +		cond_resched(); +	return ret;  }  static inline bool @@ -4701,6 +4706,7 @@ long si_mem_available(void)  	unsigned long pagecache;  	unsigned long wmark_low = 0;  	unsigned long pages[NR_LRU_LISTS]; +	unsigned long reclaimable;  	struct zone *zone;  	int lru; @@ -4726,19 +4732,13 @@ long si_mem_available(void)  	available += pagecache;  	/* -	 * Part of the reclaimable slab consists of items that are in use, -	 * and cannot be freed. Cap this estimate at the low watermark. +	 * Part of the reclaimable slab and other kernel memory consists of +	 * items that are in use, and cannot be freed. Cap this estimate at the +	 * low watermark.  	 */ -	available += global_node_page_state(NR_SLAB_RECLAIMABLE) - -		     min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, -			 wmark_low); - -	/* -	 * Part of the kernel memory, which can be released under memory -	 * pressure. -	 */ -	available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> -		PAGE_SHIFT; +	reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) + +			global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); +	available += reclaimable - min(reclaimable / 2, wmark_low);  	if (available < 0)  		available = 0; @@ -5449,76 +5449,151 @@ void __ref build_all_zonelists(pg_data_t *pgdat)  #endif  } +/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */ +static bool __meminit +overlap_memmap_init(unsigned long zone, unsigned long *pfn) +{ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +	static struct memblock_region *r; + +	if (mirrored_kernelcore && zone == ZONE_MOVABLE) { +		if (!r || *pfn >= memblock_region_memory_end_pfn(r)) { +			for_each_memblock(memory, r) { +				if (*pfn < memblock_region_memory_end_pfn(r)) +					break; +			} +		} +		if (*pfn >= memblock_region_memory_base_pfn(r) && +		    memblock_is_mirror(r)) { +			*pfn = memblock_region_memory_end_pfn(r); +			return true; +		} +	} +#endif +	return false; +} +  /*   * Initially all pages are reserved - free ones are freed - * up by free_all_bootmem() once the early boot process is + * up by memblock_free_all() once the early boot process is   * done. Non-atomic initialization, single-pass.   */  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,  		unsigned long start_pfn, enum memmap_context context,  		struct vmem_altmap *altmap)  { -	unsigned long end_pfn = start_pfn + size; -	pg_data_t *pgdat = NODE_DATA(nid); -	unsigned long pfn; -	unsigned long nr_initialised = 0; +	unsigned long pfn, end_pfn = start_pfn + size;  	struct page *page; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -	struct memblock_region *r = NULL, *tmp; -#endif  	if (highest_memmap_pfn < end_pfn - 1)  		highest_memmap_pfn = end_pfn - 1; +#ifdef CONFIG_ZONE_DEVICE  	/*  	 * Honor reservation requested by the driver for this ZONE_DEVICE -	 * memory +	 * memory. We limit the total number of pages to initialize to just +	 * those that might contain the memory mapping. We will defer the +	 * ZONE_DEVICE page initialization until after we have released +	 * the hotplug lock.  	 */ -	if (altmap && start_pfn == altmap->base_pfn) -		start_pfn += altmap->reserve; +	if (zone == ZONE_DEVICE) { +		if (!altmap) +			return; + +		if (start_pfn == altmap->base_pfn) +			start_pfn += altmap->reserve; +		end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); +	} +#endif  	for (pfn = start_pfn; pfn < end_pfn; pfn++) {  		/*  		 * There can be holes in boot-time mem_map[]s handed to this  		 * function.  They do not exist on hotplugged memory.  		 */ -		if (context != MEMMAP_EARLY) -			goto not_early; - -		if (!early_pfn_valid(pfn)) -			continue; -		if (!early_pfn_in_nid(pfn, nid)) -			continue; -		if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) -			break; - -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -		/* -		 * Check given memblock attribute by firmware which can affect -		 * kernel memory layout.  If zone==ZONE_MOVABLE but memory is -		 * mirrored, it's an overlapped memmap init. skip it. -		 */ -		if (mirrored_kernelcore && zone == ZONE_MOVABLE) { -			if (!r || pfn >= memblock_region_memory_end_pfn(r)) { -				for_each_memblock(memory, tmp) -					if (pfn < memblock_region_memory_end_pfn(tmp)) -						break; -				r = tmp; -			} -			if (pfn >= memblock_region_memory_base_pfn(r) && -			    memblock_is_mirror(r)) { -				/* already initialized as NORMAL */ -				pfn = memblock_region_memory_end_pfn(r); +		if (context == MEMMAP_EARLY) { +			if (!early_pfn_valid(pfn))  				continue; -			} +			if (!early_pfn_in_nid(pfn, nid)) +				continue; +			if (overlap_memmap_init(zone, &pfn)) +				continue; +			if (defer_init(nid, pfn, end_pfn)) +				break;  		} -#endif -not_early:  		page = pfn_to_page(pfn);  		__init_single_page(page, pfn, zone, nid);  		if (context == MEMMAP_HOTPLUG) -			SetPageReserved(page); +			__SetPageReserved(page); + +		/* +		 * Mark the block movable so that blocks are reserved for +		 * movable at startup. This will force kernel allocations +		 * to reserve their blocks rather than leaking throughout +		 * the address space during boot when many long-lived +		 * kernel allocations are made. +		 * +		 * bitmap is created for zone's valid pfn range. but memmap +		 * can be created for invalid pages (for alignment) +		 * check here not to call set_pageblock_migratetype() against +		 * pfn out of zone. +		 */ +		if (!(pfn & (pageblock_nr_pages - 1))) { +			set_pageblock_migratetype(page, MIGRATE_MOVABLE); +			cond_resched(); +		} +	} +} + +#ifdef CONFIG_ZONE_DEVICE +void __ref memmap_init_zone_device(struct zone *zone, +				   unsigned long start_pfn, +				   unsigned long size, +				   struct dev_pagemap *pgmap) +{ +	unsigned long pfn, end_pfn = start_pfn + size; +	struct pglist_data *pgdat = zone->zone_pgdat; +	unsigned long zone_idx = zone_idx(zone); +	unsigned long start = jiffies; +	int nid = pgdat->node_id; + +	if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone))) +		return; + +	/* +	 * The call to memmap_init_zone should have already taken care +	 * of the pages reserved for the memmap, so we can just jump to +	 * the end of that region and start processing the device pages. +	 */ +	if (pgmap->altmap_valid) { +		struct vmem_altmap *altmap = &pgmap->altmap; + +		start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); +		size = end_pfn - start_pfn; +	} + +	for (pfn = start_pfn; pfn < end_pfn; pfn++) { +		struct page *page = pfn_to_page(pfn); + +		__init_single_page(page, pfn, zone_idx, nid); + +		/* +		 * Mark page reserved as it will need to wait for onlining +		 * phase for it to be fully associated with a zone. +		 * +		 * We can use the non-atomic __set_bit operation for setting +		 * the flag as we are still initializing the pages. +		 */ +		__SetPageReserved(page); + +		/* +		 * ZONE_DEVICE pages union ->lru with a ->pgmap back +		 * pointer and hmm_data.  It is a bug if a ZONE_DEVICE +		 * page is ever freed or placed on a driver-private list. +		 */ +		page->pgmap = pgmap; +		page->hmm_data = 0;  		/*  		 * Mark the block movable so that blocks are reserved for @@ -5540,8 +5615,12 @@ not_early:  			cond_resched();  		}  	} + +	pr_info("%s initialised, %lu pages in %ums\n", dev_name(pgmap->dev), +		size, jiffies_to_msecs(jiffies - start));  } +#endif  static void __meminit zone_init_free_lists(struct zone *zone)  {  	unsigned int order, t; @@ -5551,10 +5630,11 @@ static void __meminit zone_init_free_lists(struct zone *zone)  	}  } -#ifndef __HAVE_ARCH_MEMMAP_INIT -#define memmap_init(size, nid, zone, start_pfn) \ -	memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL) -#endif +void __meminit __weak memmap_init(unsigned long size, int nid, +				  unsigned long zone, unsigned long start_pfn) +{ +	memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL); +}  static int zone_batchsize(struct zone *zone)  { @@ -6128,7 +6208,7 @@ static void __ref setup_usemap(struct pglist_data *pgdat,  	zone->pageblock_flags = NULL;  	if (usemapsize)  		zone->pageblock_flags = -			memblock_virt_alloc_node_nopanic(usemapsize, +			memblock_alloc_node_nopanic(usemapsize,  							 pgdat->node_id);  }  #else @@ -6358,7 +6438,7 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)  		end = pgdat_end_pfn(pgdat);  		end = ALIGN(end, MAX_ORDER_NR_PAGES);  		size =  (end - start) * sizeof(struct page); -		map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); +		map = memblock_alloc_node_nopanic(size, pgdat->node_id);  		pgdat->node_mem_map = map + offset;  	}  	pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", @@ -6427,48 +6507,67 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,  	free_area_init_core(pgdat);  } -#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP) +#if !defined(CONFIG_FLAT_NODE_MEM_MAP) +/* + * Zero all valid struct pages in range [spfn, epfn), return number of struct + * pages zeroed + */ +static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn) +{ +	unsigned long pfn; +	u64 pgcnt = 0; + +	for (pfn = spfn; pfn < epfn; pfn++) { +		if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { +			pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) +				+ pageblock_nr_pages - 1; +			continue; +		} +		mm_zero_struct_page(pfn_to_page(pfn)); +		pgcnt++; +	} + +	return pgcnt; +} +  /*   * Only struct pages that are backed by physical memory are zeroed and   * initialized by going through __init_single_page(). But, there are some   * struct pages which are reserved in memblock allocator and their fields   * may be accessed (for example page_to_pfn() on some configuration accesses   * flags). We must explicitly zero those struct pages. + * + * This function also addresses a similar issue where struct pages are left + * uninitialized because the physical address range is not covered by + * memblock.memory or memblock.reserved. That could happen when memblock + * layout is manually configured via memmap=.   */  void __init zero_resv_unavail(void)  {  	phys_addr_t start, end; -	unsigned long pfn;  	u64 i, pgcnt; +	phys_addr_t next = 0;  	/* -	 * Loop through ranges that are reserved, but do not have reported -	 * physical memory backing. +	 * Loop through unavailable ranges not covered by memblock.memory.  	 */  	pgcnt = 0; -	for_each_resv_unavail_range(i, &start, &end) { -		for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { -			if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { -				pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) -					+ pageblock_nr_pages - 1; -				continue; -			} -			mm_zero_struct_page(pfn_to_page(pfn)); -			pgcnt++; -		} +	for_each_mem_range(i, &memblock.memory, NULL, +			NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) { +		if (next < start) +			pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start)); +		next = end;  	} +	pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);  	/*  	 * Struct pages that do not have backing memory. This could be because  	 * firmware is using some of this memory, or for some other reasons. -	 * Once memblock is changed so such behaviour is not allowed: i.e. -	 * list of "reserved" memory must be a subset of list of "memory", then -	 * this code can be removed.  	 */  	if (pgcnt) -		pr_info("Reserved but unavailable: %lld pages", pgcnt); +		pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);  } -#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */ +#endif /* !CONFIG_FLAT_NODE_MEM_MAP */  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6803,15 +6902,12 @@ static void check_for_memory(pg_data_t *pgdat, int nid)  {  	enum zone_type zone_type; -	if (N_MEMORY == N_NORMAL_MEMORY) -		return; -  	for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {  		struct zone *zone = &pgdat->node_zones[zone_type];  		if (populated_zone(zone)) { -			node_set_state(nid, N_HIGH_MEMORY); -			if (N_NORMAL_MEMORY != N_HIGH_MEMORY && -			    zone_type <= ZONE_NORMAL) +			if (IS_ENABLED(CONFIG_HIGHMEM)) +				node_set_state(nid, N_HIGH_MEMORY); +			if (zone_type <= ZONE_NORMAL)  				node_set_state(nid, N_NORMAL_MEMORY);  			break;  		} @@ -7614,9 +7710,11 @@ void *__init alloc_large_system_hash(const char *tablename,  		size = bucketsize << log2qty;  		if (flags & HASH_EARLY) {  			if (flags & HASH_ZERO) -				table = memblock_virt_alloc_nopanic(size, 0); +				table = memblock_alloc_nopanic(size, +							       SMP_CACHE_BYTES);  			else -				table = memblock_virt_alloc_raw(size, 0); +				table = memblock_alloc_raw(size, +							   SMP_CACHE_BYTES);  		} else if (hashdist) {  			table = __vmalloc(size, gfp_flags, PAGE_KERNEL);  		} else { | 
