diff options
Diffstat (limited to 'mm/page_alloc.c')
| -rw-r--r-- | mm/page_alloc.c | 243 |
1 files changed, 147 insertions, 96 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 600d9e981c23..822e05f1a964 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -99,9 +99,12 @@ static DEFINE_MUTEX(pcp_batch_high_lock); /* * On SMP, spin_trylock is sufficient protection. * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + * Pass flags to a no-op inline function to typecheck and silence the unused + * variable warning. */ -#define pcp_trylock_prepare(flags) do { } while (0) -#define pcp_trylock_finish(flag) do { } while (0) +static inline void __pcp_trylock_noop(unsigned long *flags) { } +#define pcp_trylock_prepare(flags) __pcp_trylock_noop(&(flags)) +#define pcp_trylock_finish(flags) __pcp_trylock_noop(&(flags)) #else /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ @@ -129,15 +132,6 @@ static DEFINE_MUTEX(pcp_batch_high_lock); * Generic helper to lookup and a per-cpu variable with an embedded spinlock. * Return value should be used with equivalent unlock helper. */ -#define pcpu_spin_lock(type, member, ptr) \ -({ \ - type *_ret; \ - pcpu_task_pin(); \ - _ret = this_cpu_ptr(ptr); \ - spin_lock(&_ret->member); \ - _ret; \ -}) - #define pcpu_spin_trylock(type, member, ptr) \ ({ \ type *_ret; \ @@ -157,14 +151,21 @@ static DEFINE_MUTEX(pcp_batch_high_lock); }) /* struct per_cpu_pages specific helpers. */ -#define pcp_spin_lock(ptr) \ - pcpu_spin_lock(struct per_cpu_pages, lock, ptr) - -#define pcp_spin_trylock(ptr) \ - pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) +#define pcp_spin_trylock(ptr, UP_flags) \ +({ \ + struct per_cpu_pages *__ret; \ + pcp_trylock_prepare(UP_flags); \ + __ret = pcpu_spin_trylock(struct per_cpu_pages, lock, ptr); \ + if (!__ret) \ + pcp_trylock_finish(UP_flags); \ + __ret; \ +}) -#define pcp_spin_unlock(ptr) \ - pcpu_spin_unlock(lock, ptr) +#define pcp_spin_unlock(ptr, UP_flags) \ +({ \ + pcpu_spin_unlock(lock, ptr); \ + pcp_trylock_finish(UP_flags); \ +}) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -1822,14 +1823,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * If memory tags should be zeroed * (which happens only when memory should be initialized as well). */ - if (zero_tags) { - /* Initialize both memory and memory tags. */ - for (i = 0; i != 1 << order; ++i) - tag_clear_highpage(page + i); + if (zero_tags) + init = !tag_clear_highpages(page, 1 << order); - /* Take note that memory was initialized by the loop above. */ - init = false; - } if (!should_skip_kasan_unpoison(gfp_flags) && kasan_unpoison_pages(page, order, init)) { /* Take note that memory was initialized by KASAN. */ @@ -2557,10 +2553,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * Called from the vmstat counter updater to decay the PCP high. * Return whether there are addition works to do. */ -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) { - int high_min, to_drain, batch; - int todo = 0; + int high_min, to_drain, to_drain_batched, batch; + bool todo = false; high_min = READ_ONCE(pcp->high_min); batch = READ_ONCE(pcp->batch); @@ -2573,15 +2569,18 @@ int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX), pcp->high - (pcp->high >> 3), high_min); if (pcp->high > high_min) - todo++; + todo = true; } to_drain = pcp->count - pcp->high; - if (to_drain > 0) { + while (to_drain > 0) { + to_drain_batched = min(to_drain, batch); spin_lock(&pcp->lock); - free_pcppages_bulk(zone, to_drain, pcp, 0); + free_pcppages_bulk(zone, to_drain_batched, pcp, 0); spin_unlock(&pcp->lock); - todo++; + todo = true; + + to_drain -= to_drain_batched; } return todo; @@ -2815,12 +2814,22 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return high; } -static void free_frozen_page_commit(struct zone *zone, +/* + * Tune pcp alloc factor and adjust count & free_count. Free pages to bring the + * pcp's watermarks below high. + * + * May return a freed pcp, if during page freeing the pcp spinlock cannot be + * reacquired. Return true if pcp is locked, false otherwise. + */ +static bool free_frozen_page_commit(struct zone *zone, struct per_cpu_pages *pcp, struct page *page, int migratetype, - unsigned int order, fpi_t fpi_flags) + unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags) { int high, batch; + int to_free, to_free_batched; int pindex; + int cpu = smp_processor_id(); + int ret = true; bool free_high = false; /* @@ -2858,15 +2867,42 @@ static void free_frozen_page_commit(struct zone *zone, * Do not attempt to take a zone lock. Let pcp->count get * over high mark temporarily. */ - return; + return true; } high = nr_pcp_high(pcp, zone, batch, free_high); if (pcp->count < high) - return; + return true; + + to_free = nr_pcp_free(pcp, batch, high, free_high); + while (to_free > 0 && pcp->count > 0) { + to_free_batched = min(to_free, batch); + free_pcppages_bulk(zone, to_free_batched, pcp, pindex); + to_free -= to_free_batched; + + if (to_free == 0 || pcp->count == 0) + break; + + pcp_spin_unlock(pcp, *UP_flags); + + pcp = pcp_spin_trylock(zone->per_cpu_pageset, *UP_flags); + if (!pcp) { + ret = false; + break; + } + + /* + * Check if this thread has been migrated to a different CPU. + * If that is the case, give up and indicate that the pcp is + * returned in an unlocked state. + */ + if (smp_processor_id() != cpu) { + pcp_spin_unlock(pcp, *UP_flags); + ret = false; + break; + } + } - free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), - pcp, pindex); if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && zone_watermark_ok(zone, 0, high_wmark_pages(zone), ZONE_MOVABLE, 0)) { @@ -2884,6 +2920,7 @@ static void free_frozen_page_commit(struct zone *zone, next_memory_node(pgdat->node_id) < MAX_NUMNODES) atomic_set(&pgdat->kswapd_failures, 0); } + return ret; } /* @@ -2892,7 +2929,7 @@ static void free_frozen_page_commit(struct zone *zone, static void __free_frozen_pages(struct page *page, unsigned int order, fpi_t fpi_flags) { - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); @@ -2928,15 +2965,15 @@ static void __free_frozen_pages(struct page *page, unsigned int order, add_page_to_zone_llist(zone, page, order); return; } - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); if (pcp) { - free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags); - pcp_spin_unlock(pcp); + if (!free_frozen_page_commit(zone, pcp, page, migratetype, + order, fpi_flags, &UP_flags)) + return; + pcp_spin_unlock(pcp, UP_flags); } else { free_one_page(zone, page, pfn, order, fpi_flags); } - pcp_trylock_finish(UP_flags); } void free_frozen_pages(struct page *page, unsigned int order) @@ -2949,7 +2986,7 @@ void free_frozen_pages(struct page *page, unsigned int order) */ void free_unref_folios(struct folio_batch *folios) { - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; int i, j; @@ -2992,8 +3029,7 @@ void free_unref_folios(struct folio_batch *folios) if (zone != locked_zone || is_migrate_isolate(migratetype)) { if (pcp) { - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); + pcp_spin_unlock(pcp, UP_flags); locked_zone = NULL; pcp = NULL; } @@ -3012,10 +3048,8 @@ void free_unref_folios(struct folio_batch *folios) * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. */ - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); if (unlikely(!pcp)) { - pcp_trylock_finish(UP_flags); free_one_page(zone, &folio->page, pfn, order, FPI_NONE); continue; @@ -3031,14 +3065,15 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_frozen_page_commit(zone, pcp, &folio->page, migratetype, - order, FPI_NONE); + if (!free_frozen_page_commit(zone, pcp, &folio->page, + migratetype, order, FPI_NONE, &UP_flags)) { + pcp = NULL; + locked_zone = NULL; + } } - if (pcp) { - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); - } + if (pcp) + pcp_spin_unlock(pcp, UP_flags); folio_batch_reinit(folios); } @@ -3289,15 +3324,12 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct per_cpu_pages *pcp; struct list_head *list; struct page *page; - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); - if (!pcp) { - pcp_trylock_finish(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); + if (!pcp) return NULL; - } /* * On allocation, reduce the number of pages that are batch freed. @@ -3307,8 +3339,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp->free_count >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); + pcp_spin_unlock(pcp, UP_flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); @@ -3941,6 +3972,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) filter &= ~SHOW_MEM_FILTER_NODES; __show_mem(filter, nodemask, gfp_zone(gfp_mask)); + mem_cgroup_show_protected_memory(NULL); } void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) @@ -4648,11 +4680,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, if (unlikely(nofail)) { /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE(order > 1); - /* * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM, * otherwise, we may result in lockup. */ @@ -4982,20 +5009,25 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * @nr_pages: The number of pages desired in the array * @page_array: Array to store the pages * - * This is a batched version of the page allocator that attempts to - * allocate nr_pages quickly. Pages are added to the page_array. + * This is a batched version of the page allocator that attempts to allocate + * @nr_pages quickly. Pages are added to @page_array. * - * Note that only NULL elements are populated with pages and nr_pages - * is the maximum number of pages that will be stored in the array. + * Note that only the elements in @page_array that were cleared to %NULL on + * entry are populated with newly allocated pages. @nr_pages is the maximum + * number of pages that will be stored in the array. * - * Returns the number of pages in the array. + * Returns the number of pages in @page_array, including ones already + * allocated on entry. This can be less than the number requested in @nr_pages, + * but all empty slots are filled from the beginning. I.e., if all slots in + * @page_array were set to %NULL on entry, the slots from 0 to the return value + * - 1 will be filled. */ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct page **page_array) { struct page *page; - unsigned long __maybe_unused UP_flags; + unsigned long UP_flags; struct zone *zone; struct zoneref *z; struct per_cpu_pages *pcp; @@ -5089,10 +5121,9 @@ retry_this_zone: goto failed; /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ - pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock(zone->per_cpu_pageset); + pcp = pcp_spin_trylock(zone->per_cpu_pageset, UP_flags); if (!pcp) - goto failed_irq; + goto failed; /* Attempt the batch allocation */ pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; @@ -5109,8 +5140,8 @@ retry_this_zone: if (unlikely(!page)) { /* Try and allocate at least one page */ if (!nr_account) { - pcp_spin_unlock(pcp); - goto failed_irq; + pcp_spin_unlock(pcp, UP_flags); + goto failed; } break; } @@ -5121,8 +5152,7 @@ retry_this_zone: page_array[nr_populated++] = page; } - pcp_spin_unlock(pcp); - pcp_trylock_finish(UP_flags); + pcp_spin_unlock(pcp, UP_flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account); @@ -5130,9 +5160,6 @@ retry_this_zone: out: return nr_populated; -failed_irq: - pcp_trylock_finish(UP_flags); - failed: page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); if (page) @@ -5860,15 +5887,14 @@ static int zone_batchsize(struct zone *zone) int batch; /* - * The number of pages to batch allocate is either ~0.1% - * of the zone or 1MB, whichever is smaller. The batch + * The number of pages to batch allocate is either ~0.025% + * of the zone or 256KB, whichever is smaller. The batch * size is striking a balance between allocation latency * and zone lock contention. */ - batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); - batch /= 4; /* We effectively *= 4 below */ - if (batch < 1) - batch = 1; + batch = min(zone_managed_pages(zone) >> 12, SZ_256K / PAGE_SIZE); + if (batch <= 1) + return 1; /* * Clamp the batch to a 2^n - 1 value. Having a power @@ -6019,7 +6045,7 @@ static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) { int new_high_min, new_high_max, new_batch; - new_batch = max(1, zone_batchsize(zone)); + new_batch = zone_batchsize(zone); if (percpu_pagelist_high_fraction) { new_high_min = zone_highsize(zone, new_batch, cpu_online, percpu_pagelist_high_fraction); @@ -6285,10 +6311,21 @@ static void calculate_totalreserve_pages(void) long max = 0; unsigned long managed_pages = zone_managed_pages(zone); - /* Find valid and maximum lowmem_reserve in the zone */ - for (j = i; j < MAX_NR_ZONES; j++) - max = max(max, zone->lowmem_reserve[j]); + /* + * lowmem_reserve[j] is monotonically non-decreasing + * in j for a given zone (see + * setup_per_zone_lowmem_reserve()). The maximum + * valid reserve lives at the highest index with a + * non-zero value, so scan backwards and stop at the + * first hit. + */ + for (j = MAX_NR_ZONES - 1; j > i; j--) { + if (!zone->lowmem_reserve[j]) + continue; + max = zone->lowmem_reserve[j]; + break; + } /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); @@ -6313,7 +6350,21 @@ static void setup_per_zone_lowmem_reserve(void) { struct pglist_data *pgdat; enum zone_type i, j; - + /* + * For a given zone node_zones[i], lowmem_reserve[j] (j > i) + * represents how many pages in zone i must effectively be kept + * in reserve when deciding whether an allocation class that is + * allowed to allocate from zones up to j may fall back into + * zone i. + * + * As j increases, the allocation class can use a strictly larger + * set of fallback zones and therefore must not be allowed to + * deplete low zones more aggressively than a less flexible one. + * As a result, lowmem_reserve[j] is required to be monotonically + * non-decreasing in j for each zone i. Callers such as + * calculate_totalreserve_pages() rely on this monotonicity when + * selecting the maximum reserve entry. + */ for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES - 1; i++) { struct zone *zone = &pgdat->node_zones[i]; |
