summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c212
1 files changed, 138 insertions, 74 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d1d037f97c5f..600d9e981c23 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -355,7 +355,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn)
static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit)
{
- return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS;
+ return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS;
}
static __always_inline void
@@ -370,7 +370,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn,
#else
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
#endif
- BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits));
+ BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK);
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
bitmap = get_pageblock_bitmap(page, pfn);
@@ -538,8 +538,7 @@ static void set_pageblock_migratetype(struct page *page,
"Use set_pageblock_isolate() for pageblock isolation");
return;
}
- VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page),
- PB_migrate_isolate),
+ VM_WARN_ONCE(get_pageblock_isolate(page),
"Use clear_pageblock_isolate() to unisolate pageblock");
/* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */
#endif
@@ -797,7 +796,7 @@ static inline void account_freepages(struct zone *zone, int nr_pages,
if (is_migrate_cma(migratetype))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
- else if (is_migrate_highatomic(migratetype))
+ else if (migratetype == MIGRATE_HIGHATOMIC)
WRITE_ONCE(zone->nr_free_highatomic,
zone->nr_free_highatomic + nr_pages);
}
@@ -950,7 +949,7 @@ static inline void __free_one_page(struct page *page,
bool to_tail;
VM_BUG_ON(!zone_is_initialized(zone));
- VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
+ VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
@@ -1043,7 +1042,7 @@ static inline bool page_expected_state(struct page *page,
page->memcg_data |
#endif
page_pool_page_is_pp(page) |
- (page->flags & check_flags)))
+ (page->flags.f & check_flags)))
return false;
return true;
@@ -1059,7 +1058,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
- if (unlikely(page->flags & flags)) {
+ if (unlikely(page->flags.f & flags)) {
if (flags == PAGE_FLAGS_CHECK_AT_PREP)
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
else
@@ -1358,7 +1357,7 @@ __always_inline bool free_pages_prepare(struct page *page,
int i;
if (compound) {
- page[1].flags &= ~PAGE_FLAGS_SECOND;
+ page[1].flags.f &= ~PAGE_FLAGS_SECOND;
#ifdef NR_PAGES_IN_LARGE_FOLIO
folio->_nr_pages = 0;
#endif
@@ -1372,7 +1371,7 @@ __always_inline bool free_pages_prepare(struct page *page,
continue;
}
}
- (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
if (folio_test_anon(folio)) {
@@ -1391,7 +1390,7 @@ __always_inline bool free_pages_prepare(struct page *page,
}
page_cpupid_reset_last(page);
- page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
page_table_check_free(page, order);
pgalloc_tag_sub(page, 1 << order);
@@ -1521,7 +1520,7 @@ static void add_page_to_zone_llist(struct zone *zone, struct page *page,
unsigned int order)
{
/* Remember the order */
- page->order = order;
+ page->private = order;
/* Add the page to the free list */
llist_add(&page->pcp_llist, &zone->trylock_free_pages);
}
@@ -1550,7 +1549,7 @@ static void free_one_page(struct zone *zone, struct page *page,
llnode = llist_del_all(llhead);
llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
- unsigned int p_order = p->order;
+ unsigned int p_order = p->private;
split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
__count_vm_events(PGFREE, 1 << p_order);
@@ -2034,7 +2033,13 @@ static int move_freepages_block(struct zone *zone, struct page *page,
/* Look for a buddy that straddles start_pfn */
static unsigned long find_large_buddy(unsigned long start_pfn)
{
- int order = 0;
+ /*
+ * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing
+ * start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking
+ * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy,
+ * the starting order does not matter.
+ */
+ int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER;
struct page *page;
unsigned long pfn = start_pfn;
@@ -2058,9 +2063,9 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
{
if (isolate)
- set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+ set_pageblock_isolate(page);
else
- clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate);
+ clear_pageblock_isolate(page);
}
/**
@@ -2085,9 +2090,10 @@ static inline void toggle_pageblock_isolate(struct page *page, bool isolate)
static bool __move_freepages_block_isolate(struct zone *zone,
struct page *page, bool isolate)
{
- unsigned long start_pfn, pfn;
+ unsigned long start_pfn, buddy_pfn;
int from_mt;
int to_mt;
+ struct page *buddy;
if (isolate == get_pageblock_isolate(page)) {
VM_WARN_ONCE(1, "%s a pageblock that is already in that state",
@@ -2102,29 +2108,19 @@ static bool __move_freepages_block_isolate(struct zone *zone,
if (pageblock_order == MAX_PAGE_ORDER)
goto move;
- /* We're a tail block in a larger buddy */
- pfn = find_large_buddy(start_pfn);
- if (pfn != start_pfn) {
- struct page *buddy = pfn_to_page(pfn);
+ buddy_pfn = find_large_buddy(start_pfn);
+ buddy = pfn_to_page(buddy_pfn);
+ /* We're a part of a larger buddy */
+ if (PageBuddy(buddy) && buddy_order(buddy) > pageblock_order) {
int order = buddy_order(buddy);
del_page_from_free_list(buddy, zone, order,
- get_pfnblock_migratetype(buddy, pfn));
+ get_pfnblock_migratetype(buddy, buddy_pfn));
toggle_pageblock_isolate(page, isolate);
- split_large_buddy(zone, buddy, pfn, order, FPI_NONE);
+ split_large_buddy(zone, buddy, buddy_pfn, order, FPI_NONE);
return true;
}
- /* We're the starting block of a larger buddy */
- if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
- int order = buddy_order(page);
-
- del_page_from_free_list(page, zone, order,
- get_pfnblock_migratetype(page, pfn));
- toggle_pageblock_isolate(page, isolate);
- split_large_buddy(zone, page, pfn, order, FPI_NONE);
- return true;
- }
move:
/* Use MIGRATETYPE_MASK to get non-isolate migratetype */
if (isolate) {
@@ -2864,14 +2860,29 @@ static void free_frozen_page_commit(struct zone *zone,
*/
return;
}
+
high = nr_pcp_high(pcp, zone, batch, free_high);
- if (pcp->count >= high) {
- free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
- pcp, pindex);
- if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
- zone_watermark_ok(zone, 0, high_wmark_pages(zone),
- ZONE_MOVABLE, 0))
- clear_bit(ZONE_BELOW_HIGH, &zone->flags);
+ if (pcp->count < high)
+ return;
+
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
+ pcp, pindex);
+ if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
+ zone_watermark_ok(zone, 0, high_wmark_pages(zone),
+ ZONE_MOVABLE, 0)) {
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ clear_bit(ZONE_BELOW_HIGH, &zone->flags);
+
+ /*
+ * Assume that memory pressure on this node is gone and may be
+ * in a reclaimable state. If a memory fallback node exists,
+ * direct reclaim may not have been triggered, causing a
+ * 'hopeless node' to stay in that state for a while. Let
+ * kswapd work again by resetting kswapd_failures.
+ */
+ if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES &&
+ next_memory_node(pgdat->node_id) < MAX_NUMNODES)
+ atomic_set(&pgdat->kswapd_failures, 0);
}
}
@@ -3724,6 +3735,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
struct pglist_data *last_pgdat = NULL;
bool last_pgdat_dirty_ok = false;
bool no_fallback;
+ bool skip_kswapd_nodes = nr_online_nodes > 1;
+ bool skipped_kswapd_nodes = false;
retry:
/*
@@ -3786,6 +3799,19 @@ retry:
}
}
+ /*
+ * If kswapd is already active on a node, keep looking
+ * for other nodes that might be idle. This can happen
+ * if another process has NUMA bindings and is causing
+ * kswapd wakeups on only some nodes. Avoid accidental
+ * "node_reclaim_mode"-like behavior in this case.
+ */
+ if (skip_kswapd_nodes &&
+ !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) {
+ skipped_kswapd_nodes = true;
+ continue;
+ }
+
cond_accept_memory(zone, order, alloc_flags);
/*
@@ -3878,6 +3904,15 @@ try_this_zone:
}
/*
+ * If we skipped over nodes with active kswapds and found no
+ * idle nodes, retry and place anywhere the watermarks permit.
+ */
+ if (skip_kswapd_nodes && skipped_kswapd_nodes) {
+ skip_kswapd_nodes = false;
+ goto retry;
+ }
+
+ /*
* It's possible on a UMA machine to get through all zones that are
* fragmented. If avoiding fragmentation, reset and try again.
*/
@@ -4182,7 +4217,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
}
static inline bool
-should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
enum compact_result compact_result,
enum compact_priority *compact_priority,
int *compaction_retries)
@@ -4408,7 +4443,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
alloc_flags |= ALLOC_NON_BLOCK;
- if (order > 0)
+ if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE))
alloc_flags |= ALLOC_HIGHATOMIC;
}
@@ -5229,9 +5264,16 @@ static void ___free_pages(struct page *page, unsigned int order,
__free_frozen_pages(page, order, fpi_flags);
else if (!head) {
pgalloc_tag_sub_pages(tag, (1 << order) - 1);
- while (order-- > 0)
+ while (order-- > 0) {
+ /*
+ * The "tail" pages of this non-compound high-order
+ * page will have no code tags, so to avoid warnings
+ * mark them as empty.
+ */
+ clear_page_tag_ref(page + (1 << order));
__free_frozen_pages(page + (1 << order), order,
fpi_flags);
+ }
}
}
@@ -5270,6 +5312,15 @@ void free_pages_nolock(struct page *page, unsigned int order)
___free_pages(page, order, FPI_TRYLOCK);
}
+/**
+ * free_pages - Free pages allocated with __get_free_pages().
+ * @addr: The virtual address tied to a page returned from __get_free_pages().
+ * @order: The order of the allocation.
+ *
+ * This function behaves the same as __free_pages(). Use this function
+ * to free pages when you only have a valid virtual address. If you have
+ * the page, call __free_pages() instead.
+ */
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
@@ -5946,7 +5997,6 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
pcp->high_min = BOOT_PAGESET_HIGH;
pcp->high_max = BOOT_PAGESET_HIGH;
pcp->batch = BOOT_PAGESET_BATCH;
- pcp->free_count = 0;
}
static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min,
@@ -6236,16 +6286,13 @@ static void calculate_totalreserve_pages(void)
unsigned long managed_pages = zone_managed_pages(zone);
/* Find valid and maximum lowmem_reserve in the zone */
- for (j = i; j < MAX_NR_ZONES; j++) {
- if (zone->lowmem_reserve[j] > max)
- max = zone->lowmem_reserve[j];
- }
+ for (j = i; j < MAX_NR_ZONES; j++)
+ max = max(max, zone->lowmem_reserve[j]);
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
- if (max > managed_pages)
- max = managed_pages;
+ max = min_t(unsigned long, max, managed_pages);
pgdat->totalreserve_pages += max;
@@ -6837,6 +6884,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask)
int alloc_contig_range_noprof(unsigned long start, unsigned long end,
acr_flags_t alloc_flags, gfp_t gfp_mask)
{
+ const unsigned int order = ilog2(end - start);
unsigned long outer_start, outer_end;
int ret = 0;
@@ -6854,6 +6902,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
PB_ISOLATE_MODE_CMA_ALLOC :
PB_ISOLATE_MODE_OTHER;
+ /*
+ * In contrast to the buddy, we allow for orders here that exceed
+ * MAX_PAGE_ORDER, so we must manually make sure that we are not
+ * exceeding the maximum folio order.
+ */
+ if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER))
+ return -EINVAL;
+
gfp_mask = current_gfp_context(gfp_mask);
if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask))
return -EINVAL;
@@ -6951,7 +7007,6 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
free_contig_range(end, outer_end - end);
} else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {
struct page *head = pfn_to_page(start);
- int order = ilog2(end - start);
check_new_pages(head, order);
prep_new_page(head, order, gfp_mask, 0);
@@ -7478,22 +7533,7 @@ static bool __free_unaccepted(struct page *page)
#endif /* CONFIG_UNACCEPTED_MEMORY */
-/**
- * alloc_pages_nolock - opportunistic reentrant allocation from any context
- * @nid: node to allocate from
- * @order: allocation order size
- *
- * Allocates pages of a given order from the given node. This is safe to
- * call from any context (from atomic, NMI, and also reentrant
- * allocator -> tracepoint -> alloc_pages_nolock_noprof).
- * Allocation is best effort and to be expected to fail easily so nobody should
- * rely on the success. Failures are not reported via warn_alloc().
- * See always fail conditions below.
- *
- * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
- * It means ENOMEM. There is no reason to call it again and expect !NULL.
- */
-struct page *alloc_pages_nolock_noprof(int nid, unsigned int order)
+struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order)
{
/*
* Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
@@ -7515,12 +7555,13 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order)
* specify it here to highlight that alloc_pages_nolock()
* doesn't want to deplete reserves.
*/
- gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
- | __GFP_ACCOUNT;
+ gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC | __GFP_COMP
+ | gfp_flags;
unsigned int alloc_flags = ALLOC_TRYLOCK;
struct alloc_context ac = { };
struct page *page;
+ VM_WARN_ON_ONCE(gfp_flags & ~__GFP_ACCOUNT);
/*
* In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
* unsafe in NMI. If spin_trylock() is called from hard IRQ the current
@@ -7555,15 +7596,38 @@ struct page *alloc_pages_nolock_noprof(int nid, unsigned int order)
/* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
- if (page)
- set_page_refcounted(page);
-
- if (memcg_kmem_online() && page &&
+ if (memcg_kmem_online() && page && (gfp_flags & __GFP_ACCOUNT) &&
unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
- free_pages_nolock(page, order);
+ __free_frozen_pages(page, order, FPI_TRYLOCK);
page = NULL;
}
trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
kmsan_alloc_page(page, order, alloc_gfp);
return page;
}
+/**
+ * alloc_pages_nolock - opportunistic reentrant allocation from any context
+ * @gfp_flags: GFP flags. Only __GFP_ACCOUNT allowed.
+ * @nid: node to allocate from
+ * @order: allocation order size
+ *
+ * Allocates pages of a given order from the given node. This is safe to
+ * call from any context (from atomic, NMI, and also reentrant
+ * allocator -> tracepoint -> alloc_pages_nolock_noprof).
+ * Allocation is best effort and to be expected to fail easily so nobody should
+ * rely on the success. Failures are not reported via warn_alloc().
+ * See always fail conditions below.
+ *
+ * Return: allocated page or NULL on failure. NULL does not mean EBUSY or EAGAIN.
+ * It means ENOMEM. There is no reason to call it again and expect !NULL.
+ */
+struct page *alloc_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order)
+{
+ struct page *page;
+
+ page = alloc_frozen_pages_nolock_noprof(gfp_flags, nid, order);
+ if (page)
+ set_page_refcounted(page);
+ return page;
+}
+EXPORT_SYMBOL_GPL(alloc_pages_nolock_noprof);