From 82b212f40059bffd6808c07266a942d444d5558a Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 26 Nov 2012 16:29:45 -0800 Subject: Revert "mm: remove __GFP_NO_KSWAPD" With "mm: vmscan: scale number of pages reclaimed by reclaim/compaction based on failures" reverted, Zdenek Kabelac reported the following Hmm, so it's just took longer to hit the problem and observe kswapd0 spinning on my CPU again - it's not as endless like before - but still it easily eats minutes - it helps to turn off Firefox or TB (memory hungry apps) so kswapd0 stops soon - and restart those apps again. (And I still have like >1GB of cached memory) kswapd0 R running task 0 30 2 0x00000000 Call Trace: preempt_schedule+0x42/0x60 _raw_spin_unlock+0x55/0x60 put_super+0x31/0x40 drop_super+0x22/0x30 prune_super+0x149/0x1b0 shrink_slab+0xba/0x510 The sysrq+m indicates the system has no swap so it'll never reclaim anonymous pages as part of reclaim/compaction. That is one part of the problem but not the root cause as file-backed pages could also be reclaimed. The likely underlying problem is that kswapd is woken up or kept awake for each THP allocation request in the page allocator slow path. If compaction fails for the requesting process then compaction will be deferred for a time and direct reclaim is avoided. However, if there are a storm of THP requests that are simply rejected, it will still be the the case that kswapd is awake for a prolonged period of time as pgdat->kswapd_max_order is updated each time. This is noticed by the main kswapd() loop and it will not call kswapd_try_to_sleep(). Instead it will loopp, shrinking a small number of pages and calling shrink_slab() on each iteration. The temptation is to supply a patch that checks if kswapd was woken for THP and if so ignore pgdat->kswapd_max_order but it'll be a hack and not backed up by proper testing. As 3.7 is very close to release and this is not a bug we should release with, a safer path is to revert "mm: remove __GFP_NO_KSWAPD" for now and revisit it with the view to ironing out the balance_pgdat() logic in general. Signed-off-by: Mel Gorman Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux/gfp.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 02c1c9710be0..d0a79678f169 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -31,6 +31,7 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u #define ___GFP_NOTRACK 0x200000u +#define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u #define ___GFP_WRITE 0x1000000u @@ -85,6 +86,7 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ +#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ @@ -114,7 +116,8 @@ struct vm_area_struct; __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NO_KSWAPD) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) -- cgit v1.2.3 From a50915394f1fc02c2861d3b7ce7014788aa5066e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 29 Nov 2012 13:54:27 -0800 Subject: revert "Revert "mm: remove __GFP_NO_KSWAPD"" It apepars that this patch was innocent, and we hope that "mm: avoid waking kswapd for THP allocations when compaction is deferred or contended" will fix the final kswapd-spinning cause. Cc: Zdenek Kabelac Cc: Seth Jennings Cc: Valdis Kletnieks Cc: Jiri Slaby Cc: Rik van Riel Cc: Robert Jennings Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/mtd/mtdcore.c | 6 ++---- include/linux/gfp.h | 13 +++++-------- include/trace/events/gfpflags.h | 1 - mm/page_alloc.c | 7 +++---- 4 files changed, 10 insertions(+), 17 deletions(-) (limited to 'include/linux/gfp.h') diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index ec794a72975d..374c46dff7dd 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -1077,8 +1077,7 @@ EXPORT_SYMBOL_GPL(mtd_writev); * until the request succeeds or until the allocation size falls below * the system page size. This attempts to make sure it does not adversely * impact system performance, so when allocating more than one page, we - * ask the memory allocator to avoid re-trying, swapping, writing back - * or performing I/O. + * ask the memory allocator to avoid re-trying. * * Note, this function also makes sure that the allocated buffer is aligned to * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value. @@ -1092,8 +1091,7 @@ EXPORT_SYMBOL_GPL(mtd_writev); */ void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) { - gfp_t flags = __GFP_NOWARN | __GFP_WAIT | - __GFP_NORETRY | __GFP_NO_KSWAPD; + gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY; size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); void *kbuf; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d0a79678f169..76e1aa206f57 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -30,10 +30,9 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u -#define ___GFP_NOTRACK 0x200000u -#define ___GFP_NO_KSWAPD 0x400000u -#define ___GFP_OTHER_NODE 0x800000u -#define ___GFP_WRITE 0x1000000u +#define ___GFP_NOTRACK 0x100000u +#define ___GFP_OTHER_NODE 0x200000u +#define ___GFP_WRITE 0x400000u /* * GFP bitmasks.. @@ -86,7 +85,6 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ -#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ @@ -96,7 +94,7 @@ struct vm_area_struct; */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for N __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ @@ -116,8 +114,7 @@ struct vm_area_struct; __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ - __GFP_NO_KSWAPD) + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b7..9391706e9254 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -36,7 +36,6 @@ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ - {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e208f0ad68c..8193809f3de0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2416,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2495,7 +2494,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & __GFP_NO_KSWAPD)) + (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) goto nopage; /* Try direct reclaim and then allocating */ -- cgit v1.2.3 From caf491916b1c1e939a2c7575efb7a77f11fc9bdf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 10 Dec 2012 10:51:16 -0800 Subject: Revert "revert "Revert "mm: remove __GFP_NO_KSWAPD""" and associated damage This reverts commits a50915394f1fc02c2861d3b7ce7014788aa5066e and d7c3b937bdf45f0b844400b7bf6fd3ed50bac604. This is a revert of a revert of a revert. In addition, it reverts the even older i915 change to stop using the __GFP_NO_KSWAPD flag due to the original commits in linux-next. It turns out that the original patch really was bogus, and that the original revert was the correct thing to do after all. We thought we had fixed the problem, and then reverted the revert, but the problem really is fundamental: waking up kswapd simply isn't the right thing to do, and direct reclaim sometimes simply _is_ the right thing to do. When certain allocations fail, we simply should try some direct reclaim, and if that fails, fail the allocation. That's the right thing to do for THP allocations, which can easily fail, and the GPU allocations want to do that too. So starting kswapd is sometimes simply wrong, and removing the flag that said "don't start kswapd" was a mistake. Let's hope we never revisit this mistake again - and certainly not this many times ;) Acked-by: Mel Gorman Acked-by: Johannes Weiner Cc: Rik van Riel Cc: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/i915_gem.c | 6 +++--- drivers/mtd/mtdcore.c | 6 ++++-- include/linux/gfp.h | 13 ++++++++----- include/trace/events/gfpflags.h | 1 + mm/page_alloc.c | 7 ++++--- 5 files changed, 20 insertions(+), 13 deletions(-) (limited to 'include/linux/gfp.h') diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 107f09befe92..9b285da4449b 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1796,7 +1796,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) */ mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping; gfp = mapping_gfp_mask(mapping); - gfp |= __GFP_NORETRY | __GFP_NOWARN; + gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD; gfp &= ~(__GFP_IO | __GFP_WAIT); for_each_sg(st->sgl, sg, page_count, i) { page = shmem_read_mapping_page_gfp(mapping, i, gfp); @@ -1809,7 +1809,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) * our own buffer, now let the real VM do its job and * go down in flames if truly OOM. */ - gfp &= ~(__GFP_NORETRY | __GFP_NOWARN); + gfp &= ~(__GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD); gfp |= __GFP_IO | __GFP_WAIT; i915_gem_shrink_all(dev_priv); @@ -1817,7 +1817,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) if (IS_ERR(page)) goto err_pages; - gfp |= __GFP_NORETRY | __GFP_NOWARN; + gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD; gfp &= ~(__GFP_IO | __GFP_WAIT); } diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 374c46dff7dd..ec794a72975d 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -1077,7 +1077,8 @@ EXPORT_SYMBOL_GPL(mtd_writev); * until the request succeeds or until the allocation size falls below * the system page size. This attempts to make sure it does not adversely * impact system performance, so when allocating more than one page, we - * ask the memory allocator to avoid re-trying. + * ask the memory allocator to avoid re-trying, swapping, writing back + * or performing I/O. * * Note, this function also makes sure that the allocated buffer is aligned to * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value. @@ -1091,7 +1092,8 @@ EXPORT_SYMBOL_GPL(mtd_writev); */ void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size) { - gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY; + gfp_t flags = __GFP_NOWARN | __GFP_WAIT | + __GFP_NORETRY | __GFP_NO_KSWAPD; size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE); void *kbuf; diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 76e1aa206f57..d0a79678f169 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -30,9 +30,10 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u -#define ___GFP_NOTRACK 0x100000u -#define ___GFP_OTHER_NODE 0x200000u -#define ___GFP_WRITE 0x400000u +#define ___GFP_NOTRACK 0x200000u +#define ___GFP_NO_KSWAPD 0x400000u +#define ___GFP_OTHER_NODE 0x800000u +#define ___GFP_WRITE 0x1000000u /* * GFP bitmasks.. @@ -85,6 +86,7 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ +#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ @@ -94,7 +96,7 @@ struct vm_area_struct; */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 23 /* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ @@ -114,7 +116,8 @@ struct vm_area_struct; __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ - __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NO_KSWAPD) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 9391706e9254..d6fd8e5b14b7 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -36,6 +36,7 @@ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ + {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ {(unsigned long)__GFP_OTHER_NODE, "GFP_OTHER_NODE"} \ ) : "GFP_NOWAIT" diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8193809f3de0..7e208f0ad68c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2416,8 +2416,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background @@ -2494,7 +2495,7 @@ rebalance: * system then fail the allocation instead of entering direct reclaim. */ if ((deferred_compaction || contended_compaction) && - (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) + (gfp_mask & __GFP_NO_KSWAPD)) goto nopage; /* Try direct reclaim and then allocating */ -- cgit v1.2.3 From e5adfffc857788c8b7eca0e98cf1e26f1964b292 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 11 Dec 2012 16:00:29 -0800 Subject: mm: use IS_ENABLED(CONFIG_NUMA) instead of NUMA_BUILD We don't need custom NUMA_BUILD anymore, since we have handy IS_ENABLED(). Signed-off-by: Kirill A. Shutemov Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- include/linux/kernel.h | 7 ------- mm/page_alloc.c | 18 ++++++++++-------- mm/vmalloc.c | 4 ++-- 4 files changed, 13 insertions(+), 18 deletions(-) (limited to 'include/linux/gfp.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d0a79678f169..31e8041274f6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -266,7 +266,7 @@ static inline enum zone_type gfp_zone(gfp_t flags) static inline int gfp_zonelist(gfp_t flags) { - if (NUMA_BUILD && unlikely(flags & __GFP_THISNODE)) + if (IS_ENABLED(CONFIG_NUMA) && unlikely(flags & __GFP_THISNODE)) return 1; return 0; diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7d8dfc7392f1..815e5845d954 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -687,13 +687,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } /* Trap pasters of __FUNCTION__ at compile-time */ #define __FUNCTION__ (__func__) -/* This helps us to avoid #ifdef CONFIG_NUMA */ -#ifdef CONFIG_NUMA -#define NUMA_BUILD 1 -#else -#define NUMA_BUILD 0 -#endif - /* This helps us avoid #ifdef CONFIG_COMPACTION */ #ifdef CONFIG_COMPACTION #define COMPACTION_BUILD 1 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc018b486b74..a49b0ea3cc2f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1871,7 +1871,7 @@ zonelist_scan: */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - if (NUMA_BUILD && zlc_active && + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; if ((alloc_flags & ALLOC_CPUSET) && @@ -1917,7 +1917,8 @@ zonelist_scan: classzone_idx, alloc_flags)) goto try_this_zone; - if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { + if (IS_ENABLED(CONFIG_NUMA) && + !did_zlc_setup && nr_online_nodes > 1) { /* * we do zlc_setup if there are multiple nodes * and before considering the first zone allowed @@ -1936,7 +1937,7 @@ zonelist_scan: * As we may have just activated ZLC, check if the first * eligible zone has failed zone_reclaim recently. */ - if (NUMA_BUILD && zlc_active && + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; @@ -1962,11 +1963,11 @@ try_this_zone: if (page) break; this_zone_full: - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) zlc_mark_zone_full(zonelist, z); } - if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { + if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { /* Disable zlc cache for second zonelist scan */ zlc_active = 0; goto zonelist_scan; @@ -2266,7 +2267,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, return NULL; /* After successful reclaim, reconsider all zones for allocation */ - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) zlc_clear_zones_full(zonelist); retry: @@ -2412,7 +2413,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (IS_ENABLED(CONFIG_NUMA) && + (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; restart: @@ -2819,7 +2821,7 @@ unsigned int nr_free_pagecache_pages(void) static inline void show_node(struct zone *zone) { - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) printk("Node %d ", zone_to_nid(zone)); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 78e08300db21..5123a169ab7b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p) static void show_numa_info(struct seq_file *m, struct vm_struct *v) { - if (NUMA_BUILD) { + if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; if (!counters) @@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file) unsigned int *ptr = NULL; int ret; - if (NUMA_BUILD) { + if (IS_ENABLED(CONFIG_NUMA)) { ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); if (ptr == NULL) return -ENOMEM; -- cgit v1.2.3 From 05b0afd73d04109d87f00ccd39f099e217c37263 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 12 Dec 2012 13:51:56 -0800 Subject: mm: add a reminder comment for __GFP_BITS_SHIFT Cc: Glauber Costa Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/gfp.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 31e8041274f6..f74856e17e48 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -34,6 +34,7 @@ struct vm_area_struct; #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u #define ___GFP_WRITE 0x1000000u +/* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* * GFP bitmasks.. -- cgit v1.2.3 From 7a64bf05b2a6fe3703062d13d389e3eb904741c6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:21:51 -0800 Subject: mm: add a __GFP_KMEMCG flag This flag is used to indicate to the callees that this allocation is a kernel allocation in process context, and should be accounted to current's memcg. Signed-off-by: Glauber Costa Acked-by: Johannes Weiner Acked-by: Rik van Riel Acked-by: Mel Gorman Acked-by: Kamezawa Hiroyuki Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: Suleiman Souhlal Cc: Tejun Heo Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 ++ include/trace/events/gfpflags.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux/gfp.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f74856e17e48..643c9a6f7f34 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -30,6 +30,7 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u +#define ___GFP_KMEMCG 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -89,6 +90,7 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ +#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index d6fd8e5b14b7..1eddbf1557f2 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -34,6 +34,7 @@ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ + {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ -- cgit v1.2.3 From 6a1a0d3b625a4091e7a0eb249aefc6a644385149 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 18 Dec 2012 14:22:00 -0800 Subject: mm: allocate kernel pages to the right memcg When a process tries to allocate a page with the __GFP_KMEMCG flag, the page allocator will call the corresponding memcg functions to validate the allocation. Tasks in the root memcg can always proceed. To avoid adding markers to the page - and a kmem flag that would necessarily follow, as much as doing page_cgroup lookups for no reason, whoever is marking its allocations with __GFP_KMEMCG flag is responsible for telling the page allocator that this is such an allocation at free_pages() time. This is done by the invocation of __free_accounted_pages() and free_accounted_pages(). Signed-off-by: Glauber Costa Acked-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Kamezawa Hiroyuki Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Johannes Weiner Cc: Suleiman Souhlal Cc: Tejun Heo Cc: Frederic Weisbecker Cc: Greg Thelen Cc: JoonSoo Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 3 +++ mm/page_alloc.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include/linux/gfp.h') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 643c9a6f7f34..0f615eb23d05 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -367,6 +367,9 @@ extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); extern void free_hot_cold_page_list(struct list_head *list, int cold); +extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); +extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); + #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 62496edbd8dd..2ad2ad168efe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2612,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; + struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2630,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + /* + * Will only have any effect when __GFP_KMEMCG is set. This is + * verified in the (always inline) callee + */ + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); @@ -2665,6 +2673,8 @@ out: if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; + memcg_kmem_commit_charge(page, memcg, order); + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2717,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +/* + * __free_memcg_kmem_pages and free_memcg_kmem_pages will free + * pages allocated with __GFP_KMEMCG. + * + * Those pages are accounted to a particular memcg, embedded in the + * corresponding page_cgroup. To avoid adding a hit in the allocator to search + * for that information only to find out that it is NULL for users who have no + * interest in that whatsoever, we provide these functions. + * + * The caller knows better which flags it relies on. + */ +void __free_memcg_kmem_pages(struct page *page, unsigned int order) +{ + memcg_kmem_uncharge_pages(page, order); + __free_pages(page, order); +} + +void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +{ + if (addr != 0) { + VM_BUG_ON(!virt_addr_valid((void *)addr)); + __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + } +} + static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) { if (addr) { -- cgit v1.2.3