From 8f4fc071b1926d0b20336e2b3f8ab85c94c734c5 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Thu, 14 May 2015 15:16:55 -0700
Subject: gfp: add __GFP_NOACCOUNT

Not all kmem allocations should be accounted to memcg.  The following
patch gives an example when accounting of a certain type of allocations to
memcg can effectively result in a memory leak.  This patch adds the
__GFP_NOACCOUNT flag which if passed to kmalloc and friends will force the
allocation to go through the root cgroup.  It will be used by the next
patch.

Note, since in case of kmemleak enabled each kmalloc implies yet another
allocation from the kmemleak_object cache, we add __GFP_NOACCOUNT to
gfp_kmemleak_mask.

Alternatively, we could introduce a per kmem cache flag disabling
accounting for all allocations of a particular kind, but (a) we would not
be able to bypass accounting for kmalloc then and (b) a kmem cache with
this flag set could not be merged with a kmem cache without this flag,
which would increase the number of global caches and therefore
fragmentation even if the memory cgroup controller is not used.

Despite its generic name, currently __GFP_NOACCOUNT disables accounting
only for kmem allocations while user page allocations are always charged.
To catch abusing of this flag, a warning is issued on an attempt of
passing it to mem_cgroup_try_charge.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: <stable@vger.kernel.org>	[4.0.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kmemleak.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5405aff5a590..f0fe4f2c1fa7 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -115,7 +115,8 @@
 #define BYTES_PER_POINTER	sizeof(void *)
 
 /* GFP bitmask for kmemleak internal allocations */
-#define gfp_kmemleak_mask(gfp)	(((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+#define gfp_kmemleak_mask(gfp)	(((gfp) & (GFP_KERNEL | GFP_ATOMIC | \
+					   __GFP_NOACCOUNT)) | \
 				 __GFP_NORETRY | __GFP_NOMEMALLOC | \
 				 __GFP_NOWARN)
 
-- 
cgit v1.2.3


From 1ae7013dfa4281644a9a9591105c24d639216b22 Mon Sep 17 00:00:00 2001
From: Hui Zhu <zhuhui@xiaomi.com>
Date: Thu, 14 May 2015 15:17:04 -0700
Subject: CMA: page_isolation: check buddy before accessing it

I had an issue:

    Unable to handle kernel NULL pointer dereference at virtual address 0000082a
    pgd = cc970000
    [0000082a] *pgd=00000000
    Internal error: Oops: 5 [#1] PREEMPT SMP ARM
    PC is at get_pageblock_flags_group+0x5c/0xb0
    LR is at unset_migratetype_isolate+0x148/0x1b0
    pc : [<c00cc9a0>]    lr : [<c0109874>]    psr: 80000093
    sp : c7029d00  ip : 00000105  fp : c7029d1c
    r10: 00000001  r9 : 0000000a  r8 : 00000004
    r7 : 60000013  r6 : 000000a4  r5 : c0a357e4  r4 : 00000000
    r3 : 00000826  r2 : 00000002  r1 : 00000000  r0 : 0000003f
    Flags: Nzcv  IRQs off  FIQs on  Mode SVC_32  ISA ARM  Segment user
    Control: 10c5387d  Table: 2cb7006a  DAC: 00000015
    Backtrace:
        get_pageblock_flags_group+0x0/0xb0
        unset_migratetype_isolate+0x0/0x1b0
        undo_isolate_page_range+0x0/0xdc
        __alloc_contig_range+0x0/0x34c
        alloc_contig_range+0x0/0x18

This issue is because when calling unset_migratetype_isolate() to unset
a part of CMA memory, it try to access the buddy page to get its status:

		if (order >= pageblock_order) {
			page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
			buddy_idx = __find_buddy_index(page_idx, order);
			buddy = page + (buddy_idx - page_idx);

			if (!is_migrate_isolate_page(buddy)) {

But the begin addr of this part of CMA memory is very close to a part of
memory that is reserved at boot time (not in buddy system).  So add a
check before accessing it.

[akpm@linux-foundation.org: use conventional code layout]
Signed-off-by: Hui Zhu <zhuhui@xiaomi.com>
Suggested-by: Laura Abbott <labbott@redhat.com>
Suggested-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_isolation.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 755a42c76eb4..303c908790ef 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -101,7 +101,8 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 			buddy_idx = __find_buddy_index(page_idx, order);
 			buddy = page + (buddy_idx - page_idx);
 
-			if (!is_migrate_isolate_page(buddy)) {
+			if (pfn_valid_within(page_to_pfn(buddy)) &&
+			    !is_migrate_isolate_page(buddy)) {
 				__isolate_free_page(page, order);
 				kernel_map_pages(page, (1 << order), 1);
 				set_page_refcounted(page);
-- 
cgit v1.2.3


From b0dc2b9bb4ab782115b964310518ee0b17784277 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 14 May 2015 15:17:09 -0700
Subject: mm, numa: really disable NUMA balancing by default on single node
 machines

NUMA balancing is meant to be disabled by default on UMA machines but
the check is using nr_node_ids (highest node) instead of
num_online_nodes (online nodes).

The consequences are that a UMA machine with a node ID of 1 or higher
will enable NUMA balancing.  This will incur useless overhead due to
minor faults with the impact depending on the workload.  These are the
impact on the stats when running a kernel build on a single node machine
whose node ID happened to be 1:

  			       vanilla     patched
  NUMA base PTE updates          5113158           0
  NUMA huge PMD updates              643           0
  NUMA page range updates        5442374           0
  NUMA hint faults               2109622           0
  NUMA hint local faults         2109622           0
  NUMA hint local percent            100         100
  NUMA pages migrated                  0           0

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org>	[3.8+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ede26291d4aa..747743237d9f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2518,7 +2518,7 @@ static void __init check_numabalancing_enable(void)
 	if (numabalancing_override)
 		set_numabalancing_state(numabalancing_override == 1);
 
-	if (nr_node_ids > 1 && !numabalancing_override) {
+	if (num_online_nodes() > 1 && !numabalancing_override) {
 		pr_info("%s automatic NUMA balancing. "
 			"Configure with numa_balancing= or the "
 			"kernel.numa_balancing sysctl",
-- 
cgit v1.2.3


From aad653a0bc09dd4ebcb5579f9f835bbae9ef2ba3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 19 May 2015 15:58:37 +1000
Subject: block: discard bdi_unregister() in favour of bdi_destroy()

bdi_unregister() now contains very little functionality.

It contains a "WARN_ON" if bdi->dev is NULL.  This warning is of no
real consequence as bdi->dev isn't needed by anything else in the function,
and it triggers if
   blk_cleanup_queue() -> bdi_destroy()
is called before bdi_unregister, which happens since
  Commit: 6cd18e711dd8 ("block: destroy bdi before blockdev is unregistered.")

So this isn't wanted.

It also calls bdi_set_min_ratio().  This needs to be called after
writes through the bdi have all been flushed, and before the bdi is destroyed.
Calling it early is better than calling it late as it frees up a global
resource.

Calling it immediately after bdi_wb_shutdown() in bdi_destroy()
perfectly fits these requirements.

So bdi_unregister() can be discarded with the important content moved to
bdi_destroy(), as can the
  writeback_bdi_unregister
event which is already not used.

Reported-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org (v4.0)
Fixes: c4db59d31e39 ("fs: don't reassign dirty inodes to default_backing_dev_info")
Fixes: 6cd18e711dd8 ("block: destroy bdi before blockdev is unregistered.")
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Nicholas Moulin <nicholas.w.moulin@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/genhd.c                    |  1 -
 include/linux/backing-dev.h      |  1 -
 include/trace/events/writeback.h |  1 -
 mm/backing-dev.c                 | 18 +-----------------
 4 files changed, 1 insertion(+), 20 deletions(-)

(limited to 'mm')

diff --git a/block/genhd.c b/block/genhd.c
index 0a536dc05f3b..666e11b83983 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -653,7 +653,6 @@ void del_gendisk(struct gendisk *disk)
 	disk->flags &= ~GENHD_FL_UP;
 
 	sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
-	bdi_unregister(&disk->queue->backing_dev_info);
 	blk_unregister_queue(disk);
 	blk_unregister_region(disk_devt(disk), disk->minors);
 
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index aff923ae8c4b..d87d8eced064 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -116,7 +116,6 @@ __printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
-void bdi_unregister(struct backing_dev_info *bdi);
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 880dd7437172..c178d13d6f4c 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -250,7 +250,6 @@ DEFINE_EVENT(writeback_class, name, \
 DEFINE_WRITEBACK_EVENT(writeback_nowork);
 DEFINE_WRITEBACK_EVENT(writeback_wake_background);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
-DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
 
 DECLARE_EVENT_CLASS(wbc_class,
 	TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 6dc4580df2af..000e7b3b9896 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -359,23 +359,6 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 	flush_delayed_work(&bdi->wb.dwork);
 }
 
-/*
- * Called when the device behind @bdi has been removed or ejected.
- *
- * We can't really do much here except for reducing the dirty ratio at
- * the moment.  In the future we should be able to set a flag so that
- * the filesystem can handle errors at mark_inode_dirty time instead
- * of only at writeback time.
- */
-void bdi_unregister(struct backing_dev_info *bdi)
-{
-	if (WARN_ON_ONCE(!bdi->dev))
-		return;
-
-	bdi_set_min_ratio(bdi, 0);
-}
-EXPORT_SYMBOL(bdi_unregister);
-
 static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 {
 	memset(wb, 0, sizeof(*wb));
@@ -443,6 +426,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
 	int i;
 
 	bdi_wb_shutdown(bdi);
+	bdi_set_min_ratio(bdi, 0);
 
 	WARN_ON(!list_empty(&bdi->work_list));
 	WARN_ON(delayed_work_pending(&bdi->wb.dwork));
-- 
cgit v1.2.3