From d2709c7ce4c513ab7f4ca9a106a930621811f2d3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 19 Nov 2012 22:21:03 +0000
Subject: perf: Make perf build for x86 with UAPI disintegration applied

Make perf build for x86 once the UAPI disintegration patches for that arch
have been applied by adding the appropriate -I flags - in the right order -
and then converting some #includes that use ../.. notation to find main kernel
headerfiles to use <asm/foo.h> and <linux/foo.h> instead.

Note that -Iarch/foo/include/uapi is present _before_ -Iarch/foo/include.
This makes sure we get the userspace version of the pt_regs struct.  Ideally,
we wouldn't have the latter -I flag at all, but unfortunately we want
asm/svm.h and asm/vmx.h in builtin-kvm.c and these aren't part of the UAPI -
at least not for x86.  I wonder if the bits outside of the __KERNEL__ guards
*should* be transferred there.

I note also that perf seems to do its dependency handling manually by listing
all the header files it might want to use in LIB_H in the Makefile.  Can this
be changed to use -MD?

Note that to do make this work, we need to export and UAPI disintegrate
linux/hw_breakpoint.h, which I think should've been exported previously so that
perf can access the bits.  We have to do this in the same patch to maintain
bisectability.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 include/linux/hw_breakpoint.h | 31 +------------------------------
 1 file changed, 1 insertion(+), 30 deletions(-)

(limited to 'include/linux')
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 6ae9c631a1be..0464c85e63fd 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -1,35 +1,8 @@
 #ifndef _LINUX_HW_BREAKPOINT_H
 #define _LINUX_HW_BREAKPOINT_H
 
-enum {
-	HW_BREAKPOINT_LEN_1 = 1,
-	HW_BREAKPOINT_LEN_2 = 2,
-	HW_BREAKPOINT_LEN_4 = 4,
-	HW_BREAKPOINT_LEN_8 = 8,
-};
-
-enum {
-	HW_BREAKPOINT_EMPTY	= 0,
-	HW_BREAKPOINT_R		= 1,
-	HW_BREAKPOINT_W		= 2,
-	HW_BREAKPOINT_RW	= HW_BREAKPOINT_R | HW_BREAKPOINT_W,
-	HW_BREAKPOINT_X		= 4,
-	HW_BREAKPOINT_INVALID   = HW_BREAKPOINT_RW | HW_BREAKPOINT_X,
-};
-
-enum bp_type_idx {
-	TYPE_INST 	= 0,
-#ifdef CONFIG_HAVE_MIXED_BREAKPOINTS_REGS
-	TYPE_DATA	= 0,
-#else
-	TYPE_DATA	= 1,
-#endif
-	TYPE_MAX
-};
-
-#ifdef __KERNEL__
-
 #include <linux/perf_event.h>
+#include <uapi/linux/hw_breakpoint.h>
 
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 
@@ -151,6 +124,4 @@ static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp)
 }
 
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
-#endif /* __KERNEL__ */
-
 #endif /* _LINUX_HW_BREAKPOINT_H */
-- 
cgit v1.2.3


From c5782e9f5a535af09d7834693a52afdbcc6e5f3f Mon Sep 17 00:00:00 2001
From: Tushar Behera <tushar.behera@linaro.org>
Date: Mon, 26 Nov 2012 16:29:38 -0800
Subject: include/linux/bug.h: fix sparse warning related to
 BUILD_BUG_ON_INVALID

Commit baf05aa9271b ("bug: introduce BUILD_BUG_ON_INVALID() macro")
introduces this macro only when _CHECKER_ is not defined.  Define a
silent macro in the else condition to fix following sparse warning:

  mm/filemap.c:395:9: error: undefined identifier 'BUILD_BUG_ON_INVALID'
  mm/filemap.c:396:9: error: undefined identifier 'BUILD_BUG_ON_INVALID'
  mm/filemap.c:397:9: error: undefined identifier 'BUILD_BUG_ON_INVALID'
  include/linux/mm.h:419:9: error: undefined identifier 'BUILD_BUG_ON_INVALID'
  include/linux/mm.h:419:9: error: not a function <noident>

Signed-off-by: Tushar Behera <tushar.behera@linaro.org>
Acked-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bug.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/bug.h b/include/linux/bug.h
index aaac4bba6f5c..b1cf40de847e 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -15,6 +15,7 @@ struct pt_regs;
 #define BUILD_BUG_ON_NOT_POWER_OF_2(n)
 #define BUILD_BUG_ON_ZERO(e) (0)
 #define BUILD_BUG_ON_NULL(e) ((void*)0)
+#define BUILD_BUG_ON_INVALID(e) (0)
 #define BUILD_BUG_ON(condition)
 #define BUILD_BUG() (0)
 #else /* __CHECKER__ */
-- 
cgit v1.2.3


From 82b212f40059bffd6808c07266a942d444d5558a Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Mon, 26 Nov 2012 16:29:45 -0800
Subject: Revert "mm: remove __GFP_NO_KSWAPD"

With "mm: vmscan: scale number of pages reclaimed by reclaim/compaction
based on failures" reverted, Zdenek Kabelac reported the following

  Hmm,  so it's just took longer to hit the problem and observe
  kswapd0 spinning on my CPU again - it's not as endless like before -
  but still it easily eats minutes - it helps to	turn off  Firefox
  or TB  (memory hungry apps) so kswapd0 stops soon - and restart
  those apps again.  (And I still have like >1GB of cached memory)

  kswapd0         R  running task        0    30      2 0x00000000
  Call Trace:
    preempt_schedule+0x42/0x60
    _raw_spin_unlock+0x55/0x60
    put_super+0x31/0x40
    drop_super+0x22/0x30
    prune_super+0x149/0x1b0
    shrink_slab+0xba/0x510

The sysrq+m indicates the system has no swap so it'll never reclaim
anonymous pages as part of reclaim/compaction.  That is one part of the
problem but not the root cause as file-backed pages could also be
reclaimed.

The likely underlying problem is that kswapd is woken up or kept awake
for each THP allocation request in the page allocator slow path.

If compaction fails for the requesting process then compaction will be
deferred for a time and direct reclaim is avoided.  However, if there
are a storm of THP requests that are simply rejected, it will still be
the the case that kswapd is awake for a prolonged period of time as
pgdat->kswapd_max_order is updated each time.  This is noticed by the
main kswapd() loop and it will not call kswapd_try_to_sleep().  Instead
it will loopp, shrinking a small number of pages and calling
shrink_slab() on each iteration.

The temptation is to supply a patch that checks if kswapd was woken for
THP and if so ignore pgdat->kswapd_max_order but it'll be a hack and not
backed up by proper testing.  As 3.7 is very close to release and this
is not a bug we should release with, a safer path is to revert "mm:
remove __GFP_NO_KSWAPD" for now and revisit it with the view to ironing
out the balance_pgdat() logic in general.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Zdenek Kabelac <zkabelac@redhat.com>
Cc: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Robert Jennings <rcj@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mtd/mtdcore.c           | 6 ++++--
 include/linux/gfp.h             | 5 ++++-
 include/trace/events/gfpflags.h | 1 +
 mm/page_alloc.c                 | 7 ++++---
 4 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 374c46dff7dd..ec794a72975d 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1077,7 +1077,8 @@ EXPORT_SYMBOL_GPL(mtd_writev);
  * until the request succeeds or until the allocation size falls below
  * the system page size. This attempts to make sure it does not adversely
  * impact system performance, so when allocating more than one page, we
- * ask the memory allocator to avoid re-trying.
+ * ask the memory allocator to avoid re-trying, swapping, writing back
+ * or performing I/O.
  *
  * Note, this function also makes sure that the allocated buffer is aligned to
  * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value.
@@ -1091,7 +1092,8 @@ EXPORT_SYMBOL_GPL(mtd_writev);
  */
 void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size)
 {
-	gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY;
+	gfp_t flags = __GFP_NOWARN | __GFP_WAIT |
+		       __GFP_NORETRY | __GFP_NO_KSWAPD;
 	size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE);
 	void *kbuf;
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 02c1c9710be0..d0a79678f169 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -31,6 +31,7 @@ struct vm_area_struct;
 #define ___GFP_THISNODE		0x40000u
 #define ___GFP_RECLAIMABLE	0x80000u
 #define ___GFP_NOTRACK		0x200000u
+#define ___GFP_NO_KSWAPD	0x400000u
 #define ___GFP_OTHER_NODE	0x800000u
 #define ___GFP_WRITE		0x1000000u
 
@@ -85,6 +86,7 @@ struct vm_area_struct;
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
+#define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)	/* Allocator intends to dirty page */
 
@@ -114,7 +116,8 @@ struct vm_area_struct;
 				 __GFP_MOVABLE)
 #define GFP_IOFS	(__GFP_IO | __GFP_FS)
 #define GFP_TRANSHUGE	(GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
-			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN)
+			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
+			 __GFP_NO_KSWAPD)
 
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE	(__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index 9391706e9254..d6fd8e5b14b7 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -36,6 +36,7 @@
 	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
 	{(unsigned long)__GFP_MOVABLE,		"GFP_MOVABLE"},		\
 	{(unsigned long)__GFP_NOTRACK,		"GFP_NOTRACK"},		\
+	{(unsigned long)__GFP_NO_KSWAPD,	"GFP_NO_KSWAPD"},	\
 	{(unsigned long)__GFP_OTHER_NODE,	"GFP_OTHER_NODE"}	\
 	) : "GFP_NOWAIT"
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bcb72c6e2b2d..92871579cbee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2416,8 +2416,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 restart:
-	wake_all_kswapd(order, zonelist, high_zoneidx,
-					zone_idx(preferred_zone));
+	if (!(gfp_mask & __GFP_NO_KSWAPD))
+		wake_all_kswapd(order, zonelist, high_zoneidx,
+						zone_idx(preferred_zone));
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -2494,7 +2495,7 @@ rebalance:
 	 * system then fail the allocation instead of entering direct reclaim.
 	 */
 	if ((deferred_compaction || contended_compaction) &&
-	    (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
+						(gfp_mask & __GFP_NO_KSWAPD))
 		goto nopage;
 
 	/* Try direct reclaim and then allocating */
-- 
cgit v1.2.3


From 4b05a1c74d1cfae37cf6ff293ee928350f031418 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 27 Nov 2012 22:59:52 -0500
Subject: percpu-rwsem: use synchronize_sched_expedited

Use synchronize_sched_expedited() instead of synchronize_sched()
to improve mount speed.

This patch improves mount time from 0.500s to 0.013s for Jeff's
test-case.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Reported-and-tested-by: Jeff Chua <jeff.chua.linux@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/percpu-rwsem.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 250a4acddb2b..bd1e86071e57 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -13,7 +13,7 @@ struct percpu_rw_semaphore {
 };
 
 #define light_mb()	barrier()
-#define heavy_mb()	synchronize_sched()
+#define heavy_mb()	synchronize_sched_expedited()
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *p)
 {
@@ -51,7 +51,7 @@ static inline void percpu_down_write(struct percpu_rw_semaphore *p)
 {
 	mutex_lock(&p->mtx);
 	p->locked = true;
-	synchronize_sched(); /* make sure that all readers exit the rcu_read_lock_sched region */
+	synchronize_sched_expedited(); /* make sure that all readers exit the rcu_read_lock_sched region */
 	while (__percpu_count(p->counters))
 		msleep(1);
 	heavy_mb(); /* C, between read of p->counter and write to data, paired with B */
-- 
cgit v1.2.3


From 1e8b33328a5407b447ff80953655a47014a6dcb9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 29 Nov 2012 10:49:50 -0800
Subject: blockdev: remove bd_block_size_semaphore again

This reverts the block-device direct access code to the previous
unlocked code, now that fs/buffer.c no longer needs external locking.

With this, fs/block_dev.c is back to the original version, apart from a
whitespace cleanup that I didn't want to revert.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/raw.c |   2 +-
 fs/block_dev.c     | 105 ++---------------------------------------------------
 include/linux/fs.h |   4 --
 3 files changed, 5 insertions(+), 106 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 0bb207eaef2f..54a3a6d09819 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -285,7 +285,7 @@ static long raw_ctl_compat_ioctl(struct file *file, unsigned int cmd,
 
 static const struct file_operations raw_fops = {
 	.read		= do_sync_read,
-	.aio_read	= blkdev_aio_read,
+	.aio_read	= generic_file_aio_read,
 	.write		= do_sync_write,
 	.aio_write	= blkdev_aio_write,
 	.fsync		= blkdev_fsync,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1a1e5e3b1eaf..47a949d8a07e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -116,8 +116,6 @@ EXPORT_SYMBOL(invalidate_bdev);
 
 int set_blocksize(struct block_device *bdev, int size)
 {
-	struct address_space *mapping;
-
 	/* Size must be a power of two, and between 512 and PAGE_SIZE */
 	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
 		return -EINVAL;
@@ -126,19 +124,6 @@ int set_blocksize(struct block_device *bdev, int size)
 	if (size < bdev_logical_block_size(bdev))
 		return -EINVAL;
 
-	/* Prevent starting I/O or mapping the device */
-	percpu_down_write(&bdev->bd_block_size_semaphore);
-
-	/* Check that the block device is not memory mapped */
-	mapping = bdev->bd_inode->i_mapping;
-	mutex_lock(&mapping->i_mmap_mutex);
-	if (mapping_mapped(mapping)) {
-		mutex_unlock(&mapping->i_mmap_mutex);
-		percpu_up_write(&bdev->bd_block_size_semaphore);
-		return -EBUSY;
-	}
-	mutex_unlock(&mapping->i_mmap_mutex);
-
 	/* Don't change the size if it is same as current */
 	if (bdev->bd_block_size != size) {
 		sync_blockdev(bdev);
@@ -146,9 +131,6 @@ int set_blocksize(struct block_device *bdev, int size)
 		bdev->bd_inode->i_blkbits = blksize_bits(size);
 		kill_bdev(bdev);
 	}
-
-	percpu_up_write(&bdev->bd_block_size_semaphore);
-
 	return 0;
 }
 
@@ -459,12 +441,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
 	struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
 	if (!ei)
 		return NULL;
-
-	if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
-		kmem_cache_free(bdev_cachep, ei);
-		return NULL;
-	}
-
 	return &ei->vfs_inode;
 }
 
@@ -473,8 +449,6 @@ static void bdev_i_callback(struct rcu_head *head)
 	struct inode *inode = container_of(head, struct inode, i_rcu);
 	struct bdev_inode *bdi = BDEV_I(inode);
 
-	percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
-
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
@@ -1593,22 +1567,6 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 
-ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
-			unsigned long nr_segs, loff_t pos)
-{
-	ssize_t ret;
-	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
-
-	percpu_down_read(&bdev->bd_block_size_semaphore);
-
-	ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
-
-	percpu_up_read(&bdev->bd_block_size_semaphore);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(blkdev_aio_read);
-
 /*
  * Write data to the block device.  Only intended for the block device itself
  * and the raw driver which basically is a fake block device.
@@ -1620,16 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
-	struct block_device *bdev = I_BDEV(file->f_mapping->host);
 	struct blk_plug plug;
 	ssize_t ret;
 
 	BUG_ON(iocb->ki_pos != pos);
 
 	blk_start_plug(&plug);
-
-	percpu_down_read(&bdev->bd_block_size_semaphore);
-
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	if (ret > 0 || ret == -EIOCBQUEUED) {
 		ssize_t err;
@@ -1638,62 +1592,11 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		if (err < 0 && ret > 0)
 			ret = err;
 	}
-
-	percpu_up_read(&bdev->bd_block_size_semaphore);
-
 	blk_finish_plug(&plug);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_aio_write);
 
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	int ret;
-	struct block_device *bdev = I_BDEV(file->f_mapping->host);
-
-	percpu_down_read(&bdev->bd_block_size_semaphore);
-
-	ret = generic_file_mmap(file, vma);
-
-	percpu_up_read(&bdev->bd_block_size_semaphore);
-
-	return ret;
-}
-
-static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos,
-				  struct pipe_inode_info *pipe, size_t len,
-				  unsigned int flags)
-{
-	ssize_t ret;
-	struct block_device *bdev = I_BDEV(file->f_mapping->host);
-
-	percpu_down_read(&bdev->bd_block_size_semaphore);
-
-	ret = generic_file_splice_read(file, ppos, pipe, len, flags);
-
-	percpu_up_read(&bdev->bd_block_size_semaphore);
-
-	return ret;
-}
-
-static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe,
-				   struct file *file, loff_t *ppos, size_t len,
-				   unsigned int flags)
-{
-	ssize_t ret;
-	struct block_device *bdev = I_BDEV(file->f_mapping->host);
-
-	percpu_down_read(&bdev->bd_block_size_semaphore);
-
-	ret = generic_file_splice_write(pipe, file, ppos, len, flags);
-
-	percpu_up_read(&bdev->bd_block_size_semaphore);
-
-	return ret;
-}
-
-
 /*
  * Try to release a page associated with block device when the system
  * is under memory pressure.
@@ -1724,16 +1627,16 @@ const struct file_operations def_blk_fops = {
 	.llseek		= block_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
-  	.aio_read	= blkdev_aio_read,
+	.aio_read	= generic_file_aio_read,
 	.aio_write	= blkdev_aio_write,
-	.mmap		= blkdev_mmap,
+	.mmap		= generic_file_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
-	.splice_read	= blkdev_splice_read,
-	.splice_write	= blkdev_splice_write,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
 };
 
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b33cfc97b9ca..44f288e9726d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -462,8 +462,6 @@ struct block_device {
 	int			bd_fsfreeze_count;
 	/* Mutex for freeze */
 	struct mutex		bd_fsfreeze_mutex;
-	/* A semaphore that prevents I/O while block size is being changed */
-	struct percpu_rw_semaphore	bd_block_size_semaphore;
 };
 
 /*
@@ -2379,8 +2377,6 @@ extern int generic_segment_checks(const struct iovec *iov,
 		unsigned long *nr_segs, size_t *count, int access_flags);
 
 /* fs/block_dev.c */
-extern ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t pos);
 extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
 				unsigned long nr_segs, loff_t pos);
 extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
-- 
cgit v1.2.3


From bbec0270bdd887f96377065ee38b8848b5afa395 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 29 Nov 2012 12:31:52 -0800
Subject: blkdev_max_block: make private to fs/buffer.c

We really don't want to look at the block size for the raw block device
accesses in fs/block-dev.c, because it may be changing from under us.
So get rid of the max_block logic entirely, since the caller should
already have done it anyway.

That leaves the only user of this function in fs/buffer.c, so move the
whole function there and make it static.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c     | 55 +-----------------------------------------------------
 fs/buffer.c        | 14 +++++++++++++-
 include/linux/fs.h |  1 -
 3 files changed, 14 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 47a949d8a07e..a1e09b4fe1ba 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode,
 	spin_unlock(&dst->wb.list_lock);
 }
 
-sector_t blkdev_max_block(struct block_device *bdev)
-{
-	sector_t retval = ~((sector_t)0);
-	loff_t sz = i_size_read(bdev->bd_inode);
-
-	if (sz) {
-		unsigned int size = block_size(bdev);
-		unsigned int sizebits = blksize_bits(size);
-		retval = (sz >> sizebits);
-	}
-	return retval;
-}
-
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 void kill_bdev(struct block_device *bdev)
 {
@@ -163,52 +150,12 @@ static int
 blkdev_get_block(struct inode *inode, sector_t iblock,
 		struct buffer_head *bh, int create)
 {
-	if (iblock >= blkdev_max_block(I_BDEV(inode))) {
-		if (create)
-			return -EIO;
-
-		/*
-		 * for reads, we're just trying to fill a partial page.
-		 * return a hole, they will have to call get_block again
-		 * before they can fill it, and they will get -EIO at that
-		 * time
-		 */
-		return 0;
-	}
 	bh->b_bdev = I_BDEV(inode);
 	bh->b_blocknr = iblock;
 	set_buffer_mapped(bh);
 	return 0;
 }
 
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
-		struct buffer_head *bh, int create)
-{
-	sector_t end_block = blkdev_max_block(I_BDEV(inode));
-	unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
-
-	if ((iblock + max_blocks) > end_block) {
-		max_blocks = end_block - iblock;
-		if ((long)max_blocks <= 0) {
-			if (create)
-				return -EIO;	/* write fully beyond EOF */
-			/*
-			 * It is a read which is fully beyond EOF.  We return
-			 * a !buffer_mapped buffer
-			 */
-			max_blocks = 0;
-		}
-	}
-
-	bh->b_bdev = I_BDEV(inode);
-	bh->b_blocknr = iblock;
-	bh->b_size = max_blocks << inode->i_blkbits;
-	if (max_blocks)
-		set_buffer_mapped(bh);
-	return 0;
-}
-
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs)
@@ -217,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	struct inode *inode = file->f_mapping->host;
 
 	return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
-				    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
+				    nr_segs, blkdev_get_block, NULL, NULL, 0);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
diff --git a/fs/buffer.c b/fs/buffer.c
index 28a74ff5324b..3586fb05c8ce 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -911,6 +911,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
 	attach_page_buffers(page, head);
 }
 
+static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
+{
+	sector_t retval = ~((sector_t)0);
+	loff_t sz = i_size_read(bdev->bd_inode);
+
+	if (sz) {
+		unsigned int sizebits = blksize_bits(size);
+		retval = (sz >> sizebits);
+	}
+	return retval;
+}
+
 /*
  * Initialise the state of a blockdev page's buffers.
  */ 
@@ -921,7 +933,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
 	struct buffer_head *head = page_buffers(page);
 	struct buffer_head *bh = head;
 	int uptodate = PageUptodate(page);
-	sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode));
+	sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
 
 	do {
 		if (!buffer_mapped(bh)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 44f288e9726d..75fe9a134803 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2047,7 +2047,6 @@ extern void unregister_blkdev(unsigned int, const char *);
 extern struct block_device *bdget(dev_t);
 extern struct block_device *bdgrab(struct block_device *bdev);
 extern void bd_set_size(struct block_device *, loff_t size);
-extern sector_t blkdev_max_block(struct block_device *bdev);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
 extern void invalidate_bdev(struct block_device *);
-- 
cgit v1.2.3


From a50915394f1fc02c2861d3b7ce7014788aa5066e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 29 Nov 2012 13:54:27 -0800
Subject: revert "Revert "mm: remove __GFP_NO_KSWAPD""

It apepars that this patch was innocent, and we hope that "mm: avoid
waking kswapd for THP allocations when compaction is deferred or
contended" will fix the final kswapd-spinning cause.

Cc: Zdenek Kabelac <zkabelac@redhat.com>
Cc: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Jiri Slaby <jirislaby@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Robert Jennings <rcj@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mtd/mtdcore.c           |  6 ++----
 include/linux/gfp.h             | 13 +++++--------
 include/trace/events/gfpflags.h |  1 -
 mm/page_alloc.c                 |  7 +++----
 4 files changed, 10 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index ec794a72975d..374c46dff7dd 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1077,8 +1077,7 @@ EXPORT_SYMBOL_GPL(mtd_writev);
  * until the request succeeds or until the allocation size falls below
  * the system page size. This attempts to make sure it does not adversely
  * impact system performance, so when allocating more than one page, we
- * ask the memory allocator to avoid re-trying, swapping, writing back
- * or performing I/O.
+ * ask the memory allocator to avoid re-trying.
  *
  * Note, this function also makes sure that the allocated buffer is aligned to
  * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value.
@@ -1092,8 +1091,7 @@ EXPORT_SYMBOL_GPL(mtd_writev);
  */
 void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size)
 {
-	gfp_t flags = __GFP_NOWARN | __GFP_WAIT |
-		       __GFP_NORETRY | __GFP_NO_KSWAPD;
+	gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY;
 	size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE);
 	void *kbuf;
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index d0a79678f169..76e1aa206f57 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -30,10 +30,9 @@ struct vm_area_struct;
 #define ___GFP_HARDWALL		0x20000u
 #define ___GFP_THISNODE		0x40000u
 #define ___GFP_RECLAIMABLE	0x80000u
-#define ___GFP_NOTRACK		0x200000u
-#define ___GFP_NO_KSWAPD	0x400000u
-#define ___GFP_OTHER_NODE	0x800000u
-#define ___GFP_WRITE		0x1000000u
+#define ___GFP_NOTRACK		0x100000u
+#define ___GFP_OTHER_NODE	0x200000u
+#define ___GFP_WRITE		0x400000u
 
 /*
  * GFP bitmasks..
@@ -86,7 +85,6 @@ struct vm_area_struct;
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
-#define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)	/* Allocator intends to dirty page */
 
@@ -96,7 +94,7 @@ struct vm_area_struct;
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 25	/* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 23	/* Room for N __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
@@ -116,8 +114,7 @@ struct vm_area_struct;
 				 __GFP_MOVABLE)
 #define GFP_IOFS	(__GFP_IO | __GFP_FS)
 #define GFP_TRANSHUGE	(GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
-			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
-			 __GFP_NO_KSWAPD)
+			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN)
 
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE	(__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index d6fd8e5b14b7..9391706e9254 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -36,7 +36,6 @@
 	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
 	{(unsigned long)__GFP_MOVABLE,		"GFP_MOVABLE"},		\
 	{(unsigned long)__GFP_NOTRACK,		"GFP_NOTRACK"},		\
-	{(unsigned long)__GFP_NO_KSWAPD,	"GFP_NO_KSWAPD"},	\
 	{(unsigned long)__GFP_OTHER_NODE,	"GFP_OTHER_NODE"}	\
 	) : "GFP_NOWAIT"
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e208f0ad68c..8193809f3de0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2416,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 restart:
-	if (!(gfp_mask & __GFP_NO_KSWAPD))
-		wake_all_kswapd(order, zonelist, high_zoneidx,
-						zone_idx(preferred_zone));
+	wake_all_kswapd(order, zonelist, high_zoneidx,
+					zone_idx(preferred_zone));
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -2495,7 +2494,7 @@ rebalance:
 	 * system then fail the allocation instead of entering direct reclaim.
 	 */
 	if ((deferred_compaction || contended_compaction) &&
-						(gfp_mask & __GFP_NO_KSWAPD))
+	    (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
 		goto nopage;
 
 	/* Try direct reclaim and then allocating */
-- 
cgit v1.2.3


From cbdbf2abb7844548a7d7a6a2ae7af6b6fbcea401 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 23 Nov 2012 12:08:15 +0000
Subject: linux/kernel.h: define SYMBOL_PREFIX

Define SYMBOL_PREFIX to be the same as CONFIG_SYMBOL_PREFIX if set by
the architecture, or "" otherwise. This avoids the need for ugly #ifdefs
whenever symbols are referenced in asm blocks.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Joe Perches <joe@perches.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/kernel.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a123b13b70fd..7d8dfc7392f1 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -701,6 +701,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #define COMPACTION_BUILD 0
 #endif
 
+/* This helps us to avoid #ifdef CONFIG_SYMBOL_PREFIX */
+#ifdef CONFIG_SYMBOL_PREFIX
+#define SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX
+#else
+#define SYMBOL_PREFIX ""
+#endif
+
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
-- 
cgit v1.2.3


From 18a2f371f5edf41810f6469cb9be39931ef9deb9 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 5 Dec 2012 14:01:41 -0800
Subject: tmpfs: fix shared mempolicy leak

This fixes a regression in 3.7-rc, which has since gone into stable.

Commit 00442ad04a5e ("mempolicy: fix a memory corruption by refcount
imbalance in alloc_pages_vma()") changed get_vma_policy() to raise the
refcount on a shmem shared mempolicy; whereas shmem_alloc_page() went
on expecting alloc_page_vma() to drop the refcount it had acquired.
This deserves a rework: but for now fix the leak in shmem_alloc_page().

Hugh: shmem_swapin() did not need a fix, but surely it's clearer to use
the same refcounting there as in shmem_alloc_page(), delete its onstack
mempolicy, and the strange mpol_cond_copy() and __mpol_cond_copy() -
those were invented to let swapin_readahead() make an unknown number of
calls to alloc_pages_vma() with one mempolicy; but since 00442ad04a5e,
alloc_pages_vma() has kept refcount in balance, so now no problem.

Reported-and-tested-by: Tommi Rantala <tt.rantala@gmail.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 16 ----------------
 mm/mempolicy.c            | 22 ----------------------
 mm/shmem.c                | 26 ++++++++++++++++----------
 3 files changed, 16 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index e5ccb9ddd90e..dbd212723b74 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -82,16 +82,6 @@ static inline void mpol_cond_put(struct mempolicy *pol)
 		__mpol_put(pol);
 }
 
-extern struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
-					  struct mempolicy *frompol);
-static inline struct mempolicy *mpol_cond_copy(struct mempolicy *tompol,
-						struct mempolicy *frompol)
-{
-	if (!frompol)
-		return frompol;
-	return __mpol_cond_copy(tompol, frompol);
-}
-
 extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
 static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
 {
@@ -215,12 +205,6 @@ static inline void mpol_cond_put(struct mempolicy *pol)
 {
 }
 
-static inline struct mempolicy *mpol_cond_copy(struct mempolicy *to,
-						struct mempolicy *from)
-{
-	return from;
-}
-
 static inline void mpol_get(struct mempolicy *pol)
 {
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d04a8a54c294..4ea600da8940 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2037,28 +2037,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
 	return new;
 }
 
-/*
- * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
- * eliminate the * MPOL_F_* flags that require conditional ref and
- * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
- * after return.  Use the returned value.
- *
- * Allows use of a mempolicy for, e.g., multiple allocations with a single
- * policy lookup, even if the policy needs/has extra ref on lookup.
- * shmem_readahead needs this.
- */
-struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
-						struct mempolicy *frompol)
-{
-	if (!mpol_needs_cond_ref(frompol))
-		return frompol;
-
-	*tompol = *frompol;
-	tompol->flags &= ~MPOL_F_SHARED;	/* copy doesn't need unref */
-	__mpol_put(frompol);
-	return tompol;
-}
-
 /* Slow path of a mempolicy comparison */
 bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index 89341b658bd0..50c5b8f3a359 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -910,25 +910,29 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 			struct shmem_inode_info *info, pgoff_t index)
 {
-	struct mempolicy mpol, *spol;
 	struct vm_area_struct pvma;
-
-	spol = mpol_cond_copy(&mpol,
-			mpol_shared_policy_lookup(&info->policy, index));
+	struct page *page;
 
 	/* Create a pseudo vma that just contains the policy */
 	pvma.vm_start = 0;
 	/* Bias interleave by inode number to distribute better across nodes */
 	pvma.vm_pgoff = index + info->vfs_inode.i_ino;
 	pvma.vm_ops = NULL;
-	pvma.vm_policy = spol;
-	return swapin_readahead(swap, gfp, &pvma, 0);
+	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
+
+	page = swapin_readahead(swap, gfp, &pvma, 0);
+
+	/* Drop reference taken by mpol_shared_policy_lookup() */
+	mpol_cond_put(pvma.vm_policy);
+
+	return page;
 }
 
 static struct page *shmem_alloc_page(gfp_t gfp,
 			struct shmem_inode_info *info, pgoff_t index)
 {
 	struct vm_area_struct pvma;
+	struct page *page;
 
 	/* Create a pseudo vma that just contains the policy */
 	pvma.vm_start = 0;
@@ -937,10 +941,12 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
 
-	/*
-	 * alloc_page_vma() will drop the shared policy reference
-	 */
-	return alloc_page_vma(gfp, &pvma, 0);
+	page = alloc_page_vma(gfp, &pvma, 0);
+
+	/* Drop reference taken by mpol_shared_policy_lookup() */
+	mpol_cond_put(pvma.vm_policy);
+
+	return page;
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-- 
cgit v1.2.3


From c3c7c254b2e8cd99b0adf288c2a1bddacd7ba255 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 6 Dec 2012 13:54:59 +0000
Subject: net: gro: fix possible panic in skb_gro_receive()

commit 2e71a6f8084e (net: gro: selective flush of packets) added
a bug for skbs using frag_list. This part of the GRO stack is rarely
used, as it needs skb not using a page fragment for their skb->head.

Most drivers do use a page fragment, but some of them use GFP_KERNEL
allocations for the initial fill of their RX ring buffer.

napi_gro_flush() overwrite skb->prev that was used for these skb to
point to the last skb in frag_list.

Fix this using a separate field in struct napi_gro_cb to point to the
last fragment.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 3 +++
 net/core/dev.c            | 2 ++
 net/core/skbuff.c         | 6 +++---
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f8eda0276f03..a848ffc327f4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1488,6 +1488,9 @@ struct napi_gro_cb {
 
 	/* Used in ipv6_gro_receive() */
 	int	proto;
+
+	/* used in skb_gro_receive() slow path */
+	struct sk_buff *last;
 };
 
 #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
diff --git a/net/core/dev.c b/net/core/dev.c
index c0946cb2b354..e5942bf45a6d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3451,6 +3451,8 @@ static int napi_gro_complete(struct sk_buff *skb)
 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
 	int err = -ENOENT;
 
+	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
+
 	if (NAPI_GRO_CB(skb)->count == 1) {
 		skb_shinfo(skb)->gso_size = 0;
 		goto out;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4007c1437fda..3f0636cd76cd 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3004,7 +3004,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 	skb_shinfo(nskb)->gso_size = pinfo->gso_size;
 	pinfo->gso_size = 0;
 	skb_header_release(p);
-	nskb->prev = p;
+	NAPI_GRO_CB(nskb)->last = p;
 
 	nskb->data_len += p->len;
 	nskb->truesize += p->truesize;
@@ -3030,8 +3030,8 @@ merge:
 
 	__skb_pull(skb, offset);
 
-	p->prev->next = skb;
-	p->prev = skb;
+	NAPI_GRO_CB(p)->last->next = skb;
+	NAPI_GRO_CB(p)->last = skb;
 	skb_header_release(skb);
 
 done:
-- 
cgit v1.2.3


From caf491916b1c1e939a2c7575efb7a77f11fc9bdf Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 10 Dec 2012 10:51:16 -0800
Subject: Revert "revert "Revert "mm: remove __GFP_NO_KSWAPD""" and associated
 damage

This reverts commits a50915394f1fc02c2861d3b7ce7014788aa5066e and
d7c3b937bdf45f0b844400b7bf6fd3ed50bac604.

This is a revert of a revert of a revert.  In addition, it reverts the
even older i915 change to stop using the __GFP_NO_KSWAPD flag due to the
original commits in linux-next.

It turns out that the original patch really was bogus, and that the
original revert was the correct thing to do after all.  We thought we
had fixed the problem, and then reverted the revert, but the problem
really is fundamental: waking up kswapd simply isn't the right thing to
do, and direct reclaim sometimes simply _is_ the right thing to do.

When certain allocations fail, we simply should try some direct reclaim,
and if that fails, fail the allocation.  That's the right thing to do
for THP allocations, which can easily fail, and the GPU allocations want
to do that too.

So starting kswapd is sometimes simply wrong, and removing the flag that
said "don't start kswapd" was a mistake.  Let's hope we never revisit
this mistake again - and certainly not this many times ;)

Acked-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/i915/i915_gem.c |  6 +++---
 drivers/mtd/mtdcore.c           |  6 ++++--
 include/linux/gfp.h             | 13 ++++++++-----
 include/trace/events/gfpflags.h |  1 +
 mm/page_alloc.c                 |  7 ++++---
 5 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 107f09befe92..9b285da4449b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1796,7 +1796,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
 	 */
 	mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping;
 	gfp = mapping_gfp_mask(mapping);
-	gfp |= __GFP_NORETRY | __GFP_NOWARN;
+	gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD;
 	gfp &= ~(__GFP_IO | __GFP_WAIT);
 	for_each_sg(st->sgl, sg, page_count, i) {
 		page = shmem_read_mapping_page_gfp(mapping, i, gfp);
@@ -1809,7 +1809,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
 			 * our own buffer, now let the real VM do its job and
 			 * go down in flames if truly OOM.
 			 */
-			gfp &= ~(__GFP_NORETRY | __GFP_NOWARN);
+			gfp &= ~(__GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD);
 			gfp |= __GFP_IO | __GFP_WAIT;
 
 			i915_gem_shrink_all(dev_priv);
@@ -1817,7 +1817,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
 			if (IS_ERR(page))
 				goto err_pages;
 
-			gfp |= __GFP_NORETRY | __GFP_NOWARN;
+			gfp |= __GFP_NORETRY | __GFP_NOWARN | __GFP_NO_KSWAPD;
 			gfp &= ~(__GFP_IO | __GFP_WAIT);
 		}
 
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 374c46dff7dd..ec794a72975d 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -1077,7 +1077,8 @@ EXPORT_SYMBOL_GPL(mtd_writev);
  * until the request succeeds or until the allocation size falls below
  * the system page size. This attempts to make sure it does not adversely
  * impact system performance, so when allocating more than one page, we
- * ask the memory allocator to avoid re-trying.
+ * ask the memory allocator to avoid re-trying, swapping, writing back
+ * or performing I/O.
  *
  * Note, this function also makes sure that the allocated buffer is aligned to
  * the MTD device's min. I/O unit, i.e. the "mtd->writesize" value.
@@ -1091,7 +1092,8 @@ EXPORT_SYMBOL_GPL(mtd_writev);
  */
 void *mtd_kmalloc_up_to(const struct mtd_info *mtd, size_t *size)
 {
-	gfp_t flags = __GFP_NOWARN | __GFP_WAIT | __GFP_NORETRY;
+	gfp_t flags = __GFP_NOWARN | __GFP_WAIT |
+		       __GFP_NORETRY | __GFP_NO_KSWAPD;
 	size_t min_alloc = max_t(size_t, mtd->writesize, PAGE_SIZE);
 	void *kbuf;
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 76e1aa206f57..d0a79678f169 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -30,9 +30,10 @@ struct vm_area_struct;
 #define ___GFP_HARDWALL		0x20000u
 #define ___GFP_THISNODE		0x40000u
 #define ___GFP_RECLAIMABLE	0x80000u
-#define ___GFP_NOTRACK		0x100000u
-#define ___GFP_OTHER_NODE	0x200000u
-#define ___GFP_WRITE		0x400000u
+#define ___GFP_NOTRACK		0x200000u
+#define ___GFP_NO_KSWAPD	0x400000u
+#define ___GFP_OTHER_NODE	0x800000u
+#define ___GFP_WRITE		0x1000000u
 
 /*
  * GFP bitmasks..
@@ -85,6 +86,7 @@ struct vm_area_struct;
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
+#define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)	/* Allocator intends to dirty page */
 
@@ -94,7 +96,7 @@ struct vm_area_struct;
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 23	/* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 25	/* Room for N __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
@@ -114,7 +116,8 @@ struct vm_area_struct;
 				 __GFP_MOVABLE)
 #define GFP_IOFS	(__GFP_IO | __GFP_FS)
 #define GFP_TRANSHUGE	(GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
-			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN)
+			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
+			 __GFP_NO_KSWAPD)
 
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE	(__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h
index 9391706e9254..d6fd8e5b14b7 100644
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -36,6 +36,7 @@
 	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
 	{(unsigned long)__GFP_MOVABLE,		"GFP_MOVABLE"},		\
 	{(unsigned long)__GFP_NOTRACK,		"GFP_NOTRACK"},		\
+	{(unsigned long)__GFP_NO_KSWAPD,	"GFP_NO_KSWAPD"},	\
 	{(unsigned long)__GFP_OTHER_NODE,	"GFP_OTHER_NODE"}	\
 	) : "GFP_NOWAIT"
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8193809f3de0..7e208f0ad68c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2416,8 +2416,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 restart:
-	wake_all_kswapd(order, zonelist, high_zoneidx,
-					zone_idx(preferred_zone));
+	if (!(gfp_mask & __GFP_NO_KSWAPD))
+		wake_all_kswapd(order, zonelist, high_zoneidx,
+						zone_idx(preferred_zone));
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -2494,7 +2495,7 @@ rebalance:
 	 * system then fail the allocation instead of entering direct reclaim.
 	 */
 	if ((deferred_compaction || contended_compaction) &&
-	    (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
+						(gfp_mask & __GFP_NO_KSWAPD))
 		goto nopage;
 
 	/* Try direct reclaim and then allocating */
-- 
cgit v1.2.3