From 322cea2f41adb62c975f46a3242f4e3b43226fa1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 25 Mar 2016 00:30:25 +0100
Subject: bpf: add missing map_flags to bpf_map_show_fdinfo

Add map_flags attribute to bpf_map_show_fdinfo(), so that tools like
tc can check for them when loading objects from a pinned entry, e.g.
if user intent wrt allocation (BPF_F_NO_PREALLOC) is different to the
pinned object, it can bail out. Follow-up to 6c9059817432 ("bpf:
pre-allocate hash map elements"), so that tc can still support this
with v4.6.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2a2efe1bc76c..adc5e4bd74f8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -137,11 +137,13 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   "map_type:\t%u\n"
 		   "key_size:\t%u\n"
 		   "value_size:\t%u\n"
-		   "max_entries:\t%u\n",
+		   "max_entries:\t%u\n"
+		   "map_flags:\t%#x\n",
 		   map->map_type,
 		   map->key_size,
 		   map->value_size,
-		   map->max_entries);
+		   map->max_entries,
+		   map->map_flags);
 }
 #endif
 
-- 
cgit v1.2.3


From 0c93b7d85d40b690f04786ea0f18798b73182e4f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 25 Mar 2016 12:06:51 -0400
Subject: bpf: reject invalid names right in ->lookup()

... and other methods won't see them at all

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/bpf/inode.c | 37 ++++++++-----------------------------
 1 file changed, 8 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index f2ece3c174a5..35d21c189bb0 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -119,18 +119,10 @@ static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
 	return 0;
 }
 
-static bool bpf_dname_reserved(const struct dentry *dentry)
-{
-	return strchr(dentry->d_name.name, '.');
-}
-
 static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
 
-	if (bpf_dname_reserved(dentry))
-		return -EPERM;
-
 	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -152,9 +144,6 @@ static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
 {
 	struct inode *inode;
 
-	if (bpf_dname_reserved(dentry))
-		return -EPERM;
-
 	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -187,31 +176,21 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
 	}
 }
 
-static int bpf_link(struct dentry *old_dentry, struct inode *dir,
-		    struct dentry *new_dentry)
+static struct dentry *
+bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
 {
-	if (bpf_dname_reserved(new_dentry))
-		return -EPERM;
-
-	return simple_link(old_dentry, dir, new_dentry);
-}
-
-static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry,
-		      struct inode *new_dir, struct dentry *new_dentry)
-{
-	if (bpf_dname_reserved(new_dentry))
-		return -EPERM;
-
-	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (strchr(dentry->d_name.name, '.'))
+		return ERR_PTR(-EPERM);
+	return simple_lookup(dir, dentry, flags);
 }
 
 static const struct inode_operations bpf_dir_iops = {
-	.lookup		= simple_lookup,
+	.lookup		= bpf_lookup,
 	.mknod		= bpf_mkobj,
 	.mkdir		= bpf_mkdir,
 	.rmdir		= simple_rmdir,
-	.rename		= bpf_rename,
-	.link		= bpf_link,
+	.rename		= simple_rename,
+	.link		= simple_link,
 	.unlink		= simple_unlink,
 };
 
-- 
cgit v1.2.3


From f009a7a767e792d5ab0b46c08d46236ea5271dd9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 24 Mar 2016 15:38:00 +0100
Subject: timers/nohz: Convert tick dependency mask to atomic_t

The tick dependency mask was intially unsigned long because this is the
type on which clear_bit() operates on and fetch_or() accepts it.

But now that we have atomic_fetch_or(), we can instead use
atomic_andnot() to clear the bit. This consolidates the type of our
tick dependency mask, reduce its size on structures and benefit from
possible architecture optimizations on atomic_t operations.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1458830281-4255-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h    |  4 ++--
 kernel/time/tick-sched.c | 61 ++++++++++++++++++++++++------------------------
 kernel/time/tick-sched.h |  2 +-
 3 files changed, 33 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 60bba7e032dc..52c4847b05e2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -720,7 +720,7 @@ struct signal_struct {
 	struct task_cputime cputime_expires;
 
 #ifdef CONFIG_NO_HZ_FULL
-	unsigned long tick_dep_mask;
+	atomic_t tick_dep_mask;
 #endif
 
 	struct list_head cpu_timers[3];
@@ -1549,7 +1549,7 @@ struct task_struct {
 #endif
 
 #ifdef CONFIG_NO_HZ_FULL
-	unsigned long tick_dep_mask;
+	atomic_t tick_dep_mask;
 #endif
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	u64 start_time;		/* monotonic time in nsec */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 084b79f5917e..58e3310c9b21 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -157,52 +157,50 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 cpumask_var_t tick_nohz_full_mask;
 cpumask_var_t housekeeping_mask;
 bool tick_nohz_full_running;
-static unsigned long tick_dep_mask;
+static atomic_t tick_dep_mask;
 
-static void trace_tick_dependency(unsigned long dep)
+static bool check_tick_dependency(atomic_t *dep)
 {
-	if (dep & TICK_DEP_MASK_POSIX_TIMER) {
+	int val = atomic_read(dep);
+
+	if (val & TICK_DEP_MASK_POSIX_TIMER) {
 		trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
-		return;
+		return true;
 	}
 
-	if (dep & TICK_DEP_MASK_PERF_EVENTS) {
+	if (val & TICK_DEP_MASK_PERF_EVENTS) {
 		trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
-		return;
+		return true;
 	}
 
-	if (dep & TICK_DEP_MASK_SCHED) {
+	if (val & TICK_DEP_MASK_SCHED) {
 		trace_tick_stop(0, TICK_DEP_MASK_SCHED);
-		return;
+		return true;
 	}
 
-	if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE)
+	if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
 		trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
+		return true;
+	}
+
+	return false;
 }
 
 static bool can_stop_full_tick(struct tick_sched *ts)
 {
 	WARN_ON_ONCE(!irqs_disabled());
 
-	if (tick_dep_mask) {
-		trace_tick_dependency(tick_dep_mask);
+	if (check_tick_dependency(&tick_dep_mask))
 		return false;
-	}
 
-	if (ts->tick_dep_mask) {
-		trace_tick_dependency(ts->tick_dep_mask);
+	if (check_tick_dependency(&ts->tick_dep_mask))
 		return false;
-	}
 
-	if (current->tick_dep_mask) {
-		trace_tick_dependency(current->tick_dep_mask);
+	if (check_tick_dependency(&current->tick_dep_mask))
 		return false;
-	}
 
-	if (current->signal->tick_dep_mask) {
-		trace_tick_dependency(current->signal->tick_dep_mask);
+	if (check_tick_dependency(&current->signal->tick_dep_mask))
 		return false;
-	}
 
 	return true;
 }
@@ -259,12 +257,12 @@ static void tick_nohz_full_kick_all(void)
 	preempt_enable();
 }
 
-static void tick_nohz_dep_set_all(unsigned long *dep,
+static void tick_nohz_dep_set_all(atomic_t *dep,
 				  enum tick_dep_bits bit)
 {
-	unsigned long prev;
+	int prev;
 
-	prev = fetch_or(dep, BIT_MASK(bit));
+	prev = atomic_fetch_or(dep, BIT(bit));
 	if (!prev)
 		tick_nohz_full_kick_all();
 }
@@ -280,7 +278,7 @@ void tick_nohz_dep_set(enum tick_dep_bits bit)
 
 void tick_nohz_dep_clear(enum tick_dep_bits bit)
 {
-	clear_bit(bit, &tick_dep_mask);
+	atomic_andnot(BIT(bit), &tick_dep_mask);
 }
 
 /*
@@ -289,12 +287,12 @@ void tick_nohz_dep_clear(enum tick_dep_bits bit)
  */
 void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
 {
-	unsigned long prev;
+	int prev;
 	struct tick_sched *ts;
 
 	ts = per_cpu_ptr(&tick_cpu_sched, cpu);
 
-	prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit));
+	prev = atomic_fetch_or(&ts->tick_dep_mask, BIT(bit));
 	if (!prev) {
 		preempt_disable();
 		/* Perf needs local kick that is NMI safe */
@@ -313,7 +311,7 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
 {
 	struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
 
-	clear_bit(bit, &ts->tick_dep_mask);
+	atomic_andnot(BIT(bit), &ts->tick_dep_mask);
 }
 
 /*
@@ -331,7 +329,7 @@ void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
 
 void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
 {
-	clear_bit(bit, &tsk->tick_dep_mask);
+	atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
 }
 
 /*
@@ -345,7 +343,7 @@ void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
 
 void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
 {
-	clear_bit(bit, &sig->tick_dep_mask);
+	atomic_andnot(BIT(bit), &sig->tick_dep_mask);
 }
 
 /*
@@ -366,7 +364,8 @@ void __tick_nohz_task_switch(void)
 	ts = this_cpu_ptr(&tick_cpu_sched);
 
 	if (ts->tick_stopped) {
-		if (current->tick_dep_mask || current->signal->tick_dep_mask)
+		if (atomic_read(&current->tick_dep_mask) ||
+		    atomic_read(&current->signal->tick_dep_mask))
 			tick_nohz_full_kick();
 	}
 out:
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index eb4e32566a83..bf38226e5c17 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -60,7 +60,7 @@ struct tick_sched {
 	u64				next_timer;
 	ktime_t				idle_expires;
 	int				do_timer_last;
-	unsigned long			tick_dep_mask;
+	atomic_t			tick_dep_mask;
 };
 
 extern struct tick_sched *tick_get_tick_sched(int cpu);
-- 
cgit v1.2.3


From 5529578a27288d11d4d15635c258c6dde0f0fb10 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 24 Mar 2016 15:38:01 +0100
Subject: locking/atomic, sched: Unexport fetch_or()

This patch functionally reverts:

  5fd7a09cfb8c ("atomic: Export fetch_or()")

During the merge Linus observed that the generic version of fetch_or()
was messy:

  " This makes the ugly "fetch_or()" macro that the scheduler used
    internally a new generic helper, and does a bad job at it. "

  e23604edac2a Merge branch 'timers-nohz-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Now that we have introduced atomic_fetch_or(), fetch_or() is only used
by the scheduler in order to deal with thread_info flags which type
can vary across architectures.

Lets confine fetch_or() back to the scheduler so that we encourage
future users to use the more robust and well typed atomic_t version
instead.

While at it, fetch_or() gets robustified, pasting improvements from a
previous patch by Ingo Molnar that avoids needless expression
re-evaluations in the loop.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1458830281-4255-4-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/atomic.h | 21 ---------------------
 kernel/sched/core.c    | 18 ++++++++++++++++++
 2 files changed, 18 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 3d64c0852164..506c3531832e 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -579,27 +579,6 @@ static inline int atomic_fetch_or(atomic_t *p, int mask)
 }
 #endif
 
-/**
- * fetch_or - perform *ptr |= mask and return old value of *ptr
- * @ptr: pointer to value
- * @mask: mask to OR on the value
- *
- * cmpxchg based fetch_or, macro so it works for different integer types
- */
-#ifndef fetch_or
-#define fetch_or(ptr, mask)						\
-({	typeof(*(ptr)) __old, __val = *(ptr);				\
-	for (;;) {							\
-		__old = cmpxchg((ptr), __val, __val | (mask));		\
-		if (__old == __val)					\
-			break;						\
-		__val = __old;						\
-	}								\
-	__old;								\
-})
-#endif
-
-
 #ifdef CONFIG_GENERIC_ATOMIC64
 #include <asm-generic/atomic64.h>
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8465eeab8b3..8b489fcac37b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -321,6 +321,24 @@ static inline void init_hrtick(void)
 }
 #endif	/* CONFIG_SCHED_HRTICK */
 
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, mask)						\
+	({								\
+		typeof(ptr) _ptr = (ptr);				\
+		typeof(mask) _mask = (mask);				\
+		typeof(*_ptr) _old, _val = *_ptr;			\
+									\
+		for (;;) {						\
+			_old = cmpxchg(_ptr, _val, _val | _mask);	\
+			if (_old == _val)				\
+				break;					\
+			_val = _old;					\
+		}							\
+	_old;								\
+})
+
 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
 /*
  * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
-- 
cgit v1.2.3


From 8fdc65391c6ad16461526a685f03262b3b01bfde Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 29 Mar 2016 09:26:44 +0200
Subject: perf/core: Fix time tracking bug with multiplexing

Stephane reported that commit:

  3cbaa5906967 ("perf: Fix ctx time tracking by introducing EVENT_TIME")

introduced a regression wrt. time tracking, as easily observed by:

> This patch introduce a bug in the time tracking of events when
> multiplexing is used.
>
> The issue is easily reproducible with the following perf run:
>
>  $ perf stat -a -C 0 -e branches,branches,branches,branches,branches,branches -I 1000
>      1.000730239            652,394      branches   (66.41%)
>      1.000730239            597,809      branches   (66.41%)
>      1.000730239            593,870      branches   (66.63%)
>      1.000730239            651,440      branches   (67.03%)
>      1.000730239            656,725      branches   (66.96%)
>      1.000730239      <not counted>      branches
>
> One branches event is shown as not having run. Yet, with
> multiplexing, all events should run especially with a 1s (-I 1000)
> interval. The delta for time_running comes out to 0. Yet, the event
> has run because the kernel is actually multiplexing the events. The
> problem is that the time tracking is the kernel and especially in
> ctx_sched_out() is wrong now.
>
> The problem is that in case that the kernel enters ctx_sched_out() with the
> following state:
>    ctx->is_active=0x7 event_type=0x1
>    Call Trace:
>     [<ffffffff813ddd41>] dump_stack+0x63/0x82
>     [<ffffffff81182bdc>] ctx_sched_out+0x2bc/0x2d0
>     [<ffffffff81183896>] perf_mux_hrtimer_handler+0xf6/0x2c0
>     [<ffffffff811837a0>] ? __perf_install_in_context+0x130/0x130
>     [<ffffffff810f5818>] __hrtimer_run_queues+0xf8/0x2f0
>     [<ffffffff810f6097>] hrtimer_interrupt+0xb7/0x1d0
>     [<ffffffff810509a8>] local_apic_timer_interrupt+0x38/0x60
>     [<ffffffff8175ca9d>] smp_apic_timer_interrupt+0x3d/0x50
>     [<ffffffff8175ac7c>] apic_timer_interrupt+0x8c/0xa0
>
> In that case, the test:
>       if (is_active & EVENT_TIME)
>
> will be false and the time will not be updated. Time must always be updated on
> sched out.

Fix this by always updating time if EVENT_TIME was set, as opposed to
only updating time when EVENT_TIME changed.

Reported-by: Stephane Eranian <eranian@google.com>
Tested-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: kan.liang@intel.com
Cc: namhyung@kernel.org
Fixes: 3cbaa5906967 ("perf: Fix ctx time tracking by introducing EVENT_TIME")
Link: http://lkml.kernel.org/r/20160329072644.GB3408@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index de24fbce5277..8c11388e92a5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2417,14 +2417,24 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 			cpuctx->task_ctx = NULL;
 	}
 
-	is_active ^= ctx->is_active; /* changed bits */
-
+	/*
+	 * Always update time if it was set; not only when it changes.
+	 * Otherwise we can 'forget' to update time for any but the last
+	 * context we sched out. For example:
+	 *
+	 *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
+	 *   ctx_sched_out(.event_type = EVENT_PINNED)
+	 *
+	 * would only update time for the pinned events.
+	 */
 	if (is_active & EVENT_TIME) {
 		/* update (and stop) ctx time */
 		update_context_time(ctx);
 		update_cgrp_time_from_cpuctx(cpuctx);
 	}
 
+	is_active ^= ctx->is_active; /* changed bits */
+
 	if (!ctx->nr_active || !(is_active & EVENT_ALL))
 		return;
 
-- 
cgit v1.2.3


From 201c2f85bd0bc13b712d9c0b3d11251b182e06ae Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Mon, 21 Mar 2016 10:02:42 +0200
Subject: perf/core: Don't leak event in the syscall error path

In the error path, event_file not being NULL is used to determine
whether the event itself still needs to be free'd, so fix it up to
avoid leaking.

Reported-by: Leon Yu <chianglungyu@gmail.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: 130056275ade ("perf: Do not double free")
Link: http://lkml.kernel.org/r/87twk06yxp.fsf@ashishki-desk.ger.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8c11388e92a5..52bedc5a5aaa 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8542,6 +8542,7 @@ SYSCALL_DEFINE5(perf_event_open,
 					f_flags);
 	if (IS_ERR(event_file)) {
 		err = PTR_ERR(event_file);
+		event_file = NULL;
 		goto err_context;
 	}
 
-- 
cgit v1.2.3


From 63cc787e71ae12976b2116f09677d1f2805df83e Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Thu, 17 Mar 2016 12:00:31 +0800
Subject: irqdomain: Export irq_domain_free_irqs_common

Export irq_domain_free_irqs_common so it can be used by modules.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 kernel/irq/irqdomain.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3a519a01118b..245a485ffb61 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1099,6 +1099,7 @@ void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq,
 	}
 	irq_domain_free_irqs_parent(domain, virq, nr_irqs);
 }
+EXPORT_SYMBOL_GPL(irq_domain_free_irqs_common);
 
 /**
  * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent
-- 
cgit v1.2.3


From 26657848502b78474a5f17f9ce2ae6dc8d8d6262 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 22 Mar 2016 22:09:18 +0100
Subject: perf/core: Verify we have a single perf_hw_context PMU

There should (and can) only be a single PMU for perf_hw_context
events.

This is because of how we schedule events: once a hardware event fails to
schedule (the PMU is 'full') we stop trying to add more. The trivial
'fix' would break the Round-Robin scheduling we do.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 52bedc5a5aaa..525d11c59287 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7693,6 +7693,15 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 	}
 
 skip_type:
+	if (pmu->task_ctx_nr == perf_hw_context) {
+		static int hw_context_taken = 0;
+
+		if (WARN_ON_ONCE(hw_context_taken))
+			pmu->task_ctx_nr = perf_invalid_context;
+
+		hw_context_taken = 1;
+	}
+
 	pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
 	if (pmu->pmu_cpu_context)
 		goto got_cpu_context;
-- 
cgit v1.2.3


From dcb10a967ce82d5ad20570693091139ae716ff76 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 4 Mar 2016 15:42:45 +0200
Subject: perf/ring_buffer: Refuse to begin AUX transaction after
 rb->aux_mmap_count drops

When ring buffer's AUX area is unmapped and rb->aux_mmap_count drops to
zero, new AUX transactions into this buffer can still be started,
even though the buffer in en route to deallocation.

This patch adds a check to perf_aux_output_begin() for rb->aux_mmap_count
being zero, in which case there is no point starting new transactions,
in other words, the ring buffers that pass a certain point in
perf_mmap_close will not have their events sending new data, which
clears path for freeing those buffers' pages right there and then,
provided that no active transactions are holding the AUX reference.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1457098969-21595-2-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index c61f0cbd308b..89abf623e93c 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -287,6 +287,13 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
 		goto err;
 
+	/*
+	 * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
+	 * the aux buffer is in perf_mmap_close(), about to get freed.
+	 */
+	if (!atomic_read(&rb->aux_mmap_count))
+		goto err;
+
 	/*
 	 * Nesting is not supported for AUX area, make sure nested
 	 * writers are caught early
-- 
cgit v1.2.3


From 95ff4ca26c492fc1ed7751f5dd7ab7674b54f4e0 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 2 Dec 2015 18:41:11 +0200
Subject: perf/core: Free AUX pages in unmap path

Now that we can ensure that when ring buffer's AUX area is on the way
to getting unmapped new transactions won't start, we only need to stop
all events that can potentially be writing aux data to our ring buffer.

Having done that, we can safely free the AUX pages and corresponding
PMU data, as this time it is guaranteed to be the last aux reference
holder.

This partially reverts:

  57ffc5ca679 ("perf: Fix AUX buffer refcounting")

... which was made to defer deallocation that was otherwise possible
from an NMI context. Now it is no longer the case; the last call to
rb_free_aux() that drops the last AUX reference has to happen in
perf_mmap_close() on that AUX area.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/87d1qtz23d.fsf@ashishki-desk.ger.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c        | 120 +++++++++++++++++++++++++++++++++++++++++++-
 kernel/events/internal.h    |   1 -
 kernel/events/ring_buffer.c |  37 ++++----------
 3 files changed, 129 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 525d11c59287..243df4b62870 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1925,8 +1925,13 @@ event_sched_in(struct perf_event *event,
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
-	event->state = PERF_EVENT_STATE_ACTIVE;
-	event->oncpu = smp_processor_id();
+	WRITE_ONCE(event->oncpu, smp_processor_id());
+	/*
+	 * Order event::oncpu write to happen before the ACTIVE state
+	 * is visible.
+	 */
+	smp_wmb();
+	WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
 
 	/*
 	 * Unthrottle events, since we scheduled we might have missed several
@@ -2358,6 +2363,29 @@ void perf_event_enable(struct perf_event *event)
 }
 EXPORT_SYMBOL_GPL(perf_event_enable);
 
+static int __perf_event_stop(void *info)
+{
+	struct perf_event *event = info;
+
+	/* for AUX events, our job is done if the event is already inactive */
+	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+		return 0;
+
+	/* matches smp_wmb() in event_sched_in() */
+	smp_rmb();
+
+	/*
+	 * There is a window with interrupts enabled before we get here,
+	 * so we need to check again lest we try to stop another CPU's event.
+	 */
+	if (READ_ONCE(event->oncpu) != smp_processor_id())
+		return -EAGAIN;
+
+	event->pmu->stop(event, PERF_EF_UPDATE);
+
+	return 0;
+}
+
 static int _perf_event_refresh(struct perf_event *event, int refresh)
 {
 	/*
@@ -4667,6 +4695,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
 		event->pmu->event_mapped(event);
 }
 
+static void perf_pmu_output_stop(struct perf_event *event);
+
 /*
  * A buffer can be mmap()ed multiple times; either directly through the same
  * event, or through other events by use of perf_event_set_output().
@@ -4694,10 +4724,22 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	 */
 	if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
 	    atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+		/*
+		 * Stop all AUX events that are writing to this buffer,
+		 * so that we can free its AUX pages and corresponding PMU
+		 * data. Note that after rb::aux_mmap_count dropped to zero,
+		 * they won't start any more (see perf_aux_output_begin()).
+		 */
+		perf_pmu_output_stop(event);
+
+		/* now it's safe to free the pages */
 		atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
 		vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
 
+		/* this has to be the last one */
 		rb_free_aux(rb);
+		WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+
 		mutex_unlock(&event->mmap_mutex);
 	}
 
@@ -5768,6 +5810,80 @@ next:
 	rcu_read_unlock();
 }
 
+struct remote_output {
+	struct ring_buffer	*rb;
+	int			err;
+};
+
+static void __perf_event_output_stop(struct perf_event *event, void *data)
+{
+	struct perf_event *parent = event->parent;
+	struct remote_output *ro = data;
+	struct ring_buffer *rb = ro->rb;
+
+	if (!has_aux(event))
+		return;
+
+	if (!parent)
+		parent = event;
+
+	/*
+	 * In case of inheritance, it will be the parent that links to the
+	 * ring-buffer, but it will be the child that's actually using it:
+	 */
+	if (rcu_dereference(parent->rb) == rb)
+		ro->err = __perf_event_stop(event);
+}
+
+static int __perf_pmu_output_stop(void *info)
+{
+	struct perf_event *event = info;
+	struct pmu *pmu = event->pmu;
+	struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+	struct remote_output ro = {
+		.rb	= event->rb,
+	};
+
+	rcu_read_lock();
+	perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro);
+	if (cpuctx->task_ctx)
+		perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+				   &ro);
+	rcu_read_unlock();
+
+	return ro.err;
+}
+
+static void perf_pmu_output_stop(struct perf_event *event)
+{
+	struct perf_event *iter;
+	int err, cpu;
+
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
+		/*
+		 * For per-CPU events, we need to make sure that neither they
+		 * nor their children are running; for cpu==-1 events it's
+		 * sufficient to stop the event itself if it's active, since
+		 * it can't have children.
+		 */
+		cpu = iter->cpu;
+		if (cpu == -1)
+			cpu = READ_ONCE(iter->oncpu);
+
+		if (cpu == -1)
+			continue;
+
+		err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
+		if (err == -EAGAIN) {
+			rcu_read_unlock();
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
 /*
  * task tracking -- fork/exit
  *
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2bbad9c1274c..2b229fdcfc09 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -11,7 +11,6 @@
 struct ring_buffer {
 	atomic_t			refcount;
 	struct rcu_head			rcu_head;
-	struct irq_work			irq_work;
 #ifdef CONFIG_PERF_USE_VMALLOC
 	struct work_struct		work;
 	int				page_order;	/* allocation order  */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 89abf623e93c..367e9c56ec0b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -221,8 +221,6 @@ void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-static void rb_irq_work(struct irq_work *work);
-
 static void
 ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 {
@@ -243,16 +241,6 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 
 	INIT_LIST_HEAD(&rb->event_list);
 	spin_lock_init(&rb->event_lock);
-	init_irq_work(&rb->irq_work, rb_irq_work);
-}
-
-static void ring_buffer_put_async(struct ring_buffer *rb)
-{
-	if (!atomic_dec_and_test(&rb->refcount))
-		return;
-
-	rb->rcu_head.next = (void *)rb;
-	irq_work_queue(&rb->irq_work);
 }
 
 /*
@@ -292,7 +280,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	 * the aux buffer is in perf_mmap_close(), about to get freed.
 	 */
 	if (!atomic_read(&rb->aux_mmap_count))
-		goto err;
+		goto err_put;
 
 	/*
 	 * Nesting is not supported for AUX area, make sure nested
@@ -338,7 +326,7 @@ err_put:
 	rb_free_aux(rb);
 
 err:
-	ring_buffer_put_async(rb);
+	ring_buffer_put(rb);
 	handle->event = NULL;
 
 	return NULL;
@@ -389,7 +377,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 
 	local_set(&rb->aux_nest, 0);
 	rb_free_aux(rb);
-	ring_buffer_put_async(rb);
+	ring_buffer_put(rb);
 }
 
 /*
@@ -470,6 +458,14 @@ static void __rb_free_aux(struct ring_buffer *rb)
 {
 	int pg;
 
+	/*
+	 * Should never happen, the last reference should be dropped from
+	 * perf_mmap_close() path, which first stops aux transactions (which
+	 * in turn are the atomic holders of aux_refcount) and then does the
+	 * last rb_free_aux().
+	 */
+	WARN_ON_ONCE(in_atomic());
+
 	if (rb->aux_priv) {
 		rb->free_aux(rb->aux_priv);
 		rb->free_aux = NULL;
@@ -581,18 +577,7 @@ out:
 void rb_free_aux(struct ring_buffer *rb)
 {
 	if (atomic_dec_and_test(&rb->aux_refcount))
-		irq_work_queue(&rb->irq_work);
-}
-
-static void rb_irq_work(struct irq_work *work)
-{
-	struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work);
-
-	if (!atomic_read(&rb->aux_refcount))
 		__rb_free_aux(rb);
-
-	if (rb->rcu_head.next == (void *)rb)
-		call_rcu(&rb->rcu_head, rb_free_rcu);
 }
 
 #ifndef CONFIG_PERF_USE_VMALLOC
-- 
cgit v1.2.3


From af5bb4ed1254a378b6028c09e58bdcc1cd9bf5b3 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Fri, 4 Mar 2016 15:42:47 +0200
Subject: perf/ring_buffer: Document AUX API usage

In order to ensure safe AUX buffer management, we rely on the assumption
that pmu::stop() stops its ongoing AUX transaction and not just the hw.

This patch documents this requirement for the perf_aux_output_{begin,end}()
APIs.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1457098969-21595-4-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 367e9c56ec0b..0ed4555309bd 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -252,6 +252,10 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
  * The ordering is similar to that of perf_output_{begin,end}, with
  * the exception of (B), which should be taken care of by the pmu
  * driver, since ordering rules will differ depending on hardware.
+ *
+ * Call this from pmu::start(); see the comment in perf_aux_output_end()
+ * about its use in pmu callbacks. Both can also be called from the PMI
+ * handler if needed.
  */
 void *perf_aux_output_begin(struct perf_output_handle *handle,
 			    struct perf_event *event)
@@ -323,6 +327,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	return handle->rb->aux_priv;
 
 err_put:
+	/* can't be last */
 	rb_free_aux(rb);
 
 err:
@@ -337,6 +342,10 @@ err:
  * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
  * pmu driver's responsibility to observe ordering rules of the hardware,
  * so that all the data is externally visible before this is called.
+ *
+ * Note: this has to be called from pmu::stop() callback, as the assumption
+ * of the AUX buffer management code is that after pmu::stop(), the AUX
+ * transaction must be stopped and therefore drop the AUX reference count.
  */
 void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 			 bool truncated)
@@ -376,6 +385,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 	handle->event = NULL;
 
 	local_set(&rb->aux_nest, 0);
+	/* can't be last */
 	rb_free_aux(rb);
 	ring_buffer_put(rb);
 }
-- 
cgit v1.2.3


From 0a74c5b3d20d2a8693848b6ae4f1a97624f5b781 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 16 Mar 2016 15:34:29 +0100
Subject: ftrace/perf: Check sample types only for sampling events

Currently we check sample type for ftrace:function events
even if it's not created as a sampling event. That prevents
creating ftrace_function event in counting mode.

Make sure we check sample types only for sampling events.

Before:
  $ sudo perf stat -e ftrace:function ls
  ...

   Performance counter stats for 'ls':

     <not supported>      ftrace:function

         0.001983662 seconds time elapsed

After:
  $ sudo perf stat -e ftrace:function ls
  ...

   Performance counter stats for 'ls':

              44,498      ftrace:function

         0.037534722 seconds time elapsed

Suggested-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1458138873-1553-2-git-send-email-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_event_perf.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 00df25fd86ef..e11108f1d197 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -47,6 +47,9 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
 		if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
+		if (!is_sampling_event(p_event))
+			return 0;
+
 		/*
 		 * We don't allow user space callchains for  function trace
 		 * event, due to issues with page faults while tracing page
-- 
cgit v1.2.3


From 86e7972f690c1017fd086cdfe53d8524e68c661c Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 28 Mar 2016 06:41:29 +0000
Subject: perf/ring_buffer: Introduce new ioctl options to pause and resume the
 ring-buffer

Add new ioctl() to pause/resume ring-buffer output.

In some situations we want to read from the ring-buffer only when we
ensure nothing can write to the ring-buffer during reading. Without
this patch we have to turn off all events attached to this ring-buffer
to achieve this.

This patch is a prerequisite to enable overwrite support for the
perf ring-buffer support. Following commits will introduce new methods
support reading from overwrite ring buffer. Before reading, caller
must ensure the ring buffer is frozen, or the reading is unreliable.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459147292-239310-2-git-send-email-wangnan0@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/perf_event.h |  1 +
 kernel/events/core.c            | 13 +++++++++++++
 kernel/events/internal.h        |  9 +++++++++
 kernel/events/ring_buffer.c     | 12 +++++++++++-
 4 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1afe9623c1a7..a3c19034d5f8 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -401,6 +401,7 @@ struct perf_event_attr {
 #define PERF_EVENT_IOC_SET_FILTER	_IOW('$', 6, char *)
 #define PERF_EVENT_IOC_ID		_IOR('$', 7, __u64 *)
 #define PERF_EVENT_IOC_SET_BPF		_IOW('$', 8, __u32)
+#define PERF_EVENT_IOC_PAUSE_OUTPUT	_IOW('$', 9, __u32)
 
 enum perf_event_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 243df4b62870..51386e84293e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4379,6 +4379,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
 	case PERF_EVENT_IOC_SET_BPF:
 		return perf_event_set_bpf_prog(event, arg);
 
+	case PERF_EVENT_IOC_PAUSE_OUTPUT: {
+		struct ring_buffer *rb;
+
+		rcu_read_lock();
+		rb = rcu_dereference(event->rb);
+		if (!rb || !rb->nr_pages) {
+			rcu_read_unlock();
+			return -EINVAL;
+		}
+		rb_toggle_paused(rb, !!arg);
+		rcu_read_unlock();
+		return 0;
+	}
 	default:
 		return -ENOTTY;
 	}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2b229fdcfc09..2d67327d9ad9 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -17,6 +17,7 @@ struct ring_buffer {
 #endif
 	int				nr_pages;	/* nr of data pages  */
 	int				overwrite;	/* can overwrite itself */
+	int				paused;		/* can write into ring buffer */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
 
@@ -64,6 +65,14 @@ static inline void rb_free_rcu(struct rcu_head *rcu_head)
 	rb_free(rb);
 }
 
+static inline void rb_toggle_paused(struct ring_buffer *rb, bool pause)
+{
+	if (!pause && rb->nr_pages)
+		rb->paused = 0;
+	else
+		rb->paused = 1;
+}
+
 extern struct ring_buffer *
 rb_alloc(int nr_pages, long watermark, int cpu, int flags);
 extern void perf_event_wakeup(struct perf_event *event);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 0ed4555309bd..72d8127bb8fd 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -125,8 +125,11 @@ int perf_output_begin(struct perf_output_handle *handle,
 	if (unlikely(!rb))
 		goto out;
 
-	if (unlikely(!rb->nr_pages))
+	if (unlikely(rb->paused)) {
+		if (rb->nr_pages)
+			local_inc(&rb->lost);
 		goto out;
+	}
 
 	handle->rb    = rb;
 	handle->event = event;
@@ -241,6 +244,13 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
 
 	INIT_LIST_HEAD(&rb->event_list);
 	spin_lock_init(&rb->event_lock);
+
+	/*
+	 * perf_output_begin() only checks rb->paused, therefore
+	 * rb->paused must be true if we have no pages for output.
+	 */
+	if (!rb->nr_pages)
+		rb->paused = 1;
 }
 
 /*
-- 
cgit v1.2.3


From 1879445dfa7bbd6fe21b09c5cc72f4934798afed Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 28 Mar 2016 06:41:30 +0000
Subject: perf/core: Set event's default ::overflow_handler()

Set a default event->overflow_handler in perf_event_alloc() so don't
need to check event->overflow_handler in __perf_event_overflow().
Following commits can give a different default overflow_handler.

Initial idea comes from Peter:

  http://lkml.kernel.org/r/20130708121557.GA17211@twins.programming.kicks-ass.net

Since the default value of event->overflow_handler is not NULL, existing
'if (!overflow_handler)' checks need to be changed.

is_default_overflow_handler() is introduced for this.

No extra performance overhead is introduced into the hot path because in the
original code we still need to read this handler from memory. A conditional
branch is avoided so actually we remove some instructions.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459147292-239310-3-git-send-email-wangnan0@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/hw_breakpoint.c   |  4 ++--
 arch/arm64/kernel/hw_breakpoint.c |  4 ++--
 include/linux/perf_event.h        |  6 ++++++
 kernel/events/core.c              | 14 ++++++++------
 4 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 6284779d64ee..b8df45883cf7 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -631,7 +631,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 	info->address &= ~alignment_mask;
 	info->ctrl.len <<= offset;
 
-	if (!bp->overflow_handler) {
+	if (is_default_overflow_handler(bp)) {
 		/*
 		 * Mismatch breakpoints are required for single-stepping
 		 * breakpoints.
@@ -754,7 +754,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
 		 * mismatch breakpoint so we can single-step over the
 		 * watchpoint trigger.
 		 */
-		if (!wp->overflow_handler)
+		if (is_default_overflow_handler(wp))
 			enable_single_step(wp, instruction_pointer(regs));
 
 unlock:
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index b45c95d34b83..4ef5373f9a76 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -616,7 +616,7 @@ static int breakpoint_handler(unsigned long unused, unsigned int esr,
 		perf_bp_event(bp, regs);
 
 		/* Do we need to handle the stepping? */
-		if (!bp->overflow_handler)
+		if (is_default_overflow_handler(bp))
 			step = 1;
 unlock:
 		rcu_read_unlock();
@@ -712,7 +712,7 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr,
 		perf_bp_event(wp, regs);
 
 		/* Do we need to handle the stepping? */
-		if (!wp->overflow_handler)
+		if (is_default_overflow_handler(wp))
 			step = 1;
 
 unlock:
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 15588d4c581d..4065ca2d7149 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -838,6 +838,12 @@ extern void perf_event_output(struct perf_event *event,
 				struct perf_sample_data *data,
 				struct pt_regs *regs);
 
+static inline bool
+is_default_overflow_handler(struct perf_event *event)
+{
+	return (event->overflow_handler == perf_event_output);
+}
+
 extern void
 perf_event_header__init_id(struct perf_event_header *header,
 			   struct perf_sample_data *data,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 51386e84293e..8c3b35f2a269 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6628,10 +6628,7 @@ static int __perf_event_overflow(struct perf_event *event,
 		irq_work_queue(&event->pending);
 	}
 
-	if (event->overflow_handler)
-		event->overflow_handler(event, data, regs);
-	else
-		perf_event_output(event, data, regs);
+	event->overflow_handler(event, data, regs);
 
 	if (*perf_event_fasync(event) && event->pending_kill) {
 		event->pending_wakeup = 1;
@@ -8152,8 +8149,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		context = parent_event->overflow_handler_context;
 	}
 
-	event->overflow_handler	= overflow_handler;
-	event->overflow_handler_context = context;
+	if (overflow_handler) {
+		event->overflow_handler	= overflow_handler;
+		event->overflow_handler_context = context;
+	} else {
+		event->overflow_handler = perf_event_output;
+		event->overflow_handler_context = NULL;
+	}
 
 	perf_event__state_init(event);
 
-- 
cgit v1.2.3


From d1b26c70246bc72922ae61d9f972d5c2588409e7 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Mon, 28 Mar 2016 06:41:31 +0000
Subject: perf/ring_buffer: Prepare writing into the ring-buffer from the end

Convert perf_output_begin() to __perf_output_begin() and make the later
function able to write records from the end of the ring-buffer.

Following commits will utilize the 'backward' flag.

This is the core patch to support writing to the ring-buffer backwards,
which will be introduced by upcoming patches to support reading from
overwritable ring-buffers.

In theory, this patch should not introduce any extra performance
overhead since we use always_inline, but it does not hurt to double
check that assumption:

When CONFIG_OPTIMIZE_INLINING is disabled, the output object is nearly
identical to original one. See:

   http://lkml.kernel.org/g/56F52E83.70409@huawei.com

When CONFIG_OPTIMIZE_INLINING is enabled, the resuling object file becomes
smaller:

 $ size kernel/events/ring_buffer.o*
   text       data        bss        dec        hex    filename
   4641          4          8       4653       122d kernel/events/ring_buffer.o.old
   4545          4          8       4557       11cd kernel/events/ring_buffer.o.new

Performance testing results:

Calling 3000000 times of 'close(-1)', use gettimeofday() to check
duration.  Use 'perf record -o /dev/null -e raw_syscalls:*' to capture
system calls. In ns.

Testing environment:

 CPU    : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
 Kernel : v4.5.0

                     MEAN         STDVAR
  BASE            800214.950    2853.083
  PRE            2253846.700    9997.014
  POST           2257495.540    8516.293

Where 'BASE' is pure performance without capturing. 'PRE' is test
result of pure 'v4.5.0' kernel. 'POST' is test result after this
patch.

Considering the stdvar, this patch doesn't hurt performance, within
noise margin.

For testing details, see:

  http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459147292-239310-4-git-send-email-wangnan0@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 42 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 72d8127bb8fd..60be55a64040 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -102,8 +102,21 @@ out:
 	preempt_enable();
 }
 
-int perf_output_begin(struct perf_output_handle *handle,
-		      struct perf_event *event, unsigned int size)
+static bool __always_inline
+ring_buffer_has_space(unsigned long head, unsigned long tail,
+		      unsigned long data_size, unsigned int size,
+		      bool backward)
+{
+	if (!backward)
+		return CIRC_SPACE(head, tail, data_size) >= size;
+	else
+		return CIRC_SPACE(tail, head, data_size) >= size;
+}
+
+static int __always_inline
+__perf_output_begin(struct perf_output_handle *handle,
+		    struct perf_event *event, unsigned int size,
+		    bool backward)
 {
 	struct ring_buffer *rb;
 	unsigned long tail, offset, head;
@@ -146,9 +159,12 @@ int perf_output_begin(struct perf_output_handle *handle,
 	do {
 		tail = READ_ONCE(rb->user_page->data_tail);
 		offset = head = local_read(&rb->head);
-		if (!rb->overwrite &&
-		    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
-			goto fail;
+		if (!rb->overwrite) {
+			if (unlikely(!ring_buffer_has_space(head, tail,
+							    perf_data_size(rb),
+							    size, backward)))
+				goto fail;
+		}
 
 		/*
 		 * The above forms a control dependency barrier separating the
@@ -162,9 +178,17 @@ int perf_output_begin(struct perf_output_handle *handle,
 		 * See perf_output_put_handle().
 		 */
 
-		head += size;
+		if (!backward)
+			head += size;
+		else
+			head -= size;
 	} while (local_cmpxchg(&rb->head, offset, head) != offset);
 
+	if (backward) {
+		offset = head;
+		head = (u64)(-head);
+	}
+
 	/*
 	 * We rely on the implied barrier() by local_cmpxchg() to ensure
 	 * none of the data stores below can be lifted up by the compiler.
@@ -206,6 +230,12 @@ out:
 	return -ENOSPC;
 }
 
+int perf_output_begin(struct perf_output_handle *handle,
+		      struct perf_event *event, unsigned int size)
+{
+	return __perf_output_begin(handle, event, size, false);
+}
+
 unsigned int perf_output_copy(struct perf_output_handle *handle,
 		      const void *buf, unsigned int len)
 {
-- 
cgit v1.2.3


From 5ca3726af7f66a8cc71ce4414cfeb86deb784491 Mon Sep 17 00:00:00 2001
From: Zhao Lei <zhaolei@cn.fujitsu.com>
Date: Tue, 22 Mar 2016 16:37:07 +0800
Subject: sched/cpuacct: Show all possible CPUs in cpuacct output

Current code show stats of online CPUs in cpuacct.statcpus,
show stats of present cpus in cpuacct.usage(_percpu), and using
present CPUs for setting cpuacct.usage.

It will cause inconsistent result when a CPU is online or offline
or hotpluged.

We should always use possible CPUs to avoid above problem.

Here are the contents of a cpuacct.usage_percpu sysfs file,
on a 4 CPU system with maxcpus=32:

Before the patch:
 # cat cpuacct.usage_percpu
 2456565 411435 1052897 832584

After the patch:
 # cat cpuacct.usage_percpu
 2456565 411435 1052897 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a11d56cef12d0b4807f8be3a46bf9798c3014d59.1458635566.git.zhaolei@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpuacct.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 4a811203c04a..e39bd4faf2a8 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -138,7 +138,7 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
 	u64 totalcpuusage = 0;
 	int i;
 
-	for_each_present_cpu(i)
+	for_each_possible_cpu(i)
 		totalcpuusage += cpuacct_cpuusage_read(ca, i);
 
 	return totalcpuusage;
@@ -159,7 +159,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
 		goto out;
 	}
 
-	for_each_present_cpu(i)
+	for_each_possible_cpu(i)
 		cpuacct_cpuusage_write(ca, i, 0);
 
 out:
@@ -172,7 +172,7 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
 	u64 percpu;
 	int i;
 
-	for_each_present_cpu(i) {
+	for_each_possible_cpu(i) {
 		percpu = cpuacct_cpuusage_read(ca, i);
 		seq_printf(m, "%llu ", (unsigned long long) percpu);
 	}
@@ -191,7 +191,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
 	int cpu;
 	s64 val = 0;
 
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
 		val += kcpustat->cpustat[CPUTIME_USER];
 		val += kcpustat->cpustat[CPUTIME_NICE];
@@ -200,7 +200,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
 	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
 
 	val = 0;
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
 		val += kcpustat->cpustat[CPUTIME_SYSTEM];
 		val += kcpustat->cpustat[CPUTIME_IRQ];
-- 
cgit v1.2.3


From d740037fac7052e49450f6fa1454f1144a103b55 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Tue, 22 Mar 2016 16:37:08 +0800
Subject: sched/cpuacct: Split usage accounting into user_usage and sys_usage

Sometimes, cpuacct.usage is not detailed enough to see how much CPU
usage a group had. We want to know how much time it used in user mode
and how much in kernel mode.

This patch introduces more files to give this information:

 # ls /sys/fs/cgroup/cpuacct/cpuacct.usage*
 /sys/fs/cgroup/cpuacct/cpuacct.usage
 /sys/fs/cgroup/cpuacct/cpuacct.usage_percpu
 /sys/fs/cgroup/cpuacct/cpuacct.usage_user
 /sys/fs/cgroup/cpuacct/cpuacct.usage_percpu_user
 /sys/fs/cgroup/cpuacct/cpuacct.usage_sys
 /sys/fs/cgroup/cpuacct/cpuacct.usage_percpu_sys

... while keeping the ABI with the existing counter.

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
[ Ported to newer kernels. ]
Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/aa171da036b520b51c79549e9b3215d29473f19d.1458635566.git.zhaolei@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpuacct.c | 140 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 113 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index e39bd4faf2a8..df947e07aac1 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,11 +25,22 @@ enum cpuacct_stat_index {
 	CPUACCT_STAT_NSTATS,
 };
 
+enum cpuacct_usage_index {
+	CPUACCT_USAGE_USER,	/* ... user mode */
+	CPUACCT_USAGE_SYSTEM,	/* ... kernel mode */
+
+	CPUACCT_USAGE_NRUSAGE,
+};
+
+struct cpuacct_usage {
+	u64	usages[CPUACCT_USAGE_NRUSAGE];
+};
+
 /* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
-	u64 __percpu *cpuusage;
+	struct cpuacct_usage __percpu *cpuusage;
 	struct kernel_cpustat __percpu *cpustat;
 };
 
@@ -49,7 +60,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
 	return css_ca(ca->css.parent);
 }
 
-static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
 static struct cpuacct root_cpuacct = {
 	.cpustat	= &kernel_cpustat,
 	.cpuusage	= &root_cpuacct_cpuusage,
@@ -68,7 +79,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (!ca)
 		goto out;
 
-	ca->cpuusage = alloc_percpu(u64);
+	ca->cpuusage = alloc_percpu(struct cpuacct_usage);
 	if (!ca->cpuusage)
 		goto out_free_ca;
 
@@ -96,20 +107,37 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
 	kfree(ca);
 }
 
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
+				 enum cpuacct_usage_index index)
 {
-	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+	struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 
+	/*
+	 * We allow index == CPUACCT_USAGE_NRUSAGE here to read
+	 * the sum of suages.
+	 */
+	BUG_ON(index > CPUACCT_USAGE_NRUSAGE);
+
 #ifndef CONFIG_64BIT
 	/*
 	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
 	 */
 	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-	data = *cpuusage;
+#endif
+
+	if (index == CPUACCT_USAGE_NRUSAGE) {
+		int i = 0;
+
+		data = 0;
+		for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+			data += cpuusage->usages[i];
+	} else {
+		data = cpuusage->usages[index];
+	}
+
+#ifndef CONFIG_64BIT
 	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-	data = *cpuusage;
 #endif
 
 	return data;
@@ -117,69 +145,103 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
-	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+	struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+	int i;
 
 #ifndef CONFIG_64BIT
 	/*
 	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
 	 */
 	raw_spin_lock_irq(&cpu_rq(cpu)->lock);
-	*cpuusage = val;
+#endif
+
+	for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+		cpuusage->usages[i] = val;
+
+#ifndef CONFIG_64BIT
 	raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
-	*cpuusage = val;
 #endif
 }
 
 /* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+static u64 __cpuusage_read(struct cgroup_subsys_state *css,
+			   enum cpuacct_usage_index index)
 {
 	struct cpuacct *ca = css_ca(css);
 	u64 totalcpuusage = 0;
 	int i;
 
 	for_each_possible_cpu(i)
-		totalcpuusage += cpuacct_cpuusage_read(ca, i);
+		totalcpuusage += cpuacct_cpuusage_read(ca, i, index);
 
 	return totalcpuusage;
 }
 
+static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
+			      struct cftype *cft)
+{
+	return __cpuusage_read(css, CPUACCT_USAGE_USER);
+}
+
+static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM);
+}
+
+static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE);
+}
+
 static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
 			  u64 val)
 {
 	struct cpuacct *ca = css_ca(css);
-	int err = 0;
-	int i;
+	int cpu;
 
 	/*
 	 * Only allow '0' here to do a reset.
 	 */
-	if (val) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (val)
+		return -EINVAL;
 
-	for_each_possible_cpu(i)
-		cpuacct_cpuusage_write(ca, i, 0);
+	for_each_possible_cpu(cpu)
+		cpuacct_cpuusage_write(ca, cpu, 0);
 
-out:
-	return err;
+	return 0;
 }
 
-static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+static int __cpuacct_percpu_seq_show(struct seq_file *m,
+				     enum cpuacct_usage_index index)
 {
 	struct cpuacct *ca = css_ca(seq_css(m));
 	u64 percpu;
 	int i;
 
 	for_each_possible_cpu(i) {
-		percpu = cpuacct_cpuusage_read(ca, i);
+		percpu = cpuacct_cpuusage_read(ca, i, index);
 		seq_printf(m, "%llu ", (unsigned long long) percpu);
 	}
 	seq_printf(m, "\n");
 	return 0;
 }
 
+static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
+{
+	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER);
+}
+
+static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
+{
+	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM);
+}
+
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
+{
+	return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE);
+}
+
 static const char * const cpuacct_stat_desc[] = {
 	[CPUACCT_STAT_USER] = "user",
 	[CPUACCT_STAT_SYSTEM] = "system",
@@ -219,10 +281,26 @@ static struct cftype files[] = {
 		.read_u64 = cpuusage_read,
 		.write_u64 = cpuusage_write,
 	},
+	{
+		.name = "usage_user",
+		.read_u64 = cpuusage_user_read,
+	},
+	{
+		.name = "usage_sys",
+		.read_u64 = cpuusage_sys_read,
+	},
 	{
 		.name = "usage_percpu",
 		.seq_show = cpuacct_percpu_seq_show,
 	},
+	{
+		.name = "usage_percpu_user",
+		.seq_show = cpuacct_percpu_user_seq_show,
+	},
+	{
+		.name = "usage_percpu_sys",
+		.seq_show = cpuacct_percpu_sys_seq_show,
+	},
 	{
 		.name = "stat",
 		.seq_show = cpuacct_stats_show,
@@ -238,10 +316,18 @@ static struct cftype files[] = {
 void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
+	int index;
+
+	if (user_mode(task_pt_regs(tsk)))
+		index = CPUACCT_USAGE_USER;
+	else
+		index = CPUACCT_USAGE_SYSTEM;
 
 	rcu_read_lock();
+
 	for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
-		*this_cpu_ptr(ca->cpuusage) += cputime;
+		this_cpu_ptr(ca->cpuusage)->usages[index] += cputime;
+
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3


From d02c071183e1c01a76811c878c8a52322201f81f Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Wed, 23 Mar 2016 17:54:44 +0530
Subject: sched/fair: Reset nr_balance_failed after active balancing

To force a task migration during active balancing, nr_balance_failed is set
to cache_nice_tries + 1. However nr_balance_failed is not reset. As a side
effect, the next regular load balance under the same sd, a cache hot task
might be migrated, just because nr_balance_failed count is high.

Resetting nr_balance_failed after a successful active balance ensures
that a hot task is not unreasonably migrated. This can be verified by
looking at othe number of hot task migrations reported by /proc/schedstat.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1458735884-30105-1-git-send-email-srikar@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fe30e66aff1..cbb075e46b2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7399,10 +7399,7 @@ more_balance:
 					&busiest->active_balance_work);
 			}
 
-			/*
-			 * We've kicked active balancing, reset the failure
-			 * counter.
-			 */
+			/* We've kicked active balancing, force task migration. */
 			sd->nr_balance_failed = sd->cache_nice_tries+1;
 		}
 	} else
@@ -7637,10 +7634,13 @@ static int active_load_balance_cpu_stop(void *data)
 		schedstat_inc(sd, alb_count);
 
 		p = detach_one_task(&env);
-		if (p)
+		if (p) {
 			schedstat_inc(sd, alb_pushed);
-		else
+			/* Active balancing done, reset the failure counter. */
+			sd->nr_balance_failed = 0;
+		} else {
 			schedstat_inc(sd, alb_failed);
+		}
 	}
 	rcu_read_unlock();
 out_unlock:
-- 
cgit v1.2.3


From bfdb198ccd99472c5bded689699eb30dd06316bb Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Mon, 1 Feb 2016 14:47:59 -0800
Subject: sched/numa: Remove unnecessary NUMA dequeue update from non-SMP
 kernels

In account_entity_enqueue(), we do not do account_numa_enqueue()
as NUMA balancing is not needed for UP kernels.

Hence, we should remove the account_numa_dequeue() call from
account_entity_dequeue() for UP kernels.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1454366879.21738.29.camel@schen9-desk2.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cbb075e46b2c..1fe9916b4557 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2437,10 +2437,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
+#ifdef CONFIG_SMP
 	if (entity_is_task(se)) {
 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
 		list_del_init(&se->group_node);
 	}
+#endif
 	cfs_rq->nr_running--;
 }
 
-- 
cgit v1.2.3


From 47252cfbac03644ee4a3adfa50c77896aa94f2bb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 21 Mar 2016 11:23:39 -0400
Subject: sched/core: Add preempt checks in preempt_schedule() code

While testing the tracer preemptoff, I hit this strange trace:

   <...>-259     0...1    0us : schedule <-worker_thread
   <...>-259     0d..1    0us : rcu_note_context_switch <-__schedule
   <...>-259     0d..1    0us : rcu_sched_qs <-rcu_note_context_switch
   <...>-259     0d..1    0us : rcu_preempt_qs <-rcu_note_context_switch
   <...>-259     0d..1    0us : _raw_spin_lock <-__schedule
   <...>-259     0d..1    0us : preempt_count_add <-_raw_spin_lock
   <...>-259     0d..2    0us : do_raw_spin_lock <-_raw_spin_lock
   <...>-259     0d..2    1us : deactivate_task <-__schedule
   <...>-259     0d..2    1us : update_rq_clock.part.84 <-deactivate_task
   <...>-259     0d..2    1us : dequeue_task_fair <-deactivate_task
   <...>-259     0d..2    1us : dequeue_entity <-dequeue_task_fair
   <...>-259     0d..2    1us : update_curr <-dequeue_entity
   <...>-259     0d..2    1us : update_min_vruntime <-update_curr
   <...>-259     0d..2    1us : cpuacct_charge <-update_curr
   <...>-259     0d..2    1us : __rcu_read_lock <-cpuacct_charge
   <...>-259     0d..2    1us : __rcu_read_unlock <-cpuacct_charge
   <...>-259     0d..2    1us : clear_buddies <-dequeue_entity
   <...>-259     0d..2    1us : account_entity_dequeue <-dequeue_entity
   <...>-259     0d..2    2us : update_min_vruntime <-dequeue_entity
   <...>-259     0d..2    2us : update_cfs_shares <-dequeue_entity
   <...>-259     0d..2    2us : hrtick_update <-dequeue_task_fair
   <...>-259     0d..2    2us : wq_worker_sleeping <-__schedule
   <...>-259     0d..2    2us : kthread_data <-wq_worker_sleeping
   <...>-259     0d..2    2us : pick_next_task_fair <-__schedule
   <...>-259     0d..2    2us : check_cfs_rq_runtime <-pick_next_task_fair
   <...>-259     0d..2    2us : pick_next_entity <-pick_next_task_fair
   <...>-259     0d..2    2us : clear_buddies <-pick_next_entity
   <...>-259     0d..2    2us : pick_next_entity <-pick_next_task_fair
   <...>-259     0d..2    2us : clear_buddies <-pick_next_entity
   <...>-259     0d..2    2us : set_next_entity <-pick_next_task_fair
   <...>-259     0d..2    3us : put_prev_entity <-pick_next_task_fair
   <...>-259     0d..2    3us : check_cfs_rq_runtime <-put_prev_entity
   <...>-259     0d..2    3us : set_next_entity <-pick_next_task_fair
gnome-sh-1031    0d..2    3us : finish_task_switch <-__schedule
gnome-sh-1031    0d..2    3us : _raw_spin_unlock_irq <-finish_task_switch
gnome-sh-1031    0d..2    3us : do_raw_spin_unlock <-_raw_spin_unlock_irq
gnome-sh-1031    0...2    3us!: preempt_count_sub <-_raw_spin_unlock_irq
gnome-sh-1031    0...1  582us : do_raw_spin_lock <-_raw_spin_lock
gnome-sh-1031    0...1  583us : _raw_spin_unlock <-drm_gem_object_lookup
gnome-sh-1031    0...1  583us : do_raw_spin_unlock <-_raw_spin_unlock
gnome-sh-1031    0...1  583us : preempt_count_sub <-_raw_spin_unlock
gnome-sh-1031    0...1  584us : _raw_spin_unlock <-drm_gem_object_lookup
gnome-sh-1031    0...1  584us+: trace_preempt_on <-drm_gem_object_lookup
gnome-sh-1031    0...1  603us : <stack trace>
 => preempt_count_sub
 => _raw_spin_unlock
 => drm_gem_object_lookup
 => i915_gem_madvise_ioctl
 => drm_ioctl
 => do_vfs_ioctl
 => SyS_ioctl
 => entry_SYSCALL_64_fastpath

As I'm tracing preemption disabled, it seemed incorrect that the trace
would go across a schedule and report not being in the scheduler.
Looking into this I discovered the problem.

schedule() calls preempt_disable() but the preempt_schedule() calls
preempt_enable_notrace(). What happened above was that the gnome-shell
task was preempted on another CPU, migrated over to the idle cpu. The
tracer stared with idle calling schedule(), which called
preempt_disable(), but then gnome-shell finished, and it enabled
preemption with preempt_enable_notrace() that does stop the trace, even
though preemption was enabled.

The purpose of the preempt_disable_notrace() in the preempt_schedule()
is to prevent function tracing from going into an infinite loop.
Because function tracing can trace the preempt_enable/disable() calls
that are traced. The problem with function tracing is:

  NEED_RESCHED set
  preempt_schedule()
    preempt_disable()
      preempt_count_inc()
        function trace (before incrementing preempt count)
          preempt_disable_notrace()
          preempt_enable_notrace()
            sees NEED_RESCHED set
               preempt_schedule() (repeat)

Now by breaking out the preempt off/on tracing into their own code:
preempt_disable_check() and preempt_enable_check(), we can add these to
the preempt_schedule() code. As preemption would then be disabled, even
if they were to be traced by the function tracer, the disabled
preemption would prevent the recursion.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160321112339.6dc78ad6@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 59 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fcac37b..b5cf01d2368e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2958,6 +2958,20 @@ u64 scheduler_tick_max_deferment(void)
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
 				defined(CONFIG_PREEMPT_TRACER))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+	if (preempt_count() == val) {
+		unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+		current->preempt_disable_ip = ip;
+#endif
+		trace_preempt_off(CALLER_ADDR0, ip);
+	}
+}
 
 void preempt_count_add(int val)
 {
@@ -2976,17 +2990,21 @@ void preempt_count_add(int val)
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 #endif
-	if (preempt_count() == val) {
-		unsigned long ip = get_lock_parent_ip();
-#ifdef CONFIG_DEBUG_PREEMPT
-		current->preempt_disable_ip = ip;
-#endif
-		trace_preempt_off(CALLER_ADDR0, ip);
-	}
+	preempt_latency_start(val);
 }
 EXPORT_SYMBOL(preempt_count_add);
 NOKPROBE_SYMBOL(preempt_count_add);
 
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
 void preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
@@ -3003,13 +3021,15 @@ void preempt_count_sub(int val)
 		return;
 #endif
 
-	if (preempt_count() == val)
-		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+	preempt_latency_stop(val);
 	__preempt_count_sub(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
 NOKPROBE_SYMBOL(preempt_count_sub);
 
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
 #endif
 
 /*
@@ -3284,8 +3304,23 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
 	do {
+		/*
+		 * Because the function tracer can trace preempt_count_sub()
+		 * and it also uses preempt_enable/disable_notrace(), if
+		 * NEED_RESCHED is set, the preempt_enable_notrace() called
+		 * by the function tracer will call this function again and
+		 * cause infinite recursion.
+		 *
+		 * Preemption must be disabled here before the function
+		 * tracer can trace. Break up preempt_disable() into two
+		 * calls. One to disable preemption without fear of being
+		 * traced. The other to still record the preemption latency,
+		 * which can also be traced by the function tracer.
+		 */
 		preempt_disable_notrace();
+		preempt_latency_start(1);
 		__schedule(true);
+		preempt_latency_stop(1);
 		preempt_enable_no_resched_notrace();
 
 		/*
@@ -3337,7 +3372,21 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 		return;
 
 	do {
+		/*
+		 * Because the function tracer can trace preempt_count_sub()
+		 * and it also uses preempt_enable/disable_notrace(), if
+		 * NEED_RESCHED is set, the preempt_enable_notrace() called
+		 * by the function tracer will call this function again and
+		 * cause infinite recursion.
+		 *
+		 * Preemption must be disabled here before the function
+		 * tracer can trace. Break up preempt_disable() into two
+		 * calls. One to disable preemption without fear of being
+		 * traced. The other to still record the preemption latency,
+		 * which can also be traced by the function tracer.
+		 */
 		preempt_disable_notrace();
+		preempt_latency_start(1);
 		/*
 		 * Needs preempt disabled in case user_exit() is traced
 		 * and the tracer calls preempt_enable_notrace() causing
@@ -3347,6 +3396,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 		__schedule(true);
 		exception_exit(prev_ctx);
 
+		preempt_latency_stop(1);
 		preempt_enable_no_resched_notrace();
 	} while (need_resched());
 }
-- 
cgit v1.2.3


From 1c3de5e19fc96206dd086e634129d08e5f7b1000 Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Wed, 30 Mar 2016 07:07:51 +0800
Subject: sched/fair: Update comments after a variable rename

The following commit:

  ed82b8a1ff76 ("sched/core: Move the sched_to_prio[] arrays out of line")

renamed prio_to_weight to sched_prio_to_weight, but the old name was not
updated in comments.

Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1459292871-22531-1-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1fe9916b4557..4bb5ace60dc8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -204,7 +204,7 @@ static void __update_inv_weight(struct load_weight *lw)
  *   OR
  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
  *
- * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
+ * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
  * we're guaranteed shift stays positive because inv_weight is guaranteed to
  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
  *
@@ -5656,7 +5656,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  *   W_i,0 = \Sum_j w_i,j                                             (2)
  *
  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
- * is derived from the nice value as per prio_to_weight[].
+ * is derived from the nice value as per sched_prio_to_weight[].
  *
  * The weight average is an exponential decay average of the instantaneous
  * weight:
-- 
cgit v1.2.3


From 2b8c41daba327c633228169e8bd8ec067ab443f8 Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Wed, 30 Mar 2016 04:30:56 +0800
Subject: sched/fair: Initiate a new task's util avg to a bounded value

A new task's util_avg is set to full utilization of a CPU (100% time
running). This accelerates a new task's utilization ramp-up, useful to
boost its execution in early time. However, it may result in
(insanely) high utilization for a transient time period when a flood
of tasks are spawned. Importantly, it violates the "fundamentally
bounded" CPU utilization, and its side effect is negative if we don't
take any measure to bound it.

This patch proposes an algorithm to address this issue. It has
two methods to approach a sensible initial util_avg:

(1) An expected (or average) util_avg based on its cfs_rq's util_avg:

  util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight

(2) A trajectory of how successive new tasks' util develops, which
gives 1/2 of the left utilization budget to a new task such that
the additional util is noticeably large (when overall util is low) or
unnoticeably small (when overall util is high enough). In the meantime,
the aggregate utilization is well bounded:

  util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n

where n denotes the nth task.

If util_avg is larger than util_avg_cap, then the effective util is
clamped to the util_avg_cap.

Reported-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: steve.muckle@linaro.org
Link: http://lkml.kernel.org/r/1459283456-21682-1-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  2 ++
 kernel/sched/fair.c  | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/sched.h |  1 +
 3 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b5cf01d2368e..11594230ef4d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2431,6 +2431,8 @@ void wake_up_new_task(struct task_struct *p)
 	 */
 	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
+	/* Post initialize new task's util average when its cfs_rq is set */
+	post_init_entity_util_avg(&p->se);
 
 	rq = __task_rq_lock(p);
 	activate_task(rq, p, 0);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4bb5ace60dc8..b8cc1c35cd7c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -682,17 +682,68 @@ void init_entity_runnable_average(struct sched_entity *se)
 	sa->period_contrib = 1023;
 	sa->load_avg = scale_load_down(se->load.weight);
 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-	sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
-	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+	/*
+	 * At this point, util_avg won't be used in select_task_rq_fair anyway
+	 */
+	sa->util_avg = 0;
+	sa->util_sum = 0;
 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
 
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	struct sched_avg *sa = &se->avg;
+	long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2;
+
+	if (cap > 0) {
+		if (cfs_rq->avg.util_avg != 0) {
+			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
+			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+			if (sa->util_avg > cap)
+				sa->util_avg = cap;
+		} else {
+			sa->util_avg = cap;
+		}
+		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+	}
+}
+
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
 #else
 void init_entity_runnable_average(struct sched_entity *se)
 {
 }
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
 #endif
 
 /*
@@ -8384,6 +8435,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 		init_entity_runnable_average(se);
+		post_init_entity_util_avg(se);
 	}
 
 	return 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec2e8d23527e..a7cbad7b3ad2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1313,6 +1313,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 unsigned long to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se);
 
 #ifdef CONFIG_NO_HZ_FULL
 extern bool sched_can_stop_tick(struct rq *rq);
-- 
cgit v1.2.3


From 39e2e173fb1f900959d3a25c21c65fa88b06c6ee Mon Sep 17 00:00:00 2001
From: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Date: Wed, 30 Mar 2016 19:03:36 +0200
Subject: locking/lockdep: Print chain_key collision information

A sequence of pairs [class_idx -> corresponding chain_key iteration]
is printed for both the current held_lock chain and the cached chain.

That exposes the two different class_idx sequences that led to that
particular hash value.

This helps with debugging hash chain collision reports.

Signed-off-by: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-fsdevel@vger.kernel.org
Cc: sedat.dilek@gmail.com
Cc: tytso@mit.edu
Link: http://lkml.kernel.org/r/1459357416-19190-1-git-send-email-alfredoalvarezernandez@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 53ab2f85d77e..2324ba5310db 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1999,6 +1999,77 @@ static inline int get_first_held_lock(struct task_struct *curr,
 	return ++i;
 }
 
+/*
+ * Returns the next chain_key iteration
+ */
+static u64 print_chain_key_iteration(int class_idx, u64 chain_key)
+{
+	u64 new_chain_key = iterate_chain_key(chain_key, class_idx);
+
+	printk(" class_idx:%d -> chain_key:%016Lx",
+		class_idx,
+		(unsigned long long)new_chain_key);
+	return new_chain_key;
+}
+
+static void
+print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
+{
+	struct held_lock *hlock;
+	u64 chain_key = 0;
+	int depth = curr->lockdep_depth;
+	int i;
+
+	printk("depth: %u\n", depth + 1);
+	for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) {
+		hlock = curr->held_locks + i;
+		chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
+
+		print_lock(hlock);
+	}
+
+	print_chain_key_iteration(hlock_next->class_idx, chain_key);
+	print_lock(hlock_next);
+}
+
+static void print_chain_keys_chain(struct lock_chain *chain)
+{
+	int i;
+	u64 chain_key = 0;
+	int class_id;
+
+	printk("depth: %u\n", chain->depth);
+	for (i = 0; i < chain->depth; i++) {
+		class_id = chain_hlocks[chain->base + i];
+		chain_key = print_chain_key_iteration(class_id + 1, chain_key);
+
+		print_lock_name(lock_classes + class_id);
+		printk("\n");
+	}
+}
+
+static void print_collision(struct task_struct *curr,
+			struct held_lock *hlock_next,
+			struct lock_chain *chain)
+{
+	printk("\n");
+	printk("======================\n");
+	printk("[chain_key collision ]\n");
+	print_kernel_ident();
+	printk("----------------------\n");
+	printk("%s/%d: ", current->comm, task_pid_nr(current));
+	printk("Hash chain already cached but the contents don't match!\n");
+
+	printk("Held locks:");
+	print_chain_keys_held_locks(curr, hlock_next);
+
+	printk("Locks in cached chain:");
+	print_chain_keys_chain(chain);
+
+	printk("\nstack backtrace:\n");
+	dump_stack();
+}
+
 /*
  * Checks whether the chain and the current held locks are consistent
  * in depth and also in content. If they are not it most likely means
@@ -2014,14 +2085,18 @@ static int check_no_collision(struct task_struct *curr,
 
 	i = get_first_held_lock(curr, hlock);
 
-	if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1)))
+	if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) {
+		print_collision(curr, hlock, chain);
 		return 0;
+	}
 
 	for (j = 0; j < chain->depth - 1; j++, i++) {
 		id = curr->held_locks[i].class_idx - 1;
 
-		if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id))
+		if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
+			print_collision(curr, hlock, chain);
 			return 0;
+		}
 	}
 #endif
 	return 1;
-- 
cgit v1.2.3


From 274529ba9bda86c91c2c06da3a641aaf617dd30f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 21 Mar 2016 19:46:04 -0700
Subject: rcu: Consolidate dumping of ftrace buffer

This commit consolidates a couple definitions and several calls for
single-shot ftrace-buffer dumping.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 13 +++++++++++++
 kernel/rcu/rcutorture.c  | 17 +++--------------
 kernel/rcu/tree.c        |  4 ++--
 3 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 2657aff2725b..45de591657a6 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1144,4 +1144,17 @@ static inline void rcu_sysidle_force_exit(void)
 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 
 
+/*
+ * Dump the ftrace buffer, but only one time per callsite per boot.
+ */
+#define rcu_ftrace_dump(oops_dump_mode) \
+do { \
+	static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
+	\
+	if (!atomic_read(&___rfd_beenhere) && \
+	    !atomic_xchg(&___rfd_beenhere, 1)) \
+		ftrace_dump(oops_dump_mode); \
+} while (0)
+
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 250ea67c1615..463867c43221 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1082,17 +1082,6 @@ rcu_torture_fakewriter(void *arg)
 	return 0;
 }
 
-static void rcutorture_trace_dump(void)
-{
-	static atomic_t beenhere = ATOMIC_INIT(0);
-
-	if (atomic_read(&beenhere))
-		return;
-	if (atomic_xchg(&beenhere, 1) != 0)
-		return;
-	ftrace_dump(DUMP_ALL);
-}
-
 /*
  * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
  * incrementing the corresponding element of the pipeline array.  The
@@ -1142,7 +1131,7 @@ static void rcu_torture_timer(unsigned long unused)
 	if (pipe_count > 1) {
 		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
 					  started, completed);
-		rcutorture_trace_dump();
+		rcu_ftrace_dump(DUMP_ALL);
 	}
 	__this_cpu_inc(rcu_torture_count[pipe_count]);
 	completed = completed - started;
@@ -1215,7 +1204,7 @@ rcu_torture_reader(void *arg)
 		if (pipe_count > 1) {
 			do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
 						  ts, started, completed);
-			rcutorture_trace_dump();
+			rcu_ftrace_dump(DUMP_ALL);
 		}
 		__this_cpu_inc(rcu_torture_count[pipe_count]);
 		completed = completed - started;
@@ -1333,7 +1322,7 @@ rcu_torture_stats_print(void)
 			 rcu_torture_writer_state,
 			 gpnum, completed, flags);
 		show_rcu_gp_kthreads();
-		rcutorture_trace_dump();
+		rcu_ftrace_dump(DUMP_ALL);
 	}
 	rtcv_snap = rcu_torture_current_version;
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9a535a86e732..531a328076bd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -637,7 +637,7 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
 			idle_task(smp_processor_id());
 
 		trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
-		ftrace_dump(DUMP_ORIG);
+		rcu_ftrace_dump(DUMP_ORIG);
 		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
 			  current->pid, current->comm,
 			  idle->pid, idle->comm); /* must be idle task! */
@@ -799,7 +799,7 @@ static void rcu_eqs_exit_common(long long oldval, int user)
 
 		trace_rcu_dyntick(TPS("Error on exit: not idle task"),
 				  oldval, rdtp->dynticks_nesting);
-		ftrace_dump(DUMP_ORIG);
+		rcu_ftrace_dump(DUMP_ORIG);
 		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
 			  current->pid, current->comm,
 			  idle->pid, idle->comm); /* must be idle task! */
-- 
cgit v1.2.3


From 28728dd310d48834cd486dac3cac9ae96b9deb96 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Jan 2016 08:33:37 -0800
Subject: rcu: Make expedited RCU-sched grace period immediately detect idle

Currently, sync_sched_exp_handler() will force a reschedule unless
this CPU has already checked in or unless a reschedule has already
been called for.  This is clearly wasteful if sync_sched_exp_handler()
interrupted an idle CPU, so this commit immediately reports the
quiescent state in that case.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 531a328076bd..5f4336fadc28 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3649,6 +3649,11 @@ static void sync_sched_exp_handler(void *data)
 	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
 	    __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
 		return;
+	if (rcu_is_cpu_rrupt_from_idle()) {
+		rcu_report_exp_rdp(&rcu_sched_state,
+				   this_cpu_ptr(&rcu_sched_data), true);
+		return;
+	}
 	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true);
 	resched_cpu(smp_processor_id());
 }
-- 
cgit v1.2.3


From 251c617c75f48e03523c43c4ce1dff44bc3ae2bd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 13 Jan 2016 10:52:35 -0800
Subject: rcu: Make expedited RCU-preempt stall warnings count accurately

Currently, synchronize_sched_expedited_wait() simply sets the ndetected
variable to the rcu_print_task_exp_stall() return value.  This means
that if the last rcu_node structure has no stalled tasks, record of
any stalled tasks in previous rcu_node structures is lost, which can
in turn result in failure to dump out the blocking rcu_node structures.
Or could, had the test been correct.

This commit therefore adds the return value of rcu_print_task_exp_stall()
to ndetected and corrects the later test for ndetected.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5f4336fadc28..687d8a5f35c7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3778,7 +3778,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 		       rsp->name);
 		ndetected = 0;
 		rcu_for_each_leaf_node(rsp, rnp) {
-			ndetected = rcu_print_task_exp_stall(rnp);
+			ndetected += rcu_print_task_exp_stall(rnp);
 			mask = 1;
 			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
 				struct rcu_data *rdp;
@@ -3797,7 +3797,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
 			jiffies - jiffies_start, rsp->expedited_sequence,
 			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
-		if (!ndetected) {
+		if (ndetected) {
 			pr_err("blocking rcu_node structures:");
 			rcu_for_each_node_breadth_first(rsp, rnp) {
 				if (rnp == rnp_root)
-- 
cgit v1.2.3


From a1e1224849d9610b50fd1dd7d6f44308a59e46af Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 13 Jan 2016 13:57:54 -0800
Subject: rcu: Make cond_resched_rcu_qs() supply RCU-sched expedited QS

Although cond_resched_rcu_qs() supplies quiescent states to all flavors
of normal RCU grace periods, it does nothing for expedited RCU-sched
grace periods.  This commit therefore adds a check for a need for a
quiescent state from the current CPU by an expedited RCU-sched grace
period, and invokes rcu_sched_qs() to supply that quiescent state if so.

Note that the check is racy in that we might be migrated to some other
CPU just after checking the per-CPU variable.  This is OK because the
act of migration will do a context switch, which will supply the needed
quiescent state.  The only downside is that we might do an unnecessary
call to rcu_sched_qs(), but the probability is low and the overhead
is small.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 687d8a5f35c7..178575c01d09 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -370,6 +370,21 @@ void rcu_all_qs(void)
 		rcu_momentary_dyntick_idle();
 		local_irq_restore(flags);
 	}
+	if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) {
+		/*
+		 * Yes, we just checked a per-CPU variable with preemption
+		 * enabled, so we might be migrated to some other CPU at
+		 * this point.  That is OK because in that case, the
+		 * migration will supply the needed quiescent state.
+		 * We might end up needlessly disabling preemption and
+		 * invoking rcu_sched_qs() on the destination CPU, but
+		 * the probability and cost are both quite low, so this
+		 * should not be a problem in practice.
+		 */
+		preempt_disable();
+		rcu_sched_qs();
+		preempt_enable();
+	}
 	this_cpu_inc(rcu_qs_ctr);
 	barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
-- 
cgit v1.2.3


From 26ece8ef6eca97f19eb5ad5186b8c1a29ab25d76 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 28 Jan 2016 18:48:37 -0800
Subject: rcu: Fix synchronize_rcu_expedited() header comment

This commit brings the synchronize_rcu_expedited() function's header
comment into line with the new implementation.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index efdf7b61ce12..a2ac2628ef8e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -722,13 +722,19 @@ static void sync_rcu_exp_handler(void *info)
  * synchronize_rcu_expedited - Brute-force RCU grace period
  *
  * Wait for an RCU-preempt grace period, but expedite it.  The basic
- * idea is to invoke synchronize_sched_expedited() to push all the tasks to
- * the ->blkd_tasks lists and wait for this list to drain.  This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.
- * In fact, if you are using synchronize_rcu_expedited() in a loop,
- * please restructure your code to batch your updates, and then Use a
- * single synchronize_rcu() instead.
+ * idea is to IPI all non-idle non-nohz online CPUs.  The IPI handler
+ * checks whether the CPU is in an RCU-preempt critical section, and
+ * if so, it sets a flag that causes the outermost rcu_read_unlock()
+ * to report the quiescent state.  On the other hand, if the CPU is
+ * not in an RCU read-side critical section, the IPI handler reports
+ * the quiescent state immediately.
+ *
+ * Although this is a greate improvement over previous expedited
+ * implementations, it is still unfriendly to real-time workloads, so is
+ * thus not recommended for any sort of common-case code.  In fact, if
+ * you are using synchronize_rcu_expedited() in a loop, please restructure
+ * your code to batch your updates, and then Use a single synchronize_rcu()
+ * instead.
  */
 void synchronize_rcu_expedited(void)
 {
-- 
cgit v1.2.3


From bea2de44ae647698dc848a671fdee6e53c192423 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 28 Jan 2016 20:30:06 -0800
Subject: rcu: Add funnel-locking tracing for expedited grace periods

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 31 +++++++++++++++++++++++++++----
 kernel/rcu/tree_plugin.h |  3 +++
 2 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 178575c01d09..79e9206a7b11 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3584,10 +3584,18 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 			       atomic_long_t *stat, unsigned long s)
 {
 	if (rcu_exp_gp_seq_done(rsp, s)) {
-		if (rnp)
+		if (rnp) {
 			mutex_unlock(&rnp->exp_funnel_mutex);
-		else if (rdp)
+			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+						  rnp->grplo, rnp->grphi,
+						  TPS("rel"));
+		} else if (rdp) {
 			mutex_unlock(&rdp->exp_funnel_mutex);
+			trace_rcu_exp_funnel_lock(rsp->name,
+						  rdp->mynode->level + 1,
+						  rdp->cpu, rdp->cpu,
+						  TPS("rel"));
+		}
 		/* Ensure test happens before caller kfree(). */
 		smp_mb__before_atomic(); /* ^^^ */
 		atomic_long_inc(stat);
@@ -3619,6 +3627,9 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 			if (sync_exp_work_done(rsp, rnp0, NULL,
 					       &rdp->expedited_workdone0, s))
 				return NULL;
+			trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
+						  rnp0->grplo, rnp0->grphi,
+						  TPS("acq"));
 			return rnp0;
 		}
 	}
@@ -3634,16 +3645,28 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
 		return NULL;
 	mutex_lock(&rdp->exp_funnel_mutex);
+	trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1,
+				  rdp->cpu, rdp->cpu, TPS("acq"));
 	rnp0 = rdp->mynode;
 	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
 		if (sync_exp_work_done(rsp, rnp1, rdp,
 				       &rdp->expedited_workdone2, s))
 			return NULL;
 		mutex_lock(&rnp0->exp_funnel_mutex);
-		if (rnp1)
+		trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
+					  rnp0->grplo, rnp0->grphi, TPS("acq"));
+		if (rnp1) {
 			mutex_unlock(&rnp1->exp_funnel_mutex);
-		else
+			trace_rcu_exp_funnel_lock(rsp->name, rnp1->level,
+						  rnp1->grplo, rnp1->grphi,
+						  TPS("rel"));
+		} else {
 			mutex_unlock(&rdp->exp_funnel_mutex);
+			trace_rcu_exp_funnel_lock(rsp->name,
+						  rdp->mynode->level + 1,
+						  rdp->cpu, rdp->cpu,
+						  TPS("rel"));
+		}
 		rnp1 = rnp0;
 	}
 	if (sync_exp_work_done(rsp, rnp1, rdp,
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a2ac2628ef8e..cd2dae43ff48 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -767,6 +767,9 @@ void synchronize_rcu_expedited(void)
 	/* Clean up and exit. */
 	rcu_exp_gp_seq_end(rsp);
 	mutex_unlock(&rnp_unlock->exp_funnel_mutex);
+	trace_rcu_exp_funnel_lock(rsp->name, rnp_unlock->level,
+				  rnp_unlock->grplo, rnp_unlock->grphi,
+				  TPS("rel"));
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
-- 
cgit v1.2.3


From 4f41530245c7fd4837152e264d120d05ae940eb0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 28 Jan 2016 20:49:49 -0800
Subject: rcu: Add expedited-grace-period event tracing

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 20 +++++++++++++-------
 kernel/rcu/tree_plugin.h |  3 +++
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 79e9206a7b11..524026fd9dd7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3584,17 +3584,18 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 			       atomic_long_t *stat, unsigned long s)
 {
 	if (rcu_exp_gp_seq_done(rsp, s)) {
+		trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
 		if (rnp) {
-			mutex_unlock(&rnp->exp_funnel_mutex);
 			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
 						  rnp->grplo, rnp->grphi,
 						  TPS("rel"));
+			mutex_unlock(&rnp->exp_funnel_mutex);
 		} else if (rdp) {
-			mutex_unlock(&rdp->exp_funnel_mutex);
 			trace_rcu_exp_funnel_lock(rsp->name,
 						  rdp->mynode->level + 1,
 						  rdp->cpu, rdp->cpu,
 						  TPS("rel"));
+			mutex_unlock(&rdp->exp_funnel_mutex);
 		}
 		/* Ensure test happens before caller kfree(). */
 		smp_mb__before_atomic(); /* ^^^ */
@@ -3624,12 +3625,12 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	rnp0 = rcu_get_root(rsp);
 	if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
 		if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
-			if (sync_exp_work_done(rsp, rnp0, NULL,
-					       &rdp->expedited_workdone0, s))
-				return NULL;
 			trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
 						  rnp0->grplo, rnp0->grphi,
 						  TPS("acq"));
+			if (sync_exp_work_done(rsp, rnp0, NULL,
+					       &rdp->expedited_workdone0, s))
+				return NULL;
 			return rnp0;
 		}
 	}
@@ -3656,16 +3657,16 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 		trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
 					  rnp0->grplo, rnp0->grphi, TPS("acq"));
 		if (rnp1) {
-			mutex_unlock(&rnp1->exp_funnel_mutex);
 			trace_rcu_exp_funnel_lock(rsp->name, rnp1->level,
 						  rnp1->grplo, rnp1->grphi,
 						  TPS("rel"));
+			mutex_unlock(&rnp1->exp_funnel_mutex);
 		} else {
-			mutex_unlock(&rdp->exp_funnel_mutex);
 			trace_rcu_exp_funnel_lock(rsp->name,
 						  rdp->mynode->level + 1,
 						  rdp->cpu, rdp->cpu,
 						  TPS("rel"));
+			mutex_unlock(&rdp->exp_funnel_mutex);
 		}
 		rnp1 = rnp0;
 	}
@@ -3895,16 +3896,21 @@ void synchronize_sched_expedited(void)
 
 	/* Take a snapshot of the sequence number.  */
 	s = rcu_exp_gp_seq_snap(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
 
 	rnp = exp_funnel_lock(rsp, s);
 	if (rnp == NULL)
 		return;  /* Someone else did our work for us. */
 
 	rcu_exp_gp_seq_start(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
 	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
 	synchronize_sched_expedited_wait(rsp);
 
 	rcu_exp_gp_seq_end(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
+	trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+				  rnp->grplo, rnp->grphi, TPS("rel"));
 	mutex_unlock(&rnp->exp_funnel_mutex);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cd2dae43ff48..36e94aed38a7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -750,12 +750,14 @@ void synchronize_rcu_expedited(void)
 	}
 
 	s = rcu_exp_gp_seq_snap(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
 
 	rnp_unlock = exp_funnel_lock(rsp, s);
 	if (rnp_unlock == NULL)
 		return;  /* Someone else did our work for us. */
 
 	rcu_exp_gp_seq_start(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
 
 	/* Initialize the rcu_node tree in preparation for the wait. */
 	sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
@@ -766,6 +768,7 @@ void synchronize_rcu_expedited(void)
 
 	/* Clean up and exit. */
 	rcu_exp_gp_seq_end(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
 	mutex_unlock(&rnp_unlock->exp_funnel_mutex);
 	trace_rcu_exp_funnel_lock(rsp->name, rnp_unlock->level,
 				  rnp_unlock->grplo, rnp_unlock->grphi,
-- 
cgit v1.2.3


From e2fd9d35847d1936398d44c4df68dceb3d7f64e7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 30 Jan 2016 17:23:19 -0800
Subject: rcu: Remove expedited GP funnel-lock bypass

Commit #cdacbe1f91264 ("rcu: Add fastpath bypassing funnel locking")
turns out to be a pessimization at high load because it forces a tree
full of tasks to wait for an expedited grace period that they probably
do not need.  This commit therefore removes this optimization.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/RCU/trace.txt | 10 +++++-----
 kernel/rcu/tree.c           | 19 -------------------
 kernel/rcu/tree.h           |  1 -
 kernel/rcu/tree_trace.c     |  7 +++----
 4 files changed, 8 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index ec6998b1b6d0..00a3a38b375a 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -237,17 +237,17 @@ o	"ktl" is the low-order 16 bits (in hexadecimal) of the count of
 
 The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
 
-s=21872 wd0=0 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872
+s=21872 wd1=0 wd2=0 wd3=5 n=0 enq=0 sc=21872
 
 These fields are as follows:
 
 o	"s" is the sequence number, with an odd number indicating that
 	an expedited grace period is in progress.
 
-o	"wd0", "wd1", "wd2", and "wd3" are the number of times that an
-	attempt to start an expedited grace period found that someone
-	else had completed an expedited grace period that satisfies the
-	attempted request.  "Our work is done."
+o	"wd1", "wd2", and "wd3" are the number of times that an attempt
+	to start an expedited grace period found that someone else had
+	completed an expedited grace period that satisfies the attempted
+	request.  "Our work is done."
 
 o	"n" is number of times that a concurrent CPU-hotplug operation
 	forced a fallback to a normal grace period.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 524026fd9dd7..62e73e0a929f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3616,25 +3616,6 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	struct rcu_node *rnp0;
 	struct rcu_node *rnp1 = NULL;
 
-	/*
-	 * First try directly acquiring the root lock in order to reduce
-	 * latency in the common case where expedited grace periods are
-	 * rare.  We check mutex_is_locked() to avoid pathological levels of
-	 * memory contention on ->exp_funnel_mutex in the heavy-load case.
-	 */
-	rnp0 = rcu_get_root(rsp);
-	if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
-		if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
-			trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
-						  rnp0->grplo, rnp0->grphi,
-						  TPS("acq"));
-			if (sync_exp_work_done(rsp, rnp0, NULL,
-					       &rdp->expedited_workdone0, s))
-				return NULL;
-			return rnp0;
-		}
-	}
-
 	/*
 	 * Each pass through the following loop works its way
 	 * up the rcu_node tree, returning if others have done the
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index df668c0f9e64..ac9a7b0c36ae 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -388,7 +388,6 @@ struct rcu_data {
 	struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 	struct mutex exp_funnel_mutex;
-	atomic_long_t expedited_workdone0;	/* # done by others #0. */
 	atomic_long_t expedited_workdone1;	/* # done by others #1. */
 	atomic_long_t expedited_workdone2;	/* # done by others #2. */
 	atomic_long_t expedited_workdone3;	/* # done by others #3. */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 1088e64f01ad..d149c412a4e5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -185,17 +185,16 @@ static int show_rcuexp(struct seq_file *m, void *v)
 	int cpu;
 	struct rcu_state *rsp = (struct rcu_state *)m->private;
 	struct rcu_data *rdp;
-	unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+	unsigned long s1 = 0, s2 = 0, s3 = 0;
 
 	for_each_possible_cpu(cpu) {
 		rdp = per_cpu_ptr(rsp->rda, cpu);
-		s0 += atomic_long_read(&rdp->expedited_workdone0);
 		s1 += atomic_long_read(&rdp->expedited_workdone1);
 		s2 += atomic_long_read(&rdp->expedited_workdone2);
 		s3 += atomic_long_read(&rdp->expedited_workdone3);
 	}
-	seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
-		   rsp->expedited_sequence, s0, s1, s2, s3,
+	seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
+		   rsp->expedited_sequence, s1, s2, s3,
 		   atomic_long_read(&rsp->expedited_normal),
 		   atomic_read(&rsp->expedited_need_qs),
 		   rsp->expedited_sequence / 2);
-- 
cgit v1.2.3


From ec3833ed02ae6ef2a933ece9de7cbab0c64c699e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 11 Jan 2016 16:29:29 -0800
Subject: rcu: Force boolean subscript for expedited stall warnings

The cpu_online() function can return values other than 0 and 1, which
can result in subscript overflow when applied to a two-element array.
This commit allows for this behavior by using "!!" on the return value
from cpu_online() when used as a subscript.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 62e73e0a929f..64c2e3288551 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3808,7 +3808,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 				ndetected++;
 				rdp = per_cpu_ptr(rsp->rda, cpu);
 				pr_cont(" %d-%c%c%c", cpu,
-					"O."[cpu_online(cpu)],
+					"O."[!!cpu_online(cpu)],
 					"o."[!!(rdp->grpmask & rnp->expmaskinit)],
 					"N."[!!(rdp->grpmask & rnp->expmaskinitnext)]);
 			}
-- 
cgit v1.2.3


From d40a4f09a448382961fa9b1a2f7d4f34813f0273 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 8 Mar 2016 14:43:44 -0800
Subject: rcu: Shorten expedited_workdone* to exp_workdone*

Just a name change to save a few lines and a bit of typing.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c       | 8 +++-----
 kernel/rcu/tree.h       | 6 +++---
 kernel/rcu/tree_trace.c | 6 +++---
 3 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 64c2e3288551..89f028767765 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3624,15 +3624,14 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 	 * can be inexact, as it is just promoting locality and is not
 	 * strictly needed for correctness.
 	 */
-	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
+	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->exp_workdone1, s))
 		return NULL;
 	mutex_lock(&rdp->exp_funnel_mutex);
 	trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1,
 				  rdp->cpu, rdp->cpu, TPS("acq"));
 	rnp0 = rdp->mynode;
 	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
-		if (sync_exp_work_done(rsp, rnp1, rdp,
-				       &rdp->expedited_workdone2, s))
+		if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone2, s))
 			return NULL;
 		mutex_lock(&rnp0->exp_funnel_mutex);
 		trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
@@ -3651,8 +3650,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 		}
 		rnp1 = rnp0;
 	}
-	if (sync_exp_work_done(rsp, rnp1, rdp,
-			       &rdp->expedited_workdone3, s))
+	if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone3, s))
 		return NULL;
 	return rnp1;
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index ac9a7b0c36ae..6a8f09446924 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -388,9 +388,9 @@ struct rcu_data {
 	struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 	struct mutex exp_funnel_mutex;
-	atomic_long_t expedited_workdone1;	/* # done by others #1. */
-	atomic_long_t expedited_workdone2;	/* # done by others #2. */
-	atomic_long_t expedited_workdone3;	/* # done by others #3. */
+	atomic_long_t exp_workdone1;	/* # done by others #1. */
+	atomic_long_t exp_workdone2;	/* # done by others #2. */
+	atomic_long_t exp_workdone3;	/* # done by others #3. */
 
 	/* 7) Callback offloading. */
 #ifdef CONFIG_RCU_NOCB_CPU
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index d149c412a4e5..86782f9a4604 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -189,9 +189,9 @@ static int show_rcuexp(struct seq_file *m, void *v)
 
 	for_each_possible_cpu(cpu) {
 		rdp = per_cpu_ptr(rsp->rda, cpu);
-		s1 += atomic_long_read(&rdp->expedited_workdone1);
-		s2 += atomic_long_read(&rdp->expedited_workdone2);
-		s3 += atomic_long_read(&rdp->expedited_workdone3);
+		s1 += atomic_long_read(&rdp->exp_workdone1);
+		s2 += atomic_long_read(&rdp->exp_workdone2);
+		s3 += atomic_long_read(&rdp->exp_workdone3);
 	}
 	seq_printf(m, "s=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
 		   rsp->expedited_sequence, s1, s2, s3,
-- 
cgit v1.2.3


From f6a12f34a448cc8a624070fd365c29c890138a48 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 30 Jan 2016 17:57:35 -0800
Subject: rcu: Enforce expedited-GP fairness via funnel wait queue

The current mutex-based funnel-locking approach used by expedited grace
periods is subject to severe unfairness.  The problem arises when a
few tasks, making a path from leaves to root, all wake up before other
tasks do.  A new task can then follow this path all the way to the root,
which needlessly delays tasks whose grace period is done, but who do
not happen to acquire the lock quickly enough.

This commit avoids this problem by maintaining per-rcu_node wait queues,
along with a per-rcu_node counter that tracks the latest grace period
sought by an earlier task to visit this node.  If that grace period
would satisfy the current task, instead of proceeding up the tree,
it waits on the current rcu_node structure using a pair of wait queues
provided for that purpose.  This decouples awakening of old tasks from
the arrival of new tasks.

If the wakeups prove to be a bottleneck, additional kthreads can be
brought to bear for that purpose.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/trace/events/rcu.h |   5 +-
 kernel/rcu/tree.c          | 155 +++++++++++++++++++++++----------------------
 kernel/rcu/tree.h          |  10 ++-
 kernel/rcu/tree_plugin.h   |  16 ++---
 4 files changed, 93 insertions(+), 93 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index aacc172eba7e..d3e756539d44 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -179,6 +179,7 @@ TRACE_EVENT(rcu_grace_period_init,
  *	"snap": Captured snapshot of expedited grace period sequence number.
  *	"start": Started a real expedited grace period.
  *	"end": Ended a real expedited grace period.
+ *	"endwake": Woke piggybackers up.
  *	"done": Someone else did the expedited grace period for us.
  */
 TRACE_EVENT(rcu_exp_grace_period,
@@ -210,8 +211,8 @@ TRACE_EVENT(rcu_exp_grace_period,
  * and highest-numbered CPU associated with the current rcu_node structure,
  * and a string.  identifying the grace-period-related event as follows:
  *
- *	"acq": Acquired a level of funnel lock
- *	"rel": Released a level of funnel lock
+ *	"nxtlvl": Advance to next level of rcu_node funnel
+ *	"wait": Wait for someone else to do expedited GP
  */
 TRACE_EVENT(rcu_exp_funnel_lock,
 
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 89f028767765..bd2658edce00 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -102,6 +102,7 @@ struct rcu_state sname##_state = { \
 	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
+	.exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \
 }
 
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
@@ -3484,7 +3485,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
  * for the current expedited grace period.  Works only for preemptible
  * RCU -- other RCU implementation use other means.
  *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the rcu_state's exp_mutex.
  */
 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 {
@@ -3500,8 +3501,8 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
  *
- * Caller must hold the root rcu_node's exp_funnel_mutex and the
- * specified rcu_node structure's ->lock.
+ * Caller must hold the rcu_state's exp_mutex and the specified rcu_node
+ * structure's ->lock.
  */
 static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 				 bool wake, unsigned long flags)
@@ -3538,7 +3539,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
  * Report expedited quiescent state for specified node.  This is a
  * lock-acquisition wrapper function for __rcu_report_exp_rnp().
  *
- * Caller must hold the root rcu_node's exp_funnel_mutex.
+ * Caller must hold the rcu_state's exp_mutex.
  */
 static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 					      struct rcu_node *rnp, bool wake)
@@ -3551,8 +3552,8 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 
 /*
  * Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure.  Caller must hold the root
- * rcu_node's exp_funnel_mutex.
+ * specified leaf rcu_node structure.  Caller must hold the rcu_state's
+ * exp_mutex.
  */
 static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 				    unsigned long mask, bool wake)
@@ -3570,7 +3571,6 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 
 /*
  * Report expedited quiescent state for specified rcu_data (CPU).
- * Caller must hold the root rcu_node's exp_funnel_mutex.
  */
 static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
 			       bool wake)
@@ -3579,24 +3579,11 @@ static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp,
 }
 
 /* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
-static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
-			       struct rcu_data *rdp,
-			       atomic_long_t *stat, unsigned long s)
+static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat,
+			       unsigned long s)
 {
 	if (rcu_exp_gp_seq_done(rsp, s)) {
 		trace_rcu_exp_grace_period(rsp->name, s, TPS("done"));
-		if (rnp) {
-			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
-						  rnp->grplo, rnp->grphi,
-						  TPS("rel"));
-			mutex_unlock(&rnp->exp_funnel_mutex);
-		} else if (rdp) {
-			trace_rcu_exp_funnel_lock(rsp->name,
-						  rdp->mynode->level + 1,
-						  rdp->cpu, rdp->cpu,
-						  TPS("rel"));
-			mutex_unlock(&rdp->exp_funnel_mutex);
-		}
 		/* Ensure test happens before caller kfree(). */
 		smp_mb__before_atomic(); /* ^^^ */
 		atomic_long_inc(stat);
@@ -3606,53 +3593,53 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 }
 
 /*
- * Funnel-lock acquisition for expedited grace periods.  Returns a
- * pointer to the root rcu_node structure, or NULL if some other
- * task did the expedited grace period for us.
+ * Funnel-lock acquisition for expedited grace periods.  Returns true
+ * if some other task completed an expedited grace period that this task
+ * can piggy-back on, and with no mutex held.  Otherwise, returns false
+ * with the mutex held, indicating that the caller must actually do the
+ * expedited grace period.
  */
-static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 {
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
-	struct rcu_node *rnp0;
-	struct rcu_node *rnp1 = NULL;
+	struct rcu_node *rnp = rdp->mynode;
 
 	/*
-	 * Each pass through the following loop works its way
-	 * up the rcu_node tree, returning if others have done the
-	 * work or otherwise falls through holding the root rnp's
-	 * ->exp_funnel_mutex.  The mapping from CPU to rcu_node structure
-	 * can be inexact, as it is just promoting locality and is not
-	 * strictly needed for correctness.
+	 * Each pass through the following loop works its way up
+	 * the rcu_node tree, returning if others have done the work or
+	 * otherwise falls through to acquire rsp->exp_mutex.  The mapping
+	 * from CPU to rcu_node structure can be inexact, as it is just
+	 * promoting locality and is not strictly needed for correctness.
 	 */
-	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->exp_workdone1, s))
-		return NULL;
-	mutex_lock(&rdp->exp_funnel_mutex);
-	trace_rcu_exp_funnel_lock(rsp->name, rdp->mynode->level + 1,
-				  rdp->cpu, rdp->cpu, TPS("acq"));
-	rnp0 = rdp->mynode;
-	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
-		if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone2, s))
-			return NULL;
-		mutex_lock(&rnp0->exp_funnel_mutex);
-		trace_rcu_exp_funnel_lock(rsp->name, rnp0->level,
-					  rnp0->grplo, rnp0->grphi, TPS("acq"));
-		if (rnp1) {
-			trace_rcu_exp_funnel_lock(rsp->name, rnp1->level,
-						  rnp1->grplo, rnp1->grphi,
-						  TPS("rel"));
-			mutex_unlock(&rnp1->exp_funnel_mutex);
-		} else {
-			trace_rcu_exp_funnel_lock(rsp->name,
-						  rdp->mynode->level + 1,
-						  rdp->cpu, rdp->cpu,
-						  TPS("rel"));
-			mutex_unlock(&rdp->exp_funnel_mutex);
+	for (; rnp != NULL; rnp = rnp->parent) {
+		if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s))
+			return true;
+
+		/* Work not done, either wait here or go up. */
+		spin_lock(&rnp->exp_lock);
+		if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) {
+
+			/* Someone else doing GP, so wait for them. */
+			spin_unlock(&rnp->exp_lock);
+			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
+						  rnp->grplo, rnp->grphi,
+						  TPS("wait"));
+			wait_event(rnp->exp_wq[(s >> 1) & 0x1],
+				   sync_exp_work_done(rsp,
+						      &rdp->exp_workdone2, s));
+			return true;
 		}
-		rnp1 = rnp0;
+		rnp->exp_seq_rq = s; /* Followers can wait on us. */
+		spin_unlock(&rnp->exp_lock);
+		trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo,
+					  rnp->grphi, TPS("nxtlvl"));
 	}
-	if (sync_exp_work_done(rsp, rnp1, rdp, &rdp->exp_workdone3, s))
-		return NULL;
-	return rnp1;
+	mutex_lock(&rsp->exp_mutex);
+	if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
+		mutex_unlock(&rsp->exp_mutex);
+		return true;
+	}
+	return false;
 }
 
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
@@ -3841,6 +3828,27 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 	}
 }
 
+/*
+ * Wake up everyone who piggybacked on the just-completed expedited
+ * grace period.  Also update all the ->exp_seq_rq counters as needed
+ * in order to avoid counter-wrap problems.
+ */
+static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s)
+{
+	struct rcu_node *rnp;
+
+	rcu_for_each_node_breadth_first(rsp, rnp) {
+		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
+			spin_lock(&rnp->exp_lock);
+			/* Recheck, avoid hang in case someone just arrived. */
+			if (ULONG_CMP_LT(rnp->exp_seq_rq, s))
+				rnp->exp_seq_rq = s;
+			spin_unlock(&rnp->exp_lock);
+		}
+		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]);
+	}
+}
+
 /**
  * synchronize_sched_expedited - Brute-force RCU-sched grace period
  *
@@ -3860,7 +3868,6 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 void synchronize_sched_expedited(void)
 {
 	unsigned long s;
-	struct rcu_node *rnp;
 	struct rcu_state *rsp = &rcu_sched_state;
 
 	/* If only one CPU, this is automatically a grace period. */
@@ -3877,20 +3884,23 @@ void synchronize_sched_expedited(void)
 	s = rcu_exp_gp_seq_snap(rsp);
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
 
-	rnp = exp_funnel_lock(rsp, s);
-	if (rnp == NULL)
+	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
 	rcu_exp_gp_seq_start(rsp);
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
+
+	/* Initialize the rcu_node tree in preparation for the wait. */
 	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
-	synchronize_sched_expedited_wait(rsp);
 
+	/* Wait and clean up, including waking everyone. */
+	synchronize_sched_expedited_wait(rsp);
 	rcu_exp_gp_seq_end(rsp);
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
-	trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
-				  rnp->grplo, rnp->grphi, TPS("rel"));
-	mutex_unlock(&rnp->exp_funnel_mutex);
+	rcu_exp_wake(rsp, s);
+
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+	mutex_unlock(&rsp->exp_mutex);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
@@ -4190,7 +4200,6 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
 	rdp->cpu = cpu;
 	rdp->rsp = rsp;
-	mutex_init(&rdp->exp_funnel_mutex);
 	rcu_boot_init_nocb_percpu_data(rdp);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
@@ -4448,10 +4457,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 {
 	static const char * const buf[] = RCU_NODE_NAME_INIT;
 	static const char * const fqs[] = RCU_FQS_NAME_INIT;
-	static const char * const exp[] = RCU_EXP_NAME_INIT;
 	static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
 	static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-	static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
 	static u8 fl_mask = 0x1;
 
 	int levelcnt[RCU_NUM_LVLS];		/* # nodes in each level. */
@@ -4510,9 +4517,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			rnp->level = i;
 			INIT_LIST_HEAD(&rnp->blkd_tasks);
 			rcu_init_one_nocb(rnp);
-			mutex_init(&rnp->exp_funnel_mutex);
-			lockdep_set_class_and_name(&rnp->exp_funnel_mutex,
-						   &rcu_exp_class[i], exp[i]);
+			init_waitqueue_head(&rnp->exp_wq[0]);
+			init_waitqueue_head(&rnp->exp_wq[1]);
+			spin_lock_init(&rnp->exp_lock);
 		}
 	}
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 6a8f09446924..f9d4fbb1e014 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -70,7 +70,6 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0" }
-#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0" }
 #elif NR_CPUS <= RCU_FANOUT_2
 #  define RCU_NUM_LVLS	      2
 #  define NUM_RCU_LVL_0	      1
@@ -79,7 +78,6 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1" }
-#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1" }
 #elif NR_CPUS <= RCU_FANOUT_3
 #  define RCU_NUM_LVLS	      3
 #  define NUM_RCU_LVL_0	      1
@@ -89,7 +87,6 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
-#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
 #elif NR_CPUS <= RCU_FANOUT_4
 #  define RCU_NUM_LVLS	      4
 #  define NUM_RCU_LVL_0	      1
@@ -100,7 +97,6 @@
 #  define NUM_RCU_LVL_INIT    { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
 #  define RCU_NODE_NAME_INIT  { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
 #  define RCU_FQS_NAME_INIT   { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
-#  define RCU_EXP_NAME_INIT   { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
 #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
@@ -252,7 +248,9 @@ struct rcu_node {
 				/* Counts of upcoming no-CB GP requests. */
 	raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
 
-	struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
+	spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
+	unsigned long exp_seq_rq;
+	wait_queue_head_t exp_wq[2];
 } ____cacheline_internodealigned_in_smp;
 
 /*
@@ -387,7 +385,6 @@ struct rcu_data {
 #ifdef CONFIG_RCU_FAST_NO_HZ
 	struct rcu_head oom_head;
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
-	struct mutex exp_funnel_mutex;
 	atomic_long_t exp_workdone1;	/* # done by others #1. */
 	atomic_long_t exp_workdone2;	/* # done by others #2. */
 	atomic_long_t exp_workdone3;	/* # done by others #3. */
@@ -504,6 +501,7 @@ struct rcu_state {
 						/*  _rcu_barrier(). */
 	/* End of fields guarded by barrier_mutex. */
 
+	struct mutex exp_mutex;			/* Serialize expedited GP. */
 	unsigned long expedited_sequence;	/* Take a ticket. */
 	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
 	atomic_t expedited_need_qs;		/* # CPUs left to check in. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 36e94aed38a7..c82c3640493f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -738,8 +738,6 @@ static void sync_rcu_exp_handler(void *info)
  */
 void synchronize_rcu_expedited(void)
 {
-	struct rcu_node *rnp;
-	struct rcu_node *rnp_unlock;
 	struct rcu_state *rsp = rcu_state_p;
 	unsigned long s;
 
@@ -752,8 +750,7 @@ void synchronize_rcu_expedited(void)
 	s = rcu_exp_gp_seq_snap(rsp);
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
 
-	rnp_unlock = exp_funnel_lock(rsp, s);
-	if (rnp_unlock == NULL)
+	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
 	rcu_exp_gp_seq_start(rsp);
@@ -763,16 +760,13 @@ void synchronize_rcu_expedited(void)
 	sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
 
 	/* Wait for snapshotted ->blkd_tasks lists to drain. */
-	rnp = rcu_get_root(rsp);
 	synchronize_sched_expedited_wait(rsp);
-
-	/* Clean up and exit. */
 	rcu_exp_gp_seq_end(rsp);
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
-	mutex_unlock(&rnp_unlock->exp_funnel_mutex);
-	trace_rcu_exp_funnel_lock(rsp->name, rnp_unlock->level,
-				  rnp_unlock->grplo, rnp_unlock->grphi,
-				  TPS("rel"));
+	rcu_exp_wake(rsp, s);
+
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+	mutex_unlock(&rsp->exp_mutex);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
-- 
cgit v1.2.3


From 356051e1de3cf65575da4ee92d1f5cee86677ee2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 16 Mar 2016 13:22:53 -0700
Subject: rcu: Add exp_funnel_lock() fastpath

This commit speeds up the low-contention case, especially for systems
with large rcu_node trees, by attempting to directly acquire the
->exp_mutex.  This fastpath checks the leaves and root first in
order to avoid excessive memory contention on the mutex itself.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bd2658edce00..892a140ae7b6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3603,6 +3603,15 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 {
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
 	struct rcu_node *rnp = rdp->mynode;
+	struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+	/* Low-contention fastpath. */
+	if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) &&
+	    (rnp == rnp_root ||
+	     ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) &&
+	    !mutex_is_locked(&rsp->exp_mutex) &&
+	    mutex_trylock(&rsp->exp_mutex))
+		goto fastpath;
 
 	/*
 	 * Each pass through the following loop works its way up
@@ -3635,6 +3644,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 					  rnp->grphi, TPS("nxtlvl"));
 	}
 	mutex_lock(&rsp->exp_mutex);
+fastpath:
 	if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) {
 		mutex_unlock(&rsp->exp_mutex);
 		return true;
-- 
cgit v1.2.3


From 4ea3e85b113ab37a2d55cfabf0d709ddec088bb3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 16 Mar 2016 16:22:25 -0700
Subject: rcu: Consolidate expedited GP code into rcu_exp_wait_wake()

Currently, synchronize_rcu_expedited() and rcu_sched_expedited() have
significant duplicate code.  This commit therefore consolidates some of
this code into rcu_exp_wake(), which is now renamed to rcu_exp_wait_wake()
in recognition of its added responsibilities.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 18 +++++++++---------
 kernel/rcu/tree_plugin.h | 10 ++--------
 2 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 892a140ae7b6..fd86eca9478e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3839,14 +3839,18 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 }
 
 /*
- * Wake up everyone who piggybacked on the just-completed expedited
+ * Wait for the current expedited grace period to complete, and then
+ * wake up everyone who piggybacked on the just-completed expedited
  * grace period.  Also update all the ->exp_seq_rq counters as needed
  * in order to avoid counter-wrap problems.
  */
-static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s)
+static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 {
 	struct rcu_node *rnp;
 
+	synchronize_sched_expedited_wait(rsp);
+	rcu_exp_gp_seq_end(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
 	rcu_for_each_node_breadth_first(rsp, rnp) {
 		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
 			spin_lock(&rnp->exp_lock);
@@ -3857,6 +3861,8 @@ static void rcu_exp_wake(struct rcu_state *rsp, unsigned long s)
 		}
 		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]);
 	}
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
+	mutex_unlock(&rsp->exp_mutex);
 }
 
 /**
@@ -3904,13 +3910,7 @@ void synchronize_sched_expedited(void)
 	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
 
 	/* Wait and clean up, including waking everyone. */
-	synchronize_sched_expedited_wait(rsp);
-	rcu_exp_gp_seq_end(rsp);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
-	rcu_exp_wake(rsp, s);
-
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
-	mutex_unlock(&rsp->exp_mutex);
+	rcu_exp_wait_wake(rsp, s);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c82c3640493f..b6d5dde6eab9 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -759,14 +759,8 @@ void synchronize_rcu_expedited(void)
 	/* Initialize the rcu_node tree in preparation for the wait. */
 	sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
 
-	/* Wait for snapshotted ->blkd_tasks lists to drain. */
-	synchronize_sched_expedited_wait(rsp);
-	rcu_exp_gp_seq_end(rsp);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
-	rcu_exp_wake(rsp, s);
-
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
-	mutex_unlock(&rsp->exp_mutex);
+	/* Wait for ->blkd_tasks lists to drain, then wake everyone up. */
+	rcu_exp_wait_wake(rsp, s);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
-- 
cgit v1.2.3


From 179e5dcd1e5bdfac1128431d131b31322aedd2bc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 16 Mar 2016 16:27:44 -0700
Subject: rcu: Consolidate expedited GP tracing into rcu_exp_gp_seq_snap()

This commit moves some duplicate code from synchronize_rcu_expedited()
and synchronize_sched_expedited() into rcu_exp_gp_seq_snap().  This
doesn't save lines of code, but does eliminate a "tell me twice" issue.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 8 +++++---
 kernel/rcu/tree_plugin.h | 2 --
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index fd86eca9478e..5b1c8fd89af0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3392,8 +3392,12 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
 }
 static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
 {
+	unsigned long s;
+
 	smp_mb(); /* Caller's modifications seen first by other CPUs. */
-	return rcu_seq_snap(&rsp->expedited_sequence);
+	s = rcu_seq_snap(&rsp->expedited_sequence);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
+	return s;
 }
 static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
 {
@@ -3898,8 +3902,6 @@ void synchronize_sched_expedited(void)
 
 	/* Take a snapshot of the sequence number.  */
 	s = rcu_exp_gp_seq_snap(rsp);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
-
 	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index b6d5dde6eab9..529a44085a63 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -748,8 +748,6 @@ void synchronize_rcu_expedited(void)
 	}
 
 	s = rcu_exp_gp_seq_snap(rsp);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("snap"));
-
 	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
-- 
cgit v1.2.3


From aff12cdf86e6fa891d1c30c0fad112d138bd7b10 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 16 Mar 2016 16:32:24 -0700
Subject: rcu: Consolidate expedited GP code into exp_funnel_lock()

This commit pulls the grace-period-start counter adjustment and tracing
from synchronize_rcu_expedited() and synchronize_sched_expedited()
into exp_funnel_lock(), thus eliminating some code duplication.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 5 ++---
 kernel/rcu/tree_plugin.h | 3 ---
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5b1c8fd89af0..e8fff14e417b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3653,6 +3653,8 @@ fastpath:
 		mutex_unlock(&rsp->exp_mutex);
 		return true;
 	}
+	rcu_exp_gp_seq_start(rsp);
+	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
 	return false;
 }
 
@@ -3905,9 +3907,6 @@ void synchronize_sched_expedited(void)
 	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
-	rcu_exp_gp_seq_start(rsp);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
-
 	/* Initialize the rcu_node tree in preparation for the wait. */
 	sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler);
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 529a44085a63..ff1cd4e1188d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -751,9 +751,6 @@ void synchronize_rcu_expedited(void)
 	if (exp_funnel_lock(rsp, s))
 		return;  /* Someone else did our work for us. */
 
-	rcu_exp_gp_seq_start(rsp);
-	trace_rcu_exp_grace_period(rsp->name, s, TPS("start"));
-
 	/* Initialize the rcu_node tree in preparation for the wait. */
 	sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler);
 
-- 
cgit v1.2.3


From 3b5f668e715bc19610ad967ef97a7e8c55a186ec Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 16 Mar 2016 16:47:55 -0700
Subject: rcu: Overlap wakeups with next expedited grace period

The current expedited grace-period implementation makes subsequent grace
periods wait on wakeups for the prior grace period.  This does not fit
the dictionary definition of "expedited", so this commit allows these two
phases to overlap.  Doing this requires four waitqueues rather than two
because tasks can now be waiting on the previous, current, and next grace
periods.  The fourth waitqueue makes the bit masking work out nicely.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 17 ++++++++++++++---
 kernel/rcu/tree.h |  3 ++-
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e8fff14e417b..1df100cb7a62 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -103,6 +103,7 @@ struct rcu_state sname##_state = { \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
 	.exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \
+	.exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \
 }
 
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
@@ -3637,7 +3638,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 			trace_rcu_exp_funnel_lock(rsp->name, rnp->level,
 						  rnp->grplo, rnp->grphi,
 						  TPS("wait"));
-			wait_event(rnp->exp_wq[(s >> 1) & 0x1],
+			wait_event(rnp->exp_wq[(s >> 1) & 0x3],
 				   sync_exp_work_done(rsp,
 						      &rdp->exp_workdone2, s));
 			return true;
@@ -3857,6 +3858,14 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 	synchronize_sched_expedited_wait(rsp);
 	rcu_exp_gp_seq_end(rsp);
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("end"));
+
+	/*
+	 * Switch over to wakeup mode, allowing the next GP, but -only- the
+	 * next GP, to proceed.
+	 */
+	mutex_lock(&rsp->exp_wake_mutex);
+	mutex_unlock(&rsp->exp_mutex);
+
 	rcu_for_each_node_breadth_first(rsp, rnp) {
 		if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) {
 			spin_lock(&rnp->exp_lock);
@@ -3865,10 +3874,10 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s)
 				rnp->exp_seq_rq = s;
 			spin_unlock(&rnp->exp_lock);
 		}
-		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x1]);
+		wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]);
 	}
 	trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake"));
-	mutex_unlock(&rsp->exp_mutex);
+	mutex_unlock(&rsp->exp_wake_mutex);
 }
 
 /**
@@ -4530,6 +4539,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			rcu_init_one_nocb(rnp);
 			init_waitqueue_head(&rnp->exp_wq[0]);
 			init_waitqueue_head(&rnp->exp_wq[1]);
+			init_waitqueue_head(&rnp->exp_wq[2]);
+			init_waitqueue_head(&rnp->exp_wq[3]);
 			spin_lock_init(&rnp->exp_lock);
 		}
 	}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index f9d4fbb1e014..1194ab0da56a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -250,7 +250,7 @@ struct rcu_node {
 
 	spinlock_t exp_lock ____cacheline_internodealigned_in_smp;
 	unsigned long exp_seq_rq;
-	wait_queue_head_t exp_wq[2];
+	wait_queue_head_t exp_wq[4];
 } ____cacheline_internodealigned_in_smp;
 
 /*
@@ -502,6 +502,7 @@ struct rcu_state {
 	/* End of fields guarded by barrier_mutex. */
 
 	struct mutex exp_mutex;			/* Serialize expedited GP. */
+	struct mutex exp_wake_mutex;		/* Serialize wakeup. */
 	unsigned long expedited_sequence;	/* Take a ticket. */
 	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
 	atomic_t expedited_need_qs;		/* # CPUs left to check in. */
-- 
cgit v1.2.3


From 86057b80ae31d37fcbdb5f57d15aaf1148c69f96 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 31 Dec 2015 08:48:36 -0800
Subject: rcu: Awaken grace-period kthread when stalled

Recent kernels can fail to awaken the grace-period kthread for
quiescent-state forcing.  This commit is a crude hack that does
a wakeup any time a stall is detected.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 531a328076bd..a327a253c178 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1224,8 +1224,10 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 		       rsp->gp_flags,
 		       gp_state_getname(rsp->gp_state), rsp->gp_state,
 		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
-		if (rsp->gp_kthread)
+		if (rsp->gp_kthread) {
 			sched_show_task(rsp->gp_kthread);
+			wake_up_process(rsp->gp_kthread);
+		}
 	}
 }
 
-- 
cgit v1.2.3


From fcfd0a237bfcf0c314005007e9d76e55a25e2bad Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 3 Jan 2016 16:42:18 -0800
Subject: rcu: Make FQS schedule advance only if FQS happened

Currently, the force-quiescent-state (FQS) code in rcu_gp_kthread() can
advance the next FQS even if one was not executed last time.  This can
happen due timeout-duration uncertainty.  This commit therefore avoids
advancing the FQS schedule unless an FQS was just executed.  In the
corner case where an FQS was not executed, but is due now, the code does
a one-jiffy wait.

This change prepares for kthread kicking.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a327a253c178..6116cfad18ff 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2146,6 +2146,15 @@ static int __noreturn rcu_gp_kthread(void *arg)
 						       TPS("fqsend"));
 				cond_resched_rcu_qs();
 				WRITE_ONCE(rsp->gp_activity, jiffies);
+				ret = 0; /* Force full wait till next FQS. */
+				j = jiffies_till_next_fqs;
+				if (j > HZ) {
+					j = HZ;
+					jiffies_till_next_fqs = HZ;
+				} else if (j < 1) {
+					j = 1;
+					jiffies_till_next_fqs = 1;
+				}
 			} else {
 				/* Deal with stray signal. */
 				cond_resched_rcu_qs();
@@ -2154,14 +2163,12 @@ static int __noreturn rcu_gp_kthread(void *arg)
 				trace_rcu_grace_period(rsp->name,
 						       READ_ONCE(rsp->gpnum),
 						       TPS("fqswaitsig"));
-			}
-			j = jiffies_till_next_fqs;
-			if (j > HZ) {
-				j = HZ;
-				jiffies_till_next_fqs = HZ;
-			} else if (j < 1) {
-				j = 1;
-				jiffies_till_next_fqs = 1;
+				ret = 1; /* Keep old FQS timing. */
+				j = jiffies;
+				if (time_after(jiffies, rsp->jiffies_force_qs))
+					j = 1;
+				else
+					j = rsp->jiffies_force_qs - j;
 			}
 		}
 
-- 
cgit v1.2.3


From 8c7c4829a81c1838f18c12ce5a3a5c29a08bf0a8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 3 Jan 2016 20:29:57 -0800
Subject: rcu: Awaken grace-period kthread if too long since FQS

Recent kernels can fail to awaken the grace-period kthread for
quiescent-state forcing.  This commit is a crude hack that does
a wakeup if a scheduling-clock interrupt sees that it has been
too long since force-quiescent-state (FQS) processing.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 39 +++++++++++++++++++++++++++++++++++++--
 kernel/rcu/tree.h |  2 ++
 2 files changed, 39 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6116cfad18ff..a739292be605 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -385,9 +385,11 @@ module_param(qlowmark, long, 0444);
 
 static ulong jiffies_till_first_fqs = ULONG_MAX;
 static ulong jiffies_till_next_fqs = ULONG_MAX;
+static bool rcu_kick_kthreads;
 
 module_param(jiffies_till_first_fqs, ulong, 0644);
 module_param(jiffies_till_next_fqs, ulong, 0644);
+module_param(rcu_kick_kthreads, bool, 0644);
 
 /*
  * How long the grace period must be before we start recruiting
@@ -1251,6 +1253,24 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 	}
 }
 
+/*
+ * If too much time has passed in the current grace period, and if
+ * so configured, go kick the relevant kthreads.
+ */
+static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
+{
+	unsigned long j;
+
+	if (!rcu_kick_kthreads)
+		return;
+	j = READ_ONCE(rsp->jiffies_kick_kthreads);
+	if (time_after(jiffies, j) && rsp->gp_kthread) {
+		WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name);
+		wake_up_process(rsp->gp_kthread);
+		WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ);
+	}
+}
+
 static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 {
 	int cpu;
@@ -1262,6 +1282,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	long totqlen = 0;
 
+	/* Kick and suppress, if so configured. */
+	rcu_stall_kick_kthreads(rsp);
+	if (rcu_cpu_stall_suppress)
+		return;
+
 	/* Only let one CPU complain about others per time interval. */
 
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -1335,6 +1360,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	long totqlen = 0;
 
+	/* Kick and suppress, if so configured. */
+	rcu_stall_kick_kthreads(rsp);
+	if (rcu_cpu_stall_suppress)
+		return;
+
 	/*
 	 * OK, time to rat on ourselves...
 	 * See Documentation/RCU/stallwarn.txt for info on how to debug
@@ -1379,8 +1409,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 	unsigned long js;
 	struct rcu_node *rnp;
 
-	if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
+	if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) ||
+	    !rcu_gp_in_progress(rsp))
 		return;
+	rcu_stall_kick_kthreads(rsp);
 	j = jiffies;
 
 	/*
@@ -2119,8 +2151,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
 		}
 		ret = 0;
 		for (;;) {
-			if (!ret)
+			if (!ret) {
 				rsp->jiffies_force_qs = jiffies + j;
+				WRITE_ONCE(rsp->jiffies_kick_kthreads,
+					   jiffies + 3 * j);
+			}
 			trace_rcu_grace_period(rsp->name,
 					       READ_ONCE(rsp->gpnum),
 					       TPS("fqswait"));
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index df668c0f9e64..34d3973f7223 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -513,6 +513,8 @@ struct rcu_state {
 
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
 						/*  force_quiescent_state(). */
+	unsigned long jiffies_kick_kthreads;	/* Time at which to kick */
+						/*  kthreads, if configured. */
 	unsigned long n_force_qs;		/* Number of calls to */
 						/*  force_quiescent_state(). */
 	unsigned long n_force_qs_lh;		/* ~Number of calls leaving */
-- 
cgit v1.2.3


From 293e2421fe25839500207eda123cc4475f8d17b8 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Wed, 23 Mar 2016 23:11:48 +0800
Subject: rcu: Remove superfluous versions of rcu_read_lock_sched_held()

Currently, we have four versions of rcu_read_lock_sched_held(), depending
on the combined choices on PREEMPT_COUNT and DEBUG_LOCK_ALLOC.  However,
there is an existing function preemptible() that already distinguishes
between the PREEMPT_COUNT=y and PREEMPT_COUNT=n cases, and allows these
four implementations to be consolidated down to two.

This commit therefore uses preemptible() to achieve this consolidation.

Note that there could be a small performance regression in the case
of CONFIG_DEBUG_LOCK_ALLOC=y && PREEMPT_COUNT=n.  However, given the
overhead associated with CONFIG_DEBUG_LOCK_ALLOC=y, this should be
down in the noise.

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 17 +----------------
 kernel/rcu/update.c      |  4 ++--
 2 files changed, 3 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 45de591657a6..5f1533e3d032 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -508,14 +508,7 @@ int rcu_read_lock_bh_held(void);
  * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
  * critical section unless it can prove otherwise.
  */
-#ifdef CONFIG_PREEMPT_COUNT
 int rcu_read_lock_sched_held(void);
-#else /* #ifdef CONFIG_PREEMPT_COUNT */
-static inline int rcu_read_lock_sched_held(void)
-{
-	return 1;
-}
-#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
@@ -532,18 +525,10 @@ static inline int rcu_read_lock_bh_held(void)
 	return 1;
 }
 
-#ifdef CONFIG_PREEMPT_COUNT
 static inline int rcu_read_lock_sched_held(void)
 {
-	return preempt_count() != 0 || irqs_disabled();
+	return !preemptible();
 }
-#else /* #ifdef CONFIG_PREEMPT_COUNT */
-static inline int rcu_read_lock_sched_held(void)
-{
-	return 1;
-}
-#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
-
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #ifdef CONFIG_PROVE_RCU
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index ca828b41c938..3ccdc8eebc5a 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -67,7 +67,7 @@ static int rcu_normal_after_boot;
 module_param(rcu_normal_after_boot, int, 0);
 #endif /* #ifndef CONFIG_TINY_RCU */
 
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
 /**
  * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
  *
@@ -111,7 +111,7 @@ int rcu_read_lock_sched_held(void)
 		return 0;
 	if (debug_locks)
 		lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
-	return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
+	return lockdep_opinion || !preemptible();
 }
 EXPORT_SYMBOL(rcu_read_lock_sched_held);
 #endif
-- 
cgit v1.2.3


From 5dffed1e5721f6deae4fd67d32386ef037c5fc56 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 17 Feb 2016 11:54:28 -0800
Subject: rcu: Dump ftrace buffer when kicking grace-period kthread

If it is necessary to kick the grace-period kthread, that is a good
time to dump the trace buffer in order to learn why kicking was needed.
This commit therefore does the dump.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a739292be605..86edb92276d3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1266,6 +1266,7 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp)
 	j = READ_ONCE(rsp->jiffies_kick_kthreads);
 	if (time_after(jiffies, j) && rsp->gp_kthread) {
 		WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name);
+		rcu_ftrace_dump(DUMP_ALL);
 		wake_up_process(rsp->gp_kthread);
 		WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ);
 	}
-- 
cgit v1.2.3


From 9efafb8849f732a3497f46f178b350c9ff7cfe27 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 31 Dec 2015 18:11:47 -0800
Subject: rcutorture: Allow for rcupdate.rcu_normal

Currently, rcu_torture_writer() checks only for rcu_gp_is_expedited()
when deciding whether or not to do dynamic control of RCU expediting.
This means that if rcupdate.rcu_normal is specified, rcu_torture_writer()
will attempt to dynamically control RCU expediting, but will nonetheless
only test normal RCU grace periods.  This commit therefore adds a check
for !rcu_gp_is_normal(), and prints a message and desists from testing
dynamic control of RCU expediting when doing so is futile.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 463867c43221..9234e75b106a 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -916,7 +916,7 @@ rcu_torture_fqs(void *arg)
 static int
 rcu_torture_writer(void *arg)
 {
-	bool can_expedite = !rcu_gp_is_expedited();
+	bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal();
 	int expediting = 0;
 	unsigned long gp_snap;
 	bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
@@ -932,7 +932,7 @@ rcu_torture_writer(void *arg)
 	VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
 	if (!can_expedite) {
 		pr_alert("%s" TORTURE_FLAG
-			 " Grace periods expedited from boot/sysfs for %s,\n",
+			 " GP expediting controlled from boot/sysfs for %s,\n",
 			 torture_type, cur_ops->name);
 		pr_alert("%s" TORTURE_FLAG
 			 " Disabled dynamic grace-period expediting.\n",
-- 
cgit v1.2.3


From 291783b8ad77a83a6fdf91d55eee7f1ad72ed4d1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Jan 2016 13:43:30 -0800
Subject: rcutorture: Expedited-GP batch progress access to torturing

This commit provides rcu_exp_batches_completed() and
rcu_exp_batches_completed_sched() functions to allow torture-test modules
to check how many expedited grace period batches have completed.
These are analogous to the existing rcu_batches_completed(),
rcu_batches_completed_bh(), and rcu_batches_completed_sched() functions.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h | 16 ++++++++++++++++
 include/linux/rcutree.h |  2 ++
 kernel/rcu/tree.c       | 22 ++++++++++++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 64809aea661c..93aea75029fb 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -149,6 +149,22 @@ static inline unsigned long rcu_batches_completed_sched(void)
 	return 0;
 }
 
+/*
+ * Return the number of expedited grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of expedited sched grace periods completed.
+ */
+static inline unsigned long rcu_exp_batches_completed_sched(void)
+{
+	return 0;
+}
+
 static inline void rcu_force_quiescent_state(void)
 {
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index ad1eda9fa4da..5043cb823fb2 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -87,6 +87,8 @@ unsigned long rcu_batches_started_sched(void);
 unsigned long rcu_batches_completed(void);
 unsigned long rcu_batches_completed_bh(void);
 unsigned long rcu_batches_completed_sched(void);
+unsigned long rcu_exp_batches_completed(void);
+unsigned long rcu_exp_batches_completed_sched(void);
 void show_rcu_gp_kthreads(void);
 
 void rcu_force_quiescent_state(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 531a328076bd..88df64087dfe 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -459,6 +459,28 @@ unsigned long rcu_batches_completed_bh(void)
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 
+/*
+ * Return the number of RCU expedited batches completed thus far for
+ * debug & stats.  Odd numbers mean that a batch is in progress, even
+ * numbers mean idle.  The value returned will thus be roughly double
+ * the cumulative batches since boot.
+ */
+unsigned long rcu_exp_batches_completed(void)
+{
+	return rcu_state_p->expedited_sequence;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_batches_completed);
+
+/*
+ * Return the number of RCU-sched expedited batches completed thus far
+ * for debug & stats.  Similar to rcu_exp_batches_completed().
+ */
+unsigned long rcu_exp_batches_completed_sched(void)
+{
+	return rcu_sched_state.expedited_sequence;
+}
+EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched);
+
 /*
  * Force a quiescent state.
  */
-- 
cgit v1.2.3


From 8704baab9bc848b58c129fed6b591bb84ec02f41 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 31 Dec 2015 18:33:22 -0800
Subject: rcutorture: Add RCU grace-period performance tests

This commit adds a new rcuperf module that carries out simple performance
tests of RCU grace periods.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/Makefile  |   1 +
 kernel/rcu/rcuperf.c | 637 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug    |  33 +++
 3 files changed, 671 insertions(+)
 create mode 100644 kernel/rcu/rcuperf.c

(limited to 'kernel')

diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 032b2c015beb..18dfc485225c 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -5,6 +5,7 @@ KCOV_INSTRUMENT := n
 obj-y += update.o sync.o
 obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_PREEMPT_RCU) += tree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
new file mode 100644
index 000000000000..9d54a57bee7d
--- /dev/null
+++ b/kernel/rcu/rcuperf.c
@@ -0,0 +1,637 @@
+/*
+ * Read-Copy Update module-based performance-test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2015
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+#include <linux/vmalloc.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
+
+#define PERF_FLAG "-perf:"
+#define PERFOUT_STRING(s) \
+	pr_alert("%s" PERF_FLAG s "\n", perf_type)
+#define VERBOSE_PERFOUT_STRING(s) \
+	do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0)
+#define VERBOSE_PERFOUT_ERRSTRING(s) \
+	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
+
+torture_param(bool, gp_exp, true, "Use expedited GP wait primitives");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, nwriters, -1, "Number of RCU updater threads");
+torture_param(bool, shutdown, false, "Shutdown at end of performance tests.");
+torture_param(bool, verbose, true, "Enable verbose debugging printk()s");
+
+static char *perf_type = "rcu";
+module_param(perf_type, charp, 0444);
+MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, ...)");
+
+static int nrealreaders;
+static int nrealwriters;
+static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *shutdown_task;
+
+static u64 **writer_durations;
+static int *writer_n_durations;
+static atomic_t n_rcu_perf_reader_started;
+static atomic_t n_rcu_perf_writer_started;
+static atomic_t n_rcu_perf_writer_finished;
+static wait_queue_head_t shutdown_wq;
+static u64 t_rcu_perf_writer_started;
+static u64 t_rcu_perf_writer_finished;
+static unsigned long b_rcu_perf_writer_started;
+static unsigned long b_rcu_perf_writer_finished;
+
+static int rcu_perf_writer_state;
+#define RTWS_INIT		0
+#define RTWS_EXP_SYNC		1
+#define RTWS_SYNC		2
+#define RTWS_IDLE		2
+#define RTWS_STOPPING		3
+
+#define MAX_MEAS 10000
+#define MIN_MEAS 100
+
+#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE)
+#define RCUPERF_RUNNABLE_INIT 1
+#else
+#define RCUPERF_RUNNABLE_INIT 0
+#endif
+static int perf_runnable = RCUPERF_RUNNABLE_INIT;
+module_param(perf_runnable, int, 0444);
+MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot");
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_perf_ops {
+	int ptype;
+	void (*init)(void);
+	void (*cleanup)(void);
+	int (*readlock)(void);
+	void (*readunlock)(int idx);
+	unsigned long (*started)(void);
+	unsigned long (*completed)(void);
+	unsigned long (*exp_completed)(void);
+	void (*sync)(void);
+	void (*exp_sync)(void);
+	const char *name;
+};
+
+static struct rcu_perf_ops *cur_ops;
+
+/*
+ * Definitions for rcu perf testing.
+ */
+
+static int rcu_perf_read_lock(void) __acquires(RCU)
+{
+	rcu_read_lock();
+	return 0;
+}
+
+static void rcu_perf_read_unlock(int idx) __releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static unsigned long __maybe_unused rcu_no_completed(void)
+{
+	return 0;
+}
+
+static void rcu_sync_perf_init(void)
+{
+}
+
+static struct rcu_perf_ops rcu_ops = {
+	.ptype		= RCU_FLAVOR,
+	.init		= rcu_sync_perf_init,
+	.readlock	= rcu_perf_read_lock,
+	.readunlock	= rcu_perf_read_unlock,
+	.started	= rcu_batches_started,
+	.completed	= rcu_batches_completed,
+	.exp_completed	= rcu_exp_batches_completed,
+	.sync		= synchronize_rcu,
+	.exp_sync	= synchronize_rcu_expedited,
+	.name		= "rcu"
+};
+
+/*
+ * Definitions for rcu_bh perf testing.
+ */
+
+static int rcu_bh_perf_read_lock(void) __acquires(RCU_BH)
+{
+	rcu_read_lock_bh();
+	return 0;
+}
+
+static void rcu_bh_perf_read_unlock(int idx) __releases(RCU_BH)
+{
+	rcu_read_unlock_bh();
+}
+
+static struct rcu_perf_ops rcu_bh_ops = {
+	.ptype		= RCU_BH_FLAVOR,
+	.init		= rcu_sync_perf_init,
+	.readlock	= rcu_bh_perf_read_lock,
+	.readunlock	= rcu_bh_perf_read_unlock,
+	.started	= rcu_batches_started_bh,
+	.completed	= rcu_batches_completed_bh,
+	.exp_completed	= rcu_exp_batches_completed_sched,
+	.sync		= synchronize_rcu_bh,
+	.exp_sync	= synchronize_rcu_bh_expedited,
+	.name		= "rcu_bh"
+};
+
+/*
+ * Definitions for srcu perf testing.
+ */
+
+DEFINE_STATIC_SRCU(srcu_ctl_perf);
+static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf;
+
+static int srcu_perf_read_lock(void) __acquires(srcu_ctlp)
+{
+	return srcu_read_lock(srcu_ctlp);
+}
+
+static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp)
+{
+	srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static unsigned long srcu_perf_completed(void)
+{
+	return srcu_batches_completed(srcu_ctlp);
+}
+
+static void srcu_perf_synchronize(void)
+{
+	synchronize_srcu(srcu_ctlp);
+}
+
+static void srcu_perf_synchronize_expedited(void)
+{
+	synchronize_srcu_expedited(srcu_ctlp);
+}
+
+static struct rcu_perf_ops srcu_ops = {
+	.ptype		= SRCU_FLAVOR,
+	.init		= rcu_sync_perf_init,
+	.readlock	= srcu_perf_read_lock,
+	.readunlock	= srcu_perf_read_unlock,
+	.started	= NULL,
+	.completed	= srcu_perf_completed,
+	.exp_completed	= srcu_perf_completed,
+	.sync		= srcu_perf_synchronize,
+	.exp_sync	= srcu_perf_synchronize_expedited,
+	.name		= "srcu"
+};
+
+/*
+ * Definitions for sched perf testing.
+ */
+
+static int sched_perf_read_lock(void)
+{
+	preempt_disable();
+	return 0;
+}
+
+static void sched_perf_read_unlock(int idx)
+{
+	preempt_enable();
+}
+
+static struct rcu_perf_ops sched_ops = {
+	.ptype		= RCU_SCHED_FLAVOR,
+	.init		= rcu_sync_perf_init,
+	.readlock	= sched_perf_read_lock,
+	.readunlock	= sched_perf_read_unlock,
+	.started	= rcu_batches_started_sched,
+	.completed	= rcu_batches_completed_sched,
+	.exp_completed	= rcu_exp_batches_completed_sched,
+	.sync		= synchronize_sched,
+	.exp_sync	= synchronize_sched_expedited,
+	.name		= "sched"
+};
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Definitions for RCU-tasks perf testing.
+ */
+
+static int tasks_perf_read_lock(void)
+{
+	return 0;
+}
+
+static void tasks_perf_read_unlock(int idx)
+{
+}
+
+static struct rcu_perf_ops tasks_ops = {
+	.ptype		= RCU_TASKS_FLAVOR,
+	.init		= rcu_sync_perf_init,
+	.readlock	= tasks_perf_read_lock,
+	.readunlock	= tasks_perf_read_unlock,
+	.started	= rcu_no_completed,
+	.completed	= rcu_no_completed,
+	.sync		= synchronize_rcu_tasks,
+	.exp_sync	= synchronize_rcu_tasks,
+	.name		= "tasks"
+};
+
+#define RCUPERF_TASKS_OPS &tasks_ops,
+
+static bool __maybe_unused torturing_tasks(void)
+{
+	return cur_ops == &tasks_ops;
+}
+
+#else /* #ifdef CONFIG_TASKS_RCU */
+
+#define RCUPERF_TASKS_OPS
+
+static bool __maybe_unused torturing_tasks(void)
+{
+	return false;
+}
+
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
+
+/*
+ * If performance tests complete, wait for shutdown to commence.
+ */
+static void rcu_perf_wait_shutdown(void)
+{
+	cond_resched_rcu_qs();
+	if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
+		return;
+	while (!torture_must_stop())
+		schedule_timeout_uninterruptible(1);
+}
+
+/*
+ * RCU perf reader kthread.  Repeatedly does empty RCU read-side
+ * critical section, minimizing update-side interference.
+ */
+static int
+rcu_perf_reader(void *arg)
+{
+	unsigned long flags;
+	int idx;
+
+	VERBOSE_PERFOUT_STRING("rcu_perf_reader task started");
+	set_user_nice(current, MAX_NICE);
+	atomic_inc(&n_rcu_perf_reader_started);
+
+	do {
+		local_irq_save(flags);
+		idx = cur_ops->readlock();
+		cur_ops->readunlock(idx);
+		local_irq_restore(flags);
+		rcu_perf_wait_shutdown();
+	} while (!torture_must_stop());
+	torture_kthread_stopping("rcu_perf_reader");
+	return 0;
+}
+
+/*
+ * RCU perf writer kthread.  Repeatedly does a grace period.
+ */
+static int
+rcu_perf_writer(void *arg)
+{
+	int i = 0;
+	int i_max;
+	long me = (long)arg;
+	bool started = false, done = false, alldone = false;
+	u64 t;
+	u64 *wdp;
+	u64 *wdpp = writer_durations[me];
+
+	VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
+	WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp);
+	WARN_ON(rcu_gp_is_normal() && gp_exp);
+	WARN_ON(!wdpp);
+	t = ktime_get_mono_fast_ns();
+	if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
+		t_rcu_perf_writer_started = t;
+		if (gp_exp) {
+			b_rcu_perf_writer_started =
+				cur_ops->exp_completed() / 2;
+		} else {
+			b_rcu_perf_writer_started =
+				cur_ops->completed();
+		}
+	}
+
+	do {
+		wdp = &wdpp[i];
+		*wdp = ktime_get_mono_fast_ns();
+		if (gp_exp) {
+			rcu_perf_writer_state = RTWS_EXP_SYNC;
+			cur_ops->exp_sync();
+		} else {
+			rcu_perf_writer_state = RTWS_SYNC;
+			cur_ops->sync();
+		}
+		rcu_perf_writer_state = RTWS_IDLE;
+		t = ktime_get_mono_fast_ns();
+		*wdp = t - *wdp;
+		i_max = i;
+		if (!started &&
+		    atomic_read(&n_rcu_perf_writer_started) >= nrealwriters)
+			started = true;
+		if (!done && i >= MIN_MEAS) {
+			done = true;
+			pr_alert("%s" PERF_FLAG
+				 "rcu_perf_writer %ld has %d measurements\n",
+				 perf_type, me, MIN_MEAS);
+			if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
+			    nrealwriters) {
+				PERFOUT_STRING("Test complete");
+				t_rcu_perf_writer_finished = t;
+				if (gp_exp) {
+					b_rcu_perf_writer_finished =
+						cur_ops->exp_completed() / 2;
+				} else {
+					b_rcu_perf_writer_finished =
+						cur_ops->completed();
+				}
+				smp_mb(); /* Assign before wake. */
+				wake_up(&shutdown_wq);
+			}
+		}
+		if (done && !alldone &&
+		    atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters)
+			alldone = true;
+		if (started && !alldone && i < MAX_MEAS - 1)
+			i++;
+		rcu_perf_wait_shutdown();
+	} while (!torture_must_stop());
+	rcu_perf_writer_state = RTWS_STOPPING;
+	writer_n_durations[me] = i_max;
+	torture_kthread_stopping("rcu_perf_writer");
+	return 0;
+}
+
+static inline void
+rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag)
+{
+	pr_alert("%s" PERF_FLAG
+		 "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n",
+		 perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
+}
+
+static void
+rcu_perf_cleanup(void)
+{
+	int i;
+	int j;
+	int ngps = 0;
+	u64 *wdp;
+	u64 *wdpp;
+
+	if (torture_cleanup_begin())
+		return;
+
+	if (reader_tasks) {
+		for (i = 0; i < nrealreaders; i++)
+			torture_stop_kthread(rcu_perf_reader,
+					     reader_tasks[i]);
+		kfree(reader_tasks);
+	}
+
+	if (writer_tasks) {
+		for (i = 0; i < nrealwriters; i++) {
+			torture_stop_kthread(rcu_perf_writer,
+					     writer_tasks[i]);
+			if (!writer_n_durations)
+				continue;
+			j = writer_n_durations[i];
+			pr_alert("%s%s writer %d gps: %d\n",
+				 perf_type, PERF_FLAG, i, j);
+			ngps += j;
+		}
+		pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
+			 perf_type, PERF_FLAG,
+			 t_rcu_perf_writer_started, t_rcu_perf_writer_finished,
+			 t_rcu_perf_writer_finished -
+			 t_rcu_perf_writer_started,
+			 ngps,
+			 b_rcu_perf_writer_finished -
+			 b_rcu_perf_writer_started);
+		for (i = 0; i < nrealwriters; i++) {
+			if (!writer_durations)
+				break;
+			if (!writer_n_durations)
+				continue;
+			wdpp = writer_durations[i];
+			if (!wdpp)
+				continue;
+			for (j = 0; j <= writer_n_durations[i]; j++) {
+				wdp = &wdpp[j];
+				pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+					perf_type, PERF_FLAG,
+					i, j, *wdp);
+				if (j % 100 == 0)
+					schedule_timeout_uninterruptible(1);
+			}
+			kfree(writer_durations[i]);
+		}
+		kfree(writer_tasks);
+		kfree(writer_durations);
+		kfree(writer_n_durations);
+	}
+
+	/* Do flavor-specific cleanup operations.  */
+	if (cur_ops->cleanup != NULL)
+		cur_ops->cleanup();
+
+	torture_cleanup_end();
+}
+
+/*
+ * Return the number if non-negative.  If -1, the number of CPUs.
+ * If less than -1, that much less than the number of CPUs, but
+ * at least one.
+ */
+static int compute_real(int n)
+{
+	int nr;
+
+	if (n >= 0) {
+		nr = n;
+	} else {
+		nr = num_online_cpus() + 1 + n;
+		if (nr <= 0)
+			nr = 1;
+	}
+	return nr;
+}
+
+/*
+ * RCU perf shutdown kthread.  Just waits to be awakened, then shuts
+ * down system.
+ */
+static int
+rcu_perf_shutdown(void *arg)
+{
+	do {
+		wait_event(shutdown_wq,
+			   atomic_read(&n_rcu_perf_writer_finished) >=
+			   nrealwriters);
+	} while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters);
+	smp_mb(); /* Wake before output. */
+	rcu_perf_cleanup();
+	kernel_power_off();
+	return -EINVAL;
+}
+
+static int __init
+rcu_perf_init(void)
+{
+	long i;
+	int firsterr = 0;
+	static struct rcu_perf_ops *perf_ops[] = {
+		&rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+		RCUPERF_TASKS_OPS
+	};
+
+	if (!torture_init_begin(perf_type, verbose, &perf_runnable))
+		return -EBUSY;
+
+	/* Process args and tell the world that the perf'er is on the job. */
+	for (i = 0; i < ARRAY_SIZE(perf_ops); i++) {
+		cur_ops = perf_ops[i];
+		if (strcmp(perf_type, cur_ops->name) == 0)
+			break;
+	}
+	if (i == ARRAY_SIZE(perf_ops)) {
+		pr_alert("rcu-perf: invalid perf type: \"%s\"\n",
+			 perf_type);
+		pr_alert("rcu-perf types:");
+		for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
+			pr_alert(" %s", perf_ops[i]->name);
+		pr_alert("\n");
+		firsterr = -EINVAL;
+		goto unwind;
+	}
+	if (cur_ops->init)
+		cur_ops->init();
+
+	nrealwriters = compute_real(nwriters);
+	nrealreaders = compute_real(nreaders);
+	atomic_set(&n_rcu_perf_reader_started, 0);
+	atomic_set(&n_rcu_perf_writer_started, 0);
+	atomic_set(&n_rcu_perf_writer_finished, 0);
+	rcu_perf_print_module_parms(cur_ops, "Start of test");
+
+	/* Start up the kthreads. */
+
+	if (shutdown) {
+		init_waitqueue_head(&shutdown_wq);
+		firsterr = torture_create_kthread(rcu_perf_shutdown, NULL,
+						  shutdown_task);
+		if (firsterr)
+			goto unwind;
+		schedule_timeout_uninterruptible(1);
+	}
+	reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (reader_tasks == NULL) {
+		VERBOSE_PERFOUT_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealreaders; i++) {
+		firsterr = torture_create_kthread(rcu_perf_reader, NULL,
+						  reader_tasks[i]);
+		if (firsterr)
+			goto unwind;
+	}
+	while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders)
+		schedule_timeout_uninterruptible(1);
+	writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations),
+				   GFP_KERNEL);
+	writer_n_durations =
+		kcalloc(nrealwriters, sizeof(*writer_n_durations),
+			GFP_KERNEL);
+	if (!writer_tasks || !writer_durations || !writer_n_durations) {
+		VERBOSE_PERFOUT_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealwriters; i++) {
+		writer_durations[i] =
+			kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
+				GFP_KERNEL);
+		if (!writer_durations[i])
+			goto unwind;
+		firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,
+						  writer_tasks[i]);
+		if (firsterr)
+			goto unwind;
+	}
+	torture_init_end();
+	return 0;
+
+unwind:
+	torture_init_end();
+	rcu_perf_cleanup();
+	return firsterr;
+}
+
+module_init(rcu_perf_init);
+module_exit(rcu_perf_cleanup);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1e9a607534ca..f4b797a690ba 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1289,6 +1289,39 @@ config TORTURE_TEST
 	tristate
 	default n
 
+config RCU_PERF_TEST
+	tristate "performance tests for RCU"
+	depends on DEBUG_KERNEL
+	select TORTURE_TEST
+	select SRCU
+	select TASKS_RCU
+	default n
+	help
+	  This option provides a kernel module that runs performance
+	  tests on the RCU infrastructure.  The kernel module may be built
+	  after the fact on the running kernel to be tested, if desired.
+
+	  Say Y here if you want RCU performance tests to be built into
+	  the kernel.
+	  Say M if you want the RCU performance tests to build as a module.
+	  Say N if you are unsure.
+
+config RCU_PERF_TEST_RUNNABLE
+	bool "performance tests for RCU runnable by default"
+	depends on RCU_PERF_TEST = y
+	default n
+	help
+	  This option provides a way to build the RCU performance tests
+	  directly into the kernel without them starting up at boot time.
+	  You can use /sys/module to manually override this setting.
+	  This /proc file is available only when the RCU performance
+	  tests have been built into the kernel.
+
+	  Say Y here if you want the RCU performance tests to start during
+	  boot (you probably don't).
+	  Say N here if you want the RCU performance tests to start only
+	  after being manually enabled via /sys/module.
+
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3


From 6b558c4c7a4ba410e39dbcb9d4c2b6e928c09308 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Jan 2016 14:15:40 -0800
Subject: rcutorture: Bind rcuperf reader/writer kthreads to CPUs

This commit forces more deterministic behavior by binding rcuperf's
rcu_perf_reader() and rcu_perf_writer() kthreads to their respective
CPUs.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcuperf.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 9d54a57bee7d..7a1edf417d18 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -328,8 +328,10 @@ rcu_perf_reader(void *arg)
 {
 	unsigned long flags;
 	int idx;
+	long me = (long)arg;
 
 	VERBOSE_PERFOUT_STRING("rcu_perf_reader task started");
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
 	set_user_nice(current, MAX_NICE);
 	atomic_inc(&n_rcu_perf_reader_started);
 
@@ -362,6 +364,7 @@ rcu_perf_writer(void *arg)
 	WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp);
 	WARN_ON(rcu_gp_is_normal() && gp_exp);
 	WARN_ON(!wdpp);
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
 	t = ktime_get_mono_fast_ns();
 	if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
 		t_rcu_perf_writer_started = t;
@@ -594,7 +597,7 @@ rcu_perf_init(void)
 		goto unwind;
 	}
 	for (i = 0; i < nrealreaders; i++) {
-		firsterr = torture_create_kthread(rcu_perf_reader, NULL,
+		firsterr = torture_create_kthread(rcu_perf_reader, (void *)i,
 						  reader_tasks[i]);
 		if (firsterr)
 			goto unwind;
-- 
cgit v1.2.3


From 2094c99558d9e9374210898f65f5862f7a2e8bed Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Jan 2016 15:17:21 -0800
Subject: rcutorture: Set rcuperf writer kthreads to real-time priority

This commit forces more deterministic update-side behavior by setting
rcuperf's rcu_perf_writer() kthreads to real-time priority.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcuperf.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 7a1edf417d18..e18d016a9888 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -355,6 +355,7 @@ rcu_perf_writer(void *arg)
 	int i = 0;
 	int i_max;
 	long me = (long)arg;
+	struct sched_param sp;
 	bool started = false, done = false, alldone = false;
 	u64 t;
 	u64 *wdp;
@@ -365,6 +366,8 @@ rcu_perf_writer(void *arg)
 	WARN_ON(rcu_gp_is_normal() && gp_exp);
 	WARN_ON(!wdpp);
 	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+	sp.sched_priority = 1;
+	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
 	t = ktime_get_mono_fast_ns();
 	if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
 		t_rcu_perf_writer_started = t;
-- 
cgit v1.2.3


From ac2bb275e8e5abddb0815ff2b7aa383ed6d007a4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 29 Jan 2016 14:58:17 -0800
Subject: rcutorture: Make rcuperf collect expedited event-trace data

This commit enables ftrace in the rcuperf TREE kernel build and adds
an ftrace_dump() at the end of rcuperf processing.  This data will be
used to measure the actual durations of the expedited grace periods
without the added delays inherent in the kernel-module measurements.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcuperf.c                                    | 1 +
 tools/testing/selftests/rcutorture/configs/rcuperf/TREE | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index e18d016a9888..12561f96f0a2 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -404,6 +404,7 @@ rcu_perf_writer(void *arg)
 				 perf_type, me, MIN_MEAS);
 			if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
 			    nrealwriters) {
+				rcu_ftrace_dump(DUMP_ALL);
 				PERFOUT_STRING("Test complete");
 				t_rcu_perf_writer_finished = t;
 				if (gp_exp) {
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
index 614e107f6db5..a312f671a29a 100644
--- a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
+++ b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
@@ -17,3 +17,4 @@ CONFIG_PROVE_LOCKING=n
 CONFIG_RCU_BOOST=n
 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
 CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
-- 
cgit v1.2.3


From df37e66bfdbb57e8cae7dbf39a0c66b1b8701338 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 30 Jan 2016 20:56:38 -0800
Subject: rcutorture: Add rcuperf holdoff boot parameter to reduce interference

Boot-time activity can legitimately grab CPUs for extended time periods,
so the commit adds a boot parameter to delay the start of the performance
test until boot has completed.  Defaults to 10 seconds.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt | 6 ++++++
 kernel/rcu/rcuperf.c                | 5 +++++
 2 files changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 951af481da5a..da9ee466789b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3288,6 +3288,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Measure performance of expedited synchronous
 			grace-period primitives.
 
+	rcuperf.holdoff= [KNL]
+			Set test-start holdoff period.  The purpose of
+			this parameter is to delay the start of the
+			test until boot completes in order to avoid
+			interference.
+
 	rcuperf.nreaders= [KNL]
 			Set number of RCU readers.  The value -1 selects
 			N, where N is the number of CPUs.  A value
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 12561f96f0a2..278600143bb6 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -59,6 +59,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>");
 	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
 
 torture_param(bool, gp_exp, true, "Use expedited GP wait primitives");
+torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
 torture_param(int, nreaders, -1, "Number of RCU reader threads");
 torture_param(int, nwriters, -1, "Number of RCU updater threads");
 torture_param(bool, shutdown, false, "Shutdown at end of performance tests.");
@@ -368,6 +369,10 @@ rcu_perf_writer(void *arg)
 	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
 	sp.sched_priority = 1;
 	sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+
+	if (holdoff)
+		schedule_timeout_uninterruptible(holdoff * HZ);
+
 	t = ktime_get_mono_fast_ns();
 	if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
 		t_rcu_perf_writer_started = t;
-- 
cgit v1.2.3


From 620316e52a923811fe9a77ceb43eebf5f507d375 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 30 Jan 2016 21:32:09 -0800
Subject: rcutorture: Avoid RCU CPU stall warning and RT throttling

Running rcuperf can result in RCU CPU stall warnings and RT throttling.
These occur because on of the real-time writer processes does
ftrace_dump() while still running at real-time priority.  This commit
therefore prevents these problems by setting the writer thread back to
SCHED_NORMAL (AKA SCHED_OTHER) before doing ftrace_dump().

In addition, this commit adds a small fixed delay before dumping ftrace
buffer in order to decrease the probability that this dumping will
interfere with other writers' grace periods.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcuperf.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 278600143bb6..4c0572859ff0 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -404,11 +404,15 @@ rcu_perf_writer(void *arg)
 			started = true;
 		if (!done && i >= MIN_MEAS) {
 			done = true;
+			sp.sched_priority = 0;
+			sched_setscheduler_nocheck(current,
+						   SCHED_NORMAL, &sp);
 			pr_alert("%s" PERF_FLAG
 				 "rcu_perf_writer %ld has %d measurements\n",
 				 perf_type, me, MIN_MEAS);
 			if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
 			    nrealwriters) {
+				schedule_timeout_interruptible(10);
 				rcu_ftrace_dump(DUMP_ALL);
 				PERFOUT_STRING("Test complete");
 				t_rcu_perf_writer_finished = t;
-- 
cgit v1.2.3


From e6fb1fc1085e5b5155bc8f3d3385c48b8bdde95e Mon Sep 17 00:00:00 2001
From: Artem Savkov <artem.savkov@gmail.com>
Date: Sun, 7 Feb 2016 13:31:39 +0100
Subject: rcuperf: Do not wake up shutdown wait queue if "shutdown" is false.

After finishing its tests rcuperf tries to wake up shutdown_wq even if
"shutdown" param is set to false, resulting in a wake_up() call on an
unitialized wait_queue_head_t which leads to "BUG: spinlock bad magic" and
"BUG: unable to handle kernel NULL pointer dereference".

Fix by checking "shutdown" param before waking up the queue.

Signed-off-by: Artem Savkov <artem.savkov@gmail.com>
---
 kernel/rcu/rcuperf.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 4c0572859ff0..3cee0d8393ed 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -423,8 +423,10 @@ rcu_perf_writer(void *arg)
 					b_rcu_perf_writer_finished =
 						cur_ops->completed();
 				}
-				smp_mb(); /* Assign before wake. */
-				wake_up(&shutdown_wq);
+				if (shutdown) {
+					smp_mb(); /* Assign before wake. */
+					wake_up(&shutdown_wq);
+				}
 			}
 		}
 		if (done && !alldone &&
-- 
cgit v1.2.3


From 67522beecfc75d133514dda64107ee19125a74b9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 1 Mar 2016 08:52:19 -0800
Subject: rcutorture: Remove redundant initialization to zero

The current code initializes the global per-CPU variables
rcu_torture_count and rcu_torture_batch to zero.  However, C does this
initialization by default, and explicit initialization of per-CPU
variables now needs a different syntax if "make tags" is to work.
This commit therefore removes the initialization.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 9234e75b106a..52b49fe90919 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -130,8 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current;
 static unsigned long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch);
 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
 static atomic_t n_rcu_torture_alloc;
 static atomic_t n_rcu_torture_alloc_fail;
-- 
cgit v1.2.3


From de26ca19a530d2d822a6816834d22022e94b2e53 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Thu, 17 Mar 2016 11:14:35 +0100
Subject: rcutorture: Consider FROZEN hotplug notifier transitions

The hotplug notifier rcutorture_cpu_notify() doesn't consider the
corresponding CPU_XXX_FROZEN transitions. They occur on
suspend/resume and are usually handled the same way as the
corresponding non frozen transitions.

Mask the switch case action argument with '~CPU_TASKS_FROZEN' to map
CPU_XXX_FROZEN hotplug transitions on corresponding non-frozen
transitions.

Cc: Josh Triplett <josh@joshtriplett.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 52b49fe90919..633a68a09440 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1585,7 +1585,7 @@ static int rcutorture_cpu_notify(struct notifier_block *self,
 {
 	long cpu = (long)hcpu;
 
-	switch (action) {
+	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
 		(void)rcutorture_booster_init(cpu);
-- 
cgit v1.2.3


From 9eb5188a0704bd21eb7e4aef83b904fad43d3ec8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 21 Mar 2016 15:36:40 -0700
Subject: torture: Clarify refusal to run more than one torture test

This commit clarifies error messages -- you only get to run one torture
test at a time!

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/torture.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index 44aa462d033f..e912ccd960f0 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -602,8 +602,9 @@ bool torture_init_begin(char *ttype, bool v, int *runnable)
 {
 	mutex_lock(&fullstop_mutex);
 	if (torture_type != NULL) {
-		pr_alert("torture_init_begin: refusing %s init: %s running",
+		pr_alert("torture_init_begin: Refusing %s init: %s running.\n",
 			 ttype, torture_type);
+		pr_alert("torture_init_begin: One torture test at a time!\n");
 		mutex_unlock(&fullstop_mutex);
 		return false;
 	}
-- 
cgit v1.2.3


From 1ce15ef4f60529cf1313f80f4338c88bd65cc572 Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@redhat.com>
Date: Tue, 22 Mar 2016 20:03:16 -0400
Subject: module: preserve Elf information for livepatch modules

For livepatch modules, copy Elf section, symbol, and string information
from the load_info struct in the module loader. Persist copies of the
original symbol table and string table.

Livepatch manages its own relocation sections in order to reuse module
loader code to write relocations. Livepatch modules must preserve Elf
information such as section indices in order to apply livepatch relocation
sections using the module loader's apply_relocate_add() function.

In order to apply livepatch relocation sections, livepatch modules must
keep a complete copy of their original symbol table in memory. Normally, a
stripped down copy of a module's symbol table (containing only "core"
symbols) is made available through module->core_symtab. But for livepatch
modules, the symbol table copied into memory on module load must be exactly
the same as the symbol table produced when the patch module was compiled.
This is because the relocations in each livepatch relocation section refer
to their respective symbols with their symbol indices, and the original
symbol indices (and thus the symtab ordering) must be preserved in order
for apply_relocate_add() to find the right symbol.

Signed-off-by: Jessica Yu <jeyu@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/module.h |  25 ++++++++++
 kernel/module.c        | 125 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 147 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 2bb0c3085706..3daf2b3a09d2 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -330,6 +330,15 @@ struct mod_kallsyms {
 	char *strtab;
 };
 
+#ifdef CONFIG_LIVEPATCH
+struct klp_modinfo {
+	Elf_Ehdr hdr;
+	Elf_Shdr *sechdrs;
+	char *secstrings;
+	unsigned int symndx;
+};
+#endif
+
 struct module {
 	enum module_state state;
 
@@ -456,7 +465,11 @@ struct module {
 #endif
 
 #ifdef CONFIG_LIVEPATCH
+	bool klp; /* Is this a livepatch module? */
 	bool klp_alive;
+
+	/* Elf information */
+	struct klp_modinfo *klp_info;
 #endif
 
 #ifdef CONFIG_MODULE_UNLOAD
@@ -630,6 +643,18 @@ static inline bool module_requested_async_probing(struct module *module)
 	return module && module->async_probe_requested;
 }
 
+#ifdef CONFIG_LIVEPATCH
+static inline bool is_livepatch_module(struct module *mod)
+{
+	return mod->klp;
+}
+#else /* !CONFIG_LIVEPATCH */
+static inline bool is_livepatch_module(struct module *mod)
+{
+	return false;
+}
+#endif /* CONFIG_LIVEPATCH */
+
 #else /* !CONFIG_MODULES... */
 
 /* Given an address, look for it in the exception tables. */
diff --git a/kernel/module.c b/kernel/module.c
index 041200ca4a2d..5f71aa63ed2a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1973,6 +1973,83 @@ static void module_enable_nx(const struct module *mod) { }
 static void module_disable_nx(const struct module *mod) { }
 #endif
 
+#ifdef CONFIG_LIVEPATCH
+/*
+ * Persist Elf information about a module. Copy the Elf header,
+ * section header table, section string table, and symtab section
+ * index from info to mod->klp_info.
+ */
+static int copy_module_elf(struct module *mod, struct load_info *info)
+{
+	unsigned int size, symndx;
+	int ret;
+
+	size = sizeof(*mod->klp_info);
+	mod->klp_info = kmalloc(size, GFP_KERNEL);
+	if (mod->klp_info == NULL)
+		return -ENOMEM;
+
+	/* Elf header */
+	size = sizeof(mod->klp_info->hdr);
+	memcpy(&mod->klp_info->hdr, info->hdr, size);
+
+	/* Elf section header table */
+	size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
+	mod->klp_info->sechdrs = kmalloc(size, GFP_KERNEL);
+	if (mod->klp_info->sechdrs == NULL) {
+		ret = -ENOMEM;
+		goto free_info;
+	}
+	memcpy(mod->klp_info->sechdrs, info->sechdrs, size);
+
+	/* Elf section name string table */
+	size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
+	mod->klp_info->secstrings = kmalloc(size, GFP_KERNEL);
+	if (mod->klp_info->secstrings == NULL) {
+		ret = -ENOMEM;
+		goto free_sechdrs;
+	}
+	memcpy(mod->klp_info->secstrings, info->secstrings, size);
+
+	/* Elf symbol section index */
+	symndx = info->index.sym;
+	mod->klp_info->symndx = symndx;
+
+	/*
+	 * For livepatch modules, core_kallsyms.symtab is a complete
+	 * copy of the original symbol table. Adjust sh_addr to point
+	 * to core_kallsyms.symtab since the copy of the symtab in module
+	 * init memory is freed at the end of do_init_module().
+	 */
+	mod->klp_info->sechdrs[symndx].sh_addr = \
+		(unsigned long) mod->core_kallsyms.symtab;
+
+	return 0;
+
+free_sechdrs:
+	kfree(mod->klp_info->sechdrs);
+free_info:
+	kfree(mod->klp_info);
+	return ret;
+}
+
+static void free_module_elf(struct module *mod)
+{
+	kfree(mod->klp_info->sechdrs);
+	kfree(mod->klp_info->secstrings);
+	kfree(mod->klp_info);
+}
+#else /* !CONFIG_LIVEPATCH */
+static int copy_module_elf(struct module *mod, struct load_info *info)
+{
+	return 0;
+}
+
+static void free_module_elf(struct module *mod)
+{
+}
+#endif /* CONFIG_LIVEPATCH */
+
 void __weak module_memfree(void *module_region)
 {
 	vfree(module_region);
@@ -2011,6 +2088,9 @@ static void free_module(struct module *mod)
 	/* Free any allocated parameters. */
 	destroy_params(mod->kp, mod->num_kp);
 
+	if (is_livepatch_module(mod))
+		free_module_elf(mod);
+
 	/* Now we can delete it from the lists */
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
@@ -2126,6 +2206,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
 			       (long)sym[i].st_value);
 			break;
 
+		case SHN_LIVEPATCH:
+			/* Livepatch symbols are resolved by livepatch */
+			break;
+
 		case SHN_UNDEF:
 			ksym = resolve_symbol_wait(mod, info, name);
 			/* Ok if resolved.  */
@@ -2174,6 +2258,10 @@ static int apply_relocations(struct module *mod, const struct load_info *info)
 		if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC))
 			continue;
 
+		/* Livepatch relocation sections are applied by livepatch */
+		if (info->sechdrs[i].sh_flags & SHF_RELA_LIVEPATCH)
+			continue;
+
 		if (info->sechdrs[i].sh_type == SHT_REL)
 			err = apply_relocate(info->sechdrs, info->strtab,
 					     info->index.sym, i, mod);
@@ -2469,7 +2557,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
 
 	/* Compute total space required for the core symbols' strtab. */
 	for (ndst = i = 0; i < nsrc; i++) {
-		if (i == 0 ||
+		if (i == 0 || is_livepatch_module(mod) ||
 		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
 				   info->index.pcpu)) {
 			strtab_size += strlen(&info->strtab[src[i].st_name])+1;
@@ -2528,7 +2616,7 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
 	mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
 	src = mod->kallsyms->symtab;
 	for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
-		if (i == 0 ||
+		if (i == 0 || is_livepatch_module(mod) ||
 		    is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
 				   info->index.pcpu)) {
 			dst[ndst] = src[i];
@@ -2667,6 +2755,26 @@ static int copy_chunked_from_user(void *dst, const void __user *usrc, unsigned l
 	return 0;
 }
 
+#ifdef CONFIG_LIVEPATCH
+static int find_livepatch_modinfo(struct module *mod, struct load_info *info)
+{
+	mod->klp = get_modinfo(info, "livepatch") ? true : false;
+
+	return 0;
+}
+#else /* !CONFIG_LIVEPATCH */
+static int find_livepatch_modinfo(struct module *mod, struct load_info *info)
+{
+	if (get_modinfo(info, "livepatch")) {
+		pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
+		       mod->name);
+		return -ENOEXEC;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_LIVEPATCH */
+
 /* Sets info->hdr and info->len. */
 static int copy_module_from_user(const void __user *umod, unsigned long len,
 				  struct load_info *info)
@@ -2821,6 +2929,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
 			"is unknown, you have been warned.\n", mod->name);
 	}
 
+	err = find_livepatch_modinfo(mod, info);
+	if (err)
+		return err;
+
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(info, "license"));
 
@@ -3494,6 +3606,12 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	if (err < 0)
 		goto coming_cleanup;
 
+	if (is_livepatch_module(mod)) {
+		err = copy_module_elf(mod, info);
+		if (err < 0)
+			goto sysfs_cleanup;
+	}
+
 	/* Get rid of temporary copy. */
 	free_copy(info);
 
@@ -3502,11 +3620,12 @@ static int load_module(struct load_info *info, const char __user *uargs,
 
 	return do_init_module(mod);
 
+ sysfs_cleanup:
+	mod_sysfs_teardown(mod);
  coming_cleanup:
 	blocking_notifier_call_chain(&module_notify_list,
 				     MODULE_STATE_GOING, mod);
 	klp_module_going(mod);
-
  bug_cleanup:
 	/* module_bug_cleanup needs module_mutex protection */
 	mutex_lock(&module_mutex);
-- 
cgit v1.2.3


From 425595a7fc2096ab46c741b5ed5372c5ab5bbeac Mon Sep 17 00:00:00 2001
From: Jessica Yu <jeyu@redhat.com>
Date: Tue, 22 Mar 2016 20:03:18 -0400
Subject: livepatch: reuse module loader code to write relocations

Reuse module loader code to write relocations, thereby eliminating the need
for architecture specific relocation code in livepatch. Specifically, reuse
the apply_relocate_add() function in the module loader to write relocations
instead of duplicating functionality in livepatch's arch-dependent
klp_write_module_reloc() function.

In order to accomplish this, livepatch modules manage their own relocation
sections (marked with the SHF_RELA_LIVEPATCH section flag) and
livepatch-specific symbols (marked with SHN_LIVEPATCH symbol section
index). To apply livepatch relocation sections, livepatch symbols
referenced by relocs are resolved and then apply_relocate_add() is called
to apply those relocations.

In addition, remove x86 livepatch relocation code and the s390
klp_write_module_reloc() function stub. They are no longer needed since
relocation work has been offloaded to module loader.

Lastly, mark the module as a livepatch module so that the module loader
canappropriately identify and initialize it.

Signed-off-by: Jessica Yu <jeyu@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>   # for s390 changes
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/s390/include/asm/livepatch.h    |   7 --
 arch/x86/include/asm/livepatch.h     |   2 -
 arch/x86/kernel/Makefile             |   1 -
 arch/x86/kernel/livepatch.c          |  70 -----------------
 include/linux/livepatch.h            |  20 -----
 kernel/livepatch/core.c              | 148 ++++++++++++++++++++++-------------
 samples/livepatch/livepatch-sample.c |   1 +
 7 files changed, 95 insertions(+), 154 deletions(-)
 delete mode 100644 arch/x86/kernel/livepatch.c

(limited to 'kernel')

diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h
index d5427c78b1b3..2c1213785892 100644
--- a/arch/s390/include/asm/livepatch.h
+++ b/arch/s390/include/asm/livepatch.h
@@ -24,13 +24,6 @@ static inline int klp_check_compiler_support(void)
 	return 0;
 }
 
-static inline int klp_write_module_reloc(struct module *mod, unsigned long
-		type, unsigned long loc, unsigned long value)
-{
-	/* not supported yet */
-	return -ENOSYS;
-}
-
 static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 {
 	regs->psw.addr = ip;
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 7e68f9558552..a7f9181f63f3 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -32,8 +32,6 @@ static inline int klp_check_compiler_support(void)
 #endif
 	return 0;
 }
-int klp_write_module_reloc(struct module *mod, unsigned long type,
-			   unsigned long loc, unsigned long value);
 
 static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 {
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b1b78ffe01d0..c5e9a5cf976b 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -67,7 +67,6 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_LIVEPATCH)		+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
deleted file mode 100644
index 92fc1a51f994..000000000000
--- a/arch/x86/kernel/livepatch.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * livepatch.c - x86-specific Kernel Live Patching Core
- *
- * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
- * Copyright (C) 2014 SUSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <asm/elf.h>
-#include <asm/livepatch.h>
-
-/**
- * klp_write_module_reloc() - write a relocation in a module
- * @mod:	module in which the section to be modified is found
- * @type:	ELF relocation type (see asm/elf.h)
- * @loc:	address that the relocation should be written to
- * @value:	relocation value (sym address + addend)
- *
- * This function writes a relocation to the specified location for
- * a particular module.
- */
-int klp_write_module_reloc(struct module *mod, unsigned long type,
-			   unsigned long loc, unsigned long value)
-{
-	size_t size = 4;
-	unsigned long val;
-	unsigned long core = (unsigned long)mod->core_layout.base;
-	unsigned long core_size = mod->core_layout.size;
-
-	switch (type) {
-	case R_X86_64_NONE:
-		return 0;
-	case R_X86_64_64:
-		val = value;
-		size = 8;
-		break;
-	case R_X86_64_32:
-		val = (u32)value;
-		break;
-	case R_X86_64_32S:
-		val = (s32)value;
-		break;
-	case R_X86_64_PC32:
-		val = (u32)(value - loc);
-		break;
-	default:
-		/* unsupported relocation type */
-		return -EINVAL;
-	}
-
-	if (loc < core || loc >= core + core_size)
-		/* loc does not point to any symbol inside the module */
-		return -EINVAL;
-
-	return probe_kernel_write((void *)loc, &val, size);
-}
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index bd830d590465..0933ca47791c 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -64,28 +64,9 @@ struct klp_func {
 	struct list_head stack_node;
 };
 
-/**
- * struct klp_reloc - relocation structure for live patching
- * @loc:	address where the relocation will be written
- * @sympos:	position in kallsyms to disambiguate symbols (optional)
- * @type:	ELF relocation type
- * @name:	name of the referenced symbol (for lookup/verification)
- * @addend:	offset from the referenced symbol
- * @external:	symbol is either exported or within the live patch module itself
- */
-struct klp_reloc {
-	unsigned long loc;
-	unsigned long sympos;
-	unsigned long type;
-	const char *name;
-	int addend;
-	int external;
-};
-
 /**
  * struct klp_object - kernel object structure for live patching
  * @name:	module name (or NULL for vmlinux)
- * @relocs:	relocation entries to be applied at load time
  * @funcs:	function entries for functions to be patched in the object
  * @kobj:	kobject for sysfs resources
  * @mod:	kernel module associated with the patched object
@@ -95,7 +76,6 @@ struct klp_reloc {
 struct klp_object {
 	/* external */
 	const char *name;
-	struct klp_reloc *relocs;
 	struct klp_func *funcs;
 
 	/* internal */
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index d68fbf63b083..eb5db6e837aa 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -28,6 +28,8 @@
 #include <linux/list.h>
 #include <linux/kallsyms.h>
 #include <linux/livepatch.h>
+#include <linux/elf.h>
+#include <linux/moduleloader.h>
 #include <asm/cacheflush.h>
 
 /**
@@ -204,75 +206,109 @@ static int klp_find_object_symbol(const char *objname, const char *name,
 	return -EINVAL;
 }
 
-/*
- * external symbols are located outside the parent object (where the parent
- * object is either vmlinux or the kmod being patched).
- */
-static int klp_find_external_symbol(struct module *pmod, const char *name,
-				    unsigned long *addr)
+static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod)
 {
-	const struct kernel_symbol *sym;
-
-	/* first, check if it's an exported symbol */
-	preempt_disable();
-	sym = find_symbol(name, NULL, NULL, true, true);
-	if (sym) {
-		*addr = sym->value;
-		preempt_enable();
-		return 0;
-	}
-	preempt_enable();
+	int i, cnt, vmlinux, ret;
+	char objname[MODULE_NAME_LEN];
+	char symname[KSYM_NAME_LEN];
+	char *strtab = pmod->core_kallsyms.strtab;
+	Elf_Rela *relas;
+	Elf_Sym *sym;
+	unsigned long sympos, addr;
 
 	/*
-	 * Check if it's in another .o within the patch module. This also
-	 * checks that the external symbol is unique.
+	 * Since the field widths for objname and symname in the sscanf()
+	 * call are hard-coded and correspond to MODULE_NAME_LEN and
+	 * KSYM_NAME_LEN respectively, we must make sure that MODULE_NAME_LEN
+	 * and KSYM_NAME_LEN have the values we expect them to have.
+	 *
+	 * Because the value of MODULE_NAME_LEN can differ among architectures,
+	 * we use the smallest/strictest upper bound possible (56, based on
+	 * the current definition of MODULE_NAME_LEN) to prevent overflows.
 	 */
-	return klp_find_object_symbol(pmod->name, name, 0, addr);
+	BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
+
+	relas = (Elf_Rela *) relasec->sh_addr;
+	/* For each rela in this klp relocation section */
+	for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
+		sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info);
+		if (sym->st_shndx != SHN_LIVEPATCH) {
+			pr_err("symbol %s is not marked as a livepatch symbol",
+			       strtab + sym->st_name);
+			return -EINVAL;
+		}
+
+		/* Format: .klp.sym.objname.symname,sympos */
+		cnt = sscanf(strtab + sym->st_name,
+			     ".klp.sym.%55[^.].%127[^,],%lu",
+			     objname, symname, &sympos);
+		if (cnt != 3) {
+			pr_err("symbol %s has an incorrectly formatted name",
+			       strtab + sym->st_name);
+			return -EINVAL;
+		}
+
+		/* klp_find_object_symbol() treats a NULL objname as vmlinux */
+		vmlinux = !strcmp(objname, "vmlinux");
+		ret = klp_find_object_symbol(vmlinux ? NULL : objname,
+					     symname, sympos, &addr);
+		if (ret)
+			return ret;
+
+		sym->st_value = addr;
+	}
+
+	return 0;
 }
 
 static int klp_write_object_relocations(struct module *pmod,
 					struct klp_object *obj)
 {
-	int ret = 0;
-	unsigned long val;
-	struct klp_reloc *reloc;
+	int i, cnt, ret = 0;
+	const char *objname, *secname;
+	char sec_objname[MODULE_NAME_LEN];
+	Elf_Shdr *sec;
 
 	if (WARN_ON(!klp_is_object_loaded(obj)))
 		return -EINVAL;
 
-	if (WARN_ON(!obj->relocs))
-		return -EINVAL;
+	objname = klp_is_module(obj) ? obj->name : "vmlinux";
 
 	module_disable_ro(pmod);
+	/* For each klp relocation section */
+	for (i = 1; i < pmod->klp_info->hdr.e_shnum; i++) {
+		sec = pmod->klp_info->sechdrs + i;
+		secname = pmod->klp_info->secstrings + sec->sh_name;
+		if (!(sec->sh_flags & SHF_RELA_LIVEPATCH))
+			continue;
 
-	for (reloc = obj->relocs; reloc->name; reloc++) {
-		/* discover the address of the referenced symbol */
-		if (reloc->external) {
-			if (reloc->sympos > 0) {
-				pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n",
-				       reloc->name);
-				ret = -EINVAL;
-				goto out;
-			}
-			ret = klp_find_external_symbol(pmod, reloc->name, &val);
-		} else
-			ret = klp_find_object_symbol(obj->name,
-						     reloc->name,
-						     reloc->sympos,
-						     &val);
+		/*
+		 * Format: .klp.rela.sec_objname.section_name
+		 * See comment in klp_resolve_symbols() for an explanation
+		 * of the selected field width value.
+		 */
+		cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname);
+		if (cnt != 1) {
+			pr_err("section %s has an incorrectly formatted name",
+			       secname);
+			ret = -EINVAL;
+			break;
+		}
+
+		if (strcmp(objname, sec_objname))
+			continue;
+
+		ret = klp_resolve_symbols(sec, pmod);
 		if (ret)
-			goto out;
+			break;
 
-		ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
-					     val + reloc->addend);
-		if (ret) {
-			pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
-			       reloc->name, val, ret);
-			goto out;
-		}
+		ret = apply_relocate_add(pmod->klp_info->sechdrs,
+					 pmod->core_kallsyms.strtab,
+					 pmod->klp_info->symndx, i, pmod);
+		if (ret)
+			break;
 	}
 
-out:
 	module_enable_ro(pmod);
 	return ret;
 }
@@ -703,11 +739,9 @@ static int klp_init_object_loaded(struct klp_patch *patch,
 	struct klp_func *func;
 	int ret;
 
-	if (obj->relocs) {
-		ret = klp_write_object_relocations(patch->mod, obj);
-		if (ret)
-			return ret;
-	}
+	ret = klp_write_object_relocations(patch->mod, obj);
+	if (ret)
+		return ret;
 
 	klp_for_each_func(obj, func) {
 		ret = klp_find_object_symbol(obj->name, func->old_name,
@@ -842,6 +876,12 @@ int klp_register_patch(struct klp_patch *patch)
 {
 	int ret;
 
+	if (!is_livepatch_module(patch->mod)) {
+		pr_err("module %s is not marked as a livepatch module",
+		       patch->mod->name);
+		return -EINVAL;
+	}
+
 	if (!klp_initialized())
 		return -ENODEV;
 
diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c
index fb8c8614e728..e34f871e69b1 100644
--- a/samples/livepatch/livepatch-sample.c
+++ b/samples/livepatch/livepatch-sample.c
@@ -89,3 +89,4 @@ static void livepatch_exit(void)
 module_init(livepatch_init);
 module_exit(livepatch_exit);
 MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
-- 
cgit v1.2.3


From 0bed612be638e41456cd8cb270a2b411a5b43d63 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 2 Apr 2016 01:08:43 +0200
Subject: cpufreq: sched: Helpers to add and remove update_util hooks

Replace the single helper for adding and removing cpufreq utilization
update hooks, cpufreq_set_update_util_data(), with a pair of helpers,
cpufreq_add_update_util_hook() and cpufreq_remove_update_util_hook(),
and modify the users of cpufreq_set_update_util_data() accordingly.

With the new helpers, the code using them doesn't need to worry
about the internals of struct update_util_data and in particular
it doesn't need to worry about populating the func field in it
properly upfront.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/cpufreq/cpufreq_governor.c | 76 +++++++++++++++++++-------------------
 drivers/cpufreq/intel_pstate.c     |  7 ++--
 include/linux/sched.h              |  5 ++-
 kernel/sched/cpufreq.c             | 48 ++++++++++++++++++------
 4 files changed, 82 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 10a5cfeae8c5..3a0312f46027 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -258,43 +258,6 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 }
 EXPORT_SYMBOL_GPL(dbs_update);
 
-static void gov_set_update_util(struct policy_dbs_info *policy_dbs,
-				unsigned int delay_us)
-{
-	struct cpufreq_policy *policy = policy_dbs->policy;
-	int cpu;
-
-	gov_update_sample_delay(policy_dbs, delay_us);
-	policy_dbs->last_sample_time = 0;
-
-	for_each_cpu(cpu, policy->cpus) {
-		struct cpu_dbs_info *cdbs = &per_cpu(cpu_dbs, cpu);
-
-		cpufreq_set_update_util_data(cpu, &cdbs->update_util);
-	}
-}
-
-static inline void gov_clear_update_util(struct cpufreq_policy *policy)
-{
-	int i;
-
-	for_each_cpu(i, policy->cpus)
-		cpufreq_set_update_util_data(i, NULL);
-
-	synchronize_sched();
-}
-
-static void gov_cancel_work(struct cpufreq_policy *policy)
-{
-	struct policy_dbs_info *policy_dbs = policy->governor_data;
-
-	gov_clear_update_util(policy_dbs->policy);
-	irq_work_sync(&policy_dbs->irq_work);
-	cancel_work_sync(&policy_dbs->work);
-	atomic_set(&policy_dbs->work_count, 0);
-	policy_dbs->work_in_progress = false;
-}
-
 static void dbs_work_handler(struct work_struct *work)
 {
 	struct policy_dbs_info *policy_dbs;
@@ -382,6 +345,44 @@ static void dbs_update_util_handler(struct update_util_data *data, u64 time,
 	irq_work_queue(&policy_dbs->irq_work);
 }
 
+static void gov_set_update_util(struct policy_dbs_info *policy_dbs,
+				unsigned int delay_us)
+{
+	struct cpufreq_policy *policy = policy_dbs->policy;
+	int cpu;
+
+	gov_update_sample_delay(policy_dbs, delay_us);
+	policy_dbs->last_sample_time = 0;
+
+	for_each_cpu(cpu, policy->cpus) {
+		struct cpu_dbs_info *cdbs = &per_cpu(cpu_dbs, cpu);
+
+		cpufreq_add_update_util_hook(cpu, &cdbs->update_util,
+					     dbs_update_util_handler);
+	}
+}
+
+static inline void gov_clear_update_util(struct cpufreq_policy *policy)
+{
+	int i;
+
+	for_each_cpu(i, policy->cpus)
+		cpufreq_remove_update_util_hook(i);
+
+	synchronize_sched();
+}
+
+static void gov_cancel_work(struct cpufreq_policy *policy)
+{
+	struct policy_dbs_info *policy_dbs = policy->governor_data;
+
+	gov_clear_update_util(policy_dbs->policy);
+	irq_work_sync(&policy_dbs->irq_work);
+	cancel_work_sync(&policy_dbs->work);
+	atomic_set(&policy_dbs->work_count, 0);
+	policy_dbs->work_in_progress = false;
+}
+
 static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *policy,
 						     struct dbs_governor *gov)
 {
@@ -404,7 +405,6 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli
 		struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
 
 		j_cdbs->policy_dbs = policy_dbs;
-		j_cdbs->update_util.func = dbs_update_util_handler;
 	}
 	return policy_dbs;
 }
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 9ae159631f52..a38219527637 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1107,8 +1107,6 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
 
 	intel_pstate_busy_pid_reset(cpu);
 
-	cpu->update_util.func = intel_pstate_update_util;
-
 	pr_debug("intel_pstate: controlling: cpu %d\n", cpunum);
 
 	return 0;
@@ -1132,12 +1130,13 @@ static void intel_pstate_set_update_util_hook(unsigned int cpu_num)
 
 	/* Prevent intel_pstate_update_util() from using stale data. */
 	cpu->sample.time = 0;
-	cpufreq_set_update_util_data(cpu_num, &cpu->update_util);
+	cpufreq_add_update_util_hook(cpu_num, &cpu->update_util,
+				     intel_pstate_update_util);
 }
 
 static void intel_pstate_clear_update_util_hook(unsigned int cpu)
 {
-	cpufreq_set_update_util_data(cpu, NULL);
+	cpufreq_remove_update_util_hook(cpu);
 	synchronize_sched();
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 60bba7e032dc..1b8825922597 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3240,7 +3240,10 @@ struct update_util_data {
 		     u64 time, unsigned long util, unsigned long max);
 };
 
-void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+			void (*func)(struct update_util_data *data, u64 time,
+				     unsigned long util, unsigned long max));
+void cpufreq_remove_update_util_hook(int cpu);
 #endif /* CONFIG_CPU_FREQ */
 
 #endif
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 928c4ba32f68..1141954e73b4 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -14,24 +14,50 @@
 DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
 
 /**
- * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer.
+ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
  * @cpu: The CPU to set the pointer for.
  * @data: New pointer value.
+ * @func: Callback function to set for the CPU.
  *
- * Set and publish the update_util_data pointer for the given CPU.  That pointer
- * points to a struct update_util_data object containing a callback function
- * to call from cpufreq_update_util().  That function will be called from an RCU
- * read-side critical section, so it must not sleep.
+ * Set and publish the update_util_data pointer for the given CPU.
  *
- * Callers must use RCU-sched callbacks to free any memory that might be
- * accessed via the old update_util_data pointer or invoke synchronize_sched()
- * right after this function to avoid use-after-free.
+ * The update_util_data pointer of @cpu is set to @data and the callback
+ * function pointer in the target struct update_util_data is set to @func.
+ * That function will be called by cpufreq_update_util() from RCU-sched
+ * read-side critical sections, so it must not sleep.  @data will always be
+ * passed to it as the first argument which allows the function to get to the
+ * target update_util_data structure and its container.
+ *
+ * The update_util_data pointer of @cpu must be NULL when this function is
+ * called or it will WARN() and return with no effect.
  */
-void cpufreq_set_update_util_data(int cpu, struct update_util_data *data)
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+			void (*func)(struct update_util_data *data, u64 time,
+				     unsigned long util, unsigned long max))
 {
-	if (WARN_ON(data && !data->func))
+	if (WARN_ON(!data || !func))
 		return;
 
+	if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+		return;
+
+	data->func = func;
 	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
 }
-EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data);
+EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
+
+/**
+ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
+ * @cpu: The CPU to clear the pointer for.
+ *
+ * Clear the update_util_data pointer for the given CPU.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_remove_update_util_hook(int cpu)
+{
+	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
+}
+EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
-- 
cgit v1.2.3


From 9bdcb44e391da5c41b98573bf0305a0e0b1c9569 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 2 Apr 2016 01:09:12 +0200
Subject: cpufreq: schedutil: New governor based on scheduler utilization data

Add a new cpufreq scaling governor, called "schedutil", that uses
scheduler-provided CPU utilization information as input for making
its decisions.

Doing that is possible after commit 34e2c555f3e1 (cpufreq: Add
mechanism for registering utilization update callbacks) that
introduced cpufreq_update_util() called by the scheduler on
utilization changes (from CFS) and RT/DL task status updates.
In particular, CPU frequency scaling decisions may be based on
the the utilization data passed to cpufreq_update_util() by CFS.

The new governor is relatively simple.

The frequency selection formula used by it depends on whether or not
the utilization is frequency-invariant.  In the frequency-invariant
case the new CPU frequency is given by

	next_freq = 1.25 * max_freq * util / max

where util and max are the last two arguments of cpufreq_update_util().
In turn, if util is not frequency-invariant, the maximum frequency in
the above formula is replaced with the current frequency of the CPU:

	next_freq = 1.25 * curr_freq * util / max

The coefficient 1.25 corresponds to the frequency tipping point at
(util / max) = 0.8.

All of the computations are carried out in the utilization update
handlers provided by the new governor.  One of those handlers is
used for cpufreq policies shared between multiple CPUs and the other
one is for policies with one CPU only (and therefore it doesn't need
to use any extra synchronization means).

The governor supports fast frequency switching if that is supported
by the cpufreq driver in use and possible for the given policy.
In the fast switching case, all operations of the governor take
place in its utilization update handlers.  If fast switching cannot
be used, the frequency switch operations are carried out with the
help of a work item which only calls __cpufreq_driver_target()
(under a mutex) to trigger a frequency update (to a value already
computed beforehand in one of the utilization update handlers).

Currently, the governor treats all of the RT and DL tasks as
"unknown utilization" and sets the frequency to the allowed
maximum when updated from the RT or DL sched classes.  That
heavy-handed approach should be replaced with something more
subtle and specifically targeted at RT and DL tasks.

The governor shares some tunables management code with the
"ondemand" and "conservative" governors and uses some common
definitions from cpufreq_governor.h, but apart from that it
is stand-alone.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 drivers/cpufreq/Kconfig          |  30 +++
 kernel/sched/Makefile            |   1 +
 kernel/sched/cpufreq_schedutil.c | 528 +++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h             |   8 +
 kernel/trace/power-traces.c      |   1 +
 5 files changed, 568 insertions(+)
 create mode 100644 kernel/sched/cpufreq_schedutil.c

(limited to 'kernel')

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 7fd87106ba46..5d74826d75be 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -107,6 +107,16 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
 	  Be aware that not all cpufreq drivers support the conservative
 	  governor. If unsure have a look at the help section of the
 	  driver. Fallback governor will be the performance governor.
+
+config CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+	bool "schedutil"
+	select CPU_FREQ_GOV_SCHEDUTIL
+	select CPU_FREQ_GOV_PERFORMANCE
+	help
+	  Use the 'schedutil' CPUFreq governor by default. If unsure,
+	  have a look at the help section of that governor. The fallback
+	  governor will be 'performance'.
+
 endchoice
 
 config CPU_FREQ_GOV_PERFORMANCE
@@ -188,6 +198,26 @@ config CPU_FREQ_GOV_CONSERVATIVE
 
 	  If in doubt, say N.
 
+config CPU_FREQ_GOV_SCHEDUTIL
+	tristate "'schedutil' cpufreq policy governor"
+	depends on CPU_FREQ
+	select CPU_FREQ_GOV_ATTR_SET
+	select IRQ_WORK
+	help
+	  This governor makes decisions based on the utilization data provided
+	  by the scheduler.  It sets the CPU frequency to be proportional to
+	  the utilization/capacity ratio coming from the scheduler.  If the
+	  utilization is frequency-invariant, the new frequency is also
+	  proportional to the maximum available frequency.  If that is not the
+	  case, it is proportional to the current frequency of the CPU.  The
+	  frequency tipping point is at utilization/capacity equal to 80% in
+	  both cases.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called cpufreq_schedutil.
+
+	  If in doubt, say N.
+
 comment "CPU frequency scaling drivers"
 
 config CPUFREQ_DT
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 414d9c16da42..5e59b832ae2b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
new file mode 100644
index 000000000000..d27ae064b476
--- /dev/null
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -0,0 +1,528 @@
+/*
+ * CPUFreq governor based on scheduler-provided CPU utilization data.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <trace/events/power.h>
+
+#include "sched.h"
+
+struct sugov_tunables {
+	struct gov_attr_set attr_set;
+	unsigned int rate_limit_us;
+};
+
+struct sugov_policy {
+	struct cpufreq_policy *policy;
+
+	struct sugov_tunables *tunables;
+	struct list_head tunables_hook;
+
+	raw_spinlock_t update_lock;  /* For shared policies */
+	u64 last_freq_update_time;
+	s64 freq_update_delay_ns;
+	unsigned int next_freq;
+
+	/* The next fields are only needed if fast switch cannot be used. */
+	struct irq_work irq_work;
+	struct work_struct work;
+	struct mutex work_lock;
+	bool work_in_progress;
+
+	bool need_freq_update;
+};
+
+struct sugov_cpu {
+	struct update_util_data update_util;
+	struct sugov_policy *sg_policy;
+
+	/* The fields below are only needed when sharing a policy. */
+	unsigned long util;
+	unsigned long max;
+	u64 last_update;
+};
+
+static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+
+/************************ Governor internals ***********************/
+
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
+{
+	s64 delta_ns;
+
+	if (sg_policy->work_in_progress)
+		return false;
+
+	if (unlikely(sg_policy->need_freq_update)) {
+		sg_policy->need_freq_update = false;
+		/*
+		 * This happens when limits change, so forget the previous
+		 * next_freq value and force an update.
+		 */
+		sg_policy->next_freq = UINT_MAX;
+		return true;
+	}
+
+	delta_ns = time - sg_policy->last_freq_update_time;
+	return delta_ns >= sg_policy->freq_update_delay_ns;
+}
+
+static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
+				unsigned int next_freq)
+{
+	struct cpufreq_policy *policy = sg_policy->policy;
+
+	sg_policy->last_freq_update_time = time;
+
+	if (policy->fast_switch_enabled) {
+		if (sg_policy->next_freq == next_freq) {
+			trace_cpu_frequency(policy->cur, smp_processor_id());
+			return;
+		}
+		sg_policy->next_freq = next_freq;
+		next_freq = cpufreq_driver_fast_switch(policy, next_freq);
+		if (next_freq == CPUFREQ_ENTRY_INVALID)
+			return;
+
+		policy->cur = next_freq;
+		trace_cpu_frequency(next_freq, smp_processor_id());
+	} else if (sg_policy->next_freq != next_freq) {
+		sg_policy->next_freq = next_freq;
+		sg_policy->work_in_progress = true;
+		irq_work_queue(&sg_policy->irq_work);
+	}
+}
+
+/**
+ * get_next_freq - Compute a new frequency for a given cpufreq policy.
+ * @policy: cpufreq policy object to compute the new frequency for.
+ * @util: Current CPU utilization.
+ * @max: CPU capacity.
+ *
+ * If the utilization is frequency-invariant, choose the new frequency to be
+ * proportional to it, that is
+ *
+ * next_freq = C * max_freq * util / max
+ *
+ * Otherwise, approximate the would-be frequency-invariant utilization by
+ * util_raw * (curr_freq / max_freq) which leads to
+ *
+ * next_freq = C * curr_freq * util_raw / max
+ *
+ * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ */
+static unsigned int get_next_freq(struct cpufreq_policy *policy,
+				  unsigned long util, unsigned long max)
+{
+	unsigned int freq = arch_scale_freq_invariant() ?
+				policy->cpuinfo.max_freq : policy->cur;
+
+	return (freq + (freq >> 2)) * util / max;
+}
+
+static void sugov_update_single(struct update_util_data *hook, u64 time,
+				unsigned long util, unsigned long max)
+{
+	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+	struct cpufreq_policy *policy = sg_policy->policy;
+	unsigned int next_f;
+
+	if (!sugov_should_update_freq(sg_policy, time))
+		return;
+
+	next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
+			get_next_freq(policy, util, max);
+	sugov_update_commit(sg_policy, time, next_f);
+}
+
+static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+					   unsigned long util, unsigned long max)
+{
+	struct cpufreq_policy *policy = sg_policy->policy;
+	unsigned int max_f = policy->cpuinfo.max_freq;
+	u64 last_freq_update_time = sg_policy->last_freq_update_time;
+	unsigned int j;
+
+	if (util == ULONG_MAX)
+		return max_f;
+
+	for_each_cpu(j, policy->cpus) {
+		struct sugov_cpu *j_sg_cpu;
+		unsigned long j_util, j_max;
+		s64 delta_ns;
+
+		if (j == smp_processor_id())
+			continue;
+
+		j_sg_cpu = &per_cpu(sugov_cpu, j);
+		/*
+		 * If the CPU utilization was last updated before the previous
+		 * frequency update and the time elapsed between the last update
+		 * of the CPU utilization and the last frequency update is long
+		 * enough, don't take the CPU into account as it probably is
+		 * idle now.
+		 */
+		delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+		if (delta_ns > TICK_NSEC)
+			continue;
+
+		j_util = j_sg_cpu->util;
+		if (j_util == ULONG_MAX)
+			return max_f;
+
+		j_max = j_sg_cpu->max;
+		if (j_util * max > j_max * util) {
+			util = j_util;
+			max = j_max;
+		}
+	}
+
+	return get_next_freq(policy, util, max);
+}
+
+static void sugov_update_shared(struct update_util_data *hook, u64 time,
+				unsigned long util, unsigned long max)
+{
+	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+	unsigned int next_f;
+
+	raw_spin_lock(&sg_policy->update_lock);
+
+	sg_cpu->util = util;
+	sg_cpu->max = max;
+	sg_cpu->last_update = time;
+
+	if (sugov_should_update_freq(sg_policy, time)) {
+		next_f = sugov_next_freq_shared(sg_policy, util, max);
+		sugov_update_commit(sg_policy, time, next_f);
+	}
+
+	raw_spin_unlock(&sg_policy->update_lock);
+}
+
+static void sugov_work(struct work_struct *work)
+{
+	struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+
+	mutex_lock(&sg_policy->work_lock);
+	__cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
+				CPUFREQ_RELATION_L);
+	mutex_unlock(&sg_policy->work_lock);
+
+	sg_policy->work_in_progress = false;
+}
+
+static void sugov_irq_work(struct irq_work *irq_work)
+{
+	struct sugov_policy *sg_policy;
+
+	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
+	schedule_work_on(smp_processor_id(), &sg_policy->work);
+}
+
+/************************** sysfs interface ************************/
+
+static struct sugov_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
+{
+	return container_of(attr_set, struct sugov_tunables, attr_set);
+}
+
+static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+	return sprintf(buf, "%u\n", tunables->rate_limit_us);
+}
+
+static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
+				   size_t count)
+{
+	struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+	struct sugov_policy *sg_policy;
+	unsigned int rate_limit_us;
+
+	if (kstrtouint(buf, 10, &rate_limit_us))
+		return -EINVAL;
+
+	tunables->rate_limit_us = rate_limit_us;
+
+	list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+		sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+
+	return count;
+}
+
+static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+
+static struct attribute *sugov_attributes[] = {
+	&rate_limit_us.attr,
+	NULL
+};
+
+static struct kobj_type sugov_tunables_ktype = {
+	.default_attrs = sugov_attributes,
+	.sysfs_ops = &governor_sysfs_ops,
+};
+
+/********************** cpufreq governor interface *********************/
+
+static struct cpufreq_governor schedutil_gov;
+
+static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy;
+
+	sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
+	if (!sg_policy)
+		return NULL;
+
+	sg_policy->policy = policy;
+	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+	INIT_WORK(&sg_policy->work, sugov_work);
+	mutex_init(&sg_policy->work_lock);
+	raw_spin_lock_init(&sg_policy->update_lock);
+	return sg_policy;
+}
+
+static void sugov_policy_free(struct sugov_policy *sg_policy)
+{
+	mutex_destroy(&sg_policy->work_lock);
+	kfree(sg_policy);
+}
+
+static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
+{
+	struct sugov_tunables *tunables;
+
+	tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+	if (tunables) {
+		gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
+		if (!have_governor_per_policy())
+			global_tunables = tunables;
+	}
+	return tunables;
+}
+
+static void sugov_tunables_free(struct sugov_tunables *tunables)
+{
+	if (!have_governor_per_policy())
+		global_tunables = NULL;
+
+	kfree(tunables);
+}
+
+static int sugov_init(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy;
+	struct sugov_tunables *tunables;
+	unsigned int lat;
+	int ret = 0;
+
+	/* State should be equivalent to EXIT */
+	if (policy->governor_data)
+		return -EBUSY;
+
+	sg_policy = sugov_policy_alloc(policy);
+	if (!sg_policy)
+		return -ENOMEM;
+
+	mutex_lock(&global_tunables_lock);
+
+	if (global_tunables) {
+		if (WARN_ON(have_governor_per_policy())) {
+			ret = -EINVAL;
+			goto free_sg_policy;
+		}
+		policy->governor_data = sg_policy;
+		sg_policy->tunables = global_tunables;
+
+		gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
+		goto out;
+	}
+
+	tunables = sugov_tunables_alloc(sg_policy);
+	if (!tunables) {
+		ret = -ENOMEM;
+		goto free_sg_policy;
+	}
+
+	tunables->rate_limit_us = LATENCY_MULTIPLIER;
+	lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+	if (lat)
+		tunables->rate_limit_us *= lat;
+
+	policy->governor_data = sg_policy;
+	sg_policy->tunables = tunables;
+
+	ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
+				   get_governor_parent_kobj(policy), "%s",
+				   schedutil_gov.name);
+	if (ret)
+		goto fail;
+
+ out:
+	mutex_unlock(&global_tunables_lock);
+
+	cpufreq_enable_fast_switch(policy);
+	return 0;
+
+ fail:
+	policy->governor_data = NULL;
+	sugov_tunables_free(tunables);
+
+ free_sg_policy:
+	mutex_unlock(&global_tunables_lock);
+
+	sugov_policy_free(sg_policy);
+	pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret);
+	return ret;
+}
+
+static int sugov_exit(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy = policy->governor_data;
+	struct sugov_tunables *tunables = sg_policy->tunables;
+	unsigned int count;
+
+	mutex_lock(&global_tunables_lock);
+
+	count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
+	policy->governor_data = NULL;
+	if (!count)
+		sugov_tunables_free(tunables);
+
+	mutex_unlock(&global_tunables_lock);
+
+	sugov_policy_free(sg_policy);
+	return 0;
+}
+
+static int sugov_start(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy = policy->governor_data;
+	unsigned int cpu;
+
+	sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+	sg_policy->last_freq_update_time = 0;
+	sg_policy->next_freq = UINT_MAX;
+	sg_policy->work_in_progress = false;
+	sg_policy->need_freq_update = false;
+
+	for_each_cpu(cpu, policy->cpus) {
+		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
+		sg_cpu->sg_policy = sg_policy;
+		if (policy_is_shared(policy)) {
+			sg_cpu->util = ULONG_MAX;
+			sg_cpu->max = 0;
+			sg_cpu->last_update = 0;
+			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+						     sugov_update_shared);
+		} else {
+			cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+						     sugov_update_single);
+		}
+	}
+	return 0;
+}
+
+static int sugov_stop(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy = policy->governor_data;
+	unsigned int cpu;
+
+	for_each_cpu(cpu, policy->cpus)
+		cpufreq_remove_update_util_hook(cpu);
+
+	synchronize_sched();
+
+	irq_work_sync(&sg_policy->irq_work);
+	cancel_work_sync(&sg_policy->work);
+	return 0;
+}
+
+static int sugov_limits(struct cpufreq_policy *policy)
+{
+	struct sugov_policy *sg_policy = policy->governor_data;
+
+	if (!policy->fast_switch_enabled) {
+		mutex_lock(&sg_policy->work_lock);
+
+		if (policy->max < policy->cur)
+			__cpufreq_driver_target(policy, policy->max,
+						CPUFREQ_RELATION_H);
+		else if (policy->min > policy->cur)
+			__cpufreq_driver_target(policy, policy->min,
+						CPUFREQ_RELATION_L);
+
+		mutex_unlock(&sg_policy->work_lock);
+	}
+
+	sg_policy->need_freq_update = true;
+	return 0;
+}
+
+int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
+{
+	if (event == CPUFREQ_GOV_POLICY_INIT) {
+		return sugov_init(policy);
+	} else if (policy->governor_data) {
+		switch (event) {
+		case CPUFREQ_GOV_POLICY_EXIT:
+			return sugov_exit(policy);
+		case CPUFREQ_GOV_START:
+			return sugov_start(policy);
+		case CPUFREQ_GOV_STOP:
+			return sugov_stop(policy);
+		case CPUFREQ_GOV_LIMITS:
+			return sugov_limits(policy);
+		}
+	}
+	return -EINVAL;
+}
+
+static struct cpufreq_governor schedutil_gov = {
+	.name = "schedutil",
+	.governor = sugov_governor,
+	.owner = THIS_MODULE,
+};
+
+static int __init sugov_module_init(void)
+{
+	return cpufreq_register_governor(&schedutil_gov);
+}
+
+static void __exit sugov_module_exit(void)
+{
+	cpufreq_unregister_governor(&schedutil_gov);
+}
+
+MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
+MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
+MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+	return &schedutil_gov;
+}
+
+fs_initcall(sugov_module_init);
+#else
+module_init(sugov_module_init);
+#endif
+module_exit(sugov_module_exit);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec2e8d23527e..921d6e5d33b7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1842,6 +1842,14 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo
 static inline void cpufreq_trigger_update(u64 time) {}
 #endif /* CONFIG_CPU_FREQ */
 
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant()	(true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant()	(false)
+#endif
+
 static inline void account_reset_rq(struct rq *rq)
 {
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 81b87451c0ea..0c7dee221dca 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -15,5 +15,6 @@
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
 EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency);
 EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle);
 
-- 
cgit v1.2.3


From c269cba35b061181bc23c470809c00e8f71e535a Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Mon, 22 Feb 2016 15:02:07 +0100
Subject: memremap: add arch specific hook for MEMREMAP_WB mappings

Currently, the memremap code serves MEMREMAP_WB mappings directly from
the kernel direct mapping, unless the region is in high memory, in which
case it falls back to using ioremap_cache(). However, the semantics of
ioremap_cache() are not unambiguously defined, and on ARM, it will
actually result in a mapping type that differs from the attributes used
for the linear mapping, and for this reason, the ioremap_cache() call
fails if the region is part of the memory managed by the kernel.

So instead, implement an optional hook 'arch_memremap_wb' whose default
implementation calls ioremap_cache() as before, but which can be
overridden by the architecture to do what is appropriate for it.

Acked-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 kernel/memremap.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/memremap.c b/kernel/memremap.c
index a6d382312e6f..017532193fb1 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -27,6 +27,13 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
 }
 #endif
 
+#ifndef arch_memremap_wb
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
+{
+	return (__force void *)ioremap_cache(offset, size);
+}
+#endif
+
 static void *try_ram_remap(resource_size_t offset, size_t size)
 {
 	unsigned long pfn = PHYS_PFN(offset);
@@ -34,7 +41,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
 	/* In the simple case just return the existing linear address */
 	if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
 		return __va(offset);
-	return NULL; /* fallback to ioremap_cache */
+	return NULL; /* fallback to arch_memremap_wb */
 }
 
 /**
@@ -90,7 +97,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
 		if (is_ram == REGION_INTERSECTS)
 			addr = try_ram_remap(offset, size);
 		if (!addr)
-			addr = ioremap_cache(offset, size);
+			addr = arch_memremap_wb(offset, size);
 	}
 
 	/*
-- 
cgit v1.2.3


From 5c8a010c2411729a07cb1b90c09fa978ac0ac6c0 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 4 Apr 2016 10:42:07 +0200
Subject: locking/lockdep: Fix print_collision() unused warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix this:

  kernel/locking/lockdep.c:2051:13: warning: ‘print_collision’ defined but not used [-Wunused-function]
  static void print_collision(struct task_struct *curr,
              ^

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1459759327-2880-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 2324ba5310db..ed9410936a22 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1999,6 +1999,7 @@ static inline int get_first_held_lock(struct task_struct *curr,
 	return ++i;
 }
 
+#ifdef CONFIG_DEBUG_LOCKDEP
 /*
  * Returns the next chain_key iteration
  */
@@ -2069,6 +2070,7 @@ static void print_collision(struct task_struct *curr,
 	printk("\nstack backtrace:\n");
 	dump_stack();
 }
+#endif
 
 /*
  * Checks whether the chain and the current held locks are consistent
-- 
cgit v1.2.3


From 0bf676d1fd5144e66ac06939fa3c7c12086c3b18 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 31 Mar 2016 10:49:28 +0200
Subject: audit: cleanup prune_tree_thread

We can use kthread_run instead of kthread_create+wake_up_process for
creating the thread.

We do not need to set the task state to TASK_RUNNING after schedule(),
the process is in that state already.

And we do not need to set the state to TASK_INTERRUPTIBLE when not
doing schedule() as we set the state to TASK_RUNNING immediately
afterwards.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: <linux-audit@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_tree.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 5efe9b299a12..25772476fa4a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -661,10 +661,10 @@ static int tag_mount(struct vfsmount *mnt, void *arg)
 static int prune_tree_thread(void *unused)
 {
 	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (list_empty(&prune_list))
+		if (list_empty(&prune_list)) {
+			set_current_state(TASK_INTERRUPTIBLE);
 			schedule();
-		__set_current_state(TASK_RUNNING);
+		}
 
 		mutex_lock(&audit_cmd_mutex);
 		mutex_lock(&audit_filter_mutex);
@@ -693,16 +693,14 @@ static int audit_launch_prune(void)
 {
 	if (prune_thread)
 		return 0;
-	prune_thread = kthread_create(prune_tree_thread, NULL,
+	prune_thread = kthread_run(prune_tree_thread, NULL,
 				"audit_prune_tree");
 	if (IS_ERR(prune_thread)) {
 		pr_err("cannot start thread audit_prune_tree");
 		prune_thread = NULL;
 		return -ENOMEM;
-	} else {
-		wake_up_process(prune_thread);
-		return 0;
 	}
+	return 0;
 }
 
 /* called with audit_filter_mutex */
-- 
cgit v1.2.3


From 09cbfeaf1a5a67bfb3201e0c83c810cecb2efa5a Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 1 Apr 2016 15:29:47 +0300
Subject: mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros

PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.

This promise never materialized.  And unlikely will.

We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE.  And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.

Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.

Let's stop pretending that pages in page cache are special.  They are
not.

The changes are pretty straight-forward:

 - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;

 - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;

 - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};

 - page_cache_get() -> get_page();

 - page_cache_release() -> put_page();

This patch contains automated changes generated with coccinelle using
script below.  For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.

The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.

There are few places in the code where coccinelle didn't reach.  I'll
fix them manually in a separate patch.  Comments and documentation also
will be addressed with the separate patch.

virtual patch

@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E

@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E

@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT

@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE

@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK

@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)

@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)

@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arc/mm/cache.c                                |   2 +-
 arch/arm/mm/flush.c                                |   4 +-
 arch/parisc/kernel/cache.c                         |   2 +-
 arch/powerpc/platforms/cell/spufs/inode.c          |   4 +-
 arch/s390/hypfs/inode.c                            |   4 +-
 block/bio.c                                        |   8 +-
 block/blk-core.c                                   |   2 +-
 block/blk-settings.c                               |  12 +-
 block/blk-sysfs.c                                  |   8 +-
 block/cfq-iosched.c                                |   2 +-
 block/compat_ioctl.c                               |   4 +-
 block/ioctl.c                                      |   4 +-
 block/partition-generic.c                          |   8 +-
 drivers/block/aoe/aoeblk.c                         |   2 +-
 drivers/block/brd.c                                |   2 +-
 drivers/block/drbd/drbd_nl.c                       |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c            |   2 +-
 drivers/gpu/drm/armada/armada_gem.c                |   4 +-
 drivers/gpu/drm/drm_gem.c                          |   4 +-
 drivers/gpu/drm/i915/i915_gem.c                    |   8 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c            |   2 +-
 drivers/gpu/drm/radeon/radeon_ttm.c                |   2 +-
 drivers/gpu/drm/ttm/ttm_tt.c                       |   4 +-
 drivers/gpu/drm/via/via_dmablit.c                  |   2 +-
 drivers/md/bitmap.c                                |   2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c          |   2 +-
 drivers/misc/ibmasm/ibmasmfs.c                     |   4 +-
 drivers/misc/vmw_vmci/vmci_queue_pair.c            |   2 +-
 drivers/mmc/core/host.c                            |   6 +-
 drivers/mmc/host/sh_mmcif.c                        |   2 +-
 drivers/mmc/host/tmio_mmc_dma.c                    |   4 +-
 drivers/mmc/host/tmio_mmc_pio.c                    |   2 +-
 drivers/mmc/host/usdhi6rol0.c                      |   2 +-
 drivers/mtd/devices/block2mtd.c                    |   6 +-
 drivers/mtd/nand/nandsim.c                         |   6 +-
 drivers/nvdimm/btt.c                               |   2 +-
 drivers/nvdimm/pmem.c                              |   2 +-
 drivers/oprofile/oprofilefs.c                      |   4 +-
 drivers/scsi/sd.c                                  |   2 +-
 drivers/scsi/st.c                                  |   4 +-
 .../lustre/include/linux/libcfs/libcfs_private.h   |   2 +-
 .../lustre/include/linux/libcfs/linux/linux-mem.h  |   4 +-
 .../lustre/lnet/klnds/socklnd/socklnd_lib.c        |   2 +-
 drivers/staging/lustre/lnet/libcfs/debug.c         |   2 +-
 drivers/staging/lustre/lnet/libcfs/tracefile.c     |  16 +-
 drivers/staging/lustre/lnet/libcfs/tracefile.h     |   6 +-
 drivers/staging/lustre/lnet/lnet/lib-md.c          |   2 +-
 drivers/staging/lustre/lnet/lnet/lib-move.c        |   6 +-
 drivers/staging/lustre/lnet/lnet/lib-socket.c      |   4 +-
 drivers/staging/lustre/lnet/lnet/router.c          |   6 +-
 drivers/staging/lustre/lnet/selftest/brw_test.c    |  20 +-
 drivers/staging/lustre/lnet/selftest/conctl.c      |   4 +-
 drivers/staging/lustre/lnet/selftest/conrpc.c      |  10 +-
 drivers/staging/lustre/lnet/selftest/framework.c   |   2 +-
 drivers/staging/lustre/lnet/selftest/rpc.c         |   2 +-
 drivers/staging/lustre/lnet/selftest/selftest.h    |   2 +-
 .../lustre/include/linux/lustre_patchless_compat.h |   2 +-
 .../lustre/lustre/include/lustre/lustre_idl.h      |   2 +-
 drivers/staging/lustre/lustre/include/lustre_mdc.h |   4 +-
 drivers/staging/lustre/lustre/include/lustre_net.h |   6 +-
 drivers/staging/lustre/lustre/include/obd.h        |   2 +-
 .../staging/lustre/lustre/include/obd_support.h    |   2 +-
 drivers/staging/lustre/lustre/lclient/lcommon_cl.c |   4 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_lib.c      |  12 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_pool.c     |   2 +-
 drivers/staging/lustre/lustre/ldlm/ldlm_request.c  |   2 +-
 drivers/staging/lustre/lustre/llite/dir.c          |  18 +-
 .../staging/lustre/lustre/llite/llite_internal.h   |   8 +-
 drivers/staging/lustre/lustre/llite/llite_lib.c    |   8 +-
 drivers/staging/lustre/lustre/llite/llite_mmap.c   |   8 +-
 drivers/staging/lustre/lustre/llite/lloop.c        |  12 +-
 drivers/staging/lustre/lustre/llite/lproc_llite.c  |  18 +-
 drivers/staging/lustre/lustre/llite/rw.c           |  22 +-
 drivers/staging/lustre/lustre/llite/rw26.c         |  28 +--
 drivers/staging/lustre/lustre/llite/vvp_io.c       |   8 +-
 drivers/staging/lustre/lustre/llite/vvp_page.c     |   8 +-
 drivers/staging/lustre/lustre/lmv/lmv_obd.c        |   4 +-
 drivers/staging/lustre/lustre/mdc/mdc_request.c    |   6 +-
 drivers/staging/lustre/lustre/mgc/mgc_request.c    |  22 +-
 drivers/staging/lustre/lustre/obdclass/cl_page.c   |   6 +-
 drivers/staging/lustre/lustre/obdclass/class_obd.c |   6 +-
 .../lustre/lustre/obdclass/linux/linux-obdo.c      |   4 +-
 .../lustre/lustre/obdclass/linux/linux-sysctl.c    |   6 +-
 drivers/staging/lustre/lustre/obdclass/lu_object.c |   6 +-
 .../staging/lustre/lustre/obdecho/echo_client.c    |  30 +--
 drivers/staging/lustre/lustre/osc/lproc_osc.c      |  16 +-
 drivers/staging/lustre/lustre/osc/osc_cache.c      |  42 ++--
 drivers/staging/lustre/lustre/osc/osc_page.c       |   6 +-
 drivers/staging/lustre/lustre/osc/osc_request.c    |  26 +-
 drivers/staging/lustre/lustre/ptlrpc/client.c      |   6 +-
 drivers/staging/lustre/lustre/ptlrpc/import.c      |   2 +-
 .../staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c    |   4 +-
 drivers/staging/lustre/lustre/ptlrpc/recover.c     |   2 +-
 drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c    |   2 +-
 drivers/usb/gadget/function/f_fs.c                 |   4 +-
 drivers/usb/gadget/legacy/inode.c                  |   4 +-
 drivers/usb/storage/scsiglue.c                     |   2 +-
 drivers/video/fbdev/pvr2fb.c                       |   2 +-
 fs/9p/vfs_addr.c                                   |  18 +-
 fs/9p/vfs_file.c                                   |   4 +-
 fs/9p/vfs_super.c                                  |   2 +-
 fs/affs/file.c                                     |  26 +-
 fs/afs/dir.c                                       |   2 +-
 fs/afs/file.c                                      |   4 +-
 fs/afs/mntpt.c                                     |   6 +-
 fs/afs/super.c                                     |   4 +-
 fs/afs/write.c                                     |  26 +-
 fs/binfmt_elf.c                                    |   2 +-
 fs/binfmt_elf_fdpic.c                              |   2 +-
 fs/block_dev.c                                     |   4 +-
 fs/btrfs/check-integrity.c                         |  60 ++---
 fs/btrfs/compression.c                             |  84 +++----
 fs/btrfs/disk-io.c                                 |  14 +-
 fs/btrfs/extent-tree.c                             |   4 +-
 fs/btrfs/extent_io.c                               | 262 ++++++++++-----------
 fs/btrfs/extent_io.h                               |   6 +-
 fs/btrfs/file-item.c                               |   4 +-
 fs/btrfs/file.c                                    |  40 ++--
 fs/btrfs/free-space-cache.c                        |  30 +--
 fs/btrfs/inode-map.c                               |  10 +-
 fs/btrfs/inode.c                                   | 104 ++++----
 fs/btrfs/ioctl.c                                   |  84 +++----
 fs/btrfs/lzo.c                                     |  32 +--
 fs/btrfs/raid56.c                                  |  28 +--
 fs/btrfs/reada.c                                   |  30 +--
 fs/btrfs/relocation.c                              |  16 +-
 fs/btrfs/scrub.c                                   |  24 +-
 fs/btrfs/send.c                                    |  16 +-
 fs/btrfs/tests/extent-io-tests.c                   |  42 ++--
 fs/btrfs/tests/free-space-tests.c                  |   2 +-
 fs/btrfs/volumes.c                                 |  14 +-
 fs/btrfs/zlib.c                                    |  38 +--
 fs/buffer.c                                        | 100 ++++----
 fs/cachefiles/rdwr.c                               |  38 +--
 fs/ceph/addr.c                                     | 114 ++++-----
 fs/ceph/caps.c                                     |   2 +-
 fs/ceph/dir.c                                      |   4 +-
 fs/ceph/file.c                                     |  32 +--
 fs/ceph/inode.c                                    |   6 +-
 fs/ceph/mds_client.c                               |   2 +-
 fs/ceph/mds_client.h                               |   2 +-
 fs/ceph/super.c                                    |   8 +-
 fs/cifs/cifsfs.c                                   |   2 +-
 fs/cifs/cifssmb.c                                  |  16 +-
 fs/cifs/connect.c                                  |   2 +-
 fs/cifs/file.c                                     |  94 ++++----
 fs/cifs/inode.c                                    |  10 +-
 fs/configfs/mount.c                                |   4 +-
 fs/cramfs/inode.c                                  |  30 +--
 fs/crypto/crypto.c                                 |   8 +-
 fs/dax.c                                           |  30 +--
 fs/direct-io.c                                     |  26 +-
 fs/dlm/lowcomms.c                                  |   8 +-
 fs/ecryptfs/crypto.c                               |  22 +-
 fs/ecryptfs/inode.c                                |   4 +-
 fs/ecryptfs/keystore.c                             |   2 +-
 fs/ecryptfs/main.c                                 |   8 +-
 fs/ecryptfs/mmap.c                                 |  44 ++--
 fs/ecryptfs/read_write.c                           |  14 +-
 fs/efivarfs/super.c                                |   4 +-
 fs/exofs/dir.c                                     |  30 +--
 fs/exofs/inode.c                                   |  34 +--
 fs/exofs/namei.c                                   |   4 +-
 fs/ext2/dir.c                                      |  32 +--
 fs/ext2/namei.c                                    |   6 +-
 fs/ext4/crypto.c                                   |   8 +-
 fs/ext4/dir.c                                      |   4 +-
 fs/ext4/file.c                                     |   4 +-
 fs/ext4/inline.c                                   |  18 +-
 fs/ext4/inode.c                                    | 116 ++++-----
 fs/ext4/mballoc.c                                  |  36 +--
 fs/ext4/move_extent.c                              |  16 +-
 fs/ext4/page-io.c                                  |   4 +-
 fs/ext4/readpage.c                                 |  10 +-
 fs/ext4/super.c                                    |   4 +-
 fs/ext4/symlink.c                                  |   4 +-
 fs/f2fs/data.c                                     |  52 ++--
 fs/f2fs/debug.c                                    |   6 +-
 fs/f2fs/dir.c                                      |   4 +-
 fs/f2fs/f2fs.h                                     |   2 +-
 fs/f2fs/file.c                                     |  74 +++---
 fs/f2fs/inline.c                                   |  10 +-
 fs/f2fs/namei.c                                    |   4 +-
 fs/f2fs/node.c                                     |  10 +-
 fs/f2fs/recovery.c                                 |   2 +-
 fs/f2fs/segment.c                                  |  16 +-
 fs/f2fs/super.c                                    |   4 +-
 fs/freevxfs/vxfs_immed.c                           |   4 +-
 fs/freevxfs/vxfs_lookup.c                          |  12 +-
 fs/freevxfs/vxfs_subr.c                            |   2 +-
 fs/fs-writeback.c                                  |   2 +-
 fs/fscache/page.c                                  |  10 +-
 fs/fuse/dev.c                                      |  26 +-
 fs/fuse/file.c                                     |  72 +++---
 fs/fuse/inode.c                                    |  16 +-
 fs/gfs2/aops.c                                     |  44 ++--
 fs/gfs2/bmap.c                                     |  12 +-
 fs/gfs2/file.c                                     |  16 +-
 fs/gfs2/meta_io.c                                  |   4 +-
 fs/gfs2/quota.c                                    |  14 +-
 fs/gfs2/rgrp.c                                     |   5 +-
 fs/hfs/bnode.c                                     |  12 +-
 fs/hfs/btree.c                                     |  20 +-
 fs/hfs/inode.c                                     |   8 +-
 fs/hfsplus/bitmap.c                                |   2 +-
 fs/hfsplus/bnode.c                                 |  90 +++----
 fs/hfsplus/btree.c                                 |  22 +-
 fs/hfsplus/inode.c                                 |   8 +-
 fs/hfsplus/super.c                                 |   2 +-
 fs/hfsplus/xattr.c                                 |   6 +-
 fs/hostfs/hostfs_kern.c                            |  18 +-
 fs/hugetlbfs/inode.c                               |   8 +-
 fs/isofs/compress.c                                |  36 +--
 fs/isofs/inode.c                                   |   2 +-
 fs/jbd2/commit.c                                   |   4 +-
 fs/jbd2/journal.c                                  |   2 +-
 fs/jbd2/transaction.c                              |   4 +-
 fs/jffs2/debug.c                                   |   8 +-
 fs/jffs2/file.c                                    |  23 +-
 fs/jffs2/fs.c                                      |   8 +-
 fs/jffs2/gc.c                                      |   8 +-
 fs/jffs2/nodelist.c                                |   8 +-
 fs/jffs2/write.c                                   |   7 +-
 fs/jfs/jfs_metapage.c                              |  42 ++--
 fs/jfs/jfs_metapage.h                              |   4 +-
 fs/jfs/super.c                                     |   2 +-
 fs/kernfs/mount.c                                  |   4 +-
 fs/libfs.c                                         |  24 +-
 fs/logfs/dev_bdev.c                                |   2 +-
 fs/logfs/dev_mtd.c                                 |  10 +-
 fs/logfs/dir.c                                     |  12 +-
 fs/logfs/file.c                                    |  26 +-
 fs/logfs/readwrite.c                               |  20 +-
 fs/logfs/segment.c                                 |  28 +--
 fs/logfs/super.c                                   |  16 +-
 fs/minix/dir.c                                     |  18 +-
 fs/minix/namei.c                                   |   4 +-
 fs/mpage.c                                         |  20 +-
 fs/ncpfs/dir.c                                     |  10 +-
 fs/ncpfs/ncplib_kernel.h                           |   2 +-
 fs/nfs/blocklayout/blocklayout.c                   |  24 +-
 fs/nfs/blocklayout/blocklayout.h                   |   4 +-
 fs/nfs/client.c                                    |   8 +-
 fs/nfs/dir.c                                       |   4 +-
 fs/nfs/direct.c                                    |   8 +-
 fs/nfs/file.c                                      |  20 +-
 fs/nfs/internal.h                                  |   6 +-
 fs/nfs/nfs4xdr.c                                   |   2 +-
 fs/nfs/objlayout/objio_osd.c                       |   2 +-
 fs/nfs/pagelist.c                                  |   6 +-
 fs/nfs/pnfs.c                                      |   6 +-
 fs/nfs/read.c                                      |  16 +-
 fs/nfs/write.c                                     |   4 +-
 fs/nilfs2/bmap.c                                   |   2 +-
 fs/nilfs2/btnode.c                                 |  10 +-
 fs/nilfs2/dir.c                                    |  32 +--
 fs/nilfs2/gcinode.c                                |   2 +-
 fs/nilfs2/inode.c                                  |   4 +-
 fs/nilfs2/mdt.c                                    |  14 +-
 fs/nilfs2/namei.c                                  |   4 +-
 fs/nilfs2/page.c                                   |  18 +-
 fs/nilfs2/recovery.c                               |   4 +-
 fs/nilfs2/segment.c                                |   2 +-
 fs/ntfs/aops.c                                     |  48 ++--
 fs/ntfs/aops.h                                     |   2 +-
 fs/ntfs/attrib.c                                   |  28 +--
 fs/ntfs/bitmap.c                                   |  10 +-
 fs/ntfs/compress.c                                 |  56 ++---
 fs/ntfs/dir.c                                      |  40 ++--
 fs/ntfs/file.c                                     |  54 ++---
 fs/ntfs/index.c                                    |  12 +-
 fs/ntfs/inode.c                                    |   8 +-
 fs/ntfs/lcnalloc.c                                 |   6 +-
 fs/ntfs/logfile.c                                  |  16 +-
 fs/ntfs/mft.c                                      |  34 +--
 fs/ntfs/ntfs.h                                     |   2 +-
 fs/ntfs/super.c                                    |  58 ++---
 fs/ocfs2/alloc.c                                   |  28 +--
 fs/ocfs2/aops.c                                    |  48 ++--
 fs/ocfs2/cluster/heartbeat.c                       |  10 +-
 fs/ocfs2/dlmfs/dlmfs.c                             |   4 +-
 fs/ocfs2/file.c                                    |  14 +-
 fs/ocfs2/mmap.c                                    |   6 +-
 fs/ocfs2/ocfs2.h                                   |  20 +-
 fs/ocfs2/refcounttree.c                            |  22 +-
 fs/ocfs2/super.c                                   |   4 +-
 fs/orangefs/inode.c                                |  10 +-
 fs/orangefs/orangefs-bufmap.c                      |   4 +-
 fs/orangefs/orangefs-utils.c                       |   2 +-
 fs/pipe.c                                          |   6 +-
 fs/proc/task_mmu.c                                 |   2 +-
 fs/proc/vmcore.c                                   |   4 +-
 fs/pstore/inode.c                                  |   4 +-
 fs/qnx6/dir.c                                      |  16 +-
 fs/qnx6/inode.c                                    |   4 +-
 fs/qnx6/qnx6.h                                     |   2 +-
 fs/ramfs/inode.c                                   |   4 +-
 fs/reiserfs/file.c                                 |   4 +-
 fs/reiserfs/inode.c                                |  44 ++--
 fs/reiserfs/ioctl.c                                |   4 +-
 fs/reiserfs/journal.c                              |   4 +-
 fs/reiserfs/stree.c                                |   4 +-
 fs/reiserfs/tail_conversion.c                      |   4 +-
 fs/reiserfs/xattr.c                                |  18 +-
 fs/splice.c                                        |  32 +--
 fs/squashfs/block.c                                |   4 +-
 fs/squashfs/cache.c                                |  14 +-
 fs/squashfs/decompressor.c                         |   2 +-
 fs/squashfs/file.c                                 |  22 +-
 fs/squashfs/file_direct.c                          |  22 +-
 fs/squashfs/lz4_wrapper.c                          |   8 +-
 fs/squashfs/lzo_wrapper.c                          |   8 +-
 fs/squashfs/page_actor.c                           |   4 +-
 fs/squashfs/page_actor.h                           |   2 +-
 fs/squashfs/super.c                                |   2 +-
 fs/squashfs/symlink.c                              |   6 +-
 fs/squashfs/xz_wrapper.c                           |   4 +-
 fs/squashfs/zlib_wrapper.c                         |   4 +-
 fs/sync.c                                          |   4 +-
 fs/sysv/dir.c                                      |  18 +-
 fs/sysv/namei.c                                    |   4 +-
 fs/ubifs/file.c                                    |  52 ++--
 fs/ubifs/super.c                                   |   4 +-
 fs/ubifs/ubifs.h                                   |   4 +-
 fs/udf/file.c                                      |   6 +-
 fs/udf/inode.c                                     |   4 +-
 fs/ufs/balloc.c                                    |   6 +-
 fs/ufs/dir.c                                       |  32 +--
 fs/ufs/inode.c                                     |   4 +-
 fs/ufs/namei.c                                     |   6 +-
 fs/ufs/util.c                                      |   4 +-
 fs/ufs/util.h                                      |   2 +-
 fs/xfs/libxfs/xfs_bmap.c                           |   4 +-
 fs/xfs/xfs_aops.c                                  |  18 +-
 fs/xfs/xfs_bmap_util.c                             |   4 +-
 fs/xfs/xfs_file.c                                  |  12 +-
 fs/xfs/xfs_linux.h                                 |   2 +-
 fs/xfs/xfs_mount.c                                 |   2 +-
 fs/xfs/xfs_mount.h                                 |   4 +-
 fs/xfs/xfs_pnfs.c                                  |   4 +-
 fs/xfs/xfs_super.c                                 |   4 +-
 include/linux/bio.h                                |   2 +-
 include/linux/blkdev.h                             |   2 +-
 include/linux/buffer_head.h                        |   4 +-
 include/linux/ceph/libceph.h                       |   4 +-
 include/linux/f2fs_fs.h                            |   4 +-
 include/linux/fs.h                                 |   4 +-
 include/linux/nfs_page.h                           |   2 +-
 include/linux/pagemap.h                            |  14 +-
 include/linux/swap.h                               |   2 +-
 ipc/mqueue.c                                       |   4 +-
 kernel/events/uprobes.c                            |   8 +-
 mm/fadvise.c                                       |   8 +-
 mm/filemap.c                                       | 126 +++++-----
 mm/hugetlb.c                                       |   8 +-
 mm/madvise.c                                       |   6 +-
 mm/memory-failure.c                                |   2 +-
 mm/memory.c                                        |  54 ++---
 mm/mincore.c                                       |   4 +-
 mm/nommu.c                                         |   2 +-
 mm/page-writeback.c                                |  12 +-
 mm/page_io.c                                       |   2 +-
 mm/readahead.c                                     |  20 +-
 mm/rmap.c                                          |   2 +-
 mm/shmem.c                                         | 130 +++++-----
 mm/swap.c                                          |  12 +-
 mm/swap_state.c                                    |  12 +-
 mm/swapfile.c                                      |  12 +-
 mm/truncate.c                                      |  40 ++--
 mm/userfaultfd.c                                   |   4 +-
 mm/zswap.c                                         |   4 +-
 net/ceph/messenger.c                               |   6 +-
 net/ceph/pagelist.c                                |   4 +-
 net/ceph/pagevec.c                                 |  30 +--
 net/sunrpc/auth_gss/auth_gss.c                     |   8 +-
 net/sunrpc/auth_gss/gss_krb5_crypto.c              |   2 +-
 net/sunrpc/auth_gss/gss_krb5_wrap.c                |   4 +-
 net/sunrpc/cache.c                                 |   4 +-
 net/sunrpc/rpc_pipe.c                              |   4 +-
 net/sunrpc/socklib.c                               |   6 +-
 net/sunrpc/xdr.c                                   |  48 ++--
 381 files changed, 2722 insertions(+), 2721 deletions(-)

(limited to 'kernel')

diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index d7709e3930a3..9e5eddbb856f 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -628,7 +628,7 @@ void flush_dcache_page(struct page *page)
 
 		/* kernel reading from page with U-mapping */
 		phys_addr_t paddr = (unsigned long)page_address(page);
-		unsigned long vaddr = page->index << PAGE_CACHE_SHIFT;
+		unsigned long vaddr = page->index << PAGE_SHIFT;
 
 		if (addr_not_cache_congruent(paddr, vaddr))
 			__flush_dcache_page(paddr, vaddr);
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index d0ba3551d49a..3cced8455727 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -235,7 +235,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page)
 	 */
 	if (mapping && cache_is_vipt_aliasing())
 		flush_pfn_alias(page_to_pfn(page),
-				page->index << PAGE_CACHE_SHIFT);
+				page->index << PAGE_SHIFT);
 }
 
 static void __flush_dcache_aliases(struct address_space *mapping, struct page *page)
@@ -250,7 +250,7 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p
 	 *   data in the current VM view associated with this page.
 	 * - aliasing VIPT: we only need to find one mapping of this page.
 	 */
-	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	pgoff = page->index;
 
 	flush_dcache_mmap_lock(mapping);
 	vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) {
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index 91c2a39cd5aa..67001277256c 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -319,7 +319,7 @@ void flush_dcache_page(struct page *page)
 	if (!mapping)
 		return;
 
-	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	pgoff = page->index;
 
 	/* We have carefully arranged in arch_get_unmapped_area() that
 	 * *any* mappings of a file are always congruently mapped (whether
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index dfa863876778..6ca5f0525e57 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -732,8 +732,8 @@ spufs_fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = SPUFS_MAGIC;
 	sb->s_op = &s_ops;
 	sb->s_fs_info = info;
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 0f3da2cb2bd6..255c7eec4481 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -278,8 +278,8 @@ static int hypfs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->uid = current_uid();
 	sbi->gid = current_gid();
 	sb->s_fs_info = sbi;
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = HYPFS_MAGIC;
 	sb->s_op = &hypfs_s_ops;
 	if (hypfs_parse_options(data, sb))
diff --git a/block/bio.c b/block/bio.c
index f124a0a624fc..168531517694 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1339,7 +1339,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 		 * release the pages we didn't map into the bio, if any
 		 */
 		while (j < page_limit)
-			page_cache_release(pages[j++]);
+			put_page(pages[j++]);
 	}
 
 	kfree(pages);
@@ -1365,7 +1365,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
 	for (j = 0; j < nr_pages; j++) {
 		if (!pages[j])
 			break;
-		page_cache_release(pages[j]);
+		put_page(pages[j]);
 	}
  out:
 	kfree(pages);
@@ -1385,7 +1385,7 @@ static void __bio_unmap_user(struct bio *bio)
 		if (bio_data_dir(bio) == READ)
 			set_page_dirty_lock(bvec->bv_page);
 
-		page_cache_release(bvec->bv_page);
+		put_page(bvec->bv_page);
 	}
 
 	bio_put(bio);
@@ -1658,7 +1658,7 @@ void bio_check_pages_dirty(struct bio *bio)
 		struct page *page = bvec->bv_page;
 
 		if (PageDirty(page) || PageCompound(page)) {
-			page_cache_release(page);
+			put_page(page);
 			bvec->bv_page = NULL;
 		} else {
 			nr_clean_pages++;
diff --git a/block/blk-core.c b/block/blk-core.c
index 827f8badd143..b60537b2c35b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -706,7 +706,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 		goto fail_id;
 
 	q->backing_dev_info.ra_pages =
-			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+			(VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
 	q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
 	q->backing_dev_info.name = "block";
 	q->node = node_id;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index c7bb666aafd1..331e4eee0dda 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -239,8 +239,8 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
 	struct queue_limits *limits = &q->limits;
 	unsigned int max_sectors;
 
-	if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
-		max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
+	if ((max_hw_sectors << 9) < PAGE_SIZE) {
+		max_hw_sectors = 1 << (PAGE_SHIFT - 9);
 		printk(KERN_INFO "%s: set to minimum %d\n",
 		       __func__, max_hw_sectors);
 	}
@@ -329,8 +329,8 @@ EXPORT_SYMBOL(blk_queue_max_segments);
  **/
 void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 {
-	if (max_size < PAGE_CACHE_SIZE) {
-		max_size = PAGE_CACHE_SIZE;
+	if (max_size < PAGE_SIZE) {
+		max_size = PAGE_SIZE;
 		printk(KERN_INFO "%s: set to minimum %d\n",
 		       __func__, max_size);
 	}
@@ -760,8 +760,8 @@ EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
  **/
 void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
 {
-	if (mask < PAGE_CACHE_SIZE - 1) {
-		mask = PAGE_CACHE_SIZE - 1;
+	if (mask < PAGE_SIZE - 1) {
+		mask = PAGE_SIZE - 1;
 		printk(KERN_INFO "%s: set to minimum %lx\n",
 		       __func__, mask);
 	}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index dd93763057ce..995b58d46ed1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -76,7 +76,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 static ssize_t queue_ra_show(struct request_queue *q, char *page)
 {
 	unsigned long ra_kb = q->backing_dev_info.ra_pages <<
-					(PAGE_CACHE_SHIFT - 10);
+					(PAGE_SHIFT - 10);
 
 	return queue_var_show(ra_kb, (page));
 }
@@ -90,7 +90,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
 	if (ret < 0)
 		return ret;
 
-	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
+	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_SHIFT - 10);
 
 	return ret;
 }
@@ -117,7 +117,7 @@ static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
 	if (blk_queue_cluster(q))
 		return queue_var_show(queue_max_segment_size(q), (page));
 
-	return queue_var_show(PAGE_CACHE_SIZE, (page));
+	return queue_var_show(PAGE_SIZE, (page));
 }
 
 static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
@@ -198,7 +198,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
 	unsigned long max_sectors_kb,
 		max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
-			page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
+			page_kb = 1 << (PAGE_SHIFT - 10);
 	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
 
 	if (ret < 0)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index e3c591dd8f19..4a349787bc62 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4075,7 +4075,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * idle timer unplug to continue working.
 		 */
 		if (cfq_cfqq_wait_request(cfqq)) {
-			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
+			if (blk_rq_bytes(rq) > PAGE_SIZE ||
 			    cfqd->busy_queues > 1) {
 				cfq_del_timer(cfqd, cfqq);
 				cfq_clear_cfqq_wait_request(cfqq);
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f678c733df40..556826ac7cb4 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -710,7 +710,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 			return -EINVAL;
 		bdi = blk_get_backing_dev_info(bdev);
 		return compat_put_long(arg,
-				       (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
+				       (bdi->ra_pages * PAGE_SIZE) / 512);
 	case BLKROGET: /* compatible */
 		return compat_put_int(arg, bdev_read_only(bdev) != 0);
 	case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
@@ -729,7 +729,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
 		bdi = blk_get_backing_dev_info(bdev);
-		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
+		bdi->ra_pages = (arg * 512) / PAGE_SIZE;
 		return 0;
 	case BLKGETSIZE:
 		size = i_size_read(bdev->bd_inode);
diff --git a/block/ioctl.c b/block/ioctl.c
index d8996bbd7f12..4ff1f92f89ca 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -550,7 +550,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		if (!arg)
 			return -EINVAL;
 		bdi = blk_get_backing_dev_info(bdev);
-		return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512);
+		return put_long(arg, (bdi->ra_pages * PAGE_SIZE) / 512);
 	case BLKROGET:
 		return put_int(arg, bdev_read_only(bdev) != 0);
 	case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
@@ -578,7 +578,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		if(!capable(CAP_SYS_ADMIN))
 			return -EACCES;
 		bdi = blk_get_backing_dev_info(bdev);
-		bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE;
+		bdi->ra_pages = (arg * 512) / PAGE_SIZE;
 		return 0;
 	case BLKBSZSET:
 		return blkdev_bszset(bdev, mode, argp);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 5d8701941054..2c6ae2aed2c4 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -566,8 +566,8 @@ static struct page *read_pagecache_sector(struct block_device *bdev, sector_t n)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-	return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_CACHE_SHIFT-9)),
-			NULL);
+	return read_mapping_page(mapping, (pgoff_t)(n >> (PAGE_SHIFT-9)),
+				 NULL);
 }
 
 unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
@@ -584,9 +584,9 @@ unsigned char *read_dev_sector(struct block_device *bdev, sector_t n, Sector *p)
 		if (PageError(page))
 			goto fail;
 		p->v = page;
-		return (unsigned char *)page_address(page) +  ((n & ((1 << (PAGE_CACHE_SHIFT - 9)) - 1)) << 9);
+		return (unsigned char *)page_address(page) +  ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << 9);
 fail:
-		page_cache_release(page);
+		put_page(page);
 	}
 	p->v = NULL;
 	return NULL;
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index dd73e1ff1759..ec9d8610b25f 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -397,7 +397,7 @@ aoeblk_gdalloc(void *vp)
 	WARN_ON(d->flags & DEVFL_UP);
 	blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS);
 	q->backing_dev_info.name = "aoe";
-	q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
+	q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_SIZE;
 	d->bufpool = mp;
 	d->blkq = gd->queue = q;
 	q->queuedata = d;
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index f7ecc287d733..51a071e32221 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -374,7 +374,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 		       struct page *page, int rw)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
-	int err = brd_do_bvec(brd, page, PAGE_CACHE_SIZE, 0, rw, sector);
+	int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, rw, sector);
 	page_endio(page, rw & WRITE, err);
 	return err;
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 226eb0c9f0fb..1fd1dccebb6b 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1178,7 +1178,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
 	blk_queue_max_hw_sectors(q, max_hw_sectors);
 	/* This is the workaround for "bio would need to, but cannot, be split" */
 	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
-	blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
+	blk_queue_segment_boundary(q, PAGE_SIZE-1);
 
 	if (b) {
 		struct drbd_connection *connection = first_peer_device(device)->connection;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index ab34190859a8..c018b021707c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -616,7 +616,7 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm)
 			set_page_dirty(page);
 
 		mark_page_accessed(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	sg_free_table(ttm->sg);
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
index 6e731db31aa4..aca7f9cc6109 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -481,7 +481,7 @@ armada_gem_prime_map_dma_buf(struct dma_buf_attachment *attach,
 
  release:
 	for_each_sg(sgt->sgl, sg, num, i)
-		page_cache_release(sg_page(sg));
+		put_page(sg_page(sg));
  free_table:
 	sg_free_table(sgt);
  free_sgt:
@@ -502,7 +502,7 @@ static void armada_gem_prime_unmap_dma_buf(struct dma_buf_attachment *attach,
 	if (dobj->obj.filp) {
 		struct scatterlist *sg;
 		for_each_sg(sgt->sgl, sg, sgt->nents, i)
-			page_cache_release(sg_page(sg));
+			put_page(sg_page(sg));
 	}
 
 	sg_free_table(sgt);
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 2e8c77e71e1f..da0c5320789f 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -534,7 +534,7 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj)
 
 fail:
 	while (i--)
-		page_cache_release(pages[i]);
+		put_page(pages[i]);
 
 	drm_free_large(pages);
 	return ERR_CAST(p);
@@ -569,7 +569,7 @@ void drm_gem_put_pages(struct drm_gem_object *obj, struct page **pages,
 			mark_page_accessed(pages[i]);
 
 		/* Undo the reference we took when populating the table */
-		page_cache_release(pages[i]);
+		put_page(pages[i]);
 	}
 
 	drm_free_large(pages);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3d31d3ac589e..dabc08987b5e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -177,7 +177,7 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 		drm_clflush_virt_range(vaddr, PAGE_SIZE);
 		kunmap_atomic(src);
 
-		page_cache_release(page);
+		put_page(page);
 		vaddr += PAGE_SIZE;
 	}
 
@@ -243,7 +243,7 @@ i915_gem_object_put_pages_phys(struct drm_i915_gem_object *obj)
 			set_page_dirty(page);
 			if (obj->madv == I915_MADV_WILLNEED)
 				mark_page_accessed(page);
-			page_cache_release(page);
+			put_page(page);
 			vaddr += PAGE_SIZE;
 		}
 		obj->dirty = 0;
@@ -2206,7 +2206,7 @@ i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj)
 		if (obj->madv == I915_MADV_WILLNEED)
 			mark_page_accessed(page);
 
-		page_cache_release(page);
+		put_page(page);
 	}
 	obj->dirty = 0;
 
@@ -2346,7 +2346,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
 err_pages:
 	sg_mark_end(sg);
 	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
-		page_cache_release(sg_page_iter_page(&sg_iter));
+		put_page(sg_page_iter_page(&sg_iter));
 	sg_free_table(st);
 	kfree(st);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 6be40f3ba2c7..18ba8139e922 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -683,7 +683,7 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj)
 			set_page_dirty(page);
 
 		mark_page_accessed(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	obj->dirty = 0;
 
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index 6d8c32377c6f..0deb7f047be0 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -609,7 +609,7 @@ static void radeon_ttm_tt_unpin_userptr(struct ttm_tt *ttm)
 			set_page_dirty(page);
 
 		mark_page_accessed(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	sg_free_table(ttm->sg);
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 4e19d0f9cc30..077ae9b2865d 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -311,7 +311,7 @@ int ttm_tt_swapin(struct ttm_tt *ttm)
 			goto out_err;
 
 		copy_highpage(to_page, from_page);
-		page_cache_release(from_page);
+		put_page(from_page);
 	}
 
 	if (!(ttm->page_flags & TTM_PAGE_FLAG_PERSISTENT_SWAP))
@@ -361,7 +361,7 @@ int ttm_tt_swapout(struct ttm_tt *ttm, struct file *persistent_swap_storage)
 		copy_highpage(to_page, from_page);
 		set_page_dirty(to_page);
 		mark_page_accessed(to_page);
-		page_cache_release(to_page);
+		put_page(to_page);
 	}
 
 	ttm_tt_unpopulate(ttm);
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index e797dfc07ae3..7e2a12c4fed2 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -188,7 +188,7 @@ via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg)
 			if (NULL != (page = vsg->pages[i])) {
 				if (!PageReserved(page) && (DMA_FROM_DEVICE == vsg->direction))
 					SetPageDirty(page);
-				page_cache_release(page);
+				put_page(page);
 			}
 		}
 	case dr_via_pages_alloc:
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7df6b4f1548a..bef71751aade 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -322,7 +322,7 @@ __clear_page_buffers(struct page *page)
 {
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
-	page_cache_release(page);
+	put_page(page);
 }
 static void free_buffers(struct page *page)
 {
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index df4c052c6bd6..f300f060b3f3 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -349,7 +349,7 @@ int videobuf_dma_free(struct videobuf_dmabuf *dma)
 
 	if (dma->pages) {
 		for (i = 0; i < dma->nr_pages; i++)
-			page_cache_release(dma->pages[i]);
+			put_page(dma->pages[i]);
 		kfree(dma->pages);
 		dma->pages = NULL;
 	}
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index e8b933111e0d..9c677f3f3c26 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -116,8 +116,8 @@ static int ibmasmfs_fill_super (struct super_block *sb, void *data, int silent)
 {
 	struct inode *root;
 
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = IBMASMFS_MAGIC;
 	sb->s_op = &ibmasmfs_s_ops;
 	sb->s_time_gran = 1;
diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index f42d9c4e4561..f84a4275ca29 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -728,7 +728,7 @@ static void qp_release_pages(struct page **pages,
 		if (dirty)
 			set_page_dirty(pages[i]);
 
-		page_cache_release(pages[i]);
+		put_page(pages[i]);
 		pages[i] = NULL;
 	}
 }
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 1d94607611d8..6e4c55a4aab5 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -356,11 +356,11 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	 * They have to set these according to their abilities.
 	 */
 	host->max_segs = 1;
-	host->max_seg_size = PAGE_CACHE_SIZE;
+	host->max_seg_size = PAGE_SIZE;
 
-	host->max_req_size = PAGE_CACHE_SIZE;
+	host->max_req_size = PAGE_SIZE;
 	host->max_blk_size = 512;
-	host->max_blk_count = PAGE_CACHE_SIZE / 512;
+	host->max_blk_count = PAGE_SIZE / 512;
 
 	return host;
 }
diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c
index 8d870ce9f944..d9a655f47d41 100644
--- a/drivers/mmc/host/sh_mmcif.c
+++ b/drivers/mmc/host/sh_mmcif.c
@@ -1513,7 +1513,7 @@ static int sh_mmcif_probe(struct platform_device *pdev)
 		mmc->caps |= pd->caps;
 	mmc->max_segs = 32;
 	mmc->max_blk_size = 512;
-	mmc->max_req_size = PAGE_CACHE_SIZE * mmc->max_segs;
+	mmc->max_req_size = PAGE_SIZE * mmc->max_segs;
 	mmc->max_blk_count = mmc->max_req_size / mmc->max_blk_size;
 	mmc->max_seg_size = mmc->max_req_size;
 
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index 675435873823..7fb0c034dcb6 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -63,7 +63,7 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 		}
 	}
 
-	if ((!aligned && (host->sg_len > 1 || sg->length > PAGE_CACHE_SIZE ||
+	if ((!aligned && (host->sg_len > 1 || sg->length > PAGE_SIZE ||
 			  (align & PAGE_MASK))) || !multiple) {
 		ret = -EINVAL;
 		goto pio;
@@ -133,7 +133,7 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
 		}
 	}
 
-	if ((!aligned && (host->sg_len > 1 || sg->length > PAGE_CACHE_SIZE ||
+	if ((!aligned && (host->sg_len > 1 || sg->length > PAGE_SIZE ||
 			  (align & PAGE_MASK))) || !multiple) {
 		ret = -EINVAL;
 		goto pio;
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index 03f6e74c1906..0521b4662748 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -1125,7 +1125,7 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host,
 	mmc->caps2 |= pdata->capabilities2;
 	mmc->max_segs = 32;
 	mmc->max_blk_size = 512;
-	mmc->max_blk_count = (PAGE_CACHE_SIZE / mmc->max_blk_size) *
+	mmc->max_blk_count = (PAGE_SIZE / mmc->max_blk_size) *
 		mmc->max_segs;
 	mmc->max_req_size = mmc->max_blk_size * mmc->max_blk_count;
 	mmc->max_seg_size = mmc->max_req_size;
diff --git a/drivers/mmc/host/usdhi6rol0.c b/drivers/mmc/host/usdhi6rol0.c
index b2752fe711f2..807c06e203c3 100644
--- a/drivers/mmc/host/usdhi6rol0.c
+++ b/drivers/mmc/host/usdhi6rol0.c
@@ -1789,7 +1789,7 @@ static int usdhi6_probe(struct platform_device *pdev)
 	/* Set .max_segs to some random number. Feel free to adjust. */
 	mmc->max_segs = 32;
 	mmc->max_blk_size = 512;
-	mmc->max_req_size = PAGE_CACHE_SIZE * mmc->max_segs;
+	mmc->max_req_size = PAGE_SIZE * mmc->max_segs;
 	mmc->max_blk_count = mmc->max_req_size / mmc->max_blk_size;
 	/*
 	 * Setting .max_seg_size to 1 page would simplify our page-mapping code,
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index e2c0057737e6..7c887f111a7d 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -75,7 +75,7 @@ static int _block2mtd_erase(struct block2mtd_dev *dev, loff_t to, size_t len)
 				break;
 			}
 
-		page_cache_release(page);
+		put_page(page);
 		pages--;
 		index++;
 	}
@@ -124,7 +124,7 @@ static int block2mtd_read(struct mtd_info *mtd, loff_t from, size_t len,
 			return PTR_ERR(page);
 
 		memcpy(buf, page_address(page) + offset, cpylen);
-		page_cache_release(page);
+		put_page(page);
 
 		if (retlen)
 			*retlen += cpylen;
@@ -164,7 +164,7 @@ static int _block2mtd_write(struct block2mtd_dev *dev, const u_char *buf,
 			unlock_page(page);
 			balance_dirty_pages_ratelimited(mapping);
 		}
-		page_cache_release(page);
+		put_page(page);
 
 		if (retlen)
 			*retlen += cpylen;
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 1fd519503bb1..a58169a28741 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -1339,7 +1339,7 @@ static void put_pages(struct nandsim *ns)
 	int i;
 
 	for (i = 0; i < ns->held_cnt; i++)
-		page_cache_release(ns->held_pages[i]);
+		put_page(ns->held_pages[i]);
 }
 
 /* Get page cache pages in advance to provide NOFS memory allocation */
@@ -1349,8 +1349,8 @@ static int get_pages(struct nandsim *ns, struct file *file, size_t count, loff_t
 	struct page *page;
 	struct address_space *mapping = file->f_mapping;
 
-	start_index = pos >> PAGE_CACHE_SHIFT;
-	end_index = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+	start_index = pos >> PAGE_SHIFT;
+	end_index = (pos + count - 1) >> PAGE_SHIFT;
 	if (end_index - start_index + 1 > NS_MAX_HELD_PAGES)
 		return -EINVAL;
 	ns->held_cnt = 0;
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index c32cbb593600..f068b6513cd2 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1204,7 +1204,7 @@ static int btt_rw_page(struct block_device *bdev, sector_t sector,
 {
 	struct btt *btt = bdev->bd_disk->private_data;
 
-	btt_do_bvec(btt, NULL, page, PAGE_CACHE_SIZE, 0, rw, sector);
+	btt_do_bvec(btt, NULL, page, PAGE_SIZE, 0, rw, sector);
 	page_endio(page, rw & WRITE, 0);
 	return 0;
 }
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index cc31c6f1f88e..12c86fa80c5f 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -151,7 +151,7 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector,
 	struct pmem_device *pmem = bdev->bd_disk->private_data;
 	int rc;
 
-	rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
+	rc = pmem_do_bvec(pmem, page, PAGE_SIZE, 0, rw, sector);
 	if (rw & WRITE)
 		wmb_pmem();
 
diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c
index b48ac6300c79..a0e5260bd006 100644
--- a/drivers/oprofile/oprofilefs.c
+++ b/drivers/oprofile/oprofilefs.c
@@ -239,8 +239,8 @@ static int oprofilefs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *root_inode;
 
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = OPROFILEFS_MAGIC;
 	sb->s_op = &s_ops;
 	sb->s_time_gran = 1;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 5a5457ac9cdb..1bd0753f678a 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -2891,7 +2891,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
 	if (sdkp->opt_xfer_blocks &&
 	    sdkp->opt_xfer_blocks <= dev_max &&
 	    sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS &&
-	    sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_CACHE_SIZE)
+	    sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE)
 		rw_max = q->limits.io_opt =
 			sdkp->opt_xfer_blocks * sdp->sector_size;
 	else
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 71c5138ddf94..dbf1882cfbac 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4941,7 +4941,7 @@ static int sgl_map_user_pages(struct st_buffer *STbp,
  out_unmap:
 	if (res > 0) {
 		for (j=0; j < res; j++)
-			page_cache_release(pages[j]);
+			put_page(pages[j]);
 		res = 0;
 	}
 	kfree(pages);
@@ -4963,7 +4963,7 @@ static int sgl_unmap_user_pages(struct st_buffer *STbp,
 		/* FIXME: cache flush missing for rw==READ
 		 * FIXME: call the correct reference counting function
 		 */
-		page_cache_release(page);
+		put_page(page);
 	}
 	kfree(STbp->mapped_pages);
 	STbp->mapped_pages = NULL;
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
index dab486261154..13335437c69c 100644
--- a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
+++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h
@@ -88,7 +88,7 @@ do {								    \
 } while (0)
 
 #ifndef LIBCFS_VMALLOC_SIZE
-#define LIBCFS_VMALLOC_SIZE	(2 << PAGE_CACHE_SHIFT) /* 2 pages */
+#define LIBCFS_VMALLOC_SIZE	(2 << PAGE_SHIFT) /* 2 pages */
 #endif
 
 #define LIBCFS_ALLOC_PRE(size, mask)					    \
diff --git a/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h b/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h
index 0f2fd79e5ec8..837eb22749c3 100644
--- a/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h
+++ b/drivers/staging/lustre/include/linux/libcfs/linux/linux-mem.h
@@ -57,7 +57,7 @@
 #include "../libcfs_cpu.h"
 #endif
 
-#define CFS_PAGE_MASK		   (~((__u64)PAGE_CACHE_SIZE-1))
+#define CFS_PAGE_MASK		   (~((__u64)PAGE_SIZE-1))
 #define page_index(p)       ((p)->index)
 
 #define memory_pressure_get() (current->flags & PF_MEMALLOC)
@@ -67,7 +67,7 @@
 #if BITS_PER_LONG == 32
 /* limit to lowmem on 32-bit systems */
 #define NUM_CACHEPAGES \
-	min(totalram_pages, 1UL << (30 - PAGE_CACHE_SHIFT) * 3 / 4)
+	min(totalram_pages, 1UL << (30 - PAGE_SHIFT) * 3 / 4)
 #else
 #define NUM_CACHEPAGES totalram_pages
 #endif
diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c
index 3e1f24e77f64..d4ce06d0aeeb 100644
--- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c
+++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c
@@ -291,7 +291,7 @@ ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
 
 	for (nob = i = 0; i < niov; i++) {
 		if ((kiov[i].kiov_offset && i > 0) ||
-		    (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1))
+		    (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_SIZE && i < niov - 1))
 			return NULL;
 
 		pages[i] = kiov[i].kiov_page;
diff --git a/drivers/staging/lustre/lnet/libcfs/debug.c b/drivers/staging/lustre/lnet/libcfs/debug.c
index c90e5102fe06..c3d628bac5b8 100644
--- a/drivers/staging/lustre/lnet/libcfs/debug.c
+++ b/drivers/staging/lustre/lnet/libcfs/debug.c
@@ -517,7 +517,7 @@ int libcfs_debug_init(unsigned long bufsize)
 		max = TCD_MAX_PAGES;
 	} else {
 		max = max / num_possible_cpus();
-		max <<= (20 - PAGE_CACHE_SHIFT);
+		max <<= (20 - PAGE_SHIFT);
 	}
 	rc = cfs_tracefile_init(max);
 
diff --git a/drivers/staging/lustre/lnet/libcfs/tracefile.c b/drivers/staging/lustre/lnet/libcfs/tracefile.c
index ec3bc04bd89f..244eb89eef68 100644
--- a/drivers/staging/lustre/lnet/libcfs/tracefile.c
+++ b/drivers/staging/lustre/lnet/libcfs/tracefile.c
@@ -182,7 +182,7 @@ cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
 	if (tcd->tcd_cur_pages > 0) {
 		__LASSERT(!list_empty(&tcd->tcd_pages));
 		tage = cfs_tage_from_list(tcd->tcd_pages.prev);
-		if (tage->used + len <= PAGE_CACHE_SIZE)
+		if (tage->used + len <= PAGE_SIZE)
 			return tage;
 	}
 
@@ -260,7 +260,7 @@ static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
 	 * from here: this will lead to infinite recursion.
 	 */
 
-	if (len > PAGE_CACHE_SIZE) {
+	if (len > PAGE_SIZE) {
 		pr_err("cowardly refusing to write %lu bytes in a page\n", len);
 		return NULL;
 	}
@@ -349,7 +349,7 @@ int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
 	for (i = 0; i < 2; i++) {
 		tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
 		if (!tage) {
-			if (needed + known_size > PAGE_CACHE_SIZE)
+			if (needed + known_size > PAGE_SIZE)
 				mask |= D_ERROR;
 
 			cfs_trace_put_tcd(tcd);
@@ -360,7 +360,7 @@ int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
 		string_buf = (char *)page_address(tage->page) +
 					tage->used + known_size;
 
-		max_nob = PAGE_CACHE_SIZE - tage->used - known_size;
+		max_nob = PAGE_SIZE - tage->used - known_size;
 		if (max_nob <= 0) {
 			printk(KERN_EMERG "negative max_nob: %d\n",
 			       max_nob);
@@ -424,7 +424,7 @@ int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata,
 	__LASSERT(debug_buf == string_buf);
 
 	tage->used += needed;
-	__LASSERT(tage->used <= PAGE_CACHE_SIZE);
+	__LASSERT(tage->used <= PAGE_SIZE);
 
 console:
 	if ((mask & libcfs_printk) == 0) {
@@ -835,7 +835,7 @@ EXPORT_SYMBOL(cfs_trace_copyout_string);
 
 int cfs_trace_allocate_string_buffer(char **str, int nob)
 {
-	if (nob > 2 * PAGE_CACHE_SIZE)	    /* string must be "sensible" */
+	if (nob > 2 * PAGE_SIZE)	    /* string must be "sensible" */
 		return -EINVAL;
 
 	*str = kmalloc(nob, GFP_KERNEL | __GFP_ZERO);
@@ -951,7 +951,7 @@ int cfs_trace_set_debug_mb(int mb)
 	}
 
 	mb /= num_possible_cpus();
-	pages = mb << (20 - PAGE_CACHE_SHIFT);
+	pages = mb << (20 - PAGE_SHIFT);
 
 	cfs_tracefile_write_lock();
 
@@ -977,7 +977,7 @@ int cfs_trace_get_debug_mb(void)
 
 	cfs_tracefile_read_unlock();
 
-	return (total_pages >> (20 - PAGE_CACHE_SHIFT)) + 1;
+	return (total_pages >> (20 - PAGE_SHIFT)) + 1;
 }
 
 static int tracefiled(void *arg)
diff --git a/drivers/staging/lustre/lnet/libcfs/tracefile.h b/drivers/staging/lustre/lnet/libcfs/tracefile.h
index 4c77f9044dd3..ac84e7f4c859 100644
--- a/drivers/staging/lustre/lnet/libcfs/tracefile.h
+++ b/drivers/staging/lustre/lnet/libcfs/tracefile.h
@@ -87,7 +87,7 @@ void libcfs_unregister_panic_notifier(void);
 extern int  libcfs_panic_in_progress;
 int cfs_trace_max_debug_mb(void);
 
-#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT))
 #define TCD_STOCK_PAGES (TCD_MAX_PAGES)
 #define CFS_TRACEFILE_SIZE (500 << 20)
 
@@ -96,7 +96,7 @@ int cfs_trace_max_debug_mb(void);
 /*
  * Private declare for tracefile
  */
-#define TCD_MAX_PAGES (5 << (20 - PAGE_CACHE_SHIFT))
+#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT))
 #define TCD_STOCK_PAGES (TCD_MAX_PAGES)
 
 #define CFS_TRACEFILE_SIZE (500 << 20)
@@ -257,7 +257,7 @@ do {								    \
 do {								    \
 	__LASSERT(tage);					\
 	__LASSERT(tage->page);				  \
-	__LASSERT(tage->used <= PAGE_CACHE_SIZE);			 \
+	__LASSERT(tage->used <= PAGE_SIZE);			 \
 	__LASSERT(page_count(tage->page) > 0);		      \
 } while (0)
 
diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c
index c74514f99f90..75d31217bf92 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-md.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-md.c
@@ -139,7 +139,7 @@ lnet_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
 		for (i = 0; i < (int)niov; i++) {
 			/* We take the page pointer on trust */
 			if (lmd->md_iov.kiov[i].kiov_offset +
-			    lmd->md_iov.kiov[i].kiov_len > PAGE_CACHE_SIZE)
+			    lmd->md_iov.kiov[i].kiov_len > PAGE_SIZE)
 				return -EINVAL; /* invalid length */
 
 			total_length += lmd->md_iov.kiov[i].kiov_len;
diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c
index 0009a8de77d5..f19aa9320e34 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-move.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-move.c
@@ -549,12 +549,12 @@ lnet_extract_kiov(int dst_niov, lnet_kiov_t *dst,
 		if (len <= frag_len) {
 			dst->kiov_len = len;
 			LASSERT(dst->kiov_offset + dst->kiov_len
-					<= PAGE_CACHE_SIZE);
+					<= PAGE_SIZE);
 			return niov;
 		}
 
 		dst->kiov_len = frag_len;
-		LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_CACHE_SIZE);
+		LASSERT(dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
 
 		len -= frag_len;
 		dst++;
@@ -887,7 +887,7 @@ lnet_msg2bufpool(lnet_msg_t *msg)
 	rbp = &the_lnet.ln_rtrpools[cpt][0];
 
 	LASSERT(msg->msg_len <= LNET_MTU);
-	while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_CACHE_SIZE) {
+	while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_SIZE) {
 		rbp++;
 		LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]);
 	}
diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c
index cc0c2753dd63..891fd59401d7 100644
--- a/drivers/staging/lustre/lnet/lnet/lib-socket.c
+++ b/drivers/staging/lustre/lnet/lnet/lib-socket.c
@@ -166,9 +166,9 @@ lnet_ipif_enumerate(char ***namesp)
 	nalloc = 16;	/* first guess at max interfaces */
 	toobig = 0;
 	for (;;) {
-		if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) {
+		if (nalloc * sizeof(*ifr) > PAGE_SIZE) {
 			toobig = 1;
-			nalloc = PAGE_CACHE_SIZE / sizeof(*ifr);
+			nalloc = PAGE_SIZE / sizeof(*ifr);
 			CWARN("Too many interfaces: only enumerating first %d\n",
 			      nalloc);
 		}
diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c
index 61459cf9d58f..b01dc424c514 100644
--- a/drivers/staging/lustre/lnet/lnet/router.c
+++ b/drivers/staging/lustre/lnet/lnet/router.c
@@ -27,8 +27,8 @@
 #define LNET_NRB_SMALL_PAGES	1
 #define LNET_NRB_LARGE_MIN	256	/* min value for each CPT */
 #define LNET_NRB_LARGE		(LNET_NRB_LARGE_MIN * 4)
-#define LNET_NRB_LARGE_PAGES   ((LNET_MTU + PAGE_CACHE_SIZE - 1) >> \
-				 PAGE_CACHE_SHIFT)
+#define LNET_NRB_LARGE_PAGES   ((LNET_MTU + PAGE_SIZE - 1) >> \
+				 PAGE_SHIFT)
 
 static char *forwarding = "";
 module_param(forwarding, charp, 0444);
@@ -1338,7 +1338,7 @@ lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp, int cpt)
 			return NULL;
 		}
 
-		rb->rb_kiov[i].kiov_len = PAGE_CACHE_SIZE;
+		rb->rb_kiov[i].kiov_len = PAGE_SIZE;
 		rb->rb_kiov[i].kiov_offset = 0;
 		rb->rb_kiov[i].kiov_page = page;
 	}
diff --git a/drivers/staging/lustre/lnet/selftest/brw_test.c b/drivers/staging/lustre/lnet/selftest/brw_test.c
index eebc92412061..dcb6e506f592 100644
--- a/drivers/staging/lustre/lnet/selftest/brw_test.c
+++ b/drivers/staging/lustre/lnet/selftest/brw_test.c
@@ -90,7 +90,7 @@ brw_client_init(sfw_test_instance_t *tsi)
 		 * NB: this is not going to work for variable page size,
 		 * but we have to keep it for compatibility
 		 */
-		len = npg * PAGE_CACHE_SIZE;
+		len = npg * PAGE_SIZE;
 
 	} else {
 		test_bulk_req_v1_t *breq = &tsi->tsi_u.bulk_v1;
@@ -104,7 +104,7 @@ brw_client_init(sfw_test_instance_t *tsi)
 		opc = breq->blk_opc;
 		flags = breq->blk_flags;
 		len = breq->blk_len;
-		npg = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		npg = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	}
 
 	if (npg > LNET_MAX_IOV || npg <= 0)
@@ -167,13 +167,13 @@ brw_fill_page(struct page *pg, int pattern, __u64 magic)
 
 	if (pattern == LST_BRW_CHECK_SIMPLE) {
 		memcpy(addr, &magic, BRW_MSIZE);
-		addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+		addr += PAGE_SIZE - BRW_MSIZE;
 		memcpy(addr, &magic, BRW_MSIZE);
 		return;
 	}
 
 	if (pattern == LST_BRW_CHECK_FULL) {
-		for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++)
+		for (i = 0; i < PAGE_SIZE / BRW_MSIZE; i++)
 			memcpy(addr + i * BRW_MSIZE, &magic, BRW_MSIZE);
 		return;
 	}
@@ -198,7 +198,7 @@ brw_check_page(struct page *pg, int pattern, __u64 magic)
 		if (data != magic)
 			goto bad_data;
 
-		addr += PAGE_CACHE_SIZE - BRW_MSIZE;
+		addr += PAGE_SIZE - BRW_MSIZE;
 		data = *((__u64 *)addr);
 		if (data != magic)
 			goto bad_data;
@@ -207,7 +207,7 @@ brw_check_page(struct page *pg, int pattern, __u64 magic)
 	}
 
 	if (pattern == LST_BRW_CHECK_FULL) {
-		for (i = 0; i < PAGE_CACHE_SIZE / BRW_MSIZE; i++) {
+		for (i = 0; i < PAGE_SIZE / BRW_MSIZE; i++) {
 			data = *(((__u64 *)addr) + i);
 			if (data != magic)
 				goto bad_data;
@@ -278,7 +278,7 @@ brw_client_prep_rpc(sfw_test_unit_t *tsu,
 		opc = breq->blk_opc;
 		flags = breq->blk_flags;
 		npg = breq->blk_npg;
-		len = npg * PAGE_CACHE_SIZE;
+		len = npg * PAGE_SIZE;
 
 	} else {
 		test_bulk_req_v1_t *breq = &tsi->tsi_u.bulk_v1;
@@ -292,7 +292,7 @@ brw_client_prep_rpc(sfw_test_unit_t *tsu,
 		opc = breq->blk_opc;
 		flags = breq->blk_flags;
 		len = breq->blk_len;
-		npg = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		npg = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	}
 
 	rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc);
@@ -463,10 +463,10 @@ brw_server_handle(struct srpc_server_rpc *rpc)
 			reply->brw_status = EINVAL;
 			return 0;
 		}
-		npg = reqst->brw_len >> PAGE_CACHE_SHIFT;
+		npg = reqst->brw_len >> PAGE_SHIFT;
 
 	} else {
-		npg = (reqst->brw_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		npg = (reqst->brw_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	}
 
 	replymsg->msg_ses_feats = reqstmsg->msg_ses_feats;
diff --git a/drivers/staging/lustre/lnet/selftest/conctl.c b/drivers/staging/lustre/lnet/selftest/conctl.c
index 5c7cb72eac9a..79ee6c0bf7c1 100644
--- a/drivers/staging/lustre/lnet/selftest/conctl.c
+++ b/drivers/staging/lustre/lnet/selftest/conctl.c
@@ -743,7 +743,7 @@ static int lst_test_add_ioctl(lstio_test_args_t *args)
 	if (args->lstio_tes_param &&
 	    (args->lstio_tes_param_len <= 0 ||
 	     args->lstio_tes_param_len >
-	     PAGE_CACHE_SIZE - sizeof(lstcon_test_t)))
+	     PAGE_SIZE - sizeof(lstcon_test_t)))
 		return -EINVAL;
 
 	LIBCFS_ALLOC(batch_name, args->lstio_tes_bat_nmlen + 1);
@@ -819,7 +819,7 @@ lstcon_ioctl_entry(unsigned int cmd, struct libcfs_ioctl_hdr *hdr)
 
 	opc = data->ioc_u32[0];
 
-	if (data->ioc_plen1 > PAGE_CACHE_SIZE)
+	if (data->ioc_plen1 > PAGE_SIZE)
 		return -EINVAL;
 
 	LIBCFS_ALLOC(buf, data->ioc_plen1);
diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c
index bcd78888f9cc..35a227d0c657 100644
--- a/drivers/staging/lustre/lnet/selftest/conrpc.c
+++ b/drivers/staging/lustre/lnet/selftest/conrpc.c
@@ -786,8 +786,8 @@ lstcon_bulkrpc_v0_prep(lst_test_bulk_param_t *param, srpc_test_reqst_t *req)
 	test_bulk_req_t *brq = &req->tsr_u.bulk_v0;
 
 	brq->blk_opc = param->blk_opc;
-	brq->blk_npg = (param->blk_size + PAGE_CACHE_SIZE - 1) /
-			PAGE_CACHE_SIZE;
+	brq->blk_npg = (param->blk_size + PAGE_SIZE - 1) /
+			PAGE_SIZE;
 	brq->blk_flags = param->blk_flags;
 
 	return 0;
@@ -822,7 +822,7 @@ lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
 	if (transop == LST_TRANS_TSBCLIADD) {
 		npg = sfw_id_pages(test->tes_span);
 		nob = !(feats & LST_FEAT_BULK_LEN) ?
-		      npg * PAGE_CACHE_SIZE :
+		      npg * PAGE_SIZE :
 		      sizeof(lnet_process_id_packed_t) * test->tes_span;
 	}
 
@@ -851,8 +851,8 @@ lstcon_testrpc_prep(lstcon_node_t *nd, int transop, unsigned feats,
 			LASSERT(nob > 0);
 
 			len = !(feats & LST_FEAT_BULK_LEN) ?
-			      PAGE_CACHE_SIZE :
-			      min_t(int, nob, PAGE_CACHE_SIZE);
+			      PAGE_SIZE :
+			      min_t(int, nob, PAGE_SIZE);
 			nob -= len;
 
 			bulk->bk_iovs[i].kiov_offset = 0;
diff --git a/drivers/staging/lustre/lnet/selftest/framework.c b/drivers/staging/lustre/lnet/selftest/framework.c
index 926c3970c498..e2c532399366 100644
--- a/drivers/staging/lustre/lnet/selftest/framework.c
+++ b/drivers/staging/lustre/lnet/selftest/framework.c
@@ -1161,7 +1161,7 @@ sfw_add_test(struct srpc_server_rpc *rpc)
 		int len;
 
 		if (!(sn->sn_features & LST_FEAT_BULK_LEN)) {
-			len = npg * PAGE_CACHE_SIZE;
+			len = npg * PAGE_SIZE;
 
 		} else {
 			len = sizeof(lnet_process_id_packed_t) *
diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c
index 69be7d6f48fa..7d7748d96332 100644
--- a/drivers/staging/lustre/lnet/selftest/rpc.c
+++ b/drivers/staging/lustre/lnet/selftest/rpc.c
@@ -90,7 +90,7 @@ void srpc_set_counters(const srpc_counters_t *cnt)
 static int
 srpc_add_bulk_page(srpc_bulk_t *bk, struct page *pg, int i, int nob)
 {
-	nob = min_t(int, nob, PAGE_CACHE_SIZE);
+	nob = min_t(int, nob, PAGE_SIZE);
 
 	LASSERT(nob > 0);
 	LASSERT(i >= 0 && i < bk->bk_niov);
diff --git a/drivers/staging/lustre/lnet/selftest/selftest.h b/drivers/staging/lustre/lnet/selftest/selftest.h
index 288522d4d7b9..5321ddec9580 100644
--- a/drivers/staging/lustre/lnet/selftest/selftest.h
+++ b/drivers/staging/lustre/lnet/selftest/selftest.h
@@ -393,7 +393,7 @@ typedef struct sfw_test_instance {
 /* XXX: trailing (PAGE_CACHE_SIZE % sizeof(lnet_process_id_t)) bytes at
  * the end of pages are not used */
 #define SFW_MAX_CONCUR	   LST_MAX_CONCUR
-#define SFW_ID_PER_PAGE    (PAGE_CACHE_SIZE / sizeof(lnet_process_id_packed_t))
+#define SFW_ID_PER_PAGE    (PAGE_SIZE / sizeof(lnet_process_id_packed_t))
 #define SFW_MAX_NDESTS	   (LNET_MAX_IOV * SFW_ID_PER_PAGE)
 #define sfw_id_pages(n)    (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE)
 
diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
index 33e0b99e1fb4..c6c7f54637fb 100644
--- a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
+++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
@@ -52,7 +52,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 		return;
 
 	if (PagePrivate(page))
-		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
 
 	cancel_dirty_page(page);
 	ClearPageMappedToDisk(page);
diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
index da8bc6eadd13..1e2ebe5cc998 100644
--- a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
+++ b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h
@@ -1031,7 +1031,7 @@ static inline int lu_dirent_size(struct lu_dirent *ent)
 #define LU_PAGE_SIZE  (1UL << LU_PAGE_SHIFT)
 #define LU_PAGE_MASK  (~(LU_PAGE_SIZE - 1))
 
-#define LU_PAGE_COUNT (1 << (PAGE_CACHE_SHIFT - LU_PAGE_SHIFT))
+#define LU_PAGE_COUNT (1 << (PAGE_SHIFT - LU_PAGE_SHIFT))
 
 /** @} lu_dir */
 
diff --git a/drivers/staging/lustre/lustre/include/lustre_mdc.h b/drivers/staging/lustre/lustre/include/lustre_mdc.h
index df94f9f3bef2..af77eb359c43 100644
--- a/drivers/staging/lustre/lustre/include/lustre_mdc.h
+++ b/drivers/staging/lustre/lustre/include/lustre_mdc.h
@@ -155,12 +155,12 @@ static inline void mdc_update_max_ea_from_body(struct obd_export *exp,
 		if (cli->cl_max_mds_easize < body->max_mdsize) {
 			cli->cl_max_mds_easize = body->max_mdsize;
 			cli->cl_default_mds_easize =
-			    min_t(__u32, body->max_mdsize, PAGE_CACHE_SIZE);
+			    min_t(__u32, body->max_mdsize, PAGE_SIZE);
 		}
 		if (cli->cl_max_mds_cookiesize < body->max_cookiesize) {
 			cli->cl_max_mds_cookiesize = body->max_cookiesize;
 			cli->cl_default_mds_cookiesize =
-			    min_t(__u32, body->max_cookiesize, PAGE_CACHE_SIZE);
+			    min_t(__u32, body->max_cookiesize, PAGE_SIZE);
 		}
 	}
 }
diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h
index 4fa1a18b7d15..a5e9095fbf36 100644
--- a/drivers/staging/lustre/lustre/include/lustre_net.h
+++ b/drivers/staging/lustre/lustre/include/lustre_net.h
@@ -99,13 +99,13 @@
  */
 #define PTLRPC_MAX_BRW_BITS	(LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS)
 #define PTLRPC_MAX_BRW_SIZE	(1 << PTLRPC_MAX_BRW_BITS)
-#define PTLRPC_MAX_BRW_PAGES	(PTLRPC_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define PTLRPC_MAX_BRW_PAGES	(PTLRPC_MAX_BRW_SIZE >> PAGE_SHIFT)
 
 #define ONE_MB_BRW_SIZE		(1 << LNET_MTU_BITS)
 #define MD_MAX_BRW_SIZE		(1 << LNET_MTU_BITS)
-#define MD_MAX_BRW_PAGES	(MD_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define MD_MAX_BRW_PAGES	(MD_MAX_BRW_SIZE >> PAGE_SHIFT)
 #define DT_MAX_BRW_SIZE		PTLRPC_MAX_BRW_SIZE
-#define DT_MAX_BRW_PAGES	(DT_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define DT_MAX_BRW_PAGES	(DT_MAX_BRW_SIZE >> PAGE_SHIFT)
 #define OFD_MAX_BRW_SIZE	(1 << LNET_MTU_BITS)
 
 /* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */
diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h
index 4a0f2e8b19f6..f4167db65b5d 100644
--- a/drivers/staging/lustre/lustre/include/obd.h
+++ b/drivers/staging/lustre/lustre/include/obd.h
@@ -1318,7 +1318,7 @@ bad_format:
 
 static inline int cli_brw_size(struct obd_device *obd)
 {
-	return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+	return obd->u.cli.cl_max_pages_per_rpc << PAGE_SHIFT;
 }
 
 #endif /* __OBD_H */
diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h
index 225262fa67b6..f8ee3a3254ba 100644
--- a/drivers/staging/lustre/lustre/include/obd_support.h
+++ b/drivers/staging/lustre/lustre/include/obd_support.h
@@ -500,7 +500,7 @@ extern char obd_jobid_var[];
 
 #ifdef POISON_BULK
 #define POISON_PAGE(page, val) do {		  \
-	memset(kmap(page), val, PAGE_CACHE_SIZE); \
+	memset(kmap(page), val, PAGE_SIZE); \
 	kunmap(page);				  \
 } while (0)
 #else
diff --git a/drivers/staging/lustre/lustre/lclient/lcommon_cl.c b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
index aced41ab93a1..96141d17d07f 100644
--- a/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
+++ b/drivers/staging/lustre/lustre/lclient/lcommon_cl.c
@@ -758,9 +758,9 @@ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
 				 * --bug 17336
 				 */
 				loff_t size = cl_isize_read(inode);
-				loff_t cur_index = start >> PAGE_CACHE_SHIFT;
+				loff_t cur_index = start >> PAGE_SHIFT;
 				loff_t size_index = (size - 1) >>
-						    PAGE_CACHE_SHIFT;
+						    PAGE_SHIFT;
 
 				if ((size == 0 && cur_index != 0) ||
 				    size_index < cur_index)
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
index b586d5a88d00..7dd7df59aa1f 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c
@@ -307,8 +307,8 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 	cli->cl_avail_grant = 0;
 	/* FIXME: Should limit this for the sum of all cl_dirty_max. */
 	cli->cl_dirty_max = OSC_MAX_DIRTY_DEFAULT * 1024 * 1024;
-	if (cli->cl_dirty_max >> PAGE_CACHE_SHIFT > totalram_pages / 8)
-		cli->cl_dirty_max = totalram_pages << (PAGE_CACHE_SHIFT - 3);
+	if (cli->cl_dirty_max >> PAGE_SHIFT > totalram_pages / 8)
+		cli->cl_dirty_max = totalram_pages << (PAGE_SHIFT - 3);
 	INIT_LIST_HEAD(&cli->cl_cache_waiters);
 	INIT_LIST_HEAD(&cli->cl_loi_ready_list);
 	INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list);
@@ -353,15 +353,15 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
 	 * In the future this should likely be increased. LU-1431
 	 */
 	cli->cl_max_pages_per_rpc = min_t(int, PTLRPC_MAX_BRW_PAGES,
-					  LNET_MTU >> PAGE_CACHE_SHIFT);
+					  LNET_MTU >> PAGE_SHIFT);
 
 	if (!strcmp(name, LUSTRE_MDC_NAME)) {
 		cli->cl_max_rpcs_in_flight = MDC_MAX_RIF_DEFAULT;
-	} else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 128 /* MB */) {
+	} else if (totalram_pages >> (20 - PAGE_SHIFT) <= 128 /* MB */) {
 		cli->cl_max_rpcs_in_flight = 2;
-	} else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 256 /* MB */) {
+	} else if (totalram_pages >> (20 - PAGE_SHIFT) <= 256 /* MB */) {
 		cli->cl_max_rpcs_in_flight = 3;
-	} else if (totalram_pages >> (20 - PAGE_CACHE_SHIFT) <= 512 /* MB */) {
+	} else if (totalram_pages >> (20 - PAGE_SHIFT) <= 512 /* MB */) {
 		cli->cl_max_rpcs_in_flight = 4;
 	} else {
 		cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT;
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
index 3e937b050203..b913ba9cf97c 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c
@@ -107,7 +107,7 @@
 /*
  * 50 ldlm locks for 1MB of RAM.
  */
-#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_CACHE_SHIFT)) * 50)
+#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_SHIFT)) * 50)
 
 /*
  * Maximal possible grant step plan in %.
diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
index c7904a96f9af..74e193e52cd6 100644
--- a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
+++ b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c
@@ -546,7 +546,7 @@ static inline int ldlm_req_handles_avail(int req_size, int off)
 {
 	int avail;
 
-	avail = min_t(int, LDLM_MAXREQSIZE, PAGE_CACHE_SIZE - 512) - req_size;
+	avail = min_t(int, LDLM_MAXREQSIZE, PAGE_SIZE - 512) - req_size;
 	if (likely(avail >= 0))
 		avail /= (int)sizeof(struct lustre_handle);
 	else
diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c
index 4e0a3e583330..a7c02e07fd75 100644
--- a/drivers/staging/lustre/lustre/llite/dir.c
+++ b/drivers/staging/lustre/lustre/llite/dir.c
@@ -153,7 +153,7 @@ static int ll_dir_filler(void *_hash, struct page *page0)
 	struct page **page_pool;
 	struct page *page;
 	struct lu_dirpage *dp;
-	int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_CACHE_SHIFT;
+	int max_pages = ll_i2sbi(inode)->ll_md_brw_size >> PAGE_SHIFT;
 	int nrdpgs = 0; /* number of pages read actually */
 	int npages;
 	int i;
@@ -193,8 +193,8 @@ static int ll_dir_filler(void *_hash, struct page *page0)
 		if (body->valid & OBD_MD_FLSIZE)
 			cl_isize_write(inode, body->size);
 
-		nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_CACHE_SIZE-1)
-			 >> PAGE_CACHE_SHIFT;
+		nrdpgs = (request->rq_bulk->bd_nob_transferred+PAGE_SIZE-1)
+			 >> PAGE_SHIFT;
 		SetPageUptodate(page0);
 	}
 	unlock_page(page0);
@@ -209,7 +209,7 @@ static int ll_dir_filler(void *_hash, struct page *page0)
 		page = page_pool[i];
 
 		if (rc < 0 || i >= nrdpgs) {
-			page_cache_release(page);
+			put_page(page);
 			continue;
 		}
 
@@ -230,7 +230,7 @@ static int ll_dir_filler(void *_hash, struct page *page0)
 			CDEBUG(D_VFSTRACE, "page %lu add to page cache failed: %d\n",
 			       offset, ret);
 		}
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	if (page_pool != &page0)
@@ -247,7 +247,7 @@ void ll_release_page(struct page *page, int remove)
 			truncate_complete_page(page->mapping, page);
 		unlock_page(page);
 	}
-	page_cache_release(page);
+	put_page(page);
 }
 
 /*
@@ -273,7 +273,7 @@ static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
 	if (found > 0 && !radix_tree_exceptional_entry(page)) {
 		struct lu_dirpage *dp;
 
-		page_cache_get(page);
+		get_page(page);
 		spin_unlock_irq(&mapping->tree_lock);
 		/*
 		 * In contrast to find_lock_page() we are sure that directory
@@ -313,7 +313,7 @@ static struct page *ll_dir_page_locate(struct inode *dir, __u64 *hash,
 				page = NULL;
 			}
 		} else {
-			page_cache_release(page);
+			put_page(page);
 			page = ERR_PTR(-EIO);
 		}
 
@@ -1507,7 +1507,7 @@ skip_lmm:
 			st.st_gid     = body->gid;
 			st.st_rdev    = body->rdev;
 			st.st_size    = body->size;
-			st.st_blksize = PAGE_CACHE_SIZE;
+			st.st_blksize = PAGE_SIZE;
 			st.st_blocks  = body->blocks;
 			st.st_atime   = body->atime;
 			st.st_mtime   = body->mtime;
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index 3e1572cb457b..e3c0f1dd4d31 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -310,10 +310,10 @@ static inline struct ll_inode_info *ll_i2info(struct inode *inode)
 /* default to about 40meg of readahead on a given system.  That much tied
  * up in 512k readahead requests serviced at 40ms each is about 1GB/s.
  */
-#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_CACHE_SHIFT))
+#define SBI_DEFAULT_READAHEAD_MAX (40UL << (20 - PAGE_SHIFT))
 
 /* default to read-ahead full files smaller than 2MB on the second read */
-#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_CACHE_SHIFT))
+#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_SHIFT))
 
 enum ra_stat {
 	RA_STAT_HIT = 0,
@@ -975,13 +975,13 @@ struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr,
 static inline void ll_invalidate_page(struct page *vmpage)
 {
 	struct address_space *mapping = vmpage->mapping;
-	loff_t offset = vmpage->index << PAGE_CACHE_SHIFT;
+	loff_t offset = vmpage->index << PAGE_SHIFT;
 
 	LASSERT(PageLocked(vmpage));
 	if (!mapping)
 		return;
 
-	ll_teardown_mmaps(mapping, offset, offset + PAGE_CACHE_SIZE);
+	ll_teardown_mmaps(mapping, offset, offset + PAGE_SIZE);
 	truncate_complete_page(mapping, vmpage);
 }
 
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index 6d6bb33e3655..b57a992688a8 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -85,7 +85,7 @@ static struct ll_sb_info *ll_init_sbi(struct super_block *sb)
 
 	si_meminfo(&si);
 	pages = si.totalram - si.totalhigh;
-	if (pages >> (20 - PAGE_CACHE_SHIFT) < 512)
+	if (pages >> (20 - PAGE_SHIFT) < 512)
 		lru_page_max = pages / 2;
 	else
 		lru_page_max = (pages / 4) * 3;
@@ -272,12 +272,12 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	    valid != CLIENT_CONNECT_MDT_REQD) {
 		char *buf;
 
-		buf = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+		buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!buf) {
 			err = -ENOMEM;
 			goto out_md_fid;
 		}
-		obd_connect_flags2str(buf, PAGE_CACHE_SIZE,
+		obd_connect_flags2str(buf, PAGE_SIZE,
 				      valid ^ CLIENT_CONNECT_MDT_REQD, ",");
 		LCONSOLE_ERROR_MSG(0x170, "Server %s does not support feature(s) needed for correct operation of this client (%s). Please upgrade server or downgrade client.\n",
 				   sbi->ll_md_exp->exp_obd->obd_name, buf);
@@ -335,7 +335,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt,
 	if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
 		sbi->ll_md_brw_size = data->ocd_brw_size;
 	else
-		sbi->ll_md_brw_size = PAGE_CACHE_SIZE;
+		sbi->ll_md_brw_size = PAGE_SIZE;
 
 	if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) {
 		LCONSOLE_INFO("Layout lock feature supported.\n");
diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c
index 69445a9f2011..5b484e62ffd0 100644
--- a/drivers/staging/lustre/lustre/llite/llite_mmap.c
+++ b/drivers/staging/lustre/lustre/llite/llite_mmap.c
@@ -58,7 +58,7 @@ void policy_from_vma(ldlm_policy_data_t *policy,
 		     size_t count)
 {
 	policy->l_extent.start = ((addr - vma->vm_start) & CFS_PAGE_MASK) +
-				 (vma->vm_pgoff << PAGE_CACHE_SHIFT);
+				 (vma->vm_pgoff << PAGE_SHIFT);
 	policy->l_extent.end = (policy->l_extent.start + count - 1) |
 			       ~CFS_PAGE_MASK;
 }
@@ -321,7 +321,7 @@ static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 		vmpage = vio->u.fault.ft_vmpage;
 		if (result != 0 && vmpage) {
-			page_cache_release(vmpage);
+			put_page(vmpage);
 			vmf->page = NULL;
 		}
 	}
@@ -360,7 +360,7 @@ restart:
 		lock_page(vmpage);
 		if (unlikely(!vmpage->mapping)) { /* unlucky */
 			unlock_page(vmpage);
-			page_cache_release(vmpage);
+			put_page(vmpage);
 			vmf->page = NULL;
 
 			if (!printed && ++count > 16) {
@@ -457,7 +457,7 @@ int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last)
 	LASSERTF(last > first, "last %llu first %llu\n", last, first);
 	if (mapping_mapped(mapping)) {
 		rc = 0;
-		unmap_mapping_range(mapping, first + PAGE_CACHE_SIZE - 1,
+		unmap_mapping_range(mapping, first + PAGE_SIZE - 1,
 				    last - first + 1, 0);
 	}
 
diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c
index b725fc16cf49..f169c0db63b4 100644
--- a/drivers/staging/lustre/lustre/llite/lloop.c
+++ b/drivers/staging/lustre/lustre/llite/lloop.c
@@ -218,7 +218,7 @@ static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
 		offset = (pgoff_t)(bio->bi_iter.bi_sector << 9) + lo->lo_offset;
 		bio_for_each_segment(bvec, bio, iter) {
 			BUG_ON(bvec.bv_offset != 0);
-			BUG_ON(bvec.bv_len != PAGE_CACHE_SIZE);
+			BUG_ON(bvec.bv_len != PAGE_SIZE);
 
 			pages[page_count] = bvec.bv_page;
 			offsets[page_count] = offset;
@@ -232,7 +232,7 @@ static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head)
 			(rw == WRITE) ? LPROC_LL_BRW_WRITE : LPROC_LL_BRW_READ,
 			page_count);
 
-	pvec->ldp_size = page_count << PAGE_CACHE_SHIFT;
+	pvec->ldp_size = page_count << PAGE_SHIFT;
 	pvec->ldp_nr = page_count;
 
 	/* FIXME: in ll_direct_rw_pages, it has to allocate many cl_page{}s to
@@ -507,7 +507,7 @@ static int loop_set_fd(struct lloop_device *lo, struct file *unused,
 
 	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
-	lo->lo_blocksize = PAGE_CACHE_SIZE;
+	lo->lo_blocksize = PAGE_SIZE;
 	lo->lo_device = bdev;
 	lo->lo_flags = lo_flags;
 	lo->lo_backing_file = file;
@@ -525,11 +525,11 @@ static int loop_set_fd(struct lloop_device *lo, struct file *unused,
 	lo->lo_queue->queuedata = lo;
 
 	/* queue parameters */
-	CLASSERT(PAGE_CACHE_SIZE < (1 << (sizeof(unsigned short) * 8)));
+	CLASSERT(PAGE_SIZE < (1 << (sizeof(unsigned short) * 8)));
 	blk_queue_logical_block_size(lo->lo_queue,
-				     (unsigned short)PAGE_CACHE_SIZE);
+				     (unsigned short)PAGE_SIZE);
 	blk_queue_max_hw_sectors(lo->lo_queue,
-				 LLOOP_MAX_SEGMENTS << (PAGE_CACHE_SHIFT - 9));
+				 LLOOP_MAX_SEGMENTS << (PAGE_SHIFT - 9));
 	blk_queue_max_segments(lo->lo_queue, LLOOP_MAX_SEGMENTS);
 
 	set_capacity(disks[lo->lo_number], size);
diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c
index 45941a6600fe..27ab1261400e 100644
--- a/drivers/staging/lustre/lustre/llite/lproc_llite.c
+++ b/drivers/staging/lustre/lustre/llite/lproc_llite.c
@@ -233,7 +233,7 @@ static ssize_t max_read_ahead_mb_show(struct kobject *kobj,
 	pages_number = sbi->ll_ra_info.ra_max_pages;
 	spin_unlock(&sbi->ll_lock);
 
-	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	mult = 1 << (20 - PAGE_SHIFT);
 	return lprocfs_read_frac_helper(buf, PAGE_SIZE, pages_number, mult);
 }
 
@@ -251,12 +251,12 @@ static ssize_t max_read_ahead_mb_store(struct kobject *kobj,
 	if (rc)
 		return rc;
 
-	pages_number *= 1 << (20 - PAGE_CACHE_SHIFT); /* MB -> pages */
+	pages_number *= 1 << (20 - PAGE_SHIFT); /* MB -> pages */
 
 	if (pages_number > totalram_pages / 2) {
 
 		CERROR("can't set file readahead more than %lu MB\n",
-		       totalram_pages >> (20 - PAGE_CACHE_SHIFT + 1)); /*1/2 of RAM*/
+		       totalram_pages >> (20 - PAGE_SHIFT + 1)); /*1/2 of RAM*/
 		return -ERANGE;
 	}
 
@@ -281,7 +281,7 @@ static ssize_t max_read_ahead_per_file_mb_show(struct kobject *kobj,
 	pages_number = sbi->ll_ra_info.ra_max_pages_per_file;
 	spin_unlock(&sbi->ll_lock);
 
-	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	mult = 1 << (20 - PAGE_SHIFT);
 	return lprocfs_read_frac_helper(buf, PAGE_SIZE, pages_number, mult);
 }
 
@@ -326,7 +326,7 @@ static ssize_t max_read_ahead_whole_mb_show(struct kobject *kobj,
 	pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages;
 	spin_unlock(&sbi->ll_lock);
 
-	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	mult = 1 << (20 - PAGE_SHIFT);
 	return lprocfs_read_frac_helper(buf, PAGE_SIZE, pages_number, mult);
 }
 
@@ -349,7 +349,7 @@ static ssize_t max_read_ahead_whole_mb_store(struct kobject *kobj,
 	 */
 	if (pages_number > sbi->ll_ra_info.ra_max_pages_per_file) {
 		CERROR("can't set max_read_ahead_whole_mb more than max_read_ahead_per_file_mb: %lu\n",
-		       sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_CACHE_SHIFT));
+		       sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_SHIFT));
 		return -ERANGE;
 	}
 
@@ -366,7 +366,7 @@ static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v)
 	struct super_block     *sb    = m->private;
 	struct ll_sb_info      *sbi   = ll_s2sbi(sb);
 	struct cl_client_cache *cache = &sbi->ll_cache;
-	int shift = 20 - PAGE_CACHE_SHIFT;
+	int shift = 20 - PAGE_SHIFT;
 	int max_cached_mb;
 	int unused_mb;
 
@@ -405,7 +405,7 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file,
 		return -EFAULT;
 	kernbuf[count] = 0;
 
-	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	mult = 1 << (20 - PAGE_SHIFT);
 	buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) -
 		  kernbuf;
 	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
@@ -415,7 +415,7 @@ static ssize_t ll_max_cached_mb_seq_write(struct file *file,
 	if (pages_number < 0 || pages_number > totalram_pages) {
 		CERROR("%s: can't set max cache more than %lu MB\n",
 		       ll_get_fsname(sb, NULL, 0),
-		       totalram_pages >> (20 - PAGE_CACHE_SHIFT));
+		       totalram_pages >> (20 - PAGE_SHIFT));
 		return -ERANGE;
 	}
 
diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c
index 34614acf3f8e..4c7250ab54e6 100644
--- a/drivers/staging/lustre/lustre/llite/rw.c
+++ b/drivers/staging/lustre/lustre/llite/rw.c
@@ -146,10 +146,10 @@ static struct ll_cl_context *ll_cl_init(struct file *file,
 		 */
 		io->ci_lockreq = CILR_NEVER;
 
-		pos = vmpage->index << PAGE_CACHE_SHIFT;
+		pos = vmpage->index << PAGE_SHIFT;
 
 		/* Create a temp IO to serve write. */
-		result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_CACHE_SIZE);
+		result = cl_io_rw_init(env, io, CIT_WRITE, pos, PAGE_SIZE);
 		if (result == 0) {
 			cio->cui_fd = LUSTRE_FPRIVATE(file);
 			cio->cui_iter = NULL;
@@ -498,7 +498,7 @@ static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
 		}
 		if (rc != 1)
 			unlock_page(vmpage);
-		page_cache_release(vmpage);
+		put_page(vmpage);
 	} else {
 		which = RA_STAT_FAILED_GRAB_PAGE;
 		msg   = "g_c_p_n failed";
@@ -527,7 +527,7 @@ static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
  * and max_read_ahead_per_file_mb otherwise the readahead budget can be used
  * up quickly which will affect read performance significantly. See LU-2816
  */
-#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_CACHE_SHIFT)
+#define RAS_INCREASE_STEP(inode) (ONE_MB_BRW_SIZE >> PAGE_SHIFT)
 
 static inline int stride_io_mode(struct ll_readahead_state *ras)
 {
@@ -739,7 +739,7 @@ int ll_readahead(const struct lu_env *env, struct cl_io *io,
 			end = rpc_boundary;
 
 		/* Truncate RA window to end of file */
-		end = min(end, (unsigned long)((kms - 1) >> PAGE_CACHE_SHIFT));
+		end = min(end, (unsigned long)((kms - 1) >> PAGE_SHIFT));
 
 		ras->ras_next_readahead = max(end, end + 1);
 		RAS_CDEBUG(ras);
@@ -776,7 +776,7 @@ int ll_readahead(const struct lu_env *env, struct cl_io *io,
 	if (reserved != 0)
 		ll_ra_count_put(ll_i2sbi(inode), reserved);
 
-	if (ra_end == end + 1 && ra_end == (kms >> PAGE_CACHE_SHIFT))
+	if (ra_end == end + 1 && ra_end == (kms >> PAGE_SHIFT))
 		ll_ra_stats_inc(mapping, RA_STAT_EOF);
 
 	/* if we didn't get to the end of the region we reserved from
@@ -985,8 +985,8 @@ void ras_update(struct ll_sb_info *sbi, struct inode *inode,
 	if (ras->ras_requests == 2 && !ras->ras_request_index) {
 		__u64 kms_pages;
 
-		kms_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-			    PAGE_CACHE_SHIFT;
+		kms_pages = (i_size_read(inode) + PAGE_SIZE - 1) >>
+			    PAGE_SHIFT;
 
 		CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages,
 		       ra->ra_max_read_ahead_whole_pages, ra->ra_max_pages_per_file);
@@ -1173,7 +1173,7 @@ int ll_writepage(struct page *vmpage, struct writeback_control *wbc)
 		 * PageWriteback or clean the page.
 		 */
 		result = cl_sync_file_range(inode, offset,
-					    offset + PAGE_CACHE_SIZE - 1,
+					    offset + PAGE_SIZE - 1,
 					    CL_FSYNC_LOCAL, 1);
 		if (result > 0) {
 			/* actually we may have written more than one page.
@@ -1211,7 +1211,7 @@ int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	int ignore_layout = 0;
 
 	if (wbc->range_cyclic) {
-		start = mapping->writeback_index << PAGE_CACHE_SHIFT;
+		start = mapping->writeback_index << PAGE_SHIFT;
 		end = OBD_OBJECT_EOF;
 	} else {
 		start = wbc->range_start;
@@ -1241,7 +1241,7 @@ int ll_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) {
 		if (end == OBD_OBJECT_EOF)
 			end = i_size_read(inode);
-		mapping->writeback_index = (end >> PAGE_CACHE_SHIFT) + 1;
+		mapping->writeback_index = (end >> PAGE_SHIFT) + 1;
 	}
 	return result;
 }
diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
index 7a5db67bc680..69aa15e8e3ef 100644
--- a/drivers/staging/lustre/lustre/llite/rw26.c
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -87,7 +87,7 @@ static void ll_invalidatepage(struct page *vmpage, unsigned int offset,
 	 * below because they are run with page locked and all our io is
 	 * happening with locked page too
 	 */
-	if (offset == 0 && length == PAGE_CACHE_SIZE) {
+	if (offset == 0 && length == PAGE_SIZE) {
 		env = cl_env_get(&refcheck);
 		if (!IS_ERR(env)) {
 			inode = vmpage->mapping->host;
@@ -193,8 +193,8 @@ static inline int ll_get_user_pages(int rw, unsigned long user_addr,
 		return -EFBIG;
 	}
 
-	*max_pages = (user_addr + size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	*max_pages -= user_addr >> PAGE_CACHE_SHIFT;
+	*max_pages = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	*max_pages -= user_addr >> PAGE_SHIFT;
 
 	*pages = libcfs_kvzalloc(*max_pages * sizeof(**pages), GFP_NOFS);
 	if (*pages) {
@@ -217,7 +217,7 @@ static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
 	for (i = 0; i < npages; i++) {
 		if (do_dirty)
 			set_page_dirty_lock(pages[i]);
-		page_cache_release(pages[i]);
+		put_page(pages[i]);
 	}
 	kvfree(pages);
 }
@@ -357,7 +357,7 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
  * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc.
  */
 #define MAX_DIO_SIZE ((KMALLOC_MAX_SIZE / sizeof(struct brw_page) *	  \
-		       PAGE_CACHE_SIZE) & ~(DT_MAX_BRW_SIZE - 1))
+		       PAGE_SIZE) & ~(DT_MAX_BRW_SIZE - 1))
 static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
 			       loff_t file_offset)
 {
@@ -382,8 +382,8 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
 	CDEBUG(D_VFSTRACE,
 	       "VFS Op:inode=%lu/%u(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)\n",
 	       inode->i_ino, inode->i_generation, inode, count, MAX_DIO_SIZE,
-	       file_offset, file_offset, count >> PAGE_CACHE_SHIFT,
-	       MAX_DIO_SIZE >> PAGE_CACHE_SHIFT);
+	       file_offset, file_offset, count >> PAGE_SHIFT,
+	       MAX_DIO_SIZE >> PAGE_SHIFT);
 
 	/* Check that all user buffers are aligned as well */
 	if (iov_iter_alignment(iter) & ~CFS_PAGE_MASK)
@@ -432,8 +432,8 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
 			 * page worth of page pointers = 4MB on i386.
 			 */
 			if (result == -ENOMEM &&
-			    size > (PAGE_CACHE_SIZE / sizeof(*pages)) *
-				   PAGE_CACHE_SIZE) {
+			    size > (PAGE_SIZE / sizeof(*pages)) *
+			    PAGE_SIZE) {
 				size = ((((size / 2) - 1) |
 					 ~CFS_PAGE_MASK) + 1) &
 					CFS_PAGE_MASK;
@@ -474,10 +474,10 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 			  loff_t pos, unsigned len, unsigned flags,
 			  struct page **pagep, void **fsdata)
 {
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 	struct page *page;
 	int rc;
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned from = pos & (PAGE_SIZE - 1);
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
@@ -488,7 +488,7 @@ static int ll_write_begin(struct file *file, struct address_space *mapping,
 	rc = ll_prepare_write(file, page, from, from + len);
 	if (rc) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	return rc;
 }
@@ -497,12 +497,12 @@ static int ll_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned from = pos & (PAGE_SIZE - 1);
 	int rc;
 
 	rc = ll_commit_write(file, page, from, from + copied);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return rc ?: copied;
 }
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index fb0c26ee7ff3..75d4df71cab8 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -514,7 +514,7 @@ static int vvp_io_read_start(const struct lu_env *env,
 		/*
 		 * XXX: explicit PAGE_CACHE_SIZE
 		 */
-		bead->lrr_count = cl_index(obj, tot + PAGE_CACHE_SIZE - 1);
+		bead->lrr_count = cl_index(obj, tot + PAGE_SIZE - 1);
 		ll_ra_read_in(file, bead);
 	}
 
@@ -959,7 +959,7 @@ static int vvp_io_prepare_write(const struct lu_env *env,
 		 * We're completely overwriting an existing page, so _don't_
 		 * set it up to date until commit_write
 		 */
-		if (from == 0 && to == PAGE_CACHE_SIZE) {
+		if (from == 0 && to == PAGE_SIZE) {
 			CL_PAGE_HEADER(D_PAGE, env, pg, "full page write\n");
 			POISON_PAGE(page, 0x11);
 		} else
@@ -1022,7 +1022,7 @@ static int vvp_io_commit_write(const struct lu_env *env,
 			set_page_dirty(vmpage);
 			vvp_write_pending(cl2ccc(obj), cp);
 		} else if (result == -EDQUOT) {
-			pgoff_t last_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
+			pgoff_t last_index = i_size_read(inode) >> PAGE_SHIFT;
 			bool need_clip = true;
 
 			/*
@@ -1040,7 +1040,7 @@ static int vvp_io_commit_write(const struct lu_env *env,
 			 * being.
 			 */
 			if (last_index > pg->cp_index) {
-				to = PAGE_CACHE_SIZE;
+				to = PAGE_SIZE;
 				need_clip = false;
 			} else if (last_index == pg->cp_index) {
 				int size_to = i_size_read(inode) & ~CFS_PAGE_MASK;
diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c
index 850bae734075..33ca3eb34965 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_page.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_page.c
@@ -57,7 +57,7 @@ static void vvp_page_fini_common(struct ccc_page *cp)
 	struct page *vmpage = cp->cpg_page;
 
 	LASSERT(vmpage);
-	page_cache_release(vmpage);
+	put_page(vmpage);
 }
 
 static void vvp_page_fini(const struct lu_env *env,
@@ -164,12 +164,12 @@ static int vvp_page_unmap(const struct lu_env *env,
 	LASSERT(vmpage);
 	LASSERT(PageLocked(vmpage));
 
-	offset = vmpage->index << PAGE_CACHE_SHIFT;
+	offset = vmpage->index << PAGE_SHIFT;
 
 	/*
 	 * XXX is it safe to call this with the page lock held?
 	 */
-	ll_teardown_mmaps(vmpage->mapping, offset, offset + PAGE_CACHE_SIZE);
+	ll_teardown_mmaps(vmpage->mapping, offset, offset + PAGE_SIZE);
 	return 0;
 }
 
@@ -537,7 +537,7 @@ int vvp_page_init(const struct lu_env *env, struct cl_object *obj,
 	CLOBINVRNT(env, obj, ccc_object_invariant(obj));
 
 	cpg->cpg_page = vmpage;
-	page_cache_get(vmpage);
+	get_page(vmpage);
 
 	INIT_LIST_HEAD(&cpg->cpg_pending_linkage);
 	if (page->cp_type == CPT_CACHEABLE) {
diff --git a/drivers/staging/lustre/lustre/lmv/lmv_obd.c b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
index 0f776cf8a5aa..ce7e8b70dd44 100644
--- a/drivers/staging/lustre/lustre/lmv/lmv_obd.c
+++ b/drivers/staging/lustre/lustre/lmv/lmv_obd.c
@@ -2129,8 +2129,8 @@ static int lmv_readpage(struct obd_export *exp, struct md_op_data *op_data,
 	if (rc != 0)
 		return rc;
 
-	ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_CACHE_SIZE - 1)
-		 >> PAGE_CACHE_SHIFT;
+	ncfspgs = ((*request)->rq_bulk->bd_nob_transferred + PAGE_SIZE - 1)
+		 >> PAGE_SHIFT;
 	nlupgs = (*request)->rq_bulk->bd_nob_transferred >> LU_PAGE_SHIFT;
 	LASSERT(!((*request)->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK));
 	LASSERT(ncfspgs > 0 && ncfspgs <= op_data->op_npages);
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
index 55dd8ef9525b..b91d3ff18b02 100644
--- a/drivers/staging/lustre/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c
@@ -1002,10 +1002,10 @@ restart_bulk:
 
 	/* NB req now owns desc and will free it when it gets freed */
 	for (i = 0; i < op_data->op_npages; i++)
-		ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+		ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_SIZE);
 
 	mdc_readdir_pack(req, op_data->op_offset,
-			 PAGE_CACHE_SIZE * op_data->op_npages,
+			 PAGE_SIZE * op_data->op_npages,
 			 &op_data->op_fid1);
 
 	ptlrpc_request_set_replen(req);
@@ -1037,7 +1037,7 @@ restart_bulk:
 	if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) {
 		CERROR("Unexpected # bytes transferred: %d (%ld expected)\n",
 		       req->rq_bulk->bd_nob_transferred,
-		       PAGE_CACHE_SIZE * op_data->op_npages);
+		       PAGE_SIZE * op_data->op_npages);
 		ptlrpc_req_finished(req);
 		return -EPROTO;
 	}
diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c b/drivers/staging/lustre/lustre/mgc/mgc_request.c
index b7dc87248032..3924b095bfb0 100644
--- a/drivers/staging/lustre/lustre/mgc/mgc_request.c
+++ b/drivers/staging/lustre/lustre/mgc/mgc_request.c
@@ -1113,7 +1113,7 @@ static int mgc_import_event(struct obd_device *obd,
 }
 
 enum {
-	CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_CACHE_SHIFT),
+	CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_SHIFT),
 	CONFIG_READ_NRPAGES      = 4
 };
 
@@ -1137,19 +1137,19 @@ static int mgc_apply_recover_logs(struct obd_device *mgc,
 	LASSERT(cfg->cfg_instance);
 	LASSERT(cfg->cfg_sb == cfg->cfg_instance);
 
-	inst = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+	inst = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!inst)
 		return -ENOMEM;
 
-	pos = snprintf(inst, PAGE_CACHE_SIZE, "%p", cfg->cfg_instance);
-	if (pos >= PAGE_CACHE_SIZE) {
+	pos = snprintf(inst, PAGE_SIZE, "%p", cfg->cfg_instance);
+	if (pos >= PAGE_SIZE) {
 		kfree(inst);
 		return -E2BIG;
 	}
 
 	++pos;
 	buf   = inst + pos;
-	bufsz = PAGE_CACHE_SIZE - pos;
+	bufsz = PAGE_SIZE - pos;
 
 	while (datalen > 0) {
 		int   entry_len = sizeof(*entry);
@@ -1181,7 +1181,7 @@ static int mgc_apply_recover_logs(struct obd_device *mgc,
 		/* Keep this swab for normal mixed endian handling. LU-1644 */
 		if (mne_swab)
 			lustre_swab_mgs_nidtbl_entry(entry);
-		if (entry->mne_length > PAGE_CACHE_SIZE) {
+		if (entry->mne_length > PAGE_SIZE) {
 			CERROR("MNE too large (%u)\n", entry->mne_length);
 			break;
 		}
@@ -1371,7 +1371,7 @@ again:
 	}
 	body->mcb_offset = cfg->cfg_last_idx + 1;
 	body->mcb_type   = cld->cld_type;
-	body->mcb_bits   = PAGE_CACHE_SHIFT;
+	body->mcb_bits   = PAGE_SHIFT;
 	body->mcb_units  = nrpages;
 
 	/* allocate bulk transfer descriptor */
@@ -1383,7 +1383,7 @@ again:
 	}
 
 	for (i = 0; i < nrpages; i++)
-		ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_CACHE_SIZE);
+		ptlrpc_prep_bulk_page_pin(desc, pages[i], 0, PAGE_SIZE);
 
 	ptlrpc_request_set_replen(req);
 	rc = ptlrpc_queue_wait(req);
@@ -1411,7 +1411,7 @@ again:
 		goto out;
 	}
 
-	if (ealen > nrpages << PAGE_CACHE_SHIFT) {
+	if (ealen > nrpages << PAGE_SHIFT) {
 		rc = -EINVAL;
 		goto out;
 	}
@@ -1439,7 +1439,7 @@ again:
 
 		ptr = kmap(pages[i]);
 		rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, ptr,
-					     min_t(int, ealen, PAGE_CACHE_SIZE),
+					     min_t(int, ealen, PAGE_SIZE),
 					     mne_swab);
 		kunmap(pages[i]);
 		if (rc2 < 0) {
@@ -1448,7 +1448,7 @@ again:
 			break;
 		}
 
-		ealen -= PAGE_CACHE_SIZE;
+		ealen -= PAGE_SIZE;
 	}
 
 out:
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c
index 231a2f26c693..394580016638 100644
--- a/drivers/staging/lustre/lustre/obdclass/cl_page.c
+++ b/drivers/staging/lustre/lustre/obdclass/cl_page.c
@@ -1477,7 +1477,7 @@ loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
 	/*
 	 * XXX for now.
 	 */
-	return (loff_t)idx << PAGE_CACHE_SHIFT;
+	return (loff_t)idx << PAGE_SHIFT;
 }
 EXPORT_SYMBOL(cl_offset);
 
@@ -1489,13 +1489,13 @@ pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
 	/*
 	 * XXX for now.
 	 */
-	return offset >> PAGE_CACHE_SHIFT;
+	return offset >> PAGE_SHIFT;
 }
 EXPORT_SYMBOL(cl_index);
 
 int cl_page_size(const struct cl_object *obj)
 {
-	return 1 << PAGE_CACHE_SHIFT;
+	return 1 << PAGE_SHIFT;
 }
 EXPORT_SYMBOL(cl_page_size);
 
diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
index 1a938e1376f9..c2cf015962dd 100644
--- a/drivers/staging/lustre/lustre/obdclass/class_obd.c
+++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c
@@ -461,9 +461,9 @@ static int obd_init_checks(void)
 		CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len);
 		ret = -EINVAL;
 	}
-	if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) {
+	if ((u64val & ~CFS_PAGE_MASK) >= PAGE_SIZE) {
 		CWARN("mask failed: u64val %llu >= %llu\n", u64val,
-		      (__u64)PAGE_CACHE_SIZE);
+		      (__u64)PAGE_SIZE);
 		ret = -EINVAL;
 	}
 
@@ -509,7 +509,7 @@ static int __init obdclass_init(void)
 	 * For clients with less memory, a larger fraction is needed
 	 * for other purposes (mostly for BGL).
 	 */
-	if (totalram_pages <= 512 << (20 - PAGE_CACHE_SHIFT))
+	if (totalram_pages <= 512 << (20 - PAGE_SHIFT))
 		obd_max_dirty_pages = totalram_pages / 4;
 	else
 		obd_max_dirty_pages = totalram_pages / 2;
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
index 9496c09b2b69..4a2baaff3dc1 100644
--- a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
@@ -71,8 +71,8 @@ void obdo_refresh_inode(struct inode *dst, struct obdo *src, u32 valid)
 	if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits))
 		dst->i_blkbits = ffs(src->o_blksize) - 1;
 
-	if (dst->i_blkbits < PAGE_CACHE_SHIFT)
-		dst->i_blkbits = PAGE_CACHE_SHIFT;
+	if (dst->i_blkbits < PAGE_SHIFT)
+		dst->i_blkbits = PAGE_SHIFT;
 
 	/* allocation of space */
 	if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks)
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
index fd333b9e968c..e6bf414a4444 100644
--- a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
@@ -100,7 +100,7 @@ static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr,
 				 char *buf)
 {
 	return sprintf(buf, "%ul\n",
-			obd_max_dirty_pages / (1 << (20 - PAGE_CACHE_SHIFT)));
+			obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT)));
 }
 
 static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
@@ -113,14 +113,14 @@ static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr,
 	if (rc)
 		return rc;
 
-	val *= 1 << (20 - PAGE_CACHE_SHIFT); /* convert to pages */
+	val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */
 
 	if (val > ((totalram_pages / 10) * 9)) {
 		/* Somebody wants to assign too much memory to dirty pages */
 		return -EINVAL;
 	}
 
-	if (val < 4 << (20 - PAGE_CACHE_SHIFT)) {
+	if (val < 4 << (20 - PAGE_SHIFT)) {
 		/* Less than 4 Mb for dirty cache is also bad */
 		return -EINVAL;
 	}
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
index 65a4746c89ca..978568ada8e9 100644
--- a/drivers/staging/lustre/lustre/obdclass/lu_object.c
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -840,8 +840,8 @@ static int lu_htable_order(void)
 
 #if BITS_PER_LONG == 32
 	/* limit hashtable size for lowmem systems to low RAM */
-	if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT))
-		cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4;
+	if (cache_size > 1 << (30 - PAGE_SHIFT))
+		cache_size = 1 << (30 - PAGE_SHIFT) * 3 / 4;
 #endif
 
 	/* clear off unreasonable cache setting. */
@@ -853,7 +853,7 @@ static int lu_htable_order(void)
 		lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
 	}
 	cache_size = cache_size / 100 * lu_cache_percent *
-		(PAGE_CACHE_SIZE / 1024);
+		(PAGE_SIZE / 1024);
 
 	for (bits = 1; (1 << bits) < cache_size; ++bits) {
 		;
diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c
index 64ffe243f870..1e83669c204d 100644
--- a/drivers/staging/lustre/lustre/obdecho/echo_client.c
+++ b/drivers/staging/lustre/lustre/obdecho/echo_client.c
@@ -278,7 +278,7 @@ static void echo_page_fini(const struct lu_env *env,
 	struct page *vmpage      = ep->ep_vmpage;
 
 	atomic_dec(&eco->eo_npages);
-	page_cache_release(vmpage);
+	put_page(vmpage);
 }
 
 static int echo_page_prep(const struct lu_env *env,
@@ -373,7 +373,7 @@ static int echo_page_init(const struct lu_env *env, struct cl_object *obj,
 	struct echo_object *eco = cl2echo_obj(obj);
 
 	ep->ep_vmpage = vmpage;
-	page_cache_get(vmpage);
+	get_page(vmpage);
 	mutex_init(&ep->ep_lock);
 	cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops);
 	atomic_inc(&eco->eo_npages);
@@ -1138,7 +1138,7 @@ static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset,
 	LASSERT(rc == 0);
 
 	rc = cl_echo_enqueue0(env, eco, offset,
-			      offset + npages * PAGE_CACHE_SIZE - 1,
+			      offset + npages * PAGE_SIZE - 1,
 			      rw == READ ? LCK_PR : LCK_PW, &lh.cookie,
 			      CEF_NEVER);
 	if (rc < 0)
@@ -1311,11 +1311,11 @@ echo_client_page_debug_setup(struct page *page, int rw, u64 id,
 	int      delta;
 
 	/* no partial pages on the client */
-	LASSERT(count == PAGE_CACHE_SIZE);
+	LASSERT(count == PAGE_SIZE);
 
 	addr = kmap(page);
 
-	for (delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+	for (delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
 		if (rw == OBD_BRW_WRITE) {
 			stripe_off = offset + delta;
 			stripe_id = id;
@@ -1341,11 +1341,11 @@ static int echo_client_page_debug_check(struct page *page, u64 id,
 	int     rc2;
 
 	/* no partial pages on the client */
-	LASSERT(count == PAGE_CACHE_SIZE);
+	LASSERT(count == PAGE_SIZE);
 
 	addr = kmap(page);
 
-	for (rc = delta = 0; delta < PAGE_CACHE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
+	for (rc = delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) {
 		stripe_off = offset + delta;
 		stripe_id = id;
 
@@ -1391,7 +1391,7 @@ static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
 		return -EINVAL;
 
 	/* XXX think again with misaligned I/O */
-	npages = count >> PAGE_CACHE_SHIFT;
+	npages = count >> PAGE_SHIFT;
 
 	if (rw == OBD_BRW_WRITE)
 		brw_flags = OBD_BRW_ASYNC;
@@ -1408,7 +1408,7 @@ static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
 
 	for (i = 0, pgp = pga, off = offset;
 	     i < npages;
-	     i++, pgp++, off += PAGE_CACHE_SIZE) {
+	     i++, pgp++, off += PAGE_SIZE) {
 
 		LASSERT(!pgp->pg);      /* for cleanup */
 
@@ -1418,7 +1418,7 @@ static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
 			goto out;
 
 		pages[i] = pgp->pg;
-		pgp->count = PAGE_CACHE_SIZE;
+		pgp->count = PAGE_SIZE;
 		pgp->off = off;
 		pgp->flag = brw_flags;
 
@@ -1473,8 +1473,8 @@ static int echo_client_prep_commit(const struct lu_env *env,
 	if (count <= 0 || (count & (~CFS_PAGE_MASK)) != 0)
 		return -EINVAL;
 
-	npages = batch >> PAGE_CACHE_SHIFT;
-	tot_pages = count >> PAGE_CACHE_SHIFT;
+	npages = batch >> PAGE_SHIFT;
+	tot_pages = count >> PAGE_SHIFT;
 
 	lnb = kcalloc(npages, sizeof(struct niobuf_local), GFP_NOFS);
 	rnb = kcalloc(npages, sizeof(struct niobuf_remote), GFP_NOFS);
@@ -1497,9 +1497,9 @@ static int echo_client_prep_commit(const struct lu_env *env,
 		if (tot_pages < npages)
 			npages = tot_pages;
 
-		for (i = 0; i < npages; i++, off += PAGE_CACHE_SIZE) {
+		for (i = 0; i < npages; i++, off += PAGE_SIZE) {
 			rnb[i].offset = off;
-			rnb[i].len = PAGE_CACHE_SIZE;
+			rnb[i].len = PAGE_SIZE;
 			rnb[i].flags = brw_flags;
 		}
 
@@ -1878,7 +1878,7 @@ static int __init obdecho_init(void)
 {
 	LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n");
 
-	LASSERT(PAGE_CACHE_SIZE % OBD_ECHO_BLOCK_SIZE == 0);
+	LASSERT(PAGE_SIZE % OBD_ECHO_BLOCK_SIZE == 0);
 
 	return echo_client_init();
 }
diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c
index 57c43c506ef2..a3358c39b2f1 100644
--- a/drivers/staging/lustre/lustre/osc/lproc_osc.c
+++ b/drivers/staging/lustre/lustre/osc/lproc_osc.c
@@ -162,15 +162,15 @@ static ssize_t max_dirty_mb_store(struct kobject *kobj,
 	if (rc)
 		return rc;
 
-	pages_number *= 1 << (20 - PAGE_CACHE_SHIFT); /* MB -> pages */
+	pages_number *= 1 << (20 - PAGE_SHIFT); /* MB -> pages */
 
 	if (pages_number <= 0 ||
-	    pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_CACHE_SHIFT) ||
+	    pages_number > OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) ||
 	    pages_number > totalram_pages / 4) /* 1/4 of RAM */
 		return -ERANGE;
 
 	client_obd_list_lock(&cli->cl_loi_list_lock);
-	cli->cl_dirty_max = (u32)(pages_number << PAGE_CACHE_SHIFT);
+	cli->cl_dirty_max = (u32)(pages_number << PAGE_SHIFT);
 	osc_wake_cache_waiters(cli);
 	client_obd_list_unlock(&cli->cl_loi_list_lock);
 
@@ -182,7 +182,7 @@ static int osc_cached_mb_seq_show(struct seq_file *m, void *v)
 {
 	struct obd_device *dev = m->private;
 	struct client_obd *cli = &dev->u.cli;
-	int shift = 20 - PAGE_CACHE_SHIFT;
+	int shift = 20 - PAGE_SHIFT;
 
 	seq_printf(m,
 		   "used_mb: %d\n"
@@ -211,7 +211,7 @@ static ssize_t osc_cached_mb_seq_write(struct file *file,
 		return -EFAULT;
 	kernbuf[count] = 0;
 
-	mult = 1 << (20 - PAGE_CACHE_SHIFT);
+	mult = 1 << (20 - PAGE_SHIFT);
 	buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) -
 		  kernbuf;
 	rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
@@ -569,12 +569,12 @@ static ssize_t max_pages_per_rpc_store(struct kobject *kobj,
 
 	/* if the max_pages is specified in bytes, convert to pages */
 	if (val >= ONE_MB_BRW_SIZE)
-		val >>= PAGE_CACHE_SHIFT;
+		val >>= PAGE_SHIFT;
 
-	chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_CACHE_SHIFT)) - 1);
+	chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1);
 	/* max_pages_per_rpc must be chunk aligned */
 	val = (val + ~chunk_mask) & chunk_mask;
-	if (val == 0 || val > ocd->ocd_brw_size >> PAGE_CACHE_SHIFT) {
+	if (val == 0 || val > ocd->ocd_brw_size >> PAGE_SHIFT) {
 		return -ERANGE;
 	}
 	client_obd_list_lock(&cli->cl_loi_list_lock);
diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c
index 63363111380c..4e0a357ed43d 100644
--- a/drivers/staging/lustre/lustre/osc/osc_cache.c
+++ b/drivers/staging/lustre/lustre/osc/osc_cache.c
@@ -544,7 +544,7 @@ static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur,
 		return -ERANGE;
 
 	LASSERT(cur->oe_osclock == victim->oe_osclock);
-	ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_CACHE_SHIFT;
+	ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT;
 	chunk_start = cur->oe_start >> ppc_bits;
 	chunk_end = cur->oe_end >> ppc_bits;
 	if (chunk_start != (victim->oe_end >> ppc_bits) + 1 &&
@@ -647,8 +647,8 @@ static struct osc_extent *osc_extent_find(const struct lu_env *env,
 	lock = cl_lock_at_pgoff(env, osc2cl(obj), index, NULL, 1, 0);
 	LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
 
-	LASSERT(cli->cl_chunkbits >= PAGE_CACHE_SHIFT);
-	ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	LASSERT(cli->cl_chunkbits >= PAGE_SHIFT);
+	ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
 	chunk_mask = ~((1 << ppc_bits) - 1);
 	chunksize = 1 << cli->cl_chunkbits;
 	chunk = index >> ppc_bits;
@@ -871,8 +871,8 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
 
 	if (!sent) {
 		lost_grant = ext->oe_grants;
-	} else if (blocksize < PAGE_CACHE_SIZE &&
-		   last_count != PAGE_CACHE_SIZE) {
+	} else if (blocksize < PAGE_SIZE &&
+		   last_count != PAGE_SIZE) {
 		/* For short writes we shouldn't count parts of pages that
 		 * span a whole chunk on the OST side, or our accounting goes
 		 * wrong.  Should match the code in filter_grant_check.
@@ -884,7 +884,7 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext,
 		if (end)
 			count += blocksize - end;
 
-		lost_grant = PAGE_CACHE_SIZE - count;
+		lost_grant = PAGE_SIZE - count;
 	}
 	if (ext->oe_grants > 0)
 		osc_free_grant(cli, nr_pages, lost_grant);
@@ -967,7 +967,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index,
 	struct osc_async_page *oap;
 	struct osc_async_page *tmp;
 	int pages_in_chunk = 0;
-	int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
 	__u64 trunc_chunk = trunc_index >> ppc_bits;
 	int grants = 0;
 	int nr_pages = 0;
@@ -1125,7 +1125,7 @@ static int osc_extent_make_ready(const struct lu_env *env,
 	if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) {
 		last->oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE);
 		LASSERT(last->oap_count > 0);
-		LASSERT(last->oap_page_off + last->oap_count <= PAGE_CACHE_SIZE);
+		LASSERT(last->oap_page_off + last->oap_count <= PAGE_SIZE);
 		last->oap_async_flags |= ASYNC_COUNT_STABLE;
 	}
 
@@ -1134,7 +1134,7 @@ static int osc_extent_make_ready(const struct lu_env *env,
 	 */
 	list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) {
 		if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
-			oap->oap_count = PAGE_CACHE_SIZE - oap->oap_page_off;
+			oap->oap_count = PAGE_SIZE - oap->oap_page_off;
 			oap->oap_async_flags |= ASYNC_COUNT_STABLE;
 		}
 	}
@@ -1158,7 +1158,7 @@ static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, int *grants)
 	struct osc_object *obj = ext->oe_obj;
 	struct client_obd *cli = osc_cli(obj);
 	struct osc_extent *next;
-	int ppc_bits = cli->cl_chunkbits - PAGE_CACHE_SHIFT;
+	int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT;
 	pgoff_t chunk = index >> ppc_bits;
 	pgoff_t end_chunk;
 	pgoff_t end_index;
@@ -1293,9 +1293,9 @@ static int osc_refresh_count(const struct lu_env *env,
 		return 0;
 	else if (cl_offset(obj, page->cp_index + 1) > kms)
 		/* catch sub-page write at end of file */
-		return kms % PAGE_CACHE_SIZE;
+		return kms % PAGE_SIZE;
 	else
-		return PAGE_CACHE_SIZE;
+		return PAGE_SIZE;
 }
 
 static int osc_completion(const struct lu_env *env, struct osc_async_page *oap,
@@ -1376,10 +1376,10 @@ static void osc_consume_write_grant(struct client_obd *cli,
 	assert_spin_locked(&cli->cl_loi_list_lock.lock);
 	LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
 	atomic_inc(&obd_dirty_pages);
-	cli->cl_dirty += PAGE_CACHE_SIZE;
+	cli->cl_dirty += PAGE_SIZE;
 	pga->flag |= OBD_BRW_FROM_GRANT;
 	CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n",
-	       PAGE_CACHE_SIZE, pga, pga->pg);
+	       PAGE_SIZE, pga, pga->pg);
 	osc_update_next_shrink(cli);
 }
 
@@ -1396,11 +1396,11 @@ static void osc_release_write_grant(struct client_obd *cli,
 
 	pga->flag &= ~OBD_BRW_FROM_GRANT;
 	atomic_dec(&obd_dirty_pages);
-	cli->cl_dirty -= PAGE_CACHE_SIZE;
+	cli->cl_dirty -= PAGE_SIZE;
 	if (pga->flag & OBD_BRW_NOCACHE) {
 		pga->flag &= ~OBD_BRW_NOCACHE;
 		atomic_dec(&obd_dirty_transit_pages);
-		cli->cl_dirty_transit -= PAGE_CACHE_SIZE;
+		cli->cl_dirty_transit -= PAGE_SIZE;
 	}
 }
 
@@ -1469,7 +1469,7 @@ static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages,
 
 	client_obd_list_lock(&cli->cl_loi_list_lock);
 	atomic_sub(nr_pages, &obd_dirty_pages);
-	cli->cl_dirty -= nr_pages << PAGE_CACHE_SHIFT;
+	cli->cl_dirty -= nr_pages << PAGE_SHIFT;
 	cli->cl_lost_grant += lost_grant;
 	if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) {
 		/* borrow some grant from truncate to avoid the case that
@@ -1512,11 +1512,11 @@ static int osc_enter_cache_try(struct client_obd *cli,
 	if (rc < 0)
 		return 0;
 
-	if (cli->cl_dirty + PAGE_CACHE_SIZE <= cli->cl_dirty_max &&
+	if (cli->cl_dirty + PAGE_SIZE <= cli->cl_dirty_max &&
 	    atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) {
 		osc_consume_write_grant(cli, &oap->oap_brw_page);
 		if (transient) {
-			cli->cl_dirty_transit += PAGE_CACHE_SIZE;
+			cli->cl_dirty_transit += PAGE_SIZE;
 			atomic_inc(&obd_dirty_transit_pages);
 			oap->oap_brw_flags |= OBD_BRW_NOCACHE;
 		}
@@ -1562,7 +1562,7 @@ static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli,
 	 * of queued writes and create a discontiguous rpc stream
 	 */
 	if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) ||
-	    cli->cl_dirty_max < PAGE_CACHE_SIZE     ||
+	    cli->cl_dirty_max < PAGE_SIZE     ||
 	    cli->cl_ar.ar_force_sync || loi->loi_ar.ar_force_sync) {
 		rc = -EDQUOT;
 		goto out;
@@ -1632,7 +1632,7 @@ void osc_wake_cache_waiters(struct client_obd *cli)
 
 		ocw->ocw_rc = -EDQUOT;
 		/* we can't dirty more */
-		if ((cli->cl_dirty + PAGE_CACHE_SIZE > cli->cl_dirty_max) ||
+		if ((cli->cl_dirty + PAGE_SIZE > cli->cl_dirty_max) ||
 		    (atomic_read(&obd_dirty_pages) + 1 >
 		     obd_max_dirty_pages)) {
 			CDEBUG(D_CACHE, "no dirty room: dirty: %ld osc max %ld, sys max %d\n",
diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c
index d720b1a1c18c..ce9ddd515f64 100644
--- a/drivers/staging/lustre/lustre/osc/osc_page.c
+++ b/drivers/staging/lustre/lustre/osc/osc_page.c
@@ -410,7 +410,7 @@ int osc_page_init(const struct lu_env *env, struct cl_object *obj,
 	int result;
 
 	opg->ops_from = 0;
-	opg->ops_to = PAGE_CACHE_SIZE;
+	opg->ops_to = PAGE_SIZE;
 
 	result = osc_prep_async_page(osc, opg, vmpage,
 				     cl_offset(obj, page->cp_index));
@@ -487,9 +487,9 @@ static atomic_t osc_lru_waiters = ATOMIC_INIT(0);
 /* LRU pages are freed in batch mode. OSC should at least free this
  * number of pages to avoid running out of LRU budget, and..
  */
-static const int lru_shrink_min = 2 << (20 - PAGE_CACHE_SHIFT);  /* 2M */
+static const int lru_shrink_min = 2 << (20 - PAGE_SHIFT);  /* 2M */
 /* free this number at most otherwise it will take too long time to finish. */
-static const int lru_shrink_max = 32 << (20 - PAGE_CACHE_SHIFT); /* 32M */
+static const int lru_shrink_max = 32 << (20 - PAGE_SHIFT); /* 32M */
 
 /* Check if we can free LRU slots from this OSC. If there exists LRU waiters,
  * we should free slots aggressively. In this way, slots are freed in a steady
diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c
index 74805f1ae888..30526ebcad04 100644
--- a/drivers/staging/lustre/lustre/osc/osc_request.c
+++ b/drivers/staging/lustre/lustre/osc/osc_request.c
@@ -826,7 +826,7 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
 		oa->o_undirty = 0;
 	} else {
 		long max_in_flight = (cli->cl_max_pages_per_rpc <<
-				      PAGE_CACHE_SHIFT)*
+				      PAGE_SHIFT)*
 				     (cli->cl_max_rpcs_in_flight + 1);
 		oa->o_undirty = max(cli->cl_dirty_max, max_in_flight);
 	}
@@ -909,11 +909,11 @@ static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa)
 static int osc_shrink_grant(struct client_obd *cli)
 {
 	__u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) *
-			     (cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT);
+			     (cli->cl_max_pages_per_rpc << PAGE_SHIFT);
 
 	client_obd_list_lock(&cli->cl_loi_list_lock);
 	if (cli->cl_avail_grant <= target_bytes)
-		target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
 	client_obd_list_unlock(&cli->cl_loi_list_lock);
 
 	return osc_shrink_grant_to_target(cli, target_bytes);
@@ -929,8 +929,8 @@ int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes)
 	 * We don't want to shrink below a single RPC, as that will negatively
 	 * impact block allocation and long-term performance.
 	 */
-	if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT)
-		target_bytes = cli->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+	if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT)
+		target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT;
 
 	if (target_bytes >= cli->cl_avail_grant) {
 		client_obd_list_unlock(&cli->cl_loi_list_lock);
@@ -978,7 +978,7 @@ static int osc_should_shrink_grant(struct client_obd *client)
 		 * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export)
 		 * Keep comment here so that it can be found by searching.
 		 */
-		int brw_size = client->cl_max_pages_per_rpc << PAGE_CACHE_SHIFT;
+		int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT;
 
 		if (client->cl_import->imp_state == LUSTRE_IMP_FULL &&
 		    client->cl_avail_grant > brw_size)
@@ -1052,7 +1052,7 @@ static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd)
 	}
 
 	/* determine the appropriate chunk size used by osc_extent. */
-	cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize);
+	cli->cl_chunkbits = max_t(int, PAGE_SHIFT, ocd->ocd_blocksize);
 	client_obd_list_unlock(&cli->cl_loi_list_lock);
 
 	CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld chunk bits: %d\n",
@@ -1317,9 +1317,9 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,
 		LASSERT(pg->count > 0);
 		/* make sure there is no gap in the middle of page array */
 		LASSERTF(page_count == 1 ||
-			 (ergo(i == 0, poff + pg->count == PAGE_CACHE_SIZE) &&
+			 (ergo(i == 0, poff + pg->count == PAGE_SIZE) &&
 			  ergo(i > 0 && i < page_count - 1,
-			       poff == 0 && pg->count == PAGE_CACHE_SIZE)   &&
+			       poff == 0 && pg->count == PAGE_SIZE)   &&
 			  ergo(i == page_count - 1, poff == 0)),
 			 "i: %d/%d pg: %p off: %llu, count: %u\n",
 			 i, page_count, pg, pg->off, pg->count);
@@ -1877,7 +1877,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 						oap->oap_count;
 			else
 				LASSERT(oap->oap_page_off + oap->oap_count ==
-					PAGE_CACHE_SIZE);
+					PAGE_SIZE);
 		}
 	}
 
@@ -1993,7 +1993,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli,
 		tmp->oap_request = ptlrpc_request_addref(req);
 
 	client_obd_list_lock(&cli->cl_loi_list_lock);
-	starting_offset >>= PAGE_CACHE_SHIFT;
+	starting_offset >>= PAGE_SHIFT;
 	if (cmd == OBD_BRW_READ) {
 		cli->cl_r_in_flight++;
 		lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
@@ -2790,12 +2790,12 @@ out:
 						CFS_PAGE_MASK;
 
 		if (OBD_OBJECT_EOF - fm_key->fiemap.fm_length <=
-		    fm_key->fiemap.fm_start + PAGE_CACHE_SIZE - 1)
+		    fm_key->fiemap.fm_start + PAGE_SIZE - 1)
 			policy.l_extent.end = OBD_OBJECT_EOF;
 		else
 			policy.l_extent.end = (fm_key->fiemap.fm_start +
 				fm_key->fiemap.fm_length +
-				PAGE_CACHE_SIZE - 1) & CFS_PAGE_MASK;
+				PAGE_SIZE - 1) & CFS_PAGE_MASK;
 
 		ostid_build_res_name(&fm_key->oa.o_oi, &res_id);
 		mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c
index 1b7673eec4d7..cf3ac8eee9ee 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/client.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/client.c
@@ -174,12 +174,12 @@ void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc,
 	LASSERT(page);
 	LASSERT(pageoffset >= 0);
 	LASSERT(len > 0);
-	LASSERT(pageoffset + len <= PAGE_CACHE_SIZE);
+	LASSERT(pageoffset + len <= PAGE_SIZE);
 
 	desc->bd_nob += len;
 
 	if (pin)
-		page_cache_get(page);
+		get_page(page);
 
 	ptlrpc_add_bulk_page(desc, page, pageoffset, len);
 }
@@ -206,7 +206,7 @@ void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc, int unpin)
 
 	if (unpin) {
 		for (i = 0; i < desc->bd_iov_count; i++)
-			page_cache_release(desc->bd_iov[i].kiov_page);
+			put_page(desc->bd_iov[i].kiov_page);
 	}
 
 	kfree(desc);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c b/drivers/staging/lustre/lustre/ptlrpc/import.c
index b4eddf291269..cd94fed0ffdf 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/import.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/import.c
@@ -1092,7 +1092,7 @@ finish:
 
 		if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE)
 			cli->cl_max_pages_per_rpc =
-				min(ocd->ocd_brw_size >> PAGE_CACHE_SHIFT,
+				min(ocd->ocd_brw_size >> PAGE_SHIFT,
 				    cli->cl_max_pages_per_rpc);
 		else if (imp->imp_connect_op == MDS_CONNECT ||
 			 imp->imp_connect_op == MGS_CONNECT)
diff --git a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
index cee04efb6fb5..c95a91ce26c9 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c
@@ -308,7 +308,7 @@ ptlrpc_lprocfs_req_history_max_seq_write(struct file *file,
 	 * hose a kernel by allowing the request history to grow too
 	 * far.
 	 */
-	bufpages = (svc->srv_buf_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	bufpages = (svc->srv_buf_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (val > totalram_pages / (2 * bufpages))
 		return -ERANGE;
 
@@ -1226,7 +1226,7 @@ int lprocfs_wr_import(struct file *file, const char __user *buffer,
 	const char prefix[] = "connection=";
 	const int prefix_len = sizeof(prefix) - 1;
 
-	if (count > PAGE_CACHE_SIZE - 1 || count <= prefix_len)
+	if (count > PAGE_SIZE - 1 || count <= prefix_len)
 		return -EINVAL;
 
 	kbuf = kzalloc(count + 1, GFP_NOFS);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/recover.c b/drivers/staging/lustre/lustre/ptlrpc/recover.c
index 5f27d9c2e4ef..30d9a164e52d 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/recover.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/recover.c
@@ -195,7 +195,7 @@ int ptlrpc_resend(struct obd_import *imp)
 	}
 
 	list_for_each_entry_safe(req, next, &imp->imp_sending_list, rq_list) {
-		LASSERTF((long)req > PAGE_CACHE_SIZE && req != LP_POISON,
+		LASSERTF((long)req > PAGE_SIZE && req != LP_POISON,
 			 "req %p bad\n", req);
 		LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req);
 		if (!ptlrpc_no_resend(req))
diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
index 72d5b9bf5b29..d3872b8c9a6e 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c
@@ -58,7 +58,7 @@
  * bulk encryption page pools	   *
  ****************************************/
 
-#define POINTERS_PER_PAGE	(PAGE_CACHE_SIZE / sizeof(void *))
+#define POINTERS_PER_PAGE	(PAGE_SIZE / sizeof(void *))
 #define PAGES_PER_POOL		(POINTERS_PER_PAGE)
 
 #define IDLE_IDX_MAX	 (100)
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 8cfce105c7ee..e21ca2bd6839 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -1147,8 +1147,8 @@ static int ffs_sb_fill(struct super_block *sb, void *_data, int silent)
 	ffs->sb              = sb;
 	data->ffs_data       = NULL;
 	sb->s_fs_info        = ffs;
-	sb->s_blocksize      = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize      = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic          = FUNCTIONFS_MAGIC;
 	sb->s_op             = &ffs_sb_operations;
 	sb->s_time_gran      = 1;
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index 5cdaf0150a4e..e64479f882a5 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -1954,8 +1954,8 @@ gadgetfs_fill_super (struct super_block *sb, void *opts, int silent)
 		return -ENODEV;
 
 	/* superblock */
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = GADGETFS_MAGIC;
 	sb->s_op = &gadget_fs_operations;
 	sb->s_time_gran = 1;
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index dba51362d2e2..90901861bfc0 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -123,7 +123,7 @@ static int slave_configure(struct scsi_device *sdev)
 		unsigned int max_sectors = 64;
 
 		if (us->fflags & US_FL_MAX_SECTORS_MIN)
-			max_sectors = PAGE_CACHE_SIZE >> 9;
+			max_sectors = PAGE_SIZE >> 9;
 		if (queue_max_hw_sectors(sdev->request_queue) > max_sectors)
 			blk_queue_max_hw_sectors(sdev->request_queue,
 					      max_sectors);
diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c
index 71a923e53f93..3b1ca4411073 100644
--- a/drivers/video/fbdev/pvr2fb.c
+++ b/drivers/video/fbdev/pvr2fb.c
@@ -735,7 +735,7 @@ out:
 
 out_unmap:
 	for (i = 0; i < nr_pages; i++)
-		page_cache_release(pages[i]);
+		put_page(pages[i]);
 
 	kfree(pages);
 
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index e9e04376c52c..ac9225e86bf3 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -153,7 +153,7 @@ static void v9fs_invalidate_page(struct page *page, unsigned int offset,
 	 * If called with zero offset, we should release
 	 * the private state assocated with the page
 	 */
-	if (offset == 0 && length == PAGE_CACHE_SIZE)
+	if (offset == 0 && length == PAGE_SIZE)
 		v9fs_fscache_invalidate_page(page);
 }
 
@@ -166,10 +166,10 @@ static int v9fs_vfs_writepage_locked(struct page *page)
 	struct bio_vec bvec;
 	int err, len;
 
-	if (page->index == size >> PAGE_CACHE_SHIFT)
-		len = size & ~PAGE_CACHE_MASK;
+	if (page->index == size >> PAGE_SHIFT)
+		len = size & ~PAGE_MASK;
 	else
-		len = PAGE_CACHE_SIZE;
+		len = PAGE_SIZE;
 
 	bvec.bv_page = page;
 	bvec.bv_offset = 0;
@@ -271,7 +271,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
 	int retval = 0;
 	struct page *page;
 	struct v9fs_inode *v9inode;
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 	struct inode *inode = mapping->host;
 
 
@@ -288,11 +288,11 @@ start:
 	if (PageUptodate(page))
 		goto out;
 
-	if (len == PAGE_CACHE_SIZE)
+	if (len == PAGE_SIZE)
 		goto out;
 
 	retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
-	page_cache_release(page);
+	put_page(page);
 	if (!retval)
 		goto start;
 out:
@@ -313,7 +313,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
 		/*
 		 * zero out the rest of the area
 		 */
-		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+		unsigned from = pos & (PAGE_SIZE - 1);
 
 		zero_user(page, from + copied, len - copied);
 		flush_dcache_page(page);
@@ -331,7 +331,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
 	}
 	set_page_dirty(page);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return copied;
 }
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index eadc894faea2..b84c291ba1eb 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -421,8 +421,8 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		struct inode *inode = file_inode(file);
 		loff_t i_size;
 		unsigned long pg_start, pg_end;
-		pg_start = origin >> PAGE_CACHE_SHIFT;
-		pg_end = (origin + retval - 1) >> PAGE_CACHE_SHIFT;
+		pg_start = origin >> PAGE_SHIFT;
+		pg_end = (origin + retval - 1) >> PAGE_SHIFT;
 		if (inode->i_mapping && inode->i_mapping->nrpages)
 			invalidate_inode_pages2_range(inode->i_mapping,
 						      pg_start, pg_end);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index bf495cedec26..de3ed8629196 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -87,7 +87,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
 		sb->s_op = &v9fs_super_ops;
 	sb->s_bdi = &v9ses->bdi;
 	if (v9ses->cache)
-		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
+		sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
 
 	sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
 	if (!v9ses->cache)
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 22fc7c802d69..0cde550050e8 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -510,9 +510,9 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
 
 	pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
 		 page->index, to);
-	BUG_ON(to > PAGE_CACHE_SIZE);
+	BUG_ON(to > PAGE_SIZE);
 	bsize = AFFS_SB(sb)->s_data_blksize;
-	tmp = page->index << PAGE_CACHE_SHIFT;
+	tmp = page->index << PAGE_SHIFT;
 	bidx = tmp / bsize;
 	boff = tmp % bsize;
 
@@ -613,10 +613,10 @@ affs_readpage_ofs(struct file *file, struct page *page)
 	int err;
 
 	pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
-	to = PAGE_CACHE_SIZE;
-	if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
-		to = inode->i_size & ~PAGE_CACHE_MASK;
-		memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to);
+	to = PAGE_SIZE;
+	if (((page->index + 1) << PAGE_SHIFT) > inode->i_size) {
+		to = inode->i_size & ~PAGE_MASK;
+		memset(page_address(page) + to, 0, PAGE_SIZE - to);
 	}
 
 	err = affs_do_readpage_ofs(page, to);
@@ -646,7 +646,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
 			return err;
 	}
 
-	index = pos >> PAGE_CACHE_SHIFT;
+	index = pos >> PAGE_SHIFT;
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
@@ -656,10 +656,10 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
 		return 0;
 
 	/* XXX: inefficient but safe in the face of short writes */
-	err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE);
+	err = affs_do_readpage_ofs(page, PAGE_SIZE);
 	if (err) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	return err;
 }
@@ -677,7 +677,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 	u32 tmp;
 	int written;
 
-	from = pos & (PAGE_CACHE_SIZE - 1);
+	from = pos & (PAGE_SIZE - 1);
 	to = pos + len;
 	/*
 	 * XXX: not sure if this can handle short copies (len < copied), but
@@ -692,7 +692,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 
 	bh = NULL;
 	written = 0;
-	tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+	tmp = (page->index << PAGE_SHIFT) + from;
 	bidx = tmp / bsize;
 	boff = tmp % bsize;
 	if (boff) {
@@ -788,13 +788,13 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
 
 done:
 	affs_brelse(bh);
-	tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+	tmp = (page->index << PAGE_SHIFT) + from;
 	if (tmp > inode->i_size)
 		inode->i_size = AFFS_I(inode)->mmu_private = tmp;
 
 err_first_bh:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return written;
 
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index e10e17788f06..5fda2bc53cd7 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -181,7 +181,7 @@ error:
 static inline void afs_dir_put_page(struct page *page)
 {
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 /*
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 999bc3caec92..6344aee4ac4b 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -164,7 +164,7 @@ int afs_page_filler(void *data, struct page *page)
 		_debug("cache said ENOBUFS");
 	default:
 	go_on:
-		offset = page->index << PAGE_CACHE_SHIFT;
+		offset = page->index << PAGE_SHIFT;
 		len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
 
 		/* read the contents of the file from the server into the
@@ -319,7 +319,7 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
 	BUG_ON(!PageLocked(page));
 
 	/* we clean up only if the entire page is being invalidated */
-	if (offset == 0 && length == PAGE_CACHE_SIZE) {
+	if (offset == 0 && length == PAGE_SIZE) {
 #ifdef CONFIG_AFS_FSCACHE
 		if (PageFsCache(page)) {
 			struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index ccd0b212e82a..81dd075356b9 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -93,7 +93,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 
 	kunmap(page);
 out_free:
-	page_cache_release(page);
+	put_page(page);
 out:
 	_leave(" = %d", ret);
 	return ret;
@@ -189,7 +189,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 		buf = kmap_atomic(page);
 		memcpy(devname, buf, size);
 		kunmap_atomic(buf);
-		page_cache_release(page);
+		put_page(page);
 		page = NULL;
 	}
 
@@ -211,7 +211,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	return mnt;
 
 error:
-	page_cache_release(page);
+	put_page(page);
 error_no_page:
 	free_page((unsigned long) options);
 error_no_options:
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 81afefe7d8a6..fbdb022b75a2 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -315,8 +315,8 @@ static int afs_fill_super(struct super_block *sb,
 	_enter("");
 
 	/* fill in the superblock */
-	sb->s_blocksize		= PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_blocksize		= PAGE_SIZE;
+	sb->s_blocksize_bits	= PAGE_SHIFT;
 	sb->s_magic		= AFS_FS_MAGIC;
 	sb->s_op		= &afs_super_ops;
 	sb->s_bdi		= &as->volume->bdi;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index dfef94f70667..65de439bdc4f 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -93,10 +93,10 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
 	_enter(",,%llu", (unsigned long long)pos);
 
 	i_size = i_size_read(&vnode->vfs_inode);
-	if (pos + PAGE_CACHE_SIZE > i_size)
+	if (pos + PAGE_SIZE > i_size)
 		len = i_size - pos;
 	else
-		len = PAGE_CACHE_SIZE;
+		len = PAGE_SIZE;
 
 	ret = afs_vnode_fetch_data(vnode, key, pos, len, page);
 	if (ret < 0) {
@@ -123,9 +123,9 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 	struct page *page;
 	struct key *key = file->private_data;
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned from = pos & (PAGE_SIZE - 1);
 	unsigned to = from + len;
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 	int ret;
 
 	_enter("{%x:%u},{%lx},%u,%u",
@@ -151,8 +151,8 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 	*pagep = page;
 	/* page won't leak in error case: it eventually gets cleaned off LRU */
 
-	if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) {
-		ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page);
+	if (!PageUptodate(page) && len != PAGE_SIZE) {
+		ret = afs_fill_page(vnode, key, index << PAGE_SHIFT, page);
 		if (ret < 0) {
 			kfree(candidate);
 			_leave(" = %d [prep]", ret);
@@ -266,7 +266,7 @@ int afs_write_end(struct file *file, struct address_space *mapping,
 	if (PageDirty(page))
 		_debug("dirtied");
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return copied;
 }
@@ -480,7 +480,7 @@ static int afs_writepages_region(struct address_space *mapping,
 
 		if (page->index > end) {
 			*_next = index;
-			page_cache_release(page);
+			put_page(page);
 			_leave(" = 0 [%lx]", *_next);
 			return 0;
 		}
@@ -494,7 +494,7 @@ static int afs_writepages_region(struct address_space *mapping,
 
 		if (page->mapping != mapping) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			continue;
 		}
 
@@ -515,7 +515,7 @@ static int afs_writepages_region(struct address_space *mapping,
 
 		ret = afs_write_back_from_locked_page(wb, page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		if (ret < 0) {
 			_leave(" = %d", ret);
 			return ret;
@@ -551,13 +551,13 @@ int afs_writepages(struct address_space *mapping,
 						    &next);
 		mapping->writeback_index = next;
 	} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
-		end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT);
+		end = (pgoff_t)(LLONG_MAX >> PAGE_SHIFT);
 		ret = afs_writepages_region(mapping, wbc, 0, end, &next);
 		if (wbc->nr_to_write > 0)
 			mapping->writeback_index = next;
 	} else {
-		start = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		start = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
 		ret = afs_writepages_region(mapping, wbc, start, end, &next);
 	}
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7d914c67a9d0..81381cc0dd17 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2292,7 +2292,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 				void *kaddr = kmap(page);
 				stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
 				kunmap(page);
-				page_cache_release(page);
+				put_page(page);
 			} else
 				stop = !dump_skip(cprm, PAGE_SIZE);
 			if (stop)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index b1adb92e69de..083ea2bc60ab 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1533,7 +1533,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
 				void *kaddr = kmap(page);
 				res = dump_emit(cprm, kaddr, PAGE_SIZE);
 				kunmap(page);
-				page_cache_release(page);
+				put_page(page);
 			} else {
 				res = dump_skip(cprm, PAGE_SIZE);
 			}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3172c4e2f502..20a2c02b77c4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -331,7 +331,7 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
 	ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return ret;
 }
@@ -1149,7 +1149,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 	inode_lock(bdev->bd_inode);
 	i_size_write(bdev->bd_inode, size);
 	inode_unlock(bdev->bd_inode);
-	while (bsize < PAGE_CACHE_SIZE) {
+	while (bsize < PAGE_SIZE) {
 		if (size & bsize)
 			break;
 		bsize <<= 1;
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index e34a71b3e225..866789ede4cb 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -757,7 +757,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 			BUG_ON(NULL == l);
 
 			ret = btrfsic_read_block(state, &tmp_next_block_ctx);
-			if (ret < (int)PAGE_CACHE_SIZE) {
+			if (ret < (int)PAGE_SIZE) {
 				printk(KERN_INFO
 				       "btrfsic: read @logical %llu failed!\n",
 				       tmp_next_block_ctx.start);
@@ -1231,15 +1231,15 @@ static void btrfsic_read_from_block_data(
 	size_t offset_in_page;
 	char *kaddr;
 	char *dst = (char *)dstv;
-	size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+	size_t start_offset = block_ctx->start & ((u64)PAGE_SIZE - 1);
+	unsigned long i = (start_offset + offset) >> PAGE_SHIFT;
 
 	WARN_ON(offset + len > block_ctx->len);
-	offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1);
+	offset_in_page = (start_offset + offset) & (PAGE_SIZE - 1);
 
 	while (len > 0) {
-		cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
-		BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
+		cur = min(len, ((size_t)PAGE_SIZE - offset_in_page));
+		BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE));
 		kaddr = block_ctx->datav[i];
 		memcpy(dst, kaddr + offset_in_page, cur);
 
@@ -1605,8 +1605,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
 
 		BUG_ON(!block_ctx->datav);
 		BUG_ON(!block_ctx->pagev);
-		num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
-			    PAGE_CACHE_SHIFT;
+		num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
+			    PAGE_SHIFT;
 		while (num_pages > 0) {
 			num_pages--;
 			if (block_ctx->datav[num_pages]) {
@@ -1637,15 +1637,15 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 	BUG_ON(block_ctx->datav);
 	BUG_ON(block_ctx->pagev);
 	BUG_ON(block_ctx->mem_to_free);
-	if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
+	if (block_ctx->dev_bytenr & ((u64)PAGE_SIZE - 1)) {
 		printk(KERN_INFO
 		       "btrfsic: read_block() with unaligned bytenr %llu\n",
 		       block_ctx->dev_bytenr);
 		return -1;
 	}
 
-	num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
-		    PAGE_CACHE_SHIFT;
+	num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
+		    PAGE_SHIFT;
 	block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
 					  sizeof(*block_ctx->pagev)) *
 					 num_pages, GFP_NOFS);
@@ -1676,8 +1676,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 
 		for (j = i; j < num_pages; j++) {
 			ret = bio_add_page(bio, block_ctx->pagev[j],
-					   PAGE_CACHE_SIZE, 0);
-			if (PAGE_CACHE_SIZE != ret)
+					   PAGE_SIZE, 0);
+			if (PAGE_SIZE != ret)
 				break;
 		}
 		if (j == i) {
@@ -1693,7 +1693,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 			return -1;
 		}
 		bio_put(bio);
-		dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+		dev_bytenr += (j - i) * PAGE_SIZE;
 		i = j;
 	}
 	for (i = 0; i < num_pages; i++) {
@@ -1769,9 +1769,9 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
 	u32 crc = ~(u32)0;
 	unsigned int i;
 
-	if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+	if (num_pages * PAGE_SIZE < state->metablock_size)
 		return 1; /* not metadata */
-	num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+	num_pages = state->metablock_size >> PAGE_SHIFT;
 	h = (struct btrfs_header *)datav[0];
 
 	if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1779,8 +1779,8 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
 
 	for (i = 0; i < num_pages; i++) {
 		u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
-		size_t sublen = i ? PAGE_CACHE_SIZE :
-				    (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
+		size_t sublen = i ? PAGE_SIZE :
+				    (PAGE_SIZE - BTRFS_CSUM_SIZE);
 
 		crc = btrfs_crc32c(crc, data, sublen);
 	}
@@ -1826,14 +1826,14 @@ again:
 		if (block->is_superblock) {
 			bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
 						    mapped_datav[0]);
-			if (num_pages * PAGE_CACHE_SIZE <
+			if (num_pages * PAGE_SIZE <
 			    BTRFS_SUPER_INFO_SIZE) {
 				printk(KERN_INFO
 				       "btrfsic: cannot work with too short bios!\n");
 				return;
 			}
 			is_metadata = 1;
-			BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+			BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_SIZE - 1));
 			processed_len = BTRFS_SUPER_INFO_SIZE;
 			if (state->print_mask &
 			    BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
@@ -1844,7 +1844,7 @@ again:
 		}
 		if (is_metadata) {
 			if (!block->is_superblock) {
-				if (num_pages * PAGE_CACHE_SIZE <
+				if (num_pages * PAGE_SIZE <
 				    state->metablock_size) {
 					printk(KERN_INFO
 					       "btrfsic: cannot work with too short bios!\n");
@@ -1880,7 +1880,7 @@ again:
 			}
 			block->logical_bytenr = bytenr;
 		} else {
-			if (num_pages * PAGE_CACHE_SIZE <
+			if (num_pages * PAGE_SIZE <
 			    state->datablock_size) {
 				printk(KERN_INFO
 				       "btrfsic: cannot work with too short bios!\n");
@@ -2013,7 +2013,7 @@ again:
 			block->logical_bytenr = bytenr;
 			block->is_metadata = 1;
 			if (block->is_superblock) {
-				BUG_ON(PAGE_CACHE_SIZE !=
+				BUG_ON(PAGE_SIZE !=
 				       BTRFS_SUPER_INFO_SIZE);
 				ret = btrfsic_process_written_superblock(
 						state,
@@ -2172,8 +2172,8 @@ again:
 continue_loop:
 	BUG_ON(!processed_len);
 	dev_bytenr += processed_len;
-	mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
-	num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+	mapped_datav += processed_len >> PAGE_SHIFT;
+	num_pages -= processed_len >> PAGE_SHIFT;
 	goto again;
 }
 
@@ -2954,7 +2954,7 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
 			goto leave;
 		cur_bytenr = dev_bytenr;
 		for (i = 0; i < bio->bi_vcnt; i++) {
-			BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+			BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_SIZE);
 			mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
 			if (!mapped_datav[i]) {
 				while (i > 0) {
@@ -3037,16 +3037,16 @@ int btrfsic_mount(struct btrfs_root *root,
 	struct list_head *dev_head = &fs_devices->devices;
 	struct btrfs_device *device;
 
-	if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+	if (root->nodesize & ((u64)PAGE_SIZE - 1)) {
 		printk(KERN_INFO
 		       "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
-		       root->nodesize, PAGE_CACHE_SIZE);
+		       root->nodesize, PAGE_SIZE);
 		return -1;
 	}
-	if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+	if (root->sectorsize & ((u64)PAGE_SIZE - 1)) {
 		printk(KERN_INFO
 		       "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
-		       root->sectorsize, PAGE_CACHE_SIZE);
+		       root->sectorsize, PAGE_SIZE);
 		return -1;
 	}
 	state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 3346cd8f9910..ff61a41ac90b 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -119,7 +119,7 @@ static int check_compressed_csum(struct inode *inode,
 		csum = ~(u32)0;
 
 		kaddr = kmap_atomic(page);
-		csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE);
+		csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE);
 		btrfs_csum_final(csum, (char *)&csum);
 		kunmap_atomic(kaddr);
 
@@ -190,7 +190,7 @@ csum_failed:
 	for (index = 0; index < cb->nr_pages; index++) {
 		page = cb->compressed_pages[index];
 		page->mapping = NULL;
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	/* do io completion on the original bio */
@@ -224,8 +224,8 @@ out:
 static noinline void end_compressed_writeback(struct inode *inode,
 					      const struct compressed_bio *cb)
 {
-	unsigned long index = cb->start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT;
+	unsigned long index = cb->start >> PAGE_SHIFT;
+	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
 	struct page *pages[16];
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
@@ -247,7 +247,7 @@ static noinline void end_compressed_writeback(struct inode *inode,
 			if (cb->errors)
 				SetPageError(pages[i]);
 			end_page_writeback(pages[i]);
-			page_cache_release(pages[i]);
+			put_page(pages[i]);
 		}
 		nr_pages -= ret;
 		index += ret;
@@ -304,7 +304,7 @@ static void end_compressed_bio_write(struct bio *bio)
 	for (index = 0; index < cb->nr_pages; index++) {
 		page = cb->compressed_pages[index];
 		page->mapping = NULL;
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	/* finally free the cb struct */
@@ -341,7 +341,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 	int ret;
 	int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
-	WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
+	WARN_ON(start & ((u64)PAGE_SIZE - 1));
 	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
 	if (!cb)
 		return -ENOMEM;
@@ -374,14 +374,14 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 		page->mapping = inode->i_mapping;
 		if (bio->bi_iter.bi_size)
 			ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
-							   PAGE_CACHE_SIZE,
+							   PAGE_SIZE,
 							   bio, 0);
 		else
 			ret = 0;
 
 		page->mapping = NULL;
-		if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) <
-		    PAGE_CACHE_SIZE) {
+		if (ret || bio_add_page(bio, page, PAGE_SIZE, 0) <
+		    PAGE_SIZE) {
 			bio_get(bio);
 
 			/*
@@ -410,15 +410,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			BUG_ON(!bio);
 			bio->bi_private = cb;
 			bio->bi_end_io = end_compressed_bio_write;
-			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+			bio_add_page(bio, page, PAGE_SIZE, 0);
 		}
-		if (bytes_left < PAGE_CACHE_SIZE) {
+		if (bytes_left < PAGE_SIZE) {
 			btrfs_info(BTRFS_I(inode)->root->fs_info,
 					"bytes left %lu compress len %lu nr %lu",
 			       bytes_left, cb->compressed_len, cb->nr_pages);
 		}
-		bytes_left -= PAGE_CACHE_SIZE;
-		first_byte += PAGE_CACHE_SIZE;
+		bytes_left -= PAGE_SIZE;
+		first_byte += PAGE_SIZE;
 		cond_resched();
 	}
 	bio_get(bio);
@@ -457,17 +457,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 	int misses = 0;
 
 	page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page;
-	last_offset = (page_offset(page) + PAGE_CACHE_SIZE);
+	last_offset = (page_offset(page) + PAGE_SIZE);
 	em_tree = &BTRFS_I(inode)->extent_tree;
 	tree = &BTRFS_I(inode)->io_tree;
 
 	if (isize == 0)
 		return 0;
 
-	end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+	end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
 	while (last_offset < compressed_end) {
-		pg_index = last_offset >> PAGE_CACHE_SHIFT;
+		pg_index = last_offset >> PAGE_SHIFT;
 
 		if (pg_index > end_index)
 			break;
@@ -488,11 +488,11 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 			break;
 
 		if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
-			page_cache_release(page);
+			put_page(page);
 			goto next;
 		}
 
-		end = last_offset + PAGE_CACHE_SIZE - 1;
+		end = last_offset + PAGE_SIZE - 1;
 		/*
 		 * at this point, we have a locked page in the page cache
 		 * for these bytes in the file.  But, we have to make
@@ -502,27 +502,27 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		lock_extent(tree, last_offset, end);
 		read_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, last_offset,
-					   PAGE_CACHE_SIZE);
+					   PAGE_SIZE);
 		read_unlock(&em_tree->lock);
 
 		if (!em || last_offset < em->start ||
-		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
+		    (last_offset + PAGE_SIZE > extent_map_end(em)) ||
 		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
 			free_extent_map(em);
 			unlock_extent(tree, last_offset, end);
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			break;
 		}
 		free_extent_map(em);
 
 		if (page->index == end_index) {
 			char *userpage;
-			size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1);
+			size_t zero_offset = isize & (PAGE_SIZE - 1);
 
 			if (zero_offset) {
 				int zeros;
-				zeros = PAGE_CACHE_SIZE - zero_offset;
+				zeros = PAGE_SIZE - zero_offset;
 				userpage = kmap_atomic(page);
 				memset(userpage + zero_offset, 0, zeros);
 				flush_dcache_page(page);
@@ -531,19 +531,19 @@ static noinline int add_ra_bio_pages(struct inode *inode,
 		}
 
 		ret = bio_add_page(cb->orig_bio, page,
-				   PAGE_CACHE_SIZE, 0);
+				   PAGE_SIZE, 0);
 
-		if (ret == PAGE_CACHE_SIZE) {
+		if (ret == PAGE_SIZE) {
 			nr_pages++;
-			page_cache_release(page);
+			put_page(page);
 		} else {
 			unlock_extent(tree, last_offset, end);
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			break;
 		}
 next:
-		last_offset += PAGE_CACHE_SIZE;
+		last_offset += PAGE_SIZE;
 	}
 	return 0;
 }
@@ -567,7 +567,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	struct extent_map_tree *em_tree;
 	struct compressed_bio *cb;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	unsigned long uncompressed_len = bio->bi_vcnt * PAGE_SIZE;
 	unsigned long compressed_len;
 	unsigned long nr_pages;
 	unsigned long pg_index;
@@ -589,7 +589,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree,
 				   page_offset(bio->bi_io_vec->bv_page),
-				   PAGE_CACHE_SIZE);
+				   PAGE_SIZE);
 	read_unlock(&em_tree->lock);
 	if (!em)
 		return -EIO;
@@ -617,7 +617,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	cb->compress_type = extent_compress_type(bio_flags);
 	cb->orig_bio = bio;
 
-	nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
+	nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
 	cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
 				       GFP_NOFS);
 	if (!cb->compressed_pages)
@@ -640,7 +640,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	add_ra_bio_pages(inode, em_start + em_len, cb);
 
 	/* include any pages we added in add_ra-bio_pages */
-	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
+	uncompressed_len = bio->bi_vcnt * PAGE_SIZE;
 	cb->len = uncompressed_len;
 
 	comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
@@ -653,18 +653,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
 		page = cb->compressed_pages[pg_index];
 		page->mapping = inode->i_mapping;
-		page->index = em_start >> PAGE_CACHE_SHIFT;
+		page->index = em_start >> PAGE_SHIFT;
 
 		if (comp_bio->bi_iter.bi_size)
 			ret = tree->ops->merge_bio_hook(READ, page, 0,
-							PAGE_CACHE_SIZE,
+							PAGE_SIZE,
 							comp_bio, 0);
 		else
 			ret = 0;
 
 		page->mapping = NULL;
-		if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) <
-		    PAGE_CACHE_SIZE) {
+		if (ret || bio_add_page(comp_bio, page, PAGE_SIZE, 0) <
+		    PAGE_SIZE) {
 			bio_get(comp_bio);
 
 			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
@@ -702,9 +702,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			comp_bio->bi_private = cb;
 			comp_bio->bi_end_io = end_compressed_bio_read;
 
-			bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0);
+			bio_add_page(comp_bio, page, PAGE_SIZE, 0);
 		}
-		cur_disk_byte += PAGE_CACHE_SIZE;
+		cur_disk_byte += PAGE_SIZE;
 	}
 	bio_get(comp_bio);
 
@@ -1013,8 +1013,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
 
 	/* copy bytes from the working buffer into the pages */
 	while (working_bytes > 0) {
-		bytes = min(PAGE_CACHE_SIZE - *pg_offset,
-			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(PAGE_SIZE - *pg_offset,
+			    PAGE_SIZE - buf_offset);
 		bytes = min(bytes, working_bytes);
 		kaddr = kmap_atomic(page_out);
 		memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
@@ -1027,7 +1027,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
 		current_buf_start += bytes;
 
 		/* check if we need to pick another page */
-		if (*pg_offset == PAGE_CACHE_SIZE) {
+		if (*pg_offset == PAGE_SIZE) {
 			(*pg_index)++;
 			if (*pg_index >= vcnt)
 				return 0;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b02591b0301..942af3d2885f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1055,7 +1055,7 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
 			   (unsigned long long)page_offset(page));
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
-		page_cache_release(page);
+		put_page(page);
 	}
 }
 
@@ -1757,7 +1757,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 	if (err)
 		return err;
 
-	bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
+	bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
 	bdi->congested_fn	= btrfs_congested_fn;
 	bdi->congested_data	= info;
 	bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
@@ -2537,7 +2537,7 @@ int open_ctree(struct super_block *sb,
 		err = ret;
 		goto fail_bdi;
 	}
-	fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+	fs_info->dirty_metadata_batch = PAGE_SIZE *
 					(1 + ilog2(nr_cpu_ids));
 
 	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
@@ -2782,7 +2782,7 @@ int open_ctree(struct super_block *sb,
 	 * flag our filesystem as having big metadata blocks if
 	 * they are bigger than the page size
 	 */
-	if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
+	if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
 		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
 			printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
 		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
@@ -2832,7 +2832,7 @@ int open_ctree(struct super_block *sb,
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
-				    SZ_4M / PAGE_CACHE_SIZE);
+				    SZ_4M / PAGE_SIZE);
 
 	tree_root->nodesize = nodesize;
 	tree_root->sectorsize = sectorsize;
@@ -4071,9 +4071,9 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 		ret = -EINVAL;
 	}
 	/* Only PAGE SIZE is supported yet */
-	if (sectorsize != PAGE_CACHE_SIZE) {
+	if (sectorsize != PAGE_SIZE) {
 		printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n",
-				sectorsize, PAGE_CACHE_SIZE);
+				sectorsize, PAGE_SIZE);
 		ret = -EINVAL;
 	}
 	if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 53e12977bfd0..ce114ba9780a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3452,7 +3452,7 @@ again:
 		num_pages = 1;
 
 	num_pages *= 16;
-	num_pages *= PAGE_CACHE_SIZE;
+	num_pages *= PAGE_SIZE;
 
 	ret = btrfs_check_data_free_space(inode, 0, num_pages);
 	if (ret)
@@ -4639,7 +4639,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
 	loops = 0;
 	while (delalloc_bytes && loops < 3) {
 		max_reclaim = min(delalloc_bytes, to_reclaim);
-		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
+		nr_pages = max_reclaim >> PAGE_SHIFT;
 		btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
 		/*
 		 * We need to wait for the async pages to actually start before
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 76a0c8597d98..93d696d248d9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1363,23 +1363,23 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 
 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
 {
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long index = start >> PAGE_SHIFT;
+	unsigned long end_index = end >> PAGE_SHIFT;
 	struct page *page;
 
 	while (index <= end_index) {
 		page = find_get_page(inode->i_mapping, index);
 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
 		clear_page_dirty_for_io(page);
-		page_cache_release(page);
+		put_page(page);
 		index++;
 	}
 }
 
 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
 {
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long index = start >> PAGE_SHIFT;
+	unsigned long end_index = end >> PAGE_SHIFT;
 	struct page *page;
 
 	while (index <= end_index) {
@@ -1387,7 +1387,7 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
 		__set_page_dirty_nobuffers(page);
 		account_page_redirty(page);
-		page_cache_release(page);
+		put_page(page);
 		index++;
 	}
 }
@@ -1397,15 +1397,15 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
  */
 static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long index = start >> PAGE_SHIFT;
+	unsigned long end_index = end >> PAGE_SHIFT;
 	struct page *page;
 
 	while (index <= end_index) {
 		page = find_get_page(tree->mapping, index);
 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
 		set_page_writeback(page);
-		page_cache_release(page);
+		put_page(page);
 		index++;
 	}
 }
@@ -1556,8 +1556,8 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
 {
 	int ret;
 	struct page *pages[16];
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long index = start >> PAGE_SHIFT;
+	unsigned long end_index = end >> PAGE_SHIFT;
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
 
@@ -1571,7 +1571,7 @@ static noinline void __unlock_for_delalloc(struct inode *inode,
 		for (i = 0; i < ret; i++) {
 			if (pages[i] != locked_page)
 				unlock_page(pages[i]);
-			page_cache_release(pages[i]);
+			put_page(pages[i]);
 		}
 		nr_pages -= ret;
 		index += ret;
@@ -1584,9 +1584,9 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 					u64 delalloc_start,
 					u64 delalloc_end)
 {
-	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
+	unsigned long index = delalloc_start >> PAGE_SHIFT;
 	unsigned long start_index = index;
-	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = delalloc_end >> PAGE_SHIFT;
 	unsigned long pages_locked = 0;
 	struct page *pages[16];
 	unsigned long nrpages;
@@ -1619,11 +1619,11 @@ static noinline int lock_delalloc_pages(struct inode *inode,
 				    pages[i]->mapping != inode->i_mapping) {
 					ret = -EAGAIN;
 					unlock_page(pages[i]);
-					page_cache_release(pages[i]);
+					put_page(pages[i]);
 					goto done;
 				}
 			}
-			page_cache_release(pages[i]);
+			put_page(pages[i]);
 			pages_locked++;
 		}
 		nrpages -= ret;
@@ -1636,7 +1636,7 @@ done:
 		__unlock_for_delalloc(inode, locked_page,
 			      delalloc_start,
 			      ((u64)(start_index + pages_locked - 1)) <<
-			      PAGE_CACHE_SHIFT);
+			      PAGE_SHIFT);
 	}
 	return ret;
 }
@@ -1696,7 +1696,7 @@ again:
 		free_extent_state(cached_state);
 		cached_state = NULL;
 		if (!loops) {
-			max_bytes = PAGE_CACHE_SIZE;
+			max_bytes = PAGE_SIZE;
 			loops = 1;
 			goto again;
 		} else {
@@ -1735,8 +1735,8 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	int ret;
 	struct page *pages[16];
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long index = start >> PAGE_SHIFT;
+	unsigned long end_index = end >> PAGE_SHIFT;
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
 
@@ -1757,7 +1757,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 				SetPagePrivate2(pages[i]);
 
 			if (pages[i] == locked_page) {
-				page_cache_release(pages[i]);
+				put_page(pages[i]);
 				continue;
 			}
 			if (page_ops & PAGE_CLEAR_DIRTY)
@@ -1770,7 +1770,7 @@ void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 				end_page_writeback(pages[i]);
 			if (page_ops & PAGE_UNLOCK)
 				unlock_page(pages[i]);
-			page_cache_release(pages[i]);
+			put_page(pages[i]);
 		}
 		nr_pages -= ret;
 		index += ret;
@@ -1961,7 +1961,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 {
 	u64 start = page_offset(page);
-	u64 end = start + PAGE_CACHE_SIZE - 1;
+	u64 end = start + PAGE_SIZE - 1;
 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
 		SetPageUptodate(page);
 }
@@ -2071,11 +2071,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 		struct page *p = eb->pages[i];
 
 		ret = repair_io_failure(root->fs_info->btree_inode, start,
-					PAGE_CACHE_SIZE, start, p,
+					PAGE_SIZE, start, p,
 					start - page_offset(p), mirror_num);
 		if (ret)
 			break;
-		start += PAGE_CACHE_SIZE;
+		start += PAGE_SIZE;
 	}
 
 	return ret;
@@ -2466,8 +2466,8 @@ static void end_bio_extent_writepage(struct bio *bio)
 		 * advance bv_offset and adjust bv_len to compensate.
 		 * Print a warning for nonzero offsets, and an error
 		 * if they don't add up to a full page.  */
-		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
-			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+		if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
+			if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
 				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
 				   "partial page write in btrfs with offset %u and length %u",
 					bvec->bv_offset, bvec->bv_len);
@@ -2541,8 +2541,8 @@ static void end_bio_extent_readpage(struct bio *bio)
 		 * advance bv_offset and adjust bv_len to compensate.
 		 * Print a warning for nonzero offsets, and an error
 		 * if they don't add up to a full page.  */
-		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
-			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+		if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) {
+			if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
 				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
 				   "partial page read in btrfs with offset %u and length %u",
 					bvec->bv_offset, bvec->bv_len);
@@ -2598,13 +2598,13 @@ static void end_bio_extent_readpage(struct bio *bio)
 readpage_ok:
 		if (likely(uptodate)) {
 			loff_t i_size = i_size_read(inode);
-			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+			pgoff_t end_index = i_size >> PAGE_SHIFT;
 			unsigned off;
 
 			/* Zero out the end if this page straddles i_size */
-			off = i_size & (PAGE_CACHE_SIZE-1);
+			off = i_size & (PAGE_SIZE-1);
 			if (page->index == end_index && off)
-				zero_user_segment(page, off, PAGE_CACHE_SIZE);
+				zero_user_segment(page, off, PAGE_SIZE);
 			SetPageUptodate(page);
 		} else {
 			ClearPageUptodate(page);
@@ -2768,7 +2768,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	struct bio *bio;
 	int contig = 0;
 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
-	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
+	size_t page_size = min_t(size_t, size, PAGE_SIZE);
 
 	if (bio_ret && *bio_ret) {
 		bio = *bio_ret;
@@ -2821,7 +2821,7 @@ static void attach_extent_buffer_page(struct extent_buffer *eb,
 {
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
-		page_cache_get(page);
+		get_page(page);
 		set_page_private(page, (unsigned long)eb);
 	} else {
 		WARN_ON(page->private != (unsigned long)eb);
@@ -2832,7 +2832,7 @@ void set_page_extent_mapped(struct page *page)
 {
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
-		page_cache_get(page);
+		get_page(page);
 		set_page_private(page, EXTENT_PAGE_PRIVATE);
 	}
 }
@@ -2880,7 +2880,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 {
 	struct inode *inode = page->mapping->host;
 	u64 start = page_offset(page);
-	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 page_end = start + PAGE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
 	u64 extent_offset;
@@ -2909,12 +2909,12 @@ static int __do_readpage(struct extent_io_tree *tree,
 		}
 	}
 
-	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
+	if (page->index == last_byte >> PAGE_SHIFT) {
 		char *userpage;
-		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
+		size_t zero_offset = last_byte & (PAGE_SIZE - 1);
 
 		if (zero_offset) {
-			iosize = PAGE_CACHE_SIZE - zero_offset;
+			iosize = PAGE_SIZE - zero_offset;
 			userpage = kmap_atomic(page);
 			memset(userpage + zero_offset, 0, iosize);
 			flush_dcache_page(page);
@@ -2922,14 +2922,14 @@ static int __do_readpage(struct extent_io_tree *tree,
 		}
 	}
 	while (cur <= end) {
-		unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+		unsigned long pnr = (last_byte >> PAGE_SHIFT) + 1;
 		bool force_bio_submit = false;
 
 		if (cur >= last_byte) {
 			char *userpage;
 			struct extent_state *cached = NULL;
 
-			iosize = PAGE_CACHE_SIZE - pg_offset;
+			iosize = PAGE_SIZE - pg_offset;
 			userpage = kmap_atomic(page);
 			memset(userpage + pg_offset, 0, iosize);
 			flush_dcache_page(page);
@@ -3112,7 +3112,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
 	for (index = 0; index < nr_pages; index++) {
 		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
 			      mirror_num, bio_flags, rw, prev_em_start);
-		page_cache_release(pages[index]);
+		put_page(pages[index]);
 	}
 }
 
@@ -3134,10 +3134,10 @@ static void __extent_readpages(struct extent_io_tree *tree,
 		page_start = page_offset(pages[index]);
 		if (!end) {
 			start = page_start;
-			end = start + PAGE_CACHE_SIZE - 1;
+			end = start + PAGE_SIZE - 1;
 			first_index = index;
 		} else if (end + 1 == page_start) {
-			end += PAGE_CACHE_SIZE;
+			end += PAGE_SIZE;
 		} else {
 			__do_contiguous_readpages(tree, &pages[first_index],
 						  index - first_index, start,
@@ -3145,7 +3145,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
 						  bio, mirror_num, bio_flags,
 						  rw, prev_em_start);
 			start = page_start;
-			end = start + PAGE_CACHE_SIZE - 1;
+			end = start + PAGE_SIZE - 1;
 			first_index = index;
 		}
 	}
@@ -3167,13 +3167,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 	struct inode *inode = page->mapping->host;
 	struct btrfs_ordered_extent *ordered;
 	u64 start = page_offset(page);
-	u64 end = start + PAGE_CACHE_SIZE - 1;
+	u64 end = start + PAGE_SIZE - 1;
 	int ret;
 
 	while (1) {
 		lock_extent(tree, start, end);
 		ordered = btrfs_lookup_ordered_range(inode, start,
-						PAGE_CACHE_SIZE);
+						PAGE_SIZE);
 		if (!ordered)
 			break;
 		unlock_extent(tree, start, end);
@@ -3227,7 +3227,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
 			      unsigned long *nr_written)
 {
 	struct extent_io_tree *tree = epd->tree;
-	u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+	u64 page_end = delalloc_start + PAGE_SIZE - 1;
 	u64 nr_delalloc;
 	u64 delalloc_to_write = 0;
 	u64 delalloc_end = 0;
@@ -3269,8 +3269,8 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
 		 * PAGE_CACHE_SIZE
 		 */
 		delalloc_to_write += (delalloc_end - delalloc_start +
-				      PAGE_CACHE_SIZE) >>
-				      PAGE_CACHE_SHIFT;
+				      PAGE_SIZE) >>
+				      PAGE_SHIFT;
 		delalloc_start = delalloc_end + 1;
 	}
 	if (wbc->nr_to_write < delalloc_to_write) {
@@ -3319,7 +3319,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
 {
 	struct extent_io_tree *tree = epd->tree;
 	u64 start = page_offset(page);
-	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 page_end = start + PAGE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
 	u64 extent_offset;
@@ -3434,7 +3434,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
 		if (ret) {
 			SetPageError(page);
 		} else {
-			unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
+			unsigned long max_nr = (i_size >> PAGE_SHIFT) + 1;
 
 			set_range_writeback(tree, cur, cur + iosize - 1);
 			if (!PageWriteback(page)) {
@@ -3477,12 +3477,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	struct inode *inode = page->mapping->host;
 	struct extent_page_data *epd = data;
 	u64 start = page_offset(page);
-	u64 page_end = start + PAGE_CACHE_SIZE - 1;
+	u64 page_end = start + PAGE_SIZE - 1;
 	int ret;
 	int nr = 0;
 	size_t pg_offset = 0;
 	loff_t i_size = i_size_read(inode);
-	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = i_size >> PAGE_SHIFT;
 	int write_flags;
 	unsigned long nr_written = 0;
 
@@ -3497,10 +3497,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 	ClearPageError(page);
 
-	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+	pg_offset = i_size & (PAGE_SIZE - 1);
 	if (page->index > end_index ||
 	   (page->index == end_index && !pg_offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
 		unlock_page(page);
 		return 0;
 	}
@@ -3510,7 +3510,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 		userpage = kmap_atomic(page);
 		memset(userpage + pg_offset, 0,
-		       PAGE_CACHE_SIZE - pg_offset);
+		       PAGE_SIZE - pg_offset);
 		kunmap_atomic(userpage);
 		flush_dcache_page(page);
 	}
@@ -3748,7 +3748,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 		clear_page_dirty_for_io(p);
 		set_page_writeback(p);
 		ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
-					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+					 PAGE_SIZE, 0, bdev, &epd->bio,
 					 -1, end_bio_extent_buffer_writepage,
 					 0, epd->bio_flags, bio_flags, false);
 		epd->bio_flags = bio_flags;
@@ -3760,7 +3760,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 			ret = -EIO;
 			break;
 		}
-		offset += PAGE_CACHE_SIZE;
+		offset += PAGE_SIZE;
 		update_nr_written(p, wbc, 1);
 		unlock_page(p);
 	}
@@ -3804,8 +3804,8 @@ int btree_write_cache_pages(struct address_space *mapping,
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
 	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		index = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
 		scanned = 1;
 	}
 	if (wbc->sync_mode == WB_SYNC_ALL)
@@ -3948,8 +3948,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
 	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		index = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
 		scanned = 1;
 	}
 	if (wbc->sync_mode == WB_SYNC_ALL)
@@ -4083,8 +4083,8 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 	int ret = 0;
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
-	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
-		PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = (end - start + PAGE_SIZE) >>
+		PAGE_SHIFT;
 
 	struct extent_page_data epd = {
 		.bio = NULL,
@@ -4102,18 +4102,18 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 	};
 
 	while (start <= end) {
-		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+		page = find_get_page(mapping, start >> PAGE_SHIFT);
 		if (clear_page_dirty_for_io(page))
 			ret = __extent_writepage(page, &wbc_writepages, &epd);
 		else {
 			if (tree->ops && tree->ops->writepage_end_io_hook)
 				tree->ops->writepage_end_io_hook(page, start,
-						 start + PAGE_CACHE_SIZE - 1,
+						 start + PAGE_SIZE - 1,
 						 NULL, 1);
 			unlock_page(page);
 		}
-		page_cache_release(page);
-		start += PAGE_CACHE_SIZE;
+		put_page(page);
+		start += PAGE_SIZE;
 	}
 
 	flush_epd_write_bio(&epd);
@@ -4163,7 +4163,7 @@ int extent_readpages(struct extent_io_tree *tree,
 		list_del(&page->lru);
 		if (add_to_page_cache_lru(page, mapping,
 					page->index, GFP_NOFS)) {
-			page_cache_release(page);
+			put_page(page);
 			continue;
 		}
 
@@ -4197,7 +4197,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 {
 	struct extent_state *cached_state = NULL;
 	u64 start = page_offset(page);
-	u64 end = start + PAGE_CACHE_SIZE - 1;
+	u64 end = start + PAGE_SIZE - 1;
 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
 
 	start += ALIGN(offset, blocksize);
@@ -4223,7 +4223,7 @@ static int try_release_extent_state(struct extent_map_tree *map,
 				    struct page *page, gfp_t mask)
 {
 	u64 start = page_offset(page);
-	u64 end = start + PAGE_CACHE_SIZE - 1;
+	u64 end = start + PAGE_SIZE - 1;
 	int ret = 1;
 
 	if (test_range_bit(tree, start, end,
@@ -4262,7 +4262,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 {
 	struct extent_map *em;
 	u64 start = page_offset(page);
-	u64 end = start + PAGE_CACHE_SIZE - 1;
+	u64 end = start + PAGE_SIZE - 1;
 
 	if (gfpflags_allow_blocking(mask) &&
 	    page->mapping->host->i_size > SZ_16M) {
@@ -4587,14 +4587,14 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
 			ClearPagePrivate(page);
 			set_page_private(page, 0);
 			/* One for the page private */
-			page_cache_release(page);
+			put_page(page);
 		}
 
 		if (mapped)
 			spin_unlock(&page->mapping->private_lock);
 
 		/* One for when we alloced the page */
-		page_cache_release(page);
+		put_page(page);
 	} while (index != 0);
 }
 
@@ -4779,7 +4779,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 
 	rcu_read_lock();
 	eb = radix_tree_lookup(&fs_info->buffer_radix,
-			       start >> PAGE_CACHE_SHIFT);
+			       start >> PAGE_SHIFT);
 	if (eb && atomic_inc_not_zero(&eb->refs)) {
 		rcu_read_unlock();
 		/*
@@ -4829,7 +4829,7 @@ again:
 		goto free_eb;
 	spin_lock(&fs_info->buffer_lock);
 	ret = radix_tree_insert(&fs_info->buffer_radix,
-				start >> PAGE_CACHE_SHIFT, eb);
+				start >> PAGE_SHIFT, eb);
 	spin_unlock(&fs_info->buffer_lock);
 	radix_tree_preload_end();
 	if (ret == -EEXIST) {
@@ -4862,7 +4862,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	unsigned long len = fs_info->tree_root->nodesize;
 	unsigned long num_pages = num_extent_pages(start, len);
 	unsigned long i;
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long index = start >> PAGE_SHIFT;
 	struct extent_buffer *eb;
 	struct extent_buffer *exists = NULL;
 	struct page *p;
@@ -4896,7 +4896,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 			if (atomic_inc_not_zero(&exists->refs)) {
 				spin_unlock(&mapping->private_lock);
 				unlock_page(p);
-				page_cache_release(p);
+				put_page(p);
 				mark_extent_buffer_accessed(exists, p);
 				goto free_eb;
 			}
@@ -4908,7 +4908,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 			 */
 			ClearPagePrivate(p);
 			WARN_ON(PageDirty(p));
-			page_cache_release(p);
+			put_page(p);
 		}
 		attach_extent_buffer_page(eb, p);
 		spin_unlock(&mapping->private_lock);
@@ -4931,7 +4931,7 @@ again:
 
 	spin_lock(&fs_info->buffer_lock);
 	ret = radix_tree_insert(&fs_info->buffer_radix,
-				start >> PAGE_CACHE_SHIFT, eb);
+				start >> PAGE_SHIFT, eb);
 	spin_unlock(&fs_info->buffer_lock);
 	radix_tree_preload_end();
 	if (ret == -EEXIST) {
@@ -4994,7 +4994,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
 
 			spin_lock(&fs_info->buffer_lock);
 			radix_tree_delete(&fs_info->buffer_radix,
-					  eb->start >> PAGE_CACHE_SHIFT);
+					  eb->start >> PAGE_SHIFT);
 			spin_unlock(&fs_info->buffer_lock);
 		} else {
 			spin_unlock(&eb->refs_lock);
@@ -5168,8 +5168,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 
 	if (start) {
 		WARN_ON(start < eb->start);
-		start_i = (start >> PAGE_CACHE_SHIFT) -
-			(eb->start >> PAGE_CACHE_SHIFT);
+		start_i = (start >> PAGE_SHIFT) -
+			(eb->start >> PAGE_SHIFT);
 	} else {
 		start_i = 0;
 	}
@@ -5252,18 +5252,18 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 	struct page *page;
 	char *kaddr;
 	char *dst = (char *)dstv;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_SIZE - 1);
 
 	while (len > 0) {
 		page = eb->pages[i];
 
-		cur = min(len, (PAGE_CACHE_SIZE - offset));
+		cur = min(len, (PAGE_SIZE - offset));
 		kaddr = page_address(page);
 		memcpy(dst, kaddr + offset, cur);
 
@@ -5283,19 +5283,19 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
 	struct page *page;
 	char *kaddr;
 	char __user *dst = (char __user *)dstv;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 	int ret = 0;
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_SIZE - 1);
 
 	while (len > 0) {
 		page = eb->pages[i];
 
-		cur = min(len, (PAGE_CACHE_SIZE - offset));
+		cur = min(len, (PAGE_SIZE - offset));
 		kaddr = page_address(page);
 		if (copy_to_user(dst, kaddr + offset, cur)) {
 			ret = -EFAULT;
@@ -5316,13 +5316,13 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 			       unsigned long *map_start,
 			       unsigned long *map_len)
 {
-	size_t offset = start & (PAGE_CACHE_SIZE - 1);
+	size_t offset = start & (PAGE_SIZE - 1);
 	char *kaddr;
 	struct page *p;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 	unsigned long end_i = (start_offset + start + min_len - 1) >>
-		PAGE_CACHE_SHIFT;
+		PAGE_SHIFT;
 
 	if (i != end_i)
 		return -EINVAL;
@@ -5332,7 +5332,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		*map_start = 0;
 	} else {
 		offset = 0;
-		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
+		*map_start = ((u64)i << PAGE_SHIFT) - start_offset;
 	}
 
 	if (start + min_len > eb->len) {
@@ -5345,7 +5345,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 	p = eb->pages[i];
 	kaddr = page_address(p);
 	*map = kaddr + offset;
-	*map_len = PAGE_CACHE_SIZE - offset;
+	*map_len = PAGE_SIZE - offset;
 	return 0;
 }
 
@@ -5358,19 +5358,19 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 	struct page *page;
 	char *kaddr;
 	char *ptr = (char *)ptrv;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 	int ret = 0;
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_SIZE - 1);
 
 	while (len > 0) {
 		page = eb->pages[i];
 
-		cur = min(len, (PAGE_CACHE_SIZE - offset));
+		cur = min(len, (PAGE_SIZE - offset));
 
 		kaddr = page_address(page);
 		ret = memcmp(ptr, kaddr + offset, cur);
@@ -5393,19 +5393,19 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 	struct page *page;
 	char *kaddr;
 	char *src = (char *)srcv;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_SIZE - 1);
 
 	while (len > 0) {
 		page = eb->pages[i];
 		WARN_ON(!PageUptodate(page));
 
-		cur = min(len, PAGE_CACHE_SIZE - offset);
+		cur = min(len, PAGE_SIZE - offset);
 		kaddr = page_address(page);
 		memcpy(kaddr + offset, src, cur);
 
@@ -5423,19 +5423,19 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 	size_t offset;
 	struct page *page;
 	char *kaddr;
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
 
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_SIZE - 1);
 
 	while (len > 0) {
 		page = eb->pages[i];
 		WARN_ON(!PageUptodate(page));
 
-		cur = min(len, PAGE_CACHE_SIZE - offset);
+		cur = min(len, PAGE_SIZE - offset);
 		kaddr = page_address(page);
 		memset(kaddr + offset, c, cur);
 
@@ -5454,19 +5454,19 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 	size_t offset;
 	struct page *page;
 	char *kaddr;
-	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
-	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
+	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
+	unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
 
 	WARN_ON(src->len != dst_len);
 
 	offset = (start_offset + dst_offset) &
-		(PAGE_CACHE_SIZE - 1);
+		(PAGE_SIZE - 1);
 
 	while (len > 0) {
 		page = dst->pages[i];
 		WARN_ON(!PageUptodate(page));
 
-		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
+		cur = min(len, (unsigned long)(PAGE_SIZE - offset));
 
 		kaddr = page_address(page);
 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
@@ -5508,7 +5508,7 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,
 				    unsigned long *page_index,
 				    size_t *page_offset)
 {
-	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
 	size_t byte_offset = BIT_BYTE(nr);
 	size_t offset;
 
@@ -5519,8 +5519,8 @@ static inline void eb_bitmap_offset(struct extent_buffer *eb,
 	 */
 	offset = start_offset + start + byte_offset;
 
-	*page_index = offset >> PAGE_CACHE_SHIFT;
-	*page_offset = offset & (PAGE_CACHE_SIZE - 1);
+	*page_index = offset >> PAGE_SHIFT;
+	*page_offset = offset & (PAGE_SIZE - 1);
 }
 
 /**
@@ -5572,7 +5572,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
 		len -= bits_to_set;
 		bits_to_set = BITS_PER_BYTE;
 		mask_to_set = ~0U;
-		if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+		if (++offset >= PAGE_SIZE && len > 0) {
 			offset = 0;
 			page = eb->pages[++i];
 			WARN_ON(!PageUptodate(page));
@@ -5614,7 +5614,7 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
 		len -= bits_to_clear;
 		bits_to_clear = BITS_PER_BYTE;
 		mask_to_clear = ~0U;
-		if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+		if (++offset >= PAGE_SIZE && len > 0) {
 			offset = 0;
 			page = eb->pages[++i];
 			WARN_ON(!PageUptodate(page));
@@ -5661,7 +5661,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	size_t cur;
 	size_t dst_off_in_page;
 	size_t src_off_in_page;
-	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
 	unsigned long dst_i;
 	unsigned long src_i;
 
@@ -5680,17 +5680,17 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 
 	while (len > 0) {
 		dst_off_in_page = (start_offset + dst_offset) &
-			(PAGE_CACHE_SIZE - 1);
+			(PAGE_SIZE - 1);
 		src_off_in_page = (start_offset + src_offset) &
-			(PAGE_CACHE_SIZE - 1);
+			(PAGE_SIZE - 1);
 
-		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
-		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
+		dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
+		src_i = (start_offset + src_offset) >> PAGE_SHIFT;
 
-		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
+		cur = min(len, (unsigned long)(PAGE_SIZE -
 					       src_off_in_page));
 		cur = min_t(unsigned long, cur,
-			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
+			(unsigned long)(PAGE_SIZE - dst_off_in_page));
 
 		copy_pages(dst->pages[dst_i], dst->pages[src_i],
 			   dst_off_in_page, src_off_in_page, cur);
@@ -5709,7 +5709,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 	size_t src_off_in_page;
 	unsigned long dst_end = dst_offset + len - 1;
 	unsigned long src_end = src_offset + len - 1;
-	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
+	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
 	unsigned long dst_i;
 	unsigned long src_i;
 
@@ -5728,13 +5728,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 		return;
 	}
 	while (len > 0) {
-		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
-		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
+		dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
+		src_i = (start_offset + src_end) >> PAGE_SHIFT;
 
 		dst_off_in_page = (start_offset + dst_end) &
-			(PAGE_CACHE_SIZE - 1);
+			(PAGE_SIZE - 1);
 		src_off_in_page = (start_offset + src_end) &
-			(PAGE_CACHE_SIZE - 1);
+			(PAGE_SIZE - 1);
 
 		cur = min_t(unsigned long, len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5dbf92e68fbd..b5e0ade90e88 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -120,7 +120,7 @@ struct extent_state {
 };
 
 #define INLINE_EXTENT_BUFFER_PAGES 16
-#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
+#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
 struct extent_buffer {
 	u64 start;
 	unsigned long len;
@@ -365,8 +365,8 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
 
 static inline unsigned long num_extent_pages(u64 start, u64 len)
 {
-	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-		(start >> PAGE_CACHE_SHIFT);
+	return ((start + len + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+		(start >> PAGE_SHIFT);
 }
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b5baf5bdc8e1..7a7d6e253cfc 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -32,7 +32,7 @@
 				  size) - 1))
 
 #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
-				       PAGE_CACHE_SIZE))
+				       PAGE_SIZE))
 
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
 				   sizeof(struct btrfs_ordered_sum)) / \
@@ -203,7 +203,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 		csum = (u8 *)dst;
 	}
 
-	if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
+	if (bio->bi_iter.bi_size > PAGE_SIZE * 8)
 		path->reada = READA_FORWARD;
 
 	WARN_ON(bio->bi_vcnt <= 0);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 15a09cb156ce..cf31a60c6284 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -414,11 +414,11 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 	size_t copied = 0;
 	size_t total_copied = 0;
 	int pg = 0;
-	int offset = pos & (PAGE_CACHE_SIZE - 1);
+	int offset = pos & (PAGE_SIZE - 1);
 
 	while (write_bytes > 0) {
 		size_t count = min_t(size_t,
-				     PAGE_CACHE_SIZE - offset, write_bytes);
+				     PAGE_SIZE - offset, write_bytes);
 		struct page *page = prepared_pages[pg];
 		/*
 		 * Copy data from userspace to the current page
@@ -448,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 		if (unlikely(copied == 0))
 			break;
 
-		if (copied < PAGE_CACHE_SIZE - offset) {
+		if (copied < PAGE_SIZE - offset) {
 			offset += copied;
 		} else {
 			pg++;
@@ -473,7 +473,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 		 */
 		ClearPageChecked(pages[i]);
 		unlock_page(pages[i]);
-		page_cache_release(pages[i]);
+		put_page(pages[i]);
 	}
 }
 
@@ -1297,7 +1297,7 @@ static int prepare_uptodate_page(struct inode *inode,
 {
 	int ret = 0;
 
-	if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
+	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
 	    !PageUptodate(page)) {
 		ret = btrfs_readpage(NULL, page);
 		if (ret)
@@ -1323,7 +1323,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages,
 				  size_t write_bytes, bool force_uptodate)
 {
 	int i;
-	unsigned long index = pos >> PAGE_CACHE_SHIFT;
+	unsigned long index = pos >> PAGE_SHIFT;
 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int err = 0;
 	int faili;
@@ -1345,7 +1345,7 @@ again:
 			err = prepare_uptodate_page(inode, pages[i],
 						    pos + write_bytes, false);
 		if (err) {
-			page_cache_release(pages[i]);
+			put_page(pages[i]);
 			if (err == -EAGAIN) {
 				err = 0;
 				goto again;
@@ -1360,7 +1360,7 @@ again:
 fail:
 	while (faili >= 0) {
 		unlock_page(pages[faili]);
-		page_cache_release(pages[faili]);
+		put_page(pages[faili]);
 		faili--;
 	}
 	return err;
@@ -1408,7 +1408,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
 					     cached_state, GFP_NOFS);
 			for (i = 0; i < num_pages; i++) {
 				unlock_page(pages[i]);
-				page_cache_release(pages[i]);
+				put_page(pages[i]);
 			}
 			btrfs_start_ordered_extent(inode, ordered, 1);
 			btrfs_put_ordered_extent(ordered);
@@ -1497,8 +1497,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 	bool force_page_uptodate = false;
 	bool need_unlock;
 
-	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
-			PAGE_CACHE_SIZE / (sizeof(struct page *)));
+	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
+			PAGE_SIZE / (sizeof(struct page *)));
 	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
 	nrptrs = max(nrptrs, 8);
 	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
@@ -1506,13 +1506,13 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 		return -ENOMEM;
 
 	while (iov_iter_count(i) > 0) {
-		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+		size_t offset = pos & (PAGE_SIZE - 1);
 		size_t sector_offset;
 		size_t write_bytes = min(iov_iter_count(i),
-					 nrptrs * (size_t)PAGE_CACHE_SIZE -
+					 nrptrs * (size_t)PAGE_SIZE -
 					 offset);
 		size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
-						PAGE_CACHE_SIZE);
+						PAGE_SIZE);
 		size_t reserve_bytes;
 		size_t dirty_pages;
 		size_t copied;
@@ -1547,7 +1547,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
 			 * write_bytes, so scale down.
 			 */
 			num_pages = DIV_ROUND_UP(write_bytes + offset,
-						 PAGE_CACHE_SIZE);
+						 PAGE_SIZE);
 			reserve_bytes = round_up(write_bytes + sector_offset,
 					root->sectorsize);
 			goto reserve_metadata;
@@ -1609,7 +1609,7 @@ again:
 		} else {
 			force_page_uptodate = false;
 			dirty_pages = DIV_ROUND_UP(copied + offset,
-						   PAGE_CACHE_SIZE);
+						   PAGE_SIZE);
 		}
 
 		/*
@@ -1641,7 +1641,7 @@ again:
 				u64 __pos;
 
 				__pos = round_down(pos, root->sectorsize) +
-					(dirty_pages << PAGE_CACHE_SHIFT);
+					(dirty_pages << PAGE_SHIFT);
 				btrfs_delalloc_release_space(inode, __pos,
 							     release_bytes);
 			}
@@ -1682,7 +1682,7 @@ again:
 		cond_resched();
 
 		balance_dirty_pages_ratelimited(inode->i_mapping);
-		if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
+		if (dirty_pages < (root->nodesize >> PAGE_SHIFT) + 1)
 			btrfs_btree_balance_dirty(root);
 
 		pos += copied;
@@ -1738,8 +1738,8 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
 		goto out;
 	written += written_buffered;
 	iocb->ki_pos = pos + written_buffered;
-	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
-				 endbyte >> PAGE_CACHE_SHIFT);
+	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
+				 endbyte >> PAGE_SHIFT);
 out:
 	return written ? written : err;
 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 8f835bfa1bdd..5e6062c26129 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -29,7 +29,7 @@
 #include "inode-map.h"
 #include "volumes.h"
 
-#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
+#define BITS_PER_BITMAP		(PAGE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG	SZ_32K
 
 struct btrfs_trim_range {
@@ -295,7 +295,7 @@ static int readahead_cache(struct inode *inode)
 		return -ENOMEM;
 
 	file_ra_state_init(ra, inode->i_mapping);
-	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+	last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 
 	page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
 
@@ -310,14 +310,14 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 	int num_pages;
 	int check_crcs = 0;
 
-	num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+	num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 
 	if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
 		check_crcs = 1;
 
 	/* Make sure we can fit our crcs into the first page */
 	if (write && check_crcs &&
-	    (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+	    (num_pages * sizeof(u32)) >= PAGE_SIZE)
 		return -ENOSPC;
 
 	memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
@@ -354,9 +354,9 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
 	io_ctl->page = io_ctl->pages[io_ctl->index++];
 	io_ctl->cur = page_address(io_ctl->page);
 	io_ctl->orig = io_ctl->cur;
-	io_ctl->size = PAGE_CACHE_SIZE;
+	io_ctl->size = PAGE_SIZE;
 	if (clear)
-		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+		memset(io_ctl->cur, 0, PAGE_SIZE);
 }
 
 static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
@@ -369,7 +369,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
 		if (io_ctl->pages[i]) {
 			ClearPageChecked(io_ctl->pages[i]);
 			unlock_page(io_ctl->pages[i]);
-			page_cache_release(io_ctl->pages[i]);
+			put_page(io_ctl->pages[i]);
 		}
 	}
 }
@@ -475,7 +475,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
 		offset = sizeof(u32) * io_ctl->num_pages;
 
 	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
-			      PAGE_CACHE_SIZE - offset);
+			      PAGE_SIZE - offset);
 	btrfs_csum_final(crc, (char *)&crc);
 	io_ctl_unmap_page(io_ctl);
 	tmp = page_address(io_ctl->pages[0]);
@@ -503,7 +503,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
 
 	io_ctl_map_page(io_ctl, 0);
 	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
-			      PAGE_CACHE_SIZE - offset);
+			      PAGE_SIZE - offset);
 	btrfs_csum_final(crc, (char *)&crc);
 	if (val != crc) {
 		btrfs_err_rl(io_ctl->root->fs_info,
@@ -561,7 +561,7 @@ static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
 		io_ctl_map_page(io_ctl, 0);
 	}
 
-	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+	memcpy(io_ctl->cur, bitmap, PAGE_SIZE);
 	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 	if (io_ctl->index < io_ctl->num_pages)
 		io_ctl_map_page(io_ctl, 0);
@@ -621,7 +621,7 @@ static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
 	if (ret)
 		return ret;
 
-	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+	memcpy(entry->bitmap, io_ctl->cur, PAGE_SIZE);
 	io_ctl_unmap_page(io_ctl);
 
 	return 0;
@@ -775,7 +775,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 		} else {
 			ASSERT(num_bitmaps);
 			num_bitmaps--;
-			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+			e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
 			if (!e->bitmap) {
 				kmem_cache_free(
 					btrfs_free_space_cachep, e);
@@ -1660,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
 	 * we add more bitmaps.
 	 */
-	bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+	bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_SIZE;
 
 	if (bitmap_bytes >= max_bytes) {
 		ctl->extents_thresh = 0;
@@ -2111,7 +2111,7 @@ new_bitmap:
 		}
 
 		/* allocate the bitmap */
-		info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+		info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
 		spin_lock(&ctl->tree_lock);
 		if (!info->bitmap) {
 			ret = -ENOMEM;
@@ -3580,7 +3580,7 @@ again:
 	}
 
 	if (!map) {
-		map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+		map = kzalloc(PAGE_SIZE, GFP_NOFS);
 		if (!map) {
 			kmem_cache_free(btrfs_free_space_cachep, info);
 			return -ENOMEM;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 1f0ec19b23f6..70107f7c9307 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -283,7 +283,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
 }
 
 #define INIT_THRESHOLD	((SZ_32K / 2) / sizeof(struct btrfs_free_space))
-#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+#define INODES_PER_BITMAP (PAGE_SIZE * 8)
 
 /*
  * The goal is to keep the memory used by the free_ino tree won't
@@ -317,7 +317,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 	}
 
 	ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
-				PAGE_CACHE_SIZE / sizeof(*info);
+				PAGE_SIZE / sizeof(*info);
 }
 
 /*
@@ -481,12 +481,12 @@ again:
 
 	spin_lock(&ctl->tree_lock);
 	prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
-	prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
-	prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
+	prealloc = ALIGN(prealloc, PAGE_SIZE);
+	prealloc += ctl->total_bitmaps * PAGE_SIZE;
 	spin_unlock(&ctl->tree_lock);
 
 	/* Just to make sure we have enough space */
-	prealloc += 8 * PAGE_CACHE_SIZE;
+	prealloc += 8 * PAGE_SIZE;
 
 	ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
 	if (ret)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 41a5688ffdfe..2aaba58b4856 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -194,7 +194,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 		while (compressed_size > 0) {
 			cpage = compressed_pages[i];
 			cur_size = min_t(unsigned long, compressed_size,
-				       PAGE_CACHE_SIZE);
+				       PAGE_SIZE);
 
 			kaddr = kmap_atomic(cpage);
 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
@@ -208,13 +208,13 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
 						  compress_type);
 	} else {
 		page = find_get_page(inode->i_mapping,
-				     start >> PAGE_CACHE_SHIFT);
+				     start >> PAGE_SHIFT);
 		btrfs_set_file_extent_compression(leaf, ei, 0);
 		kaddr = kmap_atomic(page);
-		offset = start & (PAGE_CACHE_SIZE - 1);
+		offset = start & (PAGE_SIZE - 1);
 		write_extent_buffer(leaf, kaddr + offset, ptr, size);
 		kunmap_atomic(kaddr);
-		page_cache_release(page);
+		put_page(page);
 	}
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(path);
@@ -322,7 +322,7 @@ out:
 	 * And at reserve time, it's always aligned to page size, so
 	 * just free one page here.
 	 */
-	btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
+	btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
 	btrfs_free_path(path);
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -435,8 +435,8 @@ static noinline void compress_file_range(struct inode *inode,
 	actual_end = min_t(u64, isize, end + 1);
 again:
 	will_compress = 0;
-	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
-	nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
+	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+	nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_SIZE);
 
 	/*
 	 * we don't want to send crud past the end of i_size through
@@ -514,7 +514,7 @@ again:
 
 		if (!ret) {
 			unsigned long offset = total_compressed &
-				(PAGE_CACHE_SIZE - 1);
+				(PAGE_SIZE - 1);
 			struct page *page = pages[nr_pages_ret - 1];
 			char *kaddr;
 
@@ -524,7 +524,7 @@ again:
 			if (offset) {
 				kaddr = kmap_atomic(page);
 				memset(kaddr + offset, 0,
-				       PAGE_CACHE_SIZE - offset);
+				       PAGE_SIZE - offset);
 				kunmap_atomic(kaddr);
 			}
 			will_compress = 1;
@@ -580,7 +580,7 @@ cont:
 		 * one last check to make sure the compression is really a
 		 * win, compare the page count read with the blocks on disk
 		 */
-		total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
+		total_in = ALIGN(total_in, PAGE_SIZE);
 		if (total_compressed >= total_in) {
 			will_compress = 0;
 		} else {
@@ -594,7 +594,7 @@ cont:
 		 */
 		for (i = 0; i < nr_pages_ret; i++) {
 			WARN_ON(pages[i]->mapping);
-			page_cache_release(pages[i]);
+			put_page(pages[i]);
 		}
 		kfree(pages);
 		pages = NULL;
@@ -650,7 +650,7 @@ cleanup_and_bail_uncompressed:
 free_pages_out:
 	for (i = 0; i < nr_pages_ret; i++) {
 		WARN_ON(pages[i]->mapping);
-		page_cache_release(pages[i]);
+		put_page(pages[i]);
 	}
 	kfree(pages);
 }
@@ -664,7 +664,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
 
 	for (i = 0; i < async_extent->nr_pages; i++) {
 		WARN_ON(async_extent->pages[i]->mapping);
-		page_cache_release(async_extent->pages[i]);
+		put_page(async_extent->pages[i]);
 	}
 	kfree(async_extent->pages);
 	async_extent->nr_pages = 0;
@@ -966,7 +966,7 @@ static noinline int cow_file_range(struct inode *inode,
 				     PAGE_END_WRITEBACK);
 
 			*nr_written = *nr_written +
-			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
+			     (end - start + PAGE_SIZE) / PAGE_SIZE;
 			*page_started = 1;
 			goto out;
 		} else if (ret < 0) {
@@ -1106,8 +1106,8 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	async_cow = container_of(work, struct async_cow, work);
 
 	root = async_cow->root;
-	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
-		PAGE_CACHE_SHIFT;
+	nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
+		PAGE_SHIFT;
 
 	/*
 	 * atomic_sub_return implies a barrier for waitqueue_active
@@ -1164,8 +1164,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 				async_cow_start, async_cow_submit,
 				async_cow_free);
 
-		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
-			PAGE_CACHE_SHIFT;
+		nr_pages = (cur_end - start + PAGE_SIZE) >>
+			PAGE_SHIFT;
 		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
 
 		btrfs_queue_work(root->fs_info->delalloc_workers,
@@ -1960,7 +1960,7 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state)
 {
-	WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
+	WARN_ON((end & (PAGE_SIZE - 1)) == 0);
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
 				   cached_state, GFP_NOFS);
 }
@@ -1993,7 +1993,7 @@ again:
 
 	inode = page->mapping->host;
 	page_start = page_offset(page);
-	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
+	page_end = page_offset(page) + PAGE_SIZE - 1;
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			 &cached_state);
@@ -2003,7 +2003,7 @@ again:
 		goto out;
 
 	ordered = btrfs_lookup_ordered_range(inode, page_start,
-					PAGE_CACHE_SIZE);
+					PAGE_SIZE);
 	if (ordered) {
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
 				     page_end, &cached_state, GFP_NOFS);
@@ -2014,7 +2014,7 @@ again:
 	}
 
 	ret = btrfs_delalloc_reserve_space(inode, page_start,
-					   PAGE_CACHE_SIZE);
+					   PAGE_SIZE);
 	if (ret) {
 		mapping_set_error(page->mapping, ret);
 		end_extent_writepage(page, ret, page_start, page_end);
@@ -2030,7 +2030,7 @@ out:
 			     &cached_state, GFP_NOFS);
 out_page:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	kfree(fixup);
 }
 
@@ -2063,7 +2063,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
 		return -EAGAIN;
 
 	SetPageChecked(page);
-	page_cache_get(page);
+	get_page(page);
 	btrfs_init_work(&fixup->work, btrfs_fixup_helper,
 			btrfs_writepage_fixup_worker, NULL, NULL);
 	fixup->page = page;
@@ -4247,7 +4247,7 @@ static int truncate_inline_extent(struct inode *inode,
 
 	if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
 		loff_t offset = new_size;
-		loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+		loff_t page_end = ALIGN(offset, PAGE_SIZE);
 
 		/*
 		 * Zero out the remaining of the last page of our inline extent,
@@ -4633,7 +4633,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
 	struct extent_state *cached_state = NULL;
 	char *kaddr;
 	u32 blocksize = root->sectorsize;
-	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	pgoff_t index = from >> PAGE_SHIFT;
 	unsigned offset = from & (blocksize - 1);
 	struct page *page;
 	gfp_t mask = btrfs_alloc_write_mask(mapping);
@@ -4668,7 +4668,7 @@ again:
 		lock_page(page);
 		if (page->mapping != mapping) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			goto again;
 		}
 		if (!PageUptodate(page)) {
@@ -4686,7 +4686,7 @@ again:
 		unlock_extent_cached(io_tree, block_start, block_end,
 				     &cached_state, GFP_NOFS);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		btrfs_put_ordered_extent(ordered);
 		goto again;
@@ -4728,7 +4728,7 @@ out_unlock:
 		btrfs_delalloc_release_space(inode, block_start,
 					     blocksize);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 out:
 	return ret;
 }
@@ -6717,7 +6717,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
 
 	read_extent_buffer(leaf, tmp, ptr, inline_size);
 
-	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
+	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
 	ret = btrfs_decompress(compress_type, tmp, page,
 			       extent_offset, inline_size, max_size);
 	kfree(tmp);
@@ -6879,8 +6879,8 @@ next:
 
 		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
 		extent_offset = page_offset(page) + pg_offset - extent_start;
-		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
-				size - extent_offset);
+		copy_size = min_t(u64, PAGE_SIZE - pg_offset,
+				  size - extent_offset);
 		em->start = extent_start + extent_offset;
 		em->len = ALIGN(copy_size, root->sectorsize);
 		em->orig_block_len = em->len;
@@ -6899,9 +6899,9 @@ next:
 				map = kmap(page);
 				read_extent_buffer(leaf, map + pg_offset, ptr,
 						   copy_size);
-				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+				if (pg_offset + copy_size < PAGE_SIZE) {
 					memset(map + pg_offset + copy_size, 0,
-					       PAGE_CACHE_SIZE - pg_offset -
+					       PAGE_SIZE - pg_offset -
 					       copy_size);
 				}
 				kunmap(page);
@@ -7336,12 +7336,12 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
 	int start_idx;
 	int end_idx;
 
-	start_idx = start >> PAGE_CACHE_SHIFT;
+	start_idx = start >> PAGE_SHIFT;
 
 	/*
 	 * end is the last byte in the last page.  end == start is legal
 	 */
-	end_idx = end >> PAGE_CACHE_SHIFT;
+	end_idx = end >> PAGE_SHIFT;
 
 	rcu_read_lock();
 
@@ -7382,7 +7382,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
 		 * include/linux/pagemap.h for details.
 		 */
 		if (unlikely(page != *pagep)) {
-			page_cache_release(page);
+			put_page(page);
 			page = NULL;
 		}
 	}
@@ -7390,7 +7390,7 @@ bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
 	if (page) {
 		if (page->index <= end_idx)
 			found = true;
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	rcu_read_unlock();
@@ -8719,7 +8719,7 @@ static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 	if (ret == 1) {
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
-		page_cache_release(page);
+		put_page(page);
 	}
 	return ret;
 }
@@ -8739,7 +8739,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 	struct btrfs_ordered_extent *ordered;
 	struct extent_state *cached_state = NULL;
 	u64 page_start = page_offset(page);
-	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+	u64 page_end = page_start + PAGE_SIZE - 1;
 	u64 start;
 	u64 end;
 	int inode_evicting = inode->i_state & I_FREEING;
@@ -8822,7 +8822,7 @@ again:
 	 * 2) Not written to disk
 	 *    This means the reserved space should be freed here.
 	 */
-	btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
+	btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
 	if (!inode_evicting) {
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8837,7 +8837,7 @@ again:
 	if (PagePrivate(page)) {
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
-		page_cache_release(page);
+		put_page(page);
 	}
 }
 
@@ -8874,11 +8874,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	u64 page_end;
 	u64 end;
 
-	reserved_space = PAGE_CACHE_SIZE;
+	reserved_space = PAGE_SIZE;
 
 	sb_start_pagefault(inode->i_sb);
 	page_start = page_offset(page);
-	page_end = page_start + PAGE_CACHE_SIZE - 1;
+	page_end = page_start + PAGE_SIZE - 1;
 	end = page_end;
 
 	/*
@@ -8934,15 +8934,15 @@ again:
 		goto again;
 	}
 
-	if (page->index == ((size - 1) >> PAGE_CACHE_SHIFT)) {
+	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
 		reserved_space = round_up(size - page_start, root->sectorsize);
-		if (reserved_space < PAGE_CACHE_SIZE) {
+		if (reserved_space < PAGE_SIZE) {
 			end = page_start + reserved_space - 1;
 			spin_lock(&BTRFS_I(inode)->lock);
 			BTRFS_I(inode)->outstanding_extents++;
 			spin_unlock(&BTRFS_I(inode)->lock);
 			btrfs_delalloc_release_space(inode, page_start,
-						PAGE_CACHE_SIZE - reserved_space);
+						PAGE_SIZE - reserved_space);
 		}
 	}
 
@@ -8969,14 +8969,14 @@ again:
 	ret = 0;
 
 	/* page is wholly or partially inside EOF */
-	if (page_start + PAGE_CACHE_SIZE > size)
-		zero_start = size & ~PAGE_CACHE_MASK;
+	if (page_start + PAGE_SIZE > size)
+		zero_start = size & ~PAGE_MASK;
 	else
-		zero_start = PAGE_CACHE_SIZE;
+		zero_start = PAGE_SIZE;
 
-	if (zero_start != PAGE_CACHE_SIZE) {
+	if (zero_start != PAGE_SIZE) {
 		kaddr = kmap(page);
-		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+		memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
 		flush_dcache_page(page);
 		kunmap(page);
 	}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 053e677839fe..94a0c8a3e871 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -898,7 +898,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
 	u64 end;
 
 	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+	em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
 	read_unlock(&em_tree->lock);
 
 	if (em) {
@@ -988,7 +988,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em;
-	u64 len = PAGE_CACHE_SIZE;
+	u64 len = PAGE_SIZE;
 
 	/*
 	 * hopefully we have this extent in the tree already, try without
@@ -1124,15 +1124,15 @@ static int cluster_pages_for_defrag(struct inode *inode,
 	struct extent_io_tree *tree;
 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 
-	file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+	file_end = (isize - 1) >> PAGE_SHIFT;
 	if (!isize || start_index > file_end)
 		return 0;
 
 	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
 
 	ret = btrfs_delalloc_reserve_space(inode,
-			start_index << PAGE_CACHE_SHIFT,
-			page_cnt << PAGE_CACHE_SHIFT);
+			start_index << PAGE_SHIFT,
+			page_cnt << PAGE_SHIFT);
 	if (ret)
 		return ret;
 	i_done = 0;
@@ -1148,7 +1148,7 @@ again:
 			break;
 
 		page_start = page_offset(page);
-		page_end = page_start + PAGE_CACHE_SIZE - 1;
+		page_end = page_start + PAGE_SIZE - 1;
 		while (1) {
 			lock_extent_bits(tree, page_start, page_end,
 					 &cached_state);
@@ -1169,7 +1169,7 @@ again:
 			 */
 			if (page->mapping != inode->i_mapping) {
 				unlock_page(page);
-				page_cache_release(page);
+				put_page(page);
 				goto again;
 			}
 		}
@@ -1179,7 +1179,7 @@ again:
 			lock_page(page);
 			if (!PageUptodate(page)) {
 				unlock_page(page);
-				page_cache_release(page);
+				put_page(page);
 				ret = -EIO;
 				break;
 			}
@@ -1187,7 +1187,7 @@ again:
 
 		if (page->mapping != inode->i_mapping) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			goto again;
 		}
 
@@ -1208,7 +1208,7 @@ again:
 		wait_on_page_writeback(pages[i]);
 
 	page_start = page_offset(pages[0]);
-	page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
+	page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree,
 			 page_start, page_end - 1, &cached_state);
@@ -1222,8 +1222,8 @@ again:
 		BTRFS_I(inode)->outstanding_extents++;
 		spin_unlock(&BTRFS_I(inode)->lock);
 		btrfs_delalloc_release_space(inode,
-				start_index << PAGE_CACHE_SHIFT,
-				(page_cnt - i_done) << PAGE_CACHE_SHIFT);
+				start_index << PAGE_SHIFT,
+				(page_cnt - i_done) << PAGE_SHIFT);
 	}
 
 
@@ -1240,17 +1240,17 @@ again:
 		set_page_extent_mapped(pages[i]);
 		set_page_dirty(pages[i]);
 		unlock_page(pages[i]);
-		page_cache_release(pages[i]);
+		put_page(pages[i]);
 	}
 	return i_done;
 out:
 	for (i = 0; i < i_done; i++) {
 		unlock_page(pages[i]);
-		page_cache_release(pages[i]);
+		put_page(pages[i]);
 	}
 	btrfs_delalloc_release_space(inode,
-			start_index << PAGE_CACHE_SHIFT,
-			page_cnt << PAGE_CACHE_SHIFT);
+			start_index << PAGE_SHIFT,
+			page_cnt << PAGE_SHIFT);
 	return ret;
 
 }
@@ -1273,7 +1273,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	int defrag_count = 0;
 	int compress_type = BTRFS_COMPRESS_ZLIB;
 	u32 extent_thresh = range->extent_thresh;
-	unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT;
+	unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
 	unsigned long cluster = max_cluster;
 	u64 new_align = ~((u64)SZ_128K - 1);
 	struct page **pages = NULL;
@@ -1317,9 +1317,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	/* find the last page to defrag */
 	if (range->start + range->len > range->start) {
 		last_index = min_t(u64, isize - 1,
-			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
+			 range->start + range->len - 1) >> PAGE_SHIFT;
 	} else {
-		last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+		last_index = (isize - 1) >> PAGE_SHIFT;
 	}
 
 	if (newer_than) {
@@ -1331,11 +1331,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			 * we always align our defrag to help keep
 			 * the extents in the file evenly spaced
 			 */
-			i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+			i = (newer_off & new_align) >> PAGE_SHIFT;
 		} else
 			goto out_ra;
 	} else {
-		i = range->start >> PAGE_CACHE_SHIFT;
+		i = range->start >> PAGE_SHIFT;
 	}
 	if (!max_to_defrag)
 		max_to_defrag = last_index - i + 1;
@@ -1348,7 +1348,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		inode->i_mapping->writeback_index = i;
 
 	while (i <= last_index && defrag_count < max_to_defrag &&
-	       (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
+	       (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
 		/*
 		 * make sure we stop running if someone unmounts
 		 * the FS
@@ -1362,7 +1362,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			break;
 		}
 
-		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+		if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
 					 extent_thresh, &last_len, &skip,
 					 &defrag_end, range->flags &
 					 BTRFS_DEFRAG_RANGE_COMPRESS)) {
@@ -1371,14 +1371,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			 * the should_defrag function tells us how much to skip
 			 * bump our counter by the suggested amount
 			 */
-			next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
+			next = DIV_ROUND_UP(skip, PAGE_SIZE);
 			i = max(i + 1, next);
 			continue;
 		}
 
 		if (!newer_than) {
-			cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
-				   PAGE_CACHE_SHIFT) - i;
+			cluster = (PAGE_ALIGN(defrag_end) >>
+				   PAGE_SHIFT) - i;
 			cluster = min(cluster, max_cluster);
 		} else {
 			cluster = max_cluster;
@@ -1412,20 +1412,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 				i += ret;
 
 			newer_off = max(newer_off + 1,
-					(u64)i << PAGE_CACHE_SHIFT);
+					(u64)i << PAGE_SHIFT);
 
 			ret = find_new_extents(root, inode, newer_than,
 					       &newer_off, SZ_64K);
 			if (!ret) {
 				range->start = newer_off;
-				i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+				i = (newer_off & new_align) >> PAGE_SHIFT;
 			} else {
 				break;
 			}
 		} else {
 			if (ret > 0) {
 				i += ret;
-				last_len += ret << PAGE_CACHE_SHIFT;
+				last_len += ret << PAGE_SHIFT;
 			} else {
 				i++;
 				last_len = 0;
@@ -1722,7 +1722,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
 		readonly = true;
 	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
-		if (vol_args->size > PAGE_CACHE_SIZE) {
+		if (vol_args->size > PAGE_SIZE) {
 			ret = -EINVAL;
 			goto free_args;
 		}
@@ -2806,12 +2806,12 @@ static struct page *extent_same_get_page(struct inode *inode, pgoff_t index)
 		lock_page(page);
 		if (!PageUptodate(page)) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			return ERR_PTR(-EIO);
 		}
 		if (page->mapping != inode->i_mapping) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			return ERR_PTR(-EAGAIN);
 		}
 	}
@@ -2823,7 +2823,7 @@ static int gather_extent_pages(struct inode *inode, struct page **pages,
 			       int num_pages, u64 off)
 {
 	int i;
-	pgoff_t index = off >> PAGE_CACHE_SHIFT;
+	pgoff_t index = off >> PAGE_SHIFT;
 
 	for (i = 0; i < num_pages; i++) {
 again:
@@ -2932,12 +2932,12 @@ static void btrfs_cmp_data_free(struct cmp_pages *cmp)
 		pg = cmp->src_pages[i];
 		if (pg) {
 			unlock_page(pg);
-			page_cache_release(pg);
+			put_page(pg);
 		}
 		pg = cmp->dst_pages[i];
 		if (pg) {
 			unlock_page(pg);
-			page_cache_release(pg);
+			put_page(pg);
 		}
 	}
 	kfree(cmp->src_pages);
@@ -2949,7 +2949,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
 				  u64 len, struct cmp_pages *cmp)
 {
 	int ret;
-	int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
+	int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
 	struct page **src_pgarr, **dst_pgarr;
 
 	/*
@@ -2987,12 +2987,12 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
 	int ret = 0;
 	int i;
 	struct page *src_page, *dst_page;
-	unsigned int cmp_len = PAGE_CACHE_SIZE;
+	unsigned int cmp_len = PAGE_SIZE;
 	void *addr, *dst_addr;
 
 	i = 0;
 	while (len) {
-		if (len < PAGE_CACHE_SIZE)
+		if (len < PAGE_SIZE)
 			cmp_len = len;
 
 		BUG_ON(i >= cmp->num_pages);
@@ -3191,7 +3191,7 @@ ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
 	if (olen > BTRFS_MAX_DEDUPE_LEN)
 		olen = BTRFS_MAX_DEDUPE_LEN;
 
-	if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
+	if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
 		/*
 		 * Btrfs does not support blocksize < page_size. As a
 		 * result, btrfs_cmp_data() won't correctly handle
@@ -3891,8 +3891,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
 	 * data immediately and not the previous data.
 	 */
 	truncate_inode_pages_range(&inode->i_data,
-				round_down(destoff, PAGE_CACHE_SIZE),
-				round_up(destoff + len, PAGE_CACHE_SIZE) - 1);
+				round_down(destoff, PAGE_SIZE),
+				round_up(destoff + len, PAGE_SIZE) - 1);
 out_unlock:
 	if (!same_inode)
 		btrfs_double_inode_unlock(src, inode);
@@ -4124,7 +4124,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 	/* we generally have at most 6 or so space infos, one for each raid
 	 * level.  So, a whole page should be more than enough for everyone
 	 */
-	if (alloc_size > PAGE_CACHE_SIZE)
+	if (alloc_size > PAGE_SIZE)
 		return -ENOMEM;
 
 	space_args.total_spaces = 0;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index a2f051347731..1adfbe7be6b8 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -55,8 +55,8 @@ static struct list_head *lzo_alloc_workspace(void)
 		return ERR_PTR(-ENOMEM);
 
 	workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
-	workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
-	workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+	workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
+	workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_SIZE));
 	if (!workspace->mem || !workspace->buf || !workspace->cbuf)
 		goto fail;
 
@@ -116,7 +116,7 @@ static int lzo_compress_pages(struct list_head *ws,
 	*total_out = 0;
 	*total_in = 0;
 
-	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
 	data_in = kmap(in_page);
 
 	/*
@@ -133,10 +133,10 @@ static int lzo_compress_pages(struct list_head *ws,
 	tot_out = LZO_LEN;
 	pages[0] = out_page;
 	nr_pages = 1;
-	pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+	pg_bytes_left = PAGE_SIZE - LZO_LEN;
 
 	/* compress at most one page of data each time */
-	in_len = min(len, PAGE_CACHE_SIZE);
+	in_len = min(len, PAGE_SIZE);
 	while (tot_in < len) {
 		ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
 				       &out_len, workspace->mem);
@@ -201,7 +201,7 @@ static int lzo_compress_pages(struct list_head *ws,
 				cpage_out = kmap(out_page);
 				pages[nr_pages++] = out_page;
 
-				pg_bytes_left = PAGE_CACHE_SIZE;
+				pg_bytes_left = PAGE_SIZE;
 				out_offset = 0;
 			}
 		}
@@ -221,12 +221,12 @@ static int lzo_compress_pages(struct list_head *ws,
 
 		bytes_left = len - tot_in;
 		kunmap(in_page);
-		page_cache_release(in_page);
+		put_page(in_page);
 
-		start += PAGE_CACHE_SIZE;
-		in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+		start += PAGE_SIZE;
+		in_page = find_get_page(mapping, start >> PAGE_SHIFT);
 		data_in = kmap(in_page);
-		in_len = min(bytes_left, PAGE_CACHE_SIZE);
+		in_len = min(bytes_left, PAGE_SIZE);
 	}
 
 	if (tot_out > tot_in)
@@ -248,7 +248,7 @@ out:
 
 	if (in_page) {
 		kunmap(in_page);
-		page_cache_release(in_page);
+		put_page(in_page);
 	}
 
 	return ret;
@@ -266,7 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
 	char *data_in;
 	unsigned long page_in_index = 0;
 	unsigned long page_out_index = 0;
-	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
+	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
 	unsigned long buf_start;
 	unsigned long buf_offset = 0;
 	unsigned long bytes;
@@ -289,7 +289,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
 	tot_in = LZO_LEN;
 	in_offset = LZO_LEN;
 	tot_len = min_t(size_t, srclen, tot_len);
-	in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+	in_page_bytes_left = PAGE_SIZE - LZO_LEN;
 
 	tot_out = 0;
 	pg_offset = 0;
@@ -345,12 +345,12 @@ cont:
 
 				data_in = kmap(pages_in[++page_in_index]);
 
-				in_page_bytes_left = PAGE_CACHE_SIZE;
+				in_page_bytes_left = PAGE_SIZE;
 				in_offset = 0;
 			}
 		}
 
-		out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+		out_len = lzo1x_worst_compress(PAGE_SIZE);
 		ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
 					    &out_len);
 		if (need_unmap)
@@ -399,7 +399,7 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
 	in_len = read_compress_length(data_in);
 	data_in += LZO_LEN;
 
-	out_len = PAGE_CACHE_SIZE;
+	out_len = PAGE_SIZE;
 	ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
 	if (ret != LZO_E_OK) {
 		printk(KERN_WARNING "BTRFS: decompress failed!\n");
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 55161369fab1..0b7792e02dd5 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -270,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 		s = kmap(rbio->bio_pages[i]);
 		d = kmap(rbio->stripe_pages[i]);
 
-		memcpy(d, s, PAGE_CACHE_SIZE);
+		memcpy(d, s, PAGE_SIZE);
 
 		kunmap(rbio->bio_pages[i]);
 		kunmap(rbio->stripe_pages[i]);
@@ -962,7 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
  */
 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 {
-	return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes;
+	return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
 }
 
 /*
@@ -1078,7 +1078,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 	u64 disk_start;
 
 	stripe = &rbio->bbio->stripes[stripe_nr];
-	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
+	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
 
 	/* if the device is missing, just fail this stripe */
 	if (!stripe->dev->bdev)
@@ -1096,8 +1096,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 		if (last_end == disk_start && stripe->dev->bdev &&
 		    !last->bi_error &&
 		    last->bi_bdev == stripe->dev->bdev) {
-			ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
-			if (ret == PAGE_CACHE_SIZE)
+			ret = bio_add_page(last, page, PAGE_SIZE, 0);
+			if (ret == PAGE_SIZE)
 				return 0;
 		}
 	}
@@ -1111,7 +1111,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 	bio->bi_bdev = stripe->dev->bdev;
 	bio->bi_iter.bi_sector = disk_start >> 9;
 
-	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+	bio_add_page(bio, page, PAGE_SIZE, 0);
 	bio_list_add(bio_list, bio);
 	return 0;
 }
@@ -1154,7 +1154,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
 	bio_list_for_each(bio, &rbio->bio_list) {
 		start = (u64)bio->bi_iter.bi_sector << 9;
 		stripe_offset = start - rbio->bbio->raid_map[0];
-		page_index = stripe_offset >> PAGE_CACHE_SHIFT;
+		page_index = stripe_offset >> PAGE_SHIFT;
 
 		for (i = 0; i < bio->bi_vcnt; i++) {
 			p = bio->bi_io_vec[i].bv_page;
@@ -1253,7 +1253,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 		} else {
 			/* raid5 */
 			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
-			run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+			run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
 		}
 
 
@@ -1914,7 +1914,7 @@ pstripe:
 			/* Copy parity block into failed block to start with */
 			memcpy(pointers[faila],
 			       pointers[rbio->nr_data],
-			       PAGE_CACHE_SIZE);
+			       PAGE_SIZE);
 
 			/* rearrange the pointer array */
 			p = pointers[faila];
@@ -1923,7 +1923,7 @@ pstripe:
 			pointers[rbio->nr_data - 1] = p;
 
 			/* xor in the rest */
-			run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
+			run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
 		}
 		/* if we're doing this rebuild as part of an rmw, go through
 		 * and set all of our private rbio pages in the
@@ -2250,7 +2250,7 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
 	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
 				rbio->stripe_len * rbio->nr_data);
 	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
-	index = stripe_offset >> PAGE_CACHE_SHIFT;
+	index = stripe_offset >> PAGE_SHIFT;
 	rbio->bio_pages[index] = page;
 }
 
@@ -2365,14 +2365,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 		} else {
 			/* raid5 */
 			memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
-			run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+			run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
 		}
 
 		/* Check scrubbing pairty and repair it */
 		p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
 		parity = kmap(p);
-		if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
-			memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
+		if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
+			memcpy(parity, pointers[rbio->scrubp], PAGE_SIZE);
 		else
 			/* Parity is right, needn't writeback */
 			bitmap_clear(rbio->dbitmap, pagenr, 1);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index b892914968c1..298631eaee78 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -226,7 +226,7 @@ int btree_readahead_hook(struct btrfs_fs_info *fs_info,
 	/* find extent */
 	spin_lock(&fs_info->reada_lock);
 	re = radix_tree_lookup(&fs_info->reada_tree,
-			       start >> PAGE_CACHE_SHIFT);
+			       start >> PAGE_SHIFT);
 	if (re)
 		re->refcnt++;
 	spin_unlock(&fs_info->reada_lock);
@@ -257,7 +257,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 	zone = NULL;
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
-				     logical >> PAGE_CACHE_SHIFT, 1);
+				     logical >> PAGE_SHIFT, 1);
 	if (ret == 1 && logical >= zone->start && logical <= zone->end) {
 		kref_get(&zone->refcnt);
 		spin_unlock(&fs_info->reada_lock);
@@ -294,13 +294,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&dev->reada_zones,
-				(unsigned long)(zone->end >> PAGE_CACHE_SHIFT),
+				(unsigned long)(zone->end >> PAGE_SHIFT),
 				zone);
 
 	if (ret == -EEXIST) {
 		kfree(zone);
 		ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
-					     logical >> PAGE_CACHE_SHIFT, 1);
+					     logical >> PAGE_SHIFT, 1);
 		if (ret == 1 && logical >= zone->start && logical <= zone->end)
 			kref_get(&zone->refcnt);
 		else
@@ -326,7 +326,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	u64 length;
 	int real_stripes;
 	int nzones = 0;
-	unsigned long index = logical >> PAGE_CACHE_SHIFT;
+	unsigned long index = logical >> PAGE_SHIFT;
 	int dev_replace_is_ongoing;
 	int have_zone = 0;
 
@@ -495,7 +495,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
 			     struct reada_extent *re)
 {
 	int i;
-	unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+	unsigned long index = re->logical >> PAGE_SHIFT;
 
 	spin_lock(&fs_info->reada_lock);
 	if (--re->refcnt) {
@@ -538,7 +538,7 @@ static void reada_zone_release(struct kref *kref)
 	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
 
 	radix_tree_delete(&zone->device->reada_zones,
-			  zone->end >> PAGE_CACHE_SHIFT);
+			  zone->end >> PAGE_SHIFT);
 
 	kfree(zone);
 }
@@ -587,7 +587,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
 static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
 {
 	int i;
-	unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+	unsigned long index = zone->end >> PAGE_SHIFT;
 
 	for (i = 0; i < zone->ndevs; ++i) {
 		struct reada_zone *peer;
@@ -622,7 +622,7 @@ static int reada_pick_zone(struct btrfs_device *dev)
 					     (void **)&zone, index, 1);
 		if (ret == 0)
 			break;
-		index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+		index = (zone->end >> PAGE_SHIFT) + 1;
 		if (zone->locked) {
 			if (zone->elems > top_locked_elems) {
 				top_locked_elems = zone->elems;
@@ -673,7 +673,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
 	 * plugging to speed things up
 	 */
 	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
-				     dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+				     dev->reada_next >> PAGE_SHIFT, 1);
 	if (ret == 0 || re->logical > dev->reada_curr_zone->end) {
 		ret = reada_pick_zone(dev);
 		if (!ret) {
@@ -682,7 +682,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
 		}
 		re = NULL;
 		ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
-					dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+					dev->reada_next >> PAGE_SHIFT, 1);
 	}
 	if (ret == 0) {
 		spin_unlock(&fs_info->reada_lock);
@@ -838,7 +838,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 				printk(KERN_CONT " curr off %llu",
 					device->reada_next - zone->start);
 			printk(KERN_CONT "\n");
-			index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+			index = (zone->end >> PAGE_SHIFT) + 1;
 		}
 		cnt = 0;
 		index = 0;
@@ -864,7 +864,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 				}
 			}
 			printk(KERN_CONT "\n");
-			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+			index = (re->logical >> PAGE_SHIFT) + 1;
 			if (++cnt > 15)
 				break;
 		}
@@ -880,7 +880,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 		if (ret == 0)
 			break;
 		if (!re->scheduled) {
-			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+			index = (re->logical >> PAGE_SHIFT) + 1;
 			continue;
 		}
 		printk(KERN_DEBUG
@@ -897,7 +897,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
 			}
 		}
 		printk(KERN_CONT "\n");
-		index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+		index = (re->logical >> PAGE_SHIFT) + 1;
 	}
 	spin_unlock(&fs_info->reada_lock);
 }
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 2bd0011450df..3c93968b539d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3129,10 +3129,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	if (ret)
 		goto out;
 
-	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
-	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+	index = (cluster->start - offset) >> PAGE_SHIFT;
+	last_index = (cluster->end - offset) >> PAGE_SHIFT;
 	while (index <= last_index) {
-		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_SIZE);
 		if (ret)
 			goto out;
 
@@ -3145,7 +3145,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 						   mask);
 			if (!page) {
 				btrfs_delalloc_release_metadata(inode,
-							PAGE_CACHE_SIZE);
+							PAGE_SIZE);
 				ret = -ENOMEM;
 				goto out;
 			}
@@ -3162,16 +3162,16 @@ static int relocate_file_extent_cluster(struct inode *inode,
 			lock_page(page);
 			if (!PageUptodate(page)) {
 				unlock_page(page);
-				page_cache_release(page);
+				put_page(page);
 				btrfs_delalloc_release_metadata(inode,
-							PAGE_CACHE_SIZE);
+							PAGE_SIZE);
 				ret = -EIO;
 				goto out;
 			}
 		}
 
 		page_start = page_offset(page);
-		page_end = page_start + PAGE_CACHE_SIZE - 1;
+		page_end = page_start + PAGE_SIZE - 1;
 
 		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
 
@@ -3191,7 +3191,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 		unlock_extent(&BTRFS_I(inode)->io_tree,
 			      page_start, page_end);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 
 		index++;
 		balance_dirty_pages_ratelimited(inode->i_mapping);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 39dbdcbf4d13..4678f03e878e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -703,7 +703,7 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-	index = offset >> PAGE_CACHE_SHIFT;
+	index = offset >> PAGE_SHIFT;
 
 	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 	if (!page) {
@@ -1636,7 +1636,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 	if (spage->io_error) {
 		void *mapped_buffer = kmap_atomic(spage->page);
 
-		memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+		memset(mapped_buffer, 0, PAGE_SIZE);
 		flush_dcache_page(spage->page);
 		kunmap_atomic(mapped_buffer);
 	}
@@ -4294,8 +4294,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 		goto out;
 	}
 
-	while (len >= PAGE_CACHE_SIZE) {
-		index = offset >> PAGE_CACHE_SHIFT;
+	while (len >= PAGE_SIZE) {
+		index = offset >> PAGE_SHIFT;
 again:
 		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 		if (!page) {
@@ -4326,7 +4326,7 @@ again:
 			 */
 			if (page->mapping != inode->i_mapping) {
 				unlock_page(page);
-				page_cache_release(page);
+				put_page(page);
 				goto again;
 			}
 			if (!PageUptodate(page)) {
@@ -4348,15 +4348,15 @@ again:
 			ret = err;
 next_page:
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 
 		if (ret)
 			break;
 
-		offset += PAGE_CACHE_SIZE;
-		physical_for_dev_replace += PAGE_CACHE_SIZE;
-		nocow_ctx_logical += PAGE_CACHE_SIZE;
-		len -= PAGE_CACHE_SIZE;
+		offset += PAGE_SIZE;
+		physical_for_dev_replace += PAGE_SIZE;
+		nocow_ctx_logical += PAGE_SIZE;
+		len -= PAGE_SIZE;
 	}
 	ret = COPY_COMPLETE;
 out:
@@ -4390,8 +4390,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
 	bio->bi_bdev = dev->bdev;
-	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
-	if (ret != PAGE_CACHE_SIZE) {
+	ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+	if (ret != PAGE_SIZE) {
 leave_with_eio:
 		bio_put(bio);
 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 19b7bf4284ee..8d358c547c59 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4449,9 +4449,9 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
 	struct page *page;
 	char *addr;
 	struct btrfs_key key;
-	pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+	pgoff_t index = offset >> PAGE_SHIFT;
 	pgoff_t last_index;
-	unsigned pg_offset = offset & ~PAGE_CACHE_MASK;
+	unsigned pg_offset = offset & ~PAGE_MASK;
 	ssize_t ret = 0;
 
 	key.objectid = sctx->cur_ino;
@@ -4471,7 +4471,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
 	if (len == 0)
 		goto out;
 
-	last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+	last_index = (offset + len - 1) >> PAGE_SHIFT;
 
 	/* initial readahead */
 	memset(&sctx->ra, 0, sizeof(struct file_ra_state));
@@ -4481,7 +4481,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
 
 	while (index <= last_index) {
 		unsigned cur_len = min_t(unsigned, len,
-					 PAGE_CACHE_SIZE - pg_offset);
+					 PAGE_SIZE - pg_offset);
 		page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
 		if (!page) {
 			ret = -ENOMEM;
@@ -4493,7 +4493,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
 			lock_page(page);
 			if (!PageUptodate(page)) {
 				unlock_page(page);
-				page_cache_release(page);
+				put_page(page);
 				ret = -EIO;
 				break;
 			}
@@ -4503,7 +4503,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
 		memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
 		kunmap(page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		index++;
 		pg_offset = 0;
 		len -= cur_len;
@@ -4804,7 +4804,7 @@ static int clone_range(struct send_ctx *sctx,
 		type = btrfs_file_extent_type(leaf, ei);
 		if (type == BTRFS_FILE_EXTENT_INLINE) {
 			ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
-			ext_len = PAGE_CACHE_ALIGN(ext_len);
+			ext_len = PAGE_ALIGN(ext_len);
 		} else {
 			ext_len = btrfs_file_extent_num_bytes(leaf, ei);
 		}
@@ -4886,7 +4886,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
 		 * but there may be items after this page.  Make
 		 * sure to send the whole thing
 		 */
-		len = PAGE_CACHE_ALIGN(len);
+		len = PAGE_ALIGN(len);
 	} else {
 		len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
 	}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 669b58201e36..ac3a06d28531 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -32,8 +32,8 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
 {
 	int ret;
 	struct page *pages[16];
-	unsigned long index = start >> PAGE_CACHE_SHIFT;
-	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	unsigned long index = start >> PAGE_SHIFT;
+	unsigned long end_index = end >> PAGE_SHIFT;
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
 	int count = 0;
@@ -49,9 +49,9 @@ static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
 				count++;
 			if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
 				unlock_page(pages[i]);
-			page_cache_release(pages[i]);
+			put_page(pages[i]);
 			if (flags & PROCESS_RELEASE)
-				page_cache_release(pages[i]);
+				put_page(pages[i]);
 		}
 		nr_pages -= ret;
 		index += ret;
@@ -93,7 +93,7 @@ static int test_find_delalloc(void)
 	 * everything to make sure our pages don't get evicted and screw up our
 	 * test.
 	 */
-	for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
+	for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
 		page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
 		if (!page) {
 			test_msg("Failed to allocate test page\n");
@@ -104,7 +104,7 @@ static int test_find_delalloc(void)
 		if (index) {
 			unlock_page(page);
 		} else {
-			page_cache_get(page);
+			get_page(page);
 			locked_page = page;
 		}
 	}
@@ -129,7 +129,7 @@ static int test_find_delalloc(void)
 	}
 	unlock_extent(&tmp, start, end);
 	unlock_page(locked_page);
-	page_cache_release(locked_page);
+	put_page(locked_page);
 
 	/*
 	 * Test this scenario
@@ -139,7 +139,7 @@ static int test_find_delalloc(void)
 	 */
 	test_start = SZ_64M;
 	locked_page = find_lock_page(inode->i_mapping,
-				     test_start >> PAGE_CACHE_SHIFT);
+				     test_start >> PAGE_SHIFT);
 	if (!locked_page) {
 		test_msg("Couldn't find the locked page\n");
 		goto out_bits;
@@ -165,7 +165,7 @@ static int test_find_delalloc(void)
 	}
 	unlock_extent(&tmp, start, end);
 	/* locked_page was unlocked above */
-	page_cache_release(locked_page);
+	put_page(locked_page);
 
 	/*
 	 * Test this scenario
@@ -174,7 +174,7 @@ static int test_find_delalloc(void)
 	 */
 	test_start = max_bytes + 4096;
 	locked_page = find_lock_page(inode->i_mapping, test_start >>
-				     PAGE_CACHE_SHIFT);
+				     PAGE_SHIFT);
 	if (!locked_page) {
 		test_msg("Could'nt find the locked page\n");
 		goto out_bits;
@@ -225,13 +225,13 @@ static int test_find_delalloc(void)
 	 * range we want to find.
 	 */
 	page = find_get_page(inode->i_mapping,
-			     (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT);
+			     (max_bytes + SZ_1M) >> PAGE_SHIFT);
 	if (!page) {
 		test_msg("Couldn't find our page\n");
 		goto out_bits;
 	}
 	ClearPageDirty(page);
-	page_cache_release(page);
+	put_page(page);
 
 	/* We unlocked it in the previous test */
 	lock_page(locked_page);
@@ -249,9 +249,9 @@ static int test_find_delalloc(void)
 		test_msg("Didn't find our range\n");
 		goto out_bits;
 	}
-	if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) {
+	if (start != test_start && end != test_start + PAGE_SIZE - 1) {
 		test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
-			 test_start, test_start + PAGE_CACHE_SIZE - 1, start,
+			 test_start, test_start + PAGE_SIZE - 1, start,
 			 end);
 		goto out_bits;
 	}
@@ -265,7 +265,7 @@ out_bits:
 	clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL);
 out:
 	if (locked_page)
-		page_cache_release(locked_page);
+		put_page(locked_page);
 	process_page_range(inode, 0, total_dirty - 1,
 			   PROCESS_UNLOCK | PROCESS_RELEASE);
 	iput(inode);
@@ -298,9 +298,9 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
 		return -EINVAL;
 	}
 
-	bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+	bitmap_set(bitmap, (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
 		   sizeof(long) * BITS_PER_BYTE);
-	extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
+	extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0,
 				 sizeof(long) * BITS_PER_BYTE);
 	if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
 		test_msg("Setting straddling pages failed\n");
@@ -309,10 +309,10 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
 
 	bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
 	bitmap_clear(bitmap,
-		     (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+		     (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
 		     sizeof(long) * BITS_PER_BYTE);
 	extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
-	extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
+	extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0,
 				   sizeof(long) * BITS_PER_BYTE);
 	if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
 		test_msg("Clearing straddling pages failed\n");
@@ -353,7 +353,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
 
 static int test_eb_bitmaps(void)
 {
-	unsigned long len = PAGE_CACHE_SIZE * 4;
+	unsigned long len = PAGE_SIZE * 4;
 	unsigned long *bitmap;
 	struct extent_buffer *eb;
 	int ret;
@@ -379,7 +379,7 @@ static int test_eb_bitmaps(void)
 
 	/* Do it over again with an extent buffer which isn't page-aligned. */
 	free_extent_buffer(eb);
-	eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len);
+	eb = __alloc_dummy_extent_buffer(NULL, PAGE_SIZE / 2, len);
 	if (!eb) {
 		test_msg("Couldn't allocate test extent buffer\n");
 		kfree(bitmap);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index c9ad97b1e690..514247515312 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -22,7 +22,7 @@
 #include "../disk-io.h"
 #include "../free-space-cache.h"
 
-#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
+#define BITS_PER_BITMAP		(PAGE_SIZE * 8)
 
 /*
  * This test just does basic sanity checking, making sure we can add an exten
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e2b54d546b7c..bd0f45fb38c4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1025,16 +1025,16 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	}
 
 	/* make sure our super fits in the device */
-	if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
 		goto error_bdev_put;
 
 	/* make sure our super fits in the page */
-	if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+	if (sizeof(*disk_super) > PAGE_SIZE)
 		goto error_bdev_put;
 
 	/* make sure our super doesn't straddle pages on disk */
-	index = bytenr >> PAGE_CACHE_SHIFT;
-	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+	index = bytenr >> PAGE_SHIFT;
+	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
 		goto error_bdev_put;
 
 	/* pull in the page with our super */
@@ -1047,7 +1047,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	p = kmap(page);
 
 	/* align our pointer to the offset of the super block */
-	disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+	disk_super = p + (bytenr & ~PAGE_MASK);
 
 	if (btrfs_super_bytenr(disk_super) != bytenr ||
 	    btrfs_super_magic(disk_super) != BTRFS_MAGIC)
@@ -1075,7 +1075,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 error_unmap:
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 
 error_bdev_put:
 	blkdev_put(bdev, flags);
@@ -6527,7 +6527,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	 * but sb spans only this function. Add an explicit SetPageUptodate call
 	 * to silence the warning eg. on PowerPC 64.
 	 */
-	if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
+	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
 		SetPageUptodate(sb->pages[0]);
 
 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 82990b8f872b..88d274e8ecf2 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -59,7 +59,7 @@ static struct list_head *zlib_alloc_workspace(void)
 	workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
 			zlib_inflate_workspacesize());
 	workspace->strm.workspace = vmalloc(workspacesize);
-	workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+	workspace->buf = kmalloc(PAGE_SIZE, GFP_NOFS);
 	if (!workspace->strm.workspace || !workspace->buf)
 		goto fail;
 
@@ -103,7 +103,7 @@ static int zlib_compress_pages(struct list_head *ws,
 	workspace->strm.total_in = 0;
 	workspace->strm.total_out = 0;
 
-	in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+	in_page = find_get_page(mapping, start >> PAGE_SHIFT);
 	data_in = kmap(in_page);
 
 	out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
@@ -117,8 +117,8 @@ static int zlib_compress_pages(struct list_head *ws,
 
 	workspace->strm.next_in = data_in;
 	workspace->strm.next_out = cpage_out;
-	workspace->strm.avail_out = PAGE_CACHE_SIZE;
-	workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
+	workspace->strm.avail_out = PAGE_SIZE;
+	workspace->strm.avail_in = min(len, PAGE_SIZE);
 
 	while (workspace->strm.total_in < len) {
 		ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
@@ -156,7 +156,7 @@ static int zlib_compress_pages(struct list_head *ws,
 			cpage_out = kmap(out_page);
 			pages[nr_pages] = out_page;
 			nr_pages++;
-			workspace->strm.avail_out = PAGE_CACHE_SIZE;
+			workspace->strm.avail_out = PAGE_SIZE;
 			workspace->strm.next_out = cpage_out;
 		}
 		/* we're all done */
@@ -170,14 +170,14 @@ static int zlib_compress_pages(struct list_head *ws,
 
 			bytes_left = len - workspace->strm.total_in;
 			kunmap(in_page);
-			page_cache_release(in_page);
+			put_page(in_page);
 
-			start += PAGE_CACHE_SIZE;
+			start += PAGE_SIZE;
 			in_page = find_get_page(mapping,
-						start >> PAGE_CACHE_SHIFT);
+						start >> PAGE_SHIFT);
 			data_in = kmap(in_page);
 			workspace->strm.avail_in = min(bytes_left,
-							   PAGE_CACHE_SIZE);
+							   PAGE_SIZE);
 			workspace->strm.next_in = data_in;
 		}
 	}
@@ -205,7 +205,7 @@ out:
 
 	if (in_page) {
 		kunmap(in_page);
-		page_cache_release(in_page);
+		put_page(in_page);
 	}
 	return ret;
 }
@@ -223,18 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
 	size_t total_out = 0;
 	unsigned long page_in_index = 0;
 	unsigned long page_out_index = 0;
-	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
+	unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE);
 	unsigned long buf_start;
 	unsigned long pg_offset;
 
 	data_in = kmap(pages_in[page_in_index]);
 	workspace->strm.next_in = data_in;
-	workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
+	workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE);
 	workspace->strm.total_in = 0;
 
 	workspace->strm.total_out = 0;
 	workspace->strm.next_out = workspace->buf;
-	workspace->strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->strm.avail_out = PAGE_SIZE;
 	pg_offset = 0;
 
 	/* If it's deflate, and it's got no preset dictionary, then
@@ -274,7 +274,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
 		}
 
 		workspace->strm.next_out = workspace->buf;
-		workspace->strm.avail_out = PAGE_CACHE_SIZE;
+		workspace->strm.avail_out = PAGE_SIZE;
 
 		if (workspace->strm.avail_in == 0) {
 			unsigned long tmp;
@@ -288,7 +288,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
 			workspace->strm.next_in = data_in;
 			tmp = srclen - workspace->strm.total_in;
 			workspace->strm.avail_in = min(tmp,
-							   PAGE_CACHE_SIZE);
+							   PAGE_SIZE);
 		}
 	}
 	if (ret != Z_STREAM_END)
@@ -325,7 +325,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
 	workspace->strm.total_in = 0;
 
 	workspace->strm.next_out = workspace->buf;
-	workspace->strm.avail_out = PAGE_CACHE_SIZE;
+	workspace->strm.avail_out = PAGE_SIZE;
 	workspace->strm.total_out = 0;
 	/* If it's deflate, and it's got no preset dictionary, then
 	   we can tell zlib to skip the adler32 check. */
@@ -368,8 +368,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
 		else
 			buf_offset = 0;
 
-		bytes = min(PAGE_CACHE_SIZE - pg_offset,
-			    PAGE_CACHE_SIZE - buf_offset);
+		bytes = min(PAGE_SIZE - pg_offset,
+			    PAGE_SIZE - buf_offset);
 		bytes = min(bytes, bytes_left);
 
 		kaddr = kmap_atomic(dest_page);
@@ -380,7 +380,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
 		bytes_left -= bytes;
 next:
 		workspace->strm.next_out = workspace->buf;
-		workspace->strm.avail_out = PAGE_CACHE_SIZE;
+		workspace->strm.avail_out = PAGE_SIZE;
 	}
 
 	if (ret != Z_STREAM_END && bytes_left != 0)
diff --git a/fs/buffer.c b/fs/buffer.c
index 33be29675358..af0d9a82a8ed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -129,7 +129,7 @@ __clear_page_buffers(struct page *page)
 {
 	ClearPagePrivate(page);
 	set_page_private(page, 0);
-	page_cache_release(page);
+	put_page(page);
 }
 
 static void buffer_io_error(struct buffer_head *bh, char *msg)
@@ -207,7 +207,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	struct page *page;
 	int all_mapped = 1;
 
-	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
+	index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
 	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
 	if (!page)
 		goto out;
@@ -245,7 +245,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	}
 out_unlock:
 	spin_unlock(&bd_mapping->private_lock);
-	page_cache_release(page);
+	put_page(page);
 out:
 	return ret;
 }
@@ -1040,7 +1040,7 @@ done:
 	ret = (block < end_block) ? 1 : -ENXIO;
 failed:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return ret;
 }
 
@@ -1533,7 +1533,7 @@ void block_invalidatepage(struct page *page, unsigned int offset,
 	/*
 	 * Check for overflow
 	 */
-	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+	BUG_ON(stop > PAGE_SIZE || stop < length);
 
 	head = page_buffers(page);
 	bh = head;
@@ -1716,7 +1716,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	blocksize = bh->b_size;
 	bbits = block_size_bits(blocksize);
 
-	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
 	last_block = (i_size_read(inode) - 1) >> bbits;
 
 	/*
@@ -1894,7 +1894,7 @@ EXPORT_SYMBOL(page_zero_new_buffers);
 int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 		get_block_t *get_block)
 {
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned from = pos & (PAGE_SIZE - 1);
 	unsigned to = from + len;
 	struct inode *inode = page->mapping->host;
 	unsigned block_start, block_end;
@@ -1904,15 +1904,15 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
 
 	BUG_ON(!PageLocked(page));
-	BUG_ON(from > PAGE_CACHE_SIZE);
-	BUG_ON(to > PAGE_CACHE_SIZE);
+	BUG_ON(from > PAGE_SIZE);
+	BUG_ON(to > PAGE_SIZE);
 	BUG_ON(from > to);
 
 	head = create_page_buffers(page, inode, 0);
 	blocksize = head->b_size;
 	bbits = block_size_bits(blocksize);
 
-	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
 
 	for(bh = head, block_start = 0; bh != head || !block_start;
 	    block++, block_start=block_end, bh = bh->b_this_page) {
@@ -2020,7 +2020,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
 		unsigned flags, struct page **pagep, get_block_t *get_block)
 {
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 	struct page *page;
 	int status;
 
@@ -2031,7 +2031,7 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
 	status = __block_write_begin(page, pos, len, get_block);
 	if (unlikely(status)) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		page = NULL;
 	}
 
@@ -2047,7 +2047,7 @@ int block_write_end(struct file *file, struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	unsigned start;
 
-	start = pos & (PAGE_CACHE_SIZE - 1);
+	start = pos & (PAGE_SIZE - 1);
 
 	if (unlikely(copied < len)) {
 		/*
@@ -2099,7 +2099,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 	}
 
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	if (old_size < pos)
 		pagecache_isize_extended(inode, old_size, pos);
@@ -2136,9 +2136,9 @@ int block_is_partially_uptodate(struct page *page, unsigned long from,
 
 	head = page_buffers(page);
 	blocksize = head->b_size;
-	to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
+	to = min_t(unsigned, PAGE_SIZE - from, count);
 	to = from + to;
-	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
+	if (from < blocksize && to > PAGE_SIZE - blocksize)
 		return 0;
 
 	bh = head;
@@ -2181,7 +2181,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
 	blocksize = head->b_size;
 	bbits = block_size_bits(blocksize);
 
-	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+	iblock = (sector_t)page->index << (PAGE_SHIFT - bbits);
 	lblock = (i_size_read(inode)+blocksize-1) >> bbits;
 	bh = head;
 	nr = 0;
@@ -2295,16 +2295,16 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
 	unsigned zerofrom, offset, len;
 	int err = 0;
 
-	index = pos >> PAGE_CACHE_SHIFT;
-	offset = pos & ~PAGE_CACHE_MASK;
+	index = pos >> PAGE_SHIFT;
+	offset = pos & ~PAGE_MASK;
 
-	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
-		zerofrom = curpos & ~PAGE_CACHE_MASK;
+	while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
+		zerofrom = curpos & ~PAGE_MASK;
 		if (zerofrom & (blocksize-1)) {
 			*bytes |= (blocksize-1);
 			(*bytes)++;
 		}
-		len = PAGE_CACHE_SIZE - zerofrom;
+		len = PAGE_SIZE - zerofrom;
 
 		err = pagecache_write_begin(file, mapping, curpos, len,
 						AOP_FLAG_UNINTERRUPTIBLE,
@@ -2329,7 +2329,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
 
 	/* page covers the boundary, find the boundary offset */
 	if (index == curidx) {
-		zerofrom = curpos & ~PAGE_CACHE_MASK;
+		zerofrom = curpos & ~PAGE_MASK;
 		/* if we will expand the thing last block will be filled */
 		if (offset <= zerofrom) {
 			goto out;
@@ -2375,7 +2375,7 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
 	if (err)
 		return err;
 
-	zerofrom = *bytes & ~PAGE_CACHE_MASK;
+	zerofrom = *bytes & ~PAGE_MASK;
 	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
 		*bytes |= (blocksize-1);
 		(*bytes)++;
@@ -2430,10 +2430,10 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 	}
 
 	/* page is wholly or partially inside EOF */
-	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
-		end = size & ~PAGE_CACHE_MASK;
+	if (((page->index + 1) << PAGE_SHIFT) > size)
+		end = size & ~PAGE_MASK;
 	else
-		end = PAGE_CACHE_SIZE;
+		end = PAGE_SIZE;
 
 	ret = __block_write_begin(page, 0, end, get_block);
 	if (!ret)
@@ -2508,8 +2508,8 @@ int nobh_write_begin(struct address_space *mapping,
 	int ret = 0;
 	int is_mapped_to_disk = 1;
 
-	index = pos >> PAGE_CACHE_SHIFT;
-	from = pos & (PAGE_CACHE_SIZE - 1);
+	index = pos >> PAGE_SHIFT;
+	from = pos & (PAGE_SIZE - 1);
 	to = from + len;
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
@@ -2543,7 +2543,7 @@ int nobh_write_begin(struct address_space *mapping,
 		goto out_release;
 	}
 
-	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+	block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
 
 	/*
 	 * We loop across all blocks in the page, whether or not they are
@@ -2551,7 +2551,7 @@ int nobh_write_begin(struct address_space *mapping,
 	 * page is fully mapped-to-disk.
 	 */
 	for (block_start = 0, block_in_page = 0, bh = head;
-		  block_start < PAGE_CACHE_SIZE;
+		  block_start < PAGE_SIZE;
 		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
 		int create;
 
@@ -2623,7 +2623,7 @@ failed:
 
 out_release:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	*pagep = NULL;
 
 	return ret;
@@ -2653,7 +2653,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping,
 	}
 
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	while (head) {
 		bh = head;
@@ -2675,7 +2675,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
-	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	const pgoff_t end_index = i_size >> PAGE_SHIFT;
 	unsigned offset;
 	int ret;
 
@@ -2684,7 +2684,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 		goto out;
 
 	/* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_CACHE_SIZE-1);
+	offset = i_size & (PAGE_SIZE-1);
 	if (page->index >= end_index+1 || !offset) {
 		/*
 		 * The page may have dirty, unmapped buffers.  For example,
@@ -2707,7 +2707,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block,
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	zero_user_segment(page, offset, PAGE_SIZE);
 out:
 	ret = mpage_writepage(page, get_block, wbc);
 	if (ret == -EAGAIN)
@@ -2720,8 +2720,8 @@ EXPORT_SYMBOL(nobh_writepage);
 int nobh_truncate_page(struct address_space *mapping,
 			loff_t from, get_block_t *get_block)
 {
-	pgoff_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	pgoff_t index = from >> PAGE_SHIFT;
+	unsigned offset = from & (PAGE_SIZE-1);
 	unsigned blocksize;
 	sector_t iblock;
 	unsigned length, pos;
@@ -2738,7 +2738,7 @@ int nobh_truncate_page(struct address_space *mapping,
 		return 0;
 
 	length = blocksize - length;
-	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
 
 	page = grab_cache_page(mapping, index);
 	err = -ENOMEM;
@@ -2748,7 +2748,7 @@ int nobh_truncate_page(struct address_space *mapping,
 	if (page_has_buffers(page)) {
 has_buffers:
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		return block_truncate_page(mapping, from, get_block);
 	}
 
@@ -2772,7 +2772,7 @@ has_buffers:
 	if (!PageUptodate(page)) {
 		err = mapping->a_ops->readpage(NULL, page);
 		if (err) {
-			page_cache_release(page);
+			put_page(page);
 			goto out;
 		}
 		lock_page(page);
@@ -2789,7 +2789,7 @@ has_buffers:
 
 unlock:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 out:
 	return err;
 }
@@ -2798,8 +2798,8 @@ EXPORT_SYMBOL(nobh_truncate_page);
 int block_truncate_page(struct address_space *mapping,
 			loff_t from, get_block_t *get_block)
 {
-	pgoff_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	pgoff_t index = from >> PAGE_SHIFT;
+	unsigned offset = from & (PAGE_SIZE-1);
 	unsigned blocksize;
 	sector_t iblock;
 	unsigned length, pos;
@@ -2816,7 +2816,7 @@ int block_truncate_page(struct address_space *mapping,
 		return 0;
 
 	length = blocksize - length;
-	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
 	
 	page = grab_cache_page(mapping, index);
 	err = -ENOMEM;
@@ -2865,7 +2865,7 @@ int block_truncate_page(struct address_space *mapping,
 
 unlock:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 out:
 	return err;
 }
@@ -2879,7 +2879,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
-	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	const pgoff_t end_index = i_size >> PAGE_SHIFT;
 	unsigned offset;
 
 	/* Is the page fully inside i_size? */
@@ -2888,14 +2888,14 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 					       end_buffer_async_write);
 
 	/* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_CACHE_SIZE-1);
+	offset = i_size & (PAGE_SIZE-1);
 	if (page->index >= end_index+1 || !offset) {
 		/*
 		 * The page may have dirty, unmapped buffers.  For example,
 		 * they may have been added in ext3_writepage().  Make them
 		 * freeable here, so the page does not leak.
 		 */
-		do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		do_invalidatepage(page, 0, PAGE_SIZE);
 		unlock_page(page);
 		return 0; /* don't care */
 	}
@@ -2907,7 +2907,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	zero_user_segment(page, offset, PAGE_SIZE);
 	return __block_write_full_page(inode, page, get_block, wbc,
 							end_buffer_async_write);
 }
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index c0f3da3926a0..afbdc418966d 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -194,10 +194,10 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
 			error = -EIO;
 		}
 
-		page_cache_release(monitor->back_page);
+		put_page(monitor->back_page);
 
 		fscache_end_io(op, monitor->netfs_page, error);
-		page_cache_release(monitor->netfs_page);
+		put_page(monitor->netfs_page);
 		fscache_retrieval_complete(op, 1);
 		fscache_put_retrieval(op);
 		kfree(monitor);
@@ -288,8 +288,8 @@ monitor_backing_page:
 	_debug("- monitor add");
 
 	/* install the monitor */
-	page_cache_get(monitor->netfs_page);
-	page_cache_get(backpage);
+	get_page(monitor->netfs_page);
+	get_page(backpage);
 	monitor->back_page = backpage;
 	monitor->monitor.private = backpage;
 	add_page_wait_queue(backpage, &monitor->monitor);
@@ -310,7 +310,7 @@ backing_page_already_present:
 	_debug("- present");
 
 	if (newpage) {
-		page_cache_release(newpage);
+		put_page(newpage);
 		newpage = NULL;
 	}
 
@@ -342,7 +342,7 @@ success:
 
 out:
 	if (backpage)
-		page_cache_release(backpage);
+		put_page(backpage);
 	if (monitor) {
 		fscache_put_retrieval(monitor->op);
 		kfree(monitor);
@@ -363,7 +363,7 @@ io_error:
 	goto out;
 
 nomem_page:
-	page_cache_release(newpage);
+	put_page(newpage);
 nomem_monitor:
 	fscache_put_retrieval(monitor->op);
 	kfree(monitor);
@@ -530,7 +530,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 					    netpage->index, cachefiles_gfp);
 		if (ret < 0) {
 			if (ret == -EEXIST) {
-				page_cache_release(netpage);
+				put_page(netpage);
 				fscache_retrieval_complete(op, 1);
 				continue;
 			}
@@ -538,10 +538,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 		}
 
 		/* install a monitor */
-		page_cache_get(netpage);
+		get_page(netpage);
 		monitor->netfs_page = netpage;
 
-		page_cache_get(backpage);
+		get_page(backpage);
 		monitor->back_page = backpage;
 		monitor->monitor.private = backpage;
 		add_page_wait_queue(backpage, &monitor->monitor);
@@ -555,10 +555,10 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 			unlock_page(backpage);
 		}
 
-		page_cache_release(backpage);
+		put_page(backpage);
 		backpage = NULL;
 
-		page_cache_release(netpage);
+		put_page(netpage);
 		netpage = NULL;
 		continue;
 
@@ -603,7 +603,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 					    netpage->index, cachefiles_gfp);
 		if (ret < 0) {
 			if (ret == -EEXIST) {
-				page_cache_release(netpage);
+				put_page(netpage);
 				fscache_retrieval_complete(op, 1);
 				continue;
 			}
@@ -612,14 +612,14 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 
 		copy_highpage(netpage, backpage);
 
-		page_cache_release(backpage);
+		put_page(backpage);
 		backpage = NULL;
 
 		fscache_mark_page_cached(op, netpage);
 
 		/* the netpage is unlocked and marked up to date here */
 		fscache_end_io(op, netpage, 0);
-		page_cache_release(netpage);
+		put_page(netpage);
 		netpage = NULL;
 		fscache_retrieval_complete(op, 1);
 		continue;
@@ -632,11 +632,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 out:
 	/* tidy up */
 	if (newpage)
-		page_cache_release(newpage);
+		put_page(newpage);
 	if (netpage)
-		page_cache_release(netpage);
+		put_page(netpage);
 	if (backpage)
-		page_cache_release(backpage);
+		put_page(backpage);
 	if (monitor) {
 		fscache_put_retrieval(op);
 		kfree(monitor);
@@ -644,7 +644,7 @@ out:
 
 	list_for_each_entry_safe(netpage, _n, list, lru) {
 		list_del(&netpage->lru);
-		page_cache_release(netpage);
+		put_page(netpage);
 		fscache_retrieval_complete(op, 1);
 	}
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index fc5cae2a0db2..4801571f51cb 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -143,7 +143,7 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
 	inode = page->mapping->host;
 	ci = ceph_inode(inode);
 
-	if (offset != 0 || length != PAGE_CACHE_SIZE) {
+	if (offset != 0 || length != PAGE_SIZE) {
 		dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
 		     inode, page, page->index, offset, length);
 		return;
@@ -197,10 +197,10 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 		&ceph_inode_to_client(inode)->client->osdc;
 	int err = 0;
 	u64 off = page_offset(page);
-	u64 len = PAGE_CACHE_SIZE;
+	u64 len = PAGE_SIZE;
 
 	if (off >= i_size_read(inode)) {
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		zero_user_segment(page, 0, PAGE_SIZE);
 		SetPageUptodate(page);
 		return 0;
 	}
@@ -212,7 +212,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 		 */
 		if (off == 0)
 			return -EINVAL;
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		zero_user_segment(page, 0, PAGE_SIZE);
 		SetPageUptodate(page);
 		return 0;
 	}
@@ -234,9 +234,9 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 		ceph_fscache_readpage_cancel(inode, page);
 		goto out;
 	}
-	if (err < PAGE_CACHE_SIZE)
+	if (err < PAGE_SIZE)
 		/* zero fill remainder of page */
-		zero_user_segment(page, err, PAGE_CACHE_SIZE);
+		zero_user_segment(page, err, PAGE_SIZE);
 	else
 		flush_dcache_page(page);
 
@@ -278,10 +278,10 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 
 		if (rc < 0 && rc != -ENOENT)
 			goto unlock;
-		if (bytes < (int)PAGE_CACHE_SIZE) {
+		if (bytes < (int)PAGE_SIZE) {
 			/* zero (remainder of) page */
 			int s = bytes < 0 ? 0 : bytes;
-			zero_user_segment(page, s, PAGE_CACHE_SIZE);
+			zero_user_segment(page, s, PAGE_SIZE);
 		}
  		dout("finish_read %p uptodate %p idx %lu\n", inode, page,
 		     page->index);
@@ -290,8 +290,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 		ceph_readpage_to_fscache(inode, page);
 unlock:
 		unlock_page(page);
-		page_cache_release(page);
-		bytes -= PAGE_CACHE_SIZE;
+		put_page(page);
+		bytes -= PAGE_SIZE;
 	}
 	kfree(osd_data->pages);
 }
@@ -336,7 +336,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		if (max && nr_pages == max)
 			break;
 	}
-	len = nr_pages << PAGE_CACHE_SHIFT;
+	len = nr_pages << PAGE_SHIFT;
 	dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
 	     off, len);
 	vino = ceph_vino(inode);
@@ -364,7 +364,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		if (add_to_page_cache_lru(page, &inode->i_data, page->index,
 					  GFP_KERNEL)) {
 			ceph_fscache_uncache_page(inode, page);
-			page_cache_release(page);
+			put_page(page);
 			dout("start_read %p add_to_page_cache failed %p\n",
 			     inode, page);
 			nr_pages = i;
@@ -415,8 +415,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 	if (rc == 0)
 		goto out;
 
-	if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
-		max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
+	if (fsc->mount_options->rsize >= PAGE_SIZE)
+		max = (fsc->mount_options->rsize + PAGE_SIZE - 1)
 			>> PAGE_SHIFT;
 
 	dout("readpages %p file %p nr_pages %d max %d\n", inode,
@@ -484,7 +484,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	long writeback_stat;
 	u64 truncate_size;
 	u32 truncate_seq;
-	int err = 0, len = PAGE_CACHE_SIZE;
+	int err = 0, len = PAGE_SIZE;
 
 	dout("writepage %p idx %lu\n", page, page->index);
 
@@ -725,9 +725,9 @@ static int ceph_writepages_start(struct address_space *mapping,
 	}
 	if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
 		wsize = fsc->mount_options->wsize;
-	if (wsize < PAGE_CACHE_SIZE)
-		wsize = PAGE_CACHE_SIZE;
-	max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+	if (wsize < PAGE_SIZE)
+		wsize = PAGE_SIZE;
+	max_pages_ever = wsize >> PAGE_SHIFT;
 
 	pagevec_init(&pvec, 0);
 
@@ -737,8 +737,8 @@ static int ceph_writepages_start(struct address_space *mapping,
 		end = -1;
 		dout(" cyclic, start at %lu\n", start);
 	} else {
-		start = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		start = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 		should_loop = 0;
@@ -887,7 +887,7 @@ get_more_pages:
 
 				num_ops = 1 + do_sync;
 				strip_unit_end = page->index +
-					((len - 1) >> PAGE_CACHE_SHIFT);
+					((len - 1) >> PAGE_SHIFT);
 
 				BUG_ON(pages);
 				max_pages = calc_pages_for(0, (u64)len);
@@ -901,7 +901,7 @@ get_more_pages:
 
 				len = 0;
 			} else if (page->index !=
-				   (offset + len) >> PAGE_CACHE_SHIFT) {
+				   (offset + len) >> PAGE_SHIFT) {
 				if (num_ops >= (pool ?  CEPH_OSD_SLAB_OPS :
 							CEPH_OSD_MAX_OPS)) {
 					redirty_page_for_writepage(wbc, page);
@@ -929,7 +929,7 @@ get_more_pages:
 
 			pages[locked_pages] = page;
 			locked_pages++;
-			len += PAGE_CACHE_SIZE;
+			len += PAGE_SIZE;
 		}
 
 		/* did we get anything? */
@@ -981,7 +981,7 @@ new_request:
 			BUG_ON(IS_ERR(req));
 		}
 		BUG_ON(len < page_offset(pages[locked_pages - 1]) +
-			     PAGE_CACHE_SIZE - offset);
+			     PAGE_SIZE - offset);
 
 		req->r_callback = writepages_finish;
 		req->r_inode = inode;
@@ -1011,7 +1011,7 @@ new_request:
 			}
 
 			set_page_writeback(pages[i]);
-			len += PAGE_CACHE_SIZE;
+			len += PAGE_SIZE;
 		}
 
 		if (snap_size != -1) {
@@ -1020,7 +1020,7 @@ new_request:
 			/* writepages_finish() clears writeback pages
 			 * according to the data length, so make sure
 			 * data length covers all locked pages */
-			u64 min_len = len + 1 - PAGE_CACHE_SIZE;
+			u64 min_len = len + 1 - PAGE_SIZE;
 			len = min(len, (u64)i_size_read(inode) - offset);
 			len = max(len, min_len);
 		}
@@ -1135,8 +1135,8 @@ static int ceph_update_writeable_page(struct file *file,
 {
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	loff_t page_off = pos & PAGE_CACHE_MASK;
-	int pos_in_page = pos & ~PAGE_CACHE_MASK;
+	loff_t page_off = pos & PAGE_MASK;
+	int pos_in_page = pos & ~PAGE_MASK;
 	int end_in_page = pos_in_page + len;
 	loff_t i_size;
 	int r;
@@ -1191,7 +1191,7 @@ retry_locked:
 	}
 
 	/* full page? */
-	if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+	if (pos_in_page == 0 && len == PAGE_SIZE)
 		return 0;
 
 	/* past end of file? */
@@ -1199,12 +1199,12 @@ retry_locked:
 
 	if (page_off >= i_size ||
 	    (pos_in_page == 0 && (pos+len) >= i_size &&
-	     end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+	     end_in_page - pos_in_page != PAGE_SIZE)) {
 		dout(" zeroing %p 0 - %d and %d - %d\n",
-		     page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+		     page, pos_in_page, end_in_page, (int)PAGE_SIZE);
 		zero_user_segments(page,
 				   0, pos_in_page,
-				   end_in_page, PAGE_CACHE_SIZE);
+				   end_in_page, PAGE_SIZE);
 		return 0;
 	}
 
@@ -1228,7 +1228,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = file_inode(file);
 	struct page *page;
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 	int r;
 
 	do {
@@ -1242,7 +1242,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 
 		r = ceph_update_writeable_page(file, pos, len, page);
 		if (r < 0)
-			page_cache_release(page);
+			put_page(page);
 		else
 			*pagep = page;
 	} while (r == -EAGAIN);
@@ -1259,7 +1259,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 			  struct page *page, void *fsdata)
 {
 	struct inode *inode = file_inode(file);
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned from = pos & (PAGE_SIZE - 1);
 	int check_cap = 0;
 
 	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
@@ -1279,7 +1279,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 	set_page_dirty(page);
 
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	if (check_cap)
 		ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
@@ -1322,11 +1322,11 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_file_info *fi = vma->vm_file->private_data;
 	struct page *pinned_page = NULL;
-	loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
+	loff_t off = vmf->pgoff << PAGE_SHIFT;
 	int want, got, ret;
 
 	dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
-	     inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
+	     inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE);
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
 	else
@@ -1343,7 +1343,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		}
 	}
 	dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
-	     inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
+	     inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
 
 	if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
 	    ci->i_inline_version == CEPH_INLINE_NONE)
@@ -1352,16 +1352,16 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		ret = -EAGAIN;
 
 	dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
-	     inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
+	     inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got), ret);
 	if (pinned_page)
-		page_cache_release(pinned_page);
+		put_page(pinned_page);
 	ceph_put_cap_refs(ci, got);
 
 	if (ret != -EAGAIN)
 		return ret;
 
 	/* read inline data */
-	if (off >= PAGE_CACHE_SIZE) {
+	if (off >= PAGE_SIZE) {
 		/* does not support inline data > PAGE_SIZE */
 		ret = VM_FAULT_SIGBUS;
 	} else {
@@ -1378,12 +1378,12 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 					 CEPH_STAT_CAP_INLINE_DATA, true);
 		if (ret1 < 0 || off >= i_size_read(inode)) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			ret = VM_FAULT_SIGBUS;
 			goto out;
 		}
-		if (ret1 < PAGE_CACHE_SIZE)
-			zero_user_segment(page, ret1, PAGE_CACHE_SIZE);
+		if (ret1 < PAGE_SIZE)
+			zero_user_segment(page, ret1, PAGE_SIZE);
 		else
 			flush_dcache_page(page);
 		SetPageUptodate(page);
@@ -1392,7 +1392,7 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 out:
 	dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
-	     inode, off, (size_t)PAGE_CACHE_SIZE, ret);
+	     inode, off, (size_t)PAGE_SIZE, ret);
 	return ret;
 }
 
@@ -1430,10 +1430,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		}
 	}
 
-	if (off + PAGE_CACHE_SIZE <= size)
-		len = PAGE_CACHE_SIZE;
+	if (off + PAGE_SIZE <= size)
+		len = PAGE_SIZE;
 	else
-		len = size & ~PAGE_CACHE_MASK;
+		len = size & ~PAGE_MASK;
 
 	dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
 	     inode, ceph_vinop(inode), off, len, size);
@@ -1519,7 +1519,7 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
 			return;
 		if (PageUptodate(page)) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			return;
 		}
 	}
@@ -1534,14 +1534,14 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
 	}
 
 	if (page != locked_page) {
-		if (len < PAGE_CACHE_SIZE)
-			zero_user_segment(page, len, PAGE_CACHE_SIZE);
+		if (len < PAGE_SIZE)
+			zero_user_segment(page, len, PAGE_SIZE);
 		else
 			flush_dcache_page(page);
 
 		SetPageUptodate(page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 }
 
@@ -1578,7 +1578,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 				from_pagecache = true;
 				lock_page(page);
 			} else {
-				page_cache_release(page);
+				put_page(page);
 				page = NULL;
 			}
 		}
@@ -1586,8 +1586,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
 
 	if (page) {
 		len = i_size_read(inode);
-		if (len > PAGE_CACHE_SIZE)
-			len = PAGE_CACHE_SIZE;
+		if (len > PAGE_SIZE)
+			len = PAGE_SIZE;
 	} else {
 		page = __page_cache_alloc(GFP_NOFS);
 		if (!page) {
@@ -1670,7 +1670,7 @@ out:
 	if (page && page != locked_page) {
 		if (from_pagecache) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 		} else
 			__free_pages(page, 0);
 	}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index de17bb232ff8..cfaeef18cbca 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2510,7 +2510,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
 					*pinned_page = page;
 					break;
 				}
-				page_cache_release(page);
+				put_page(page);
 			}
 			/*
 			 * drop cap refs first because getattr while
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index fadc243dfb28..4fb2bbc2a272 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -129,7 +129,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 	struct inode *dir = d_inode(parent);
 	struct dentry *dentry, *last = NULL;
 	struct ceph_dentry_info *di;
-	unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *);
+	unsigned nsize = PAGE_SIZE / sizeof(struct dentry *);
 	int err = 0;
 	loff_t ptr_pos = 0;
 	struct ceph_readdir_cache_control cache_ctl = {};
@@ -154,7 +154,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 		}
 
 		err = -EAGAIN;
-		pgoff = ptr_pos >> PAGE_CACHE_SHIFT;
+		pgoff = ptr_pos >> PAGE_SHIFT;
 		if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
 			ceph_readdir_cache_release(&cache_ctl);
 			cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ef38f01c1795..a79f9269831e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -466,7 +466,7 @@ more:
 			ret += zlen;
 		}
 
-		didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
+		didpages = (page_align + ret) >> PAGE_SHIFT;
 		pos += ret;
 		read = pos - off;
 		left -= ret;
@@ -806,8 +806,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 
 	if (write) {
 		ret = invalidate_inode_pages2_range(inode->i_mapping,
-					pos >> PAGE_CACHE_SHIFT,
-					(pos + count) >> PAGE_CACHE_SHIFT);
+					pos >> PAGE_SHIFT,
+					(pos + count) >> PAGE_SHIFT);
 		if (ret < 0)
 			dout("invalidate_inode_pages2_range returned %d\n", ret);
 
@@ -872,7 +872,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 			 * may block.
 			 */
 			truncate_inode_pages_range(inode->i_mapping, pos,
-					(pos+len) | (PAGE_CACHE_SIZE - 1));
+					(pos+len) | (PAGE_SIZE - 1));
 
 			osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
 		}
@@ -1006,8 +1006,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		return ret;
 
 	ret = invalidate_inode_pages2_range(inode->i_mapping,
-					    pos >> PAGE_CACHE_SHIFT,
-					    (pos + count) >> PAGE_CACHE_SHIFT);
+					    pos >> PAGE_SHIFT,
+					    (pos + count) >> PAGE_SHIFT);
 	if (ret < 0)
 		dout("invalidate_inode_pages2_range returned %d\n", ret);
 
@@ -1036,7 +1036,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		 * write from beginning of first page,
 		 * regardless of io alignment
 		 */
-		num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		num_pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
 		if (IS_ERR(pages)) {
@@ -1159,7 +1159,7 @@ again:
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
 	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
 	if (pinned_page) {
-		page_cache_release(pinned_page);
+		put_page(pinned_page);
 		pinned_page = NULL;
 	}
 	ceph_put_cap_refs(ci, got);
@@ -1188,10 +1188,10 @@ again:
 		if (retry_op == READ_INLINE) {
 			BUG_ON(ret > 0 || read > 0);
 			if (iocb->ki_pos < i_size &&
-			    iocb->ki_pos < PAGE_CACHE_SIZE) {
+			    iocb->ki_pos < PAGE_SIZE) {
 				loff_t end = min_t(loff_t, i_size,
 						   iocb->ki_pos + len);
-				end = min_t(loff_t, end, PAGE_CACHE_SIZE);
+				end = min_t(loff_t, end, PAGE_SIZE);
 				if (statret < end)
 					zero_user_segment(page, statret, end);
 				ret = copy_page_to_iter(page,
@@ -1463,21 +1463,21 @@ static inline void ceph_zero_partial_page(
 	struct inode *inode, loff_t offset, unsigned size)
 {
 	struct page *page;
-	pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+	pgoff_t index = offset >> PAGE_SHIFT;
 
 	page = find_lock_page(inode->i_mapping, index);
 	if (page) {
 		wait_on_page_writeback(page);
-		zero_user(page, offset & (PAGE_CACHE_SIZE - 1), size);
+		zero_user(page, offset & (PAGE_SIZE - 1), size);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 }
 
 static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
 				      loff_t length)
 {
-	loff_t nearly = round_up(offset, PAGE_CACHE_SIZE);
+	loff_t nearly = round_up(offset, PAGE_SIZE);
 	if (offset < nearly) {
 		loff_t size = nearly - offset;
 		if (length < size)
@@ -1486,8 +1486,8 @@ static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
 		offset += size;
 		length -= size;
 	}
-	if (length >= PAGE_CACHE_SIZE) {
-		loff_t size = round_down(length, PAGE_CACHE_SIZE);
+	if (length >= PAGE_SIZE) {
+		loff_t size = round_down(length, PAGE_SIZE);
 		truncate_pagecache_range(inode, offset, offset + size - 1);
 		offset += size;
 		length -= size;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ed58b168904a..edfade037738 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1338,7 +1338,7 @@ void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
 {
 	if (ctl->page) {
 		kunmap(ctl->page);
-		page_cache_release(ctl->page);
+		put_page(ctl->page);
 		ctl->page = NULL;
 	}
 }
@@ -1348,7 +1348,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
 			      struct ceph_mds_request *req)
 {
 	struct ceph_inode_info *ci = ceph_inode(dir);
-	unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
+	unsigned nsize = PAGE_SIZE / sizeof(struct dentry*);
 	unsigned idx = ctl->index % nsize;
 	pgoff_t pgoff = ctl->index / nsize;
 
@@ -1367,7 +1367,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
 		unlock_page(ctl->page);
 		ctl->dentries = kmap(ctl->page);
 		if (idx == 0)
-			memset(ctl->dentries, 0, PAGE_CACHE_SIZE);
+			memset(ctl->dentries, 0, PAGE_SIZE);
 	}
 
 	if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 44852c3ae531..541ead4d8965 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1610,7 +1610,7 @@ again:
 	while (!list_empty(&tmp_list)) {
 		if (!msg) {
 			msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE,
-					PAGE_CACHE_SIZE, GFP_NOFS, false);
+					PAGE_SIZE, GFP_NOFS, false);
 			if (!msg)
 				goto out_err;
 			head = msg->front.iov_base;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 37712ccffcc6..ee69a537dba5 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -97,7 +97,7 @@ struct ceph_mds_reply_info_parsed {
 /*
  * cap releases are batched and sent to the MDS en masse.
  */
-#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE -			\
+#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE -			\
 				sizeof(struct ceph_mds_cap_release)) /	\
 			       sizeof(struct ceph_mds_cap_item))
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c973043deb0e..f12d5e2955c2 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -560,7 +560,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
 
 	/* set up mempools */
 	err = -ENOMEM;
-	page_count = fsc->mount_options->wsize >> PAGE_CACHE_SHIFT;
+	page_count = fsc->mount_options->wsize >> PAGE_SHIFT;
 	size = sizeof (struct page *) * (page_count ? page_count : 1);
 	fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10, size);
 	if (!fsc->wb_pagevec_pool)
@@ -912,13 +912,13 @@ static int ceph_register_bdi(struct super_block *sb,
 	int err;
 
 	/* set ra_pages based on rasize mount option? */
-	if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE)
+	if (fsc->mount_options->rasize >= PAGE_SIZE)
 		fsc->backing_dev_info.ra_pages =
-			(fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1)
+			(fsc->mount_options->rasize + PAGE_SIZE - 1)
 			>> PAGE_SHIFT;
 	else
 		fsc->backing_dev_info.ra_pages =
-			VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
+			VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
 
 	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
 			   atomic_long_inc_return(&bdi_seq));
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 1d86fc620e5c..89201564c346 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -962,7 +962,7 @@ static int cifs_clone_file_range(struct file *src_file, loff_t off,
 	cifs_dbg(FYI, "about to flush pages\n");
 	/* should we flush first and last page first */
 	truncate_inode_pages_range(&target_inode->i_data, destoff,
-				   PAGE_CACHE_ALIGN(destoff + len)-1);
+				   PAGE_ALIGN(destoff + len)-1);
 
 	if (target_tcon->ses->server->ops->duplicate_extents)
 		rc = target_tcon->ses->server->ops->duplicate_extents(xid,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 76fcb50295a3..a894bf809ff7 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1929,17 +1929,17 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
 
 		wsize = server->ops->wp_retry_size(inode);
 		if (wsize < rest_len) {
-			nr_pages = wsize / PAGE_CACHE_SIZE;
+			nr_pages = wsize / PAGE_SIZE;
 			if (!nr_pages) {
 				rc = -ENOTSUPP;
 				break;
 			}
-			cur_len = nr_pages * PAGE_CACHE_SIZE;
-			tailsz = PAGE_CACHE_SIZE;
+			cur_len = nr_pages * PAGE_SIZE;
+			tailsz = PAGE_SIZE;
 		} else {
-			nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE);
+			nr_pages = DIV_ROUND_UP(rest_len, PAGE_SIZE);
 			cur_len = rest_len;
-			tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE;
+			tailsz = rest_len - (nr_pages - 1) * PAGE_SIZE;
 		}
 
 		wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
@@ -1957,7 +1957,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
 		wdata2->sync_mode = wdata->sync_mode;
 		wdata2->nr_pages = nr_pages;
 		wdata2->offset = page_offset(wdata2->pages[0]);
-		wdata2->pagesz = PAGE_CACHE_SIZE;
+		wdata2->pagesz = PAGE_SIZE;
 		wdata2->tailsz = tailsz;
 		wdata2->bytes = cur_len;
 
@@ -1975,7 +1975,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
 			if (rc != 0 && rc != -EAGAIN) {
 				SetPageError(wdata2->pages[j]);
 				end_page_writeback(wdata2->pages[j]);
-				page_cache_release(wdata2->pages[j]);
+				put_page(wdata2->pages[j]);
 			}
 		}
 
@@ -2018,7 +2018,7 @@ cifs_writev_complete(struct work_struct *work)
 		else if (wdata->result < 0)
 			SetPageError(page);
 		end_page_writeback(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	if (wdata->result != -EAGAIN)
 		mapping_set_error(inode->i_mapping, wdata->result);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a763cd3d9e7c..6f62ac821a84 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3630,7 +3630,7 @@ try_mount_again:
 	cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
 
 	/* tune readahead according to rsize */
-	cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_CACHE_SIZE;
+	cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE;
 
 remote_path_check:
 #ifdef CONFIG_CIFS_DFS_UPCALL
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ff882aeaccc6..5ce540dc6996 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1833,7 +1833,7 @@ refind_writable:
 static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 {
 	struct address_space *mapping = page->mapping;
-	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	loff_t offset = (loff_t)page->index << PAGE_SHIFT;
 	char *write_data;
 	int rc = -EFAULT;
 	int bytes_written = 0;
@@ -1849,7 +1849,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
 	write_data = kmap(page);
 	write_data += from;
 
-	if ((to > PAGE_CACHE_SIZE) || (from > to)) {
+	if ((to > PAGE_SIZE) || (from > to)) {
 		kunmap(page);
 		return -EIO;
 	}
@@ -1991,7 +1991,7 @@ wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
 
 	/* put any pages we aren't going to use */
 	for (i = nr_pages; i < found_pages; i++) {
-		page_cache_release(wdata->pages[i]);
+		put_page(wdata->pages[i]);
 		wdata->pages[i] = NULL;
 	}
 
@@ -2009,11 +2009,11 @@ wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
 	wdata->sync_mode = wbc->sync_mode;
 	wdata->nr_pages = nr_pages;
 	wdata->offset = page_offset(wdata->pages[0]);
-	wdata->pagesz = PAGE_CACHE_SIZE;
+	wdata->pagesz = PAGE_SIZE;
 	wdata->tailsz = min(i_size_read(mapping->host) -
 			page_offset(wdata->pages[nr_pages - 1]),
-			(loff_t)PAGE_CACHE_SIZE);
-	wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz;
+			(loff_t)PAGE_SIZE);
+	wdata->bytes = ((nr_pages - 1) * PAGE_SIZE) + wdata->tailsz;
 
 	if (wdata->cfile != NULL)
 		cifsFileInfo_put(wdata->cfile);
@@ -2047,15 +2047,15 @@ static int cifs_writepages(struct address_space *mapping,
 	 * If wsize is smaller than the page cache size, default to writing
 	 * one page at a time via cifs_writepage
 	 */
-	if (cifs_sb->wsize < PAGE_CACHE_SIZE)
+	if (cifs_sb->wsize < PAGE_SIZE)
 		return generic_writepages(mapping, wbc);
 
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
 		end = -1;
 	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		index = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = true;
 		scanned = true;
@@ -2071,7 +2071,7 @@ retry:
 		if (rc)
 			break;
 
-		tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1;
+		tofind = min((wsize / PAGE_SIZE) - 1, end - index) + 1;
 
 		wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
 						  &found_pages);
@@ -2111,7 +2111,7 @@ retry:
 				else
 					SetPageError(wdata->pages[i]);
 				end_page_writeback(wdata->pages[i]);
-				page_cache_release(wdata->pages[i]);
+				put_page(wdata->pages[i]);
 			}
 			if (rc != -EAGAIN)
 				mapping_set_error(mapping, rc);
@@ -2154,7 +2154,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
 
 	xid = get_xid();
 /* BB add check for wbc flags */
-	page_cache_get(page);
+	get_page(page);
 	if (!PageUptodate(page))
 		cifs_dbg(FYI, "ppw - page not up to date\n");
 
@@ -2170,7 +2170,7 @@ cifs_writepage_locked(struct page *page, struct writeback_control *wbc)
 	 */
 	set_page_writeback(page);
 retry_write:
-	rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE);
+	rc = cifs_partialpagewrite(page, 0, PAGE_SIZE);
 	if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL)
 		goto retry_write;
 	else if (rc == -EAGAIN)
@@ -2180,7 +2180,7 @@ retry_write:
 	else
 		SetPageUptodate(page);
 	end_page_writeback(page);
-	page_cache_release(page);
+	put_page(page);
 	free_xid(xid);
 	return rc;
 }
@@ -2214,12 +2214,12 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 		if (copied == len)
 			SetPageUptodate(page);
 		ClearPageChecked(page);
-	} else if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+	} else if (!PageUptodate(page) && copied == PAGE_SIZE)
 		SetPageUptodate(page);
 
 	if (!PageUptodate(page)) {
 		char *page_data;
-		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+		unsigned offset = pos & (PAGE_SIZE - 1);
 		unsigned int xid;
 
 		xid = get_xid();
@@ -2248,7 +2248,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	}
 
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return rc;
 }
@@ -3286,9 +3286,9 @@ cifs_readv_complete(struct work_struct *work)
 		    (rdata->result == -EAGAIN && got_bytes))
 			cifs_readpage_to_fscache(rdata->mapping->host, page);
 
-		got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes);
+		got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes);
 
-		page_cache_release(page);
+		put_page(page);
 		rdata->pages[i] = NULL;
 	}
 	kref_put(&rdata->refcount, cifs_readdata_release);
@@ -3307,21 +3307,21 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
 
 	/* determine the eof that the server (probably) has */
 	eof = CIFS_I(rdata->mapping->host)->server_eof;
-	eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
+	eof_index = eof ? (eof - 1) >> PAGE_SHIFT : 0;
 	cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
 
 	rdata->got_bytes = 0;
-	rdata->tailsz = PAGE_CACHE_SIZE;
+	rdata->tailsz = PAGE_SIZE;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page = rdata->pages[i];
 
-		if (len >= PAGE_CACHE_SIZE) {
+		if (len >= PAGE_SIZE) {
 			/* enough data to fill the page */
 			iov.iov_base = kmap(page);
-			iov.iov_len = PAGE_CACHE_SIZE;
+			iov.iov_len = PAGE_SIZE;
 			cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
 				 i, page->index, iov.iov_base, iov.iov_len);
-			len -= PAGE_CACHE_SIZE;
+			len -= PAGE_SIZE;
 		} else if (len > 0) {
 			/* enough for partial page, fill and zero the rest */
 			iov.iov_base = kmap(page);
@@ -3329,7 +3329,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
 			cifs_dbg(FYI, "%u: idx=%lu iov_base=%p iov_len=%zu\n",
 				 i, page->index, iov.iov_base, iov.iov_len);
 			memset(iov.iov_base + len,
-				'\0', PAGE_CACHE_SIZE - len);
+				'\0', PAGE_SIZE - len);
 			rdata->tailsz = len;
 			len = 0;
 		} else if (page->index > eof_index) {
@@ -3341,12 +3341,12 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
 			 * to prevent the VFS from repeatedly attempting to
 			 * fill them until the writes are flushed.
 			 */
-			zero_user(page, 0, PAGE_CACHE_SIZE);
+			zero_user(page, 0, PAGE_SIZE);
 			lru_cache_add_file(page);
 			flush_dcache_page(page);
 			SetPageUptodate(page);
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			rdata->pages[i] = NULL;
 			rdata->nr_pages--;
 			continue;
@@ -3354,7 +3354,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
 			/* no need to hold page hostage */
 			lru_cache_add_file(page);
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			rdata->pages[i] = NULL;
 			rdata->nr_pages--;
 			continue;
@@ -3402,8 +3402,8 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 	}
 
 	/* move first page to the tmplist */
-	*offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-	*bytes = PAGE_CACHE_SIZE;
+	*offset = (loff_t)page->index << PAGE_SHIFT;
+	*bytes = PAGE_SIZE;
 	*nr_pages = 1;
 	list_move_tail(&page->lru, tmplist);
 
@@ -3415,7 +3415,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 			break;
 
 		/* would this page push the read over the rsize? */
-		if (*bytes + PAGE_CACHE_SIZE > rsize)
+		if (*bytes + PAGE_SIZE > rsize)
 			break;
 
 		__SetPageLocked(page);
@@ -3424,7 +3424,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 			break;
 		}
 		list_move_tail(&page->lru, tmplist);
-		(*bytes) += PAGE_CACHE_SIZE;
+		(*bytes) += PAGE_SIZE;
 		expected_index++;
 		(*nr_pages)++;
 	}
@@ -3493,7 +3493,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		 * reach this point however since we set ra_pages to 0 when the
 		 * rsize is smaller than a cache page.
 		 */
-		if (unlikely(rsize < PAGE_CACHE_SIZE)) {
+		if (unlikely(rsize < PAGE_SIZE)) {
 			add_credits_and_wake_if(server, credits, 0);
 			return 0;
 		}
@@ -3512,7 +3512,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 				list_del(&page->lru);
 				lru_cache_add_file(page);
 				unlock_page(page);
-				page_cache_release(page);
+				put_page(page);
 			}
 			rc = -ENOMEM;
 			add_credits_and_wake_if(server, credits, 0);
@@ -3524,7 +3524,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		rdata->offset = offset;
 		rdata->bytes = bytes;
 		rdata->pid = pid;
-		rdata->pagesz = PAGE_CACHE_SIZE;
+		rdata->pagesz = PAGE_SIZE;
 		rdata->read_into_pages = cifs_readpages_read_into_pages;
 		rdata->credits = credits;
 
@@ -3542,7 +3542,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 				page = rdata->pages[i];
 				lru_cache_add_file(page);
 				unlock_page(page);
-				page_cache_release(page);
+				put_page(page);
 			}
 			/* Fallback to the readpage in error/reconnect cases */
 			kref_put(&rdata->refcount, cifs_readdata_release);
@@ -3577,7 +3577,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	read_data = kmap(page);
 	/* for reads over a certain size could initiate async read ahead */
 
-	rc = cifs_read(file, read_data, PAGE_CACHE_SIZE, poffset);
+	rc = cifs_read(file, read_data, PAGE_SIZE, poffset);
 
 	if (rc < 0)
 		goto io_error;
@@ -3587,8 +3587,8 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	file_inode(file)->i_atime =
 		current_fs_time(file_inode(file)->i_sb);
 
-	if (PAGE_CACHE_SIZE > rc)
-		memset(read_data + rc, 0, PAGE_CACHE_SIZE - rc);
+	if (PAGE_SIZE > rc)
+		memset(read_data + rc, 0, PAGE_SIZE - rc);
 
 	flush_dcache_page(page);
 	SetPageUptodate(page);
@@ -3608,7 +3608,7 @@ read_complete:
 
 static int cifs_readpage(struct file *file, struct page *page)
 {
-	loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	loff_t offset = (loff_t)page->index << PAGE_SHIFT;
 	int rc = -EACCES;
 	unsigned int xid;
 
@@ -3679,8 +3679,8 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata)
 {
 	int oncethru = 0;
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
+	pgoff_t index = pos >> PAGE_SHIFT;
+	loff_t offset = pos & (PAGE_SIZE - 1);
 	loff_t page_start = pos & PAGE_MASK;
 	loff_t i_size;
 	struct page *page;
@@ -3703,7 +3703,7 @@ start:
 	 * the server. If the write is short, we'll end up doing a sync write
 	 * instead.
 	 */
-	if (len == PAGE_CACHE_SIZE)
+	if (len == PAGE_SIZE)
 		goto out;
 
 	/*
@@ -3718,7 +3718,7 @@ start:
 		    (offset == 0 && (pos + len) >= i_size)) {
 			zero_user_segments(page, 0, offset,
 					   offset + len,
-					   PAGE_CACHE_SIZE);
+					   PAGE_SIZE);
 			/*
 			 * PageChecked means that the parts of the page
 			 * to which we're not writing are considered up
@@ -3737,7 +3737,7 @@ start:
 		 * do a sync write instead since PG_uptodate isn't set.
 		 */
 		cifs_readpage_worker(file, page, &page_start);
-		page_cache_release(page);
+		put_page(page);
 		oncethru = 1;
 		goto start;
 	} else {
@@ -3764,7 +3764,7 @@ static void cifs_invalidate_page(struct page *page, unsigned int offset,
 {
 	struct cifsInodeInfo *cifsi = CIFS_I(page->mapping->host);
 
-	if (offset == 0 && length == PAGE_CACHE_SIZE)
+	if (offset == 0 && length == PAGE_SIZE)
 		cifs_fscache_invalidate_page(page, &cifsi->vfs_inode);
 }
 
@@ -3772,7 +3772,7 @@ static int cifs_launder_page(struct page *page)
 {
 	int rc = 0;
 	loff_t range_start = page_offset(page);
-	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+	loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = 0,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index aeb26dbfa1bf..5f9ad5c42180 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -59,7 +59,7 @@ static void cifs_set_ops(struct inode *inode)
 
 		/* check if server can support readpages */
 		if (cifs_sb_master_tcon(cifs_sb)->ses->server->maxBuf <
-				PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)
+				PAGE_SIZE + MAX_CIFS_HDR_SIZE)
 			inode->i_data.a_ops = &cifs_addr_ops_smallbuf;
 		else
 			inode->i_data.a_ops = &cifs_addr_ops;
@@ -2019,8 +2019,8 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 
 static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 {
-	pgoff_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
+	pgoff_t index = from >> PAGE_SHIFT;
+	unsigned offset = from & (PAGE_SIZE - 1);
 	struct page *page;
 	int rc = 0;
 
@@ -2028,9 +2028,9 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 	if (!page)
 		return -ENOMEM;
 
-	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	zero_user_segment(page, offset, PAGE_SIZE);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return rc;
 }
 
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index a8f3b589a2df..cfd91320e869 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -71,8 +71,8 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *inode;
 	struct dentry *root;
 
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = CONFIGFS_MAGIC;
 	sb->s_op = &configfs_ops;
 	sb->s_time_gran = 1;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index b862bc219cd7..2096654dd26d 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -152,7 +152,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
  */
 #define BLKS_PER_BUF_SHIFT	(2)
 #define BLKS_PER_BUF		(1 << BLKS_PER_BUF_SHIFT)
-#define BUFFER_SIZE		(BLKS_PER_BUF*PAGE_CACHE_SIZE)
+#define BUFFER_SIZE		(BLKS_PER_BUF*PAGE_SIZE)
 
 static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE];
 static unsigned buffer_blocknr[READ_BUFFERS];
@@ -173,8 +173,8 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 
 	if (!len)
 		return NULL;
-	blocknr = offset >> PAGE_CACHE_SHIFT;
-	offset &= PAGE_CACHE_SIZE - 1;
+	blocknr = offset >> PAGE_SHIFT;
+	offset &= PAGE_SIZE - 1;
 
 	/* Check if an existing buffer already has the data.. */
 	for (i = 0; i < READ_BUFFERS; i++) {
@@ -184,14 +184,14 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 			continue;
 		if (blocknr < buffer_blocknr[i])
 			continue;
-		blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_CACHE_SHIFT;
+		blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT;
 		blk_offset += offset;
 		if (blk_offset + len > BUFFER_SIZE)
 			continue;
 		return read_buffers[i] + blk_offset;
 	}
 
-	devsize = mapping->host->i_size >> PAGE_CACHE_SHIFT;
+	devsize = mapping->host->i_size >> PAGE_SHIFT;
 
 	/* Ok, read in BLKS_PER_BUF pages completely first. */
 	for (i = 0; i < BLKS_PER_BUF; i++) {
@@ -213,7 +213,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 			wait_on_page_locked(page);
 			if (!PageUptodate(page)) {
 				/* asynchronous error */
-				page_cache_release(page);
+				put_page(page);
 				pages[i] = NULL;
 			}
 		}
@@ -229,12 +229,12 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
 		struct page *page = pages[i];
 
 		if (page) {
-			memcpy(data, kmap(page), PAGE_CACHE_SIZE);
+			memcpy(data, kmap(page), PAGE_SIZE);
 			kunmap(page);
-			page_cache_release(page);
+			put_page(page);
 		} else
-			memset(data, 0, PAGE_CACHE_SIZE);
-		data += PAGE_CACHE_SIZE;
+			memset(data, 0, PAGE_SIZE);
+		data += PAGE_SIZE;
 	}
 	return read_buffers[buffer] + offset;
 }
@@ -353,7 +353,7 @@ static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
 
 	buf->f_type = CRAMFS_MAGIC;
-	buf->f_bsize = PAGE_CACHE_SIZE;
+	buf->f_bsize = PAGE_SIZE;
 	buf->f_blocks = CRAMFS_SB(sb)->blocks;
 	buf->f_bfree = 0;
 	buf->f_bavail = 0;
@@ -496,7 +496,7 @@ static int cramfs_readpage(struct file *file, struct page *page)
 	int bytes_filled;
 	void *pgdata;
 
-	maxblock = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	bytes_filled = 0;
 	pgdata = kmap(page);
 
@@ -516,14 +516,14 @@ static int cramfs_readpage(struct file *file, struct page *page)
 
 		if (compr_len == 0)
 			; /* hole */
-		else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
+		else if (unlikely(compr_len > (PAGE_SIZE << 1))) {
 			pr_err("bad compressed blocksize %u\n",
 				compr_len);
 			goto err;
 		} else {
 			mutex_lock(&read_mutex);
 			bytes_filled = cramfs_uncompress_block(pgdata,
-				 PAGE_CACHE_SIZE,
+				 PAGE_SIZE,
 				 cramfs_read(sb, start_offset, compr_len),
 				 compr_len);
 			mutex_unlock(&read_mutex);
@@ -532,7 +532,7 @@ static int cramfs_readpage(struct file *file, struct page *page)
 		}
 	}
 
-	memset(pgdata + bytes_filled, 0, PAGE_CACHE_SIZE - bytes_filled);
+	memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled);
 	flush_dcache_page(page);
 	kunmap(page);
 	SetPageUptodate(page);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 06cd1a22240b..7f5804537d30 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -175,10 +175,10 @@ static int do_page_crypto(struct inode *inode,
 			FS_XTS_TWEAK_SIZE - sizeof(index));
 
 	sg_init_table(&dst, 1);
-	sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+	sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
 	sg_init_table(&src, 1);
-	sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
-	skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+	sg_set_page(&src, src_page, PAGE_SIZE, 0);
+	skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE,
 					xts_tweak);
 	if (rw == FS_DECRYPT)
 		res = crypto_skcipher_decrypt(req);
@@ -287,7 +287,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
 	struct bio *bio;
 	int ret, err = 0;
 
-	BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
 
 	ctx = fscrypt_get_ctx(inode);
 	if (IS_ERR(ctx))
diff --git a/fs/dax.c b/fs/dax.c
index 90322eb7498c..1f954166704a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -323,7 +323,7 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		return VM_FAULT_SIGBUS;
 	}
 
@@ -351,7 +351,7 @@ static int copy_user_bh(struct page *to, struct inode *inode,
 }
 
 #define NO_SECTOR -1
-#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
+#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
 
 static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
 		sector_t sector, bool pmd_entry, bool dirty)
@@ -506,8 +506,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
 		return 0;
 
-	start_index = wbc->range_start >> PAGE_CACHE_SHIFT;
-	end_index = wbc->range_end >> PAGE_CACHE_SHIFT;
+	start_index = wbc->range_start >> PAGE_SHIFT;
+	end_index = wbc->range_end >> PAGE_SHIFT;
 	pmd_index = DAX_PMD_INDEX(start_index);
 
 	rcu_read_lock();
@@ -642,12 +642,12 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	page = find_get_page(mapping, vmf->pgoff);
 	if (page) {
 		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
-			page_cache_release(page);
+			put_page(page);
 			return VM_FAULT_RETRY;
 		}
 		if (unlikely(page->mapping != mapping)) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			goto repeat;
 		}
 		size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -711,10 +711,10 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	if (page) {
 		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
-							PAGE_CACHE_SIZE, 0);
+							PAGE_SIZE, 0);
 		delete_from_page_cache(page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		page = NULL;
 	}
 
@@ -747,7 +747,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
  unlock_page:
 	if (page) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	goto out;
 }
@@ -1104,18 +1104,18 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 							get_block_t get_block)
 {
 	struct buffer_head bh;
-	pgoff_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	pgoff_t index = from >> PAGE_SHIFT;
+	unsigned offset = from & (PAGE_SIZE-1);
 	int err;
 
 	/* Block boundary? Nothing to do */
 	if (!length)
 		return 0;
-	BUG_ON((offset + length) > PAGE_CACHE_SIZE);
+	BUG_ON((offset + length) > PAGE_SIZE);
 
 	memset(&bh, 0, sizeof(bh));
 	bh.b_bdev = inode->i_sb->s_bdev;
-	bh.b_size = PAGE_CACHE_SIZE;
+	bh.b_size = PAGE_SIZE;
 	err = get_block(inode, index, &bh, 0);
 	if (err < 0)
 		return err;
@@ -1123,7 +1123,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 		struct block_device *bdev = bh.b_bdev;
 		struct blk_dax_ctl dax = {
 			.sector = to_sector(&bh, inode),
-			.size = PAGE_CACHE_SIZE,
+			.size = PAGE_SIZE,
 		};
 
 		if (dax_map_atomic(bdev, &dax) < 0)
@@ -1154,7 +1154,7 @@ EXPORT_SYMBOL_GPL(dax_zero_page_range);
  */
 int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
 {
-	unsigned length = PAGE_CACHE_ALIGN(from) - from;
+	unsigned length = PAGE_ALIGN(from) - from;
 	return dax_zero_page_range(inode, from, length, get_block);
 }
 EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 476f1ecbd1f0..472037732daf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -172,7 +172,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
 		 */
 		if (dio->page_errors == 0)
 			dio->page_errors = ret;
-		page_cache_get(page);
+		get_page(page);
 		dio->pages[0] = page;
 		sdio->head = 0;
 		sdio->tail = 1;
@@ -424,7 +424,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
 static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
 {
 	while (sdio->head < sdio->tail)
-		page_cache_release(dio->pages[sdio->head++]);
+		put_page(dio->pages[sdio->head++]);
 }
 
 /*
@@ -487,7 +487,7 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
 			if (dio->rw == READ && !PageCompound(page) &&
 					dio->should_dirty)
 				set_page_dirty_lock(page);
-			page_cache_release(page);
+			put_page(page);
 		}
 		err = bio->bi_error;
 		bio_put(bio);
@@ -696,7 +696,7 @@ static inline int dio_bio_add_page(struct dio_submit *sdio)
 		 */
 		if ((sdio->cur_page_len + sdio->cur_page_offset) == PAGE_SIZE)
 			sdio->pages_in_io--;
-		page_cache_get(sdio->cur_page);
+		get_page(sdio->cur_page);
 		sdio->final_block_in_bio = sdio->cur_page_block +
 			(sdio->cur_page_len >> sdio->blkbits);
 		ret = 0;
@@ -810,13 +810,13 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 	 */
 	if (sdio->cur_page) {
 		ret = dio_send_cur_page(dio, sdio, map_bh);
-		page_cache_release(sdio->cur_page);
+		put_page(sdio->cur_page);
 		sdio->cur_page = NULL;
 		if (ret)
 			return ret;
 	}
 
-	page_cache_get(page);		/* It is in dio */
+	get_page(page);		/* It is in dio */
 	sdio->cur_page = page;
 	sdio->cur_page_offset = offset;
 	sdio->cur_page_len = len;
@@ -830,7 +830,7 @@ out:
 	if (sdio->boundary) {
 		ret = dio_send_cur_page(dio, sdio, map_bh);
 		dio_bio_submit(dio, sdio);
-		page_cache_release(sdio->cur_page);
+		put_page(sdio->cur_page);
 		sdio->cur_page = NULL;
 	}
 	return ret;
@@ -947,7 +947,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 
 				ret = get_more_blocks(dio, sdio, map_bh);
 				if (ret) {
-					page_cache_release(page);
+					put_page(page);
 					goto out;
 				}
 				if (!buffer_mapped(map_bh))
@@ -988,7 +988,7 @@ do_holes:
 
 				/* AKPM: eargh, -ENOTBLK is a hack */
 				if (dio->rw & WRITE) {
-					page_cache_release(page);
+					put_page(page);
 					return -ENOTBLK;
 				}
 
@@ -1001,7 +1001,7 @@ do_holes:
 				if (sdio->block_in_file >=
 						i_size_aligned >> blkbits) {
 					/* We hit eof */
-					page_cache_release(page);
+					put_page(page);
 					goto out;
 				}
 				zero_user(page, from, 1 << blkbits);
@@ -1041,7 +1041,7 @@ do_holes:
 						  sdio->next_block_for_io,
 						  map_bh);
 			if (ret) {
-				page_cache_release(page);
+				put_page(page);
 				goto out;
 			}
 			sdio->next_block_for_io += this_chunk_blocks;
@@ -1057,7 +1057,7 @@ next_block:
 		}
 
 		/* Drop the ref which was taken in get_user_pages() */
-		page_cache_release(page);
+		put_page(page);
 	}
 out:
 	return ret;
@@ -1281,7 +1281,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
 		if (retval == 0)
 			retval = ret2;
-		page_cache_release(sdio.cur_page);
+		put_page(sdio.cur_page);
 		sdio.cur_page = NULL;
 	}
 	if (sdio.bio)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 00640e70ed7a..1ab012a27d9f 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -640,7 +640,7 @@ static int receive_from_sock(struct connection *con)
 		con->rx_page = alloc_page(GFP_ATOMIC);
 		if (con->rx_page == NULL)
 			goto out_resched;
-		cbuf_init(&con->cb, PAGE_CACHE_SIZE);
+		cbuf_init(&con->cb, PAGE_SIZE);
 	}
 
 	/*
@@ -657,7 +657,7 @@ static int receive_from_sock(struct connection *con)
 	 * buffer and the start of the currently used section (cb.base)
 	 */
 	if (cbuf_data(&con->cb) >= con->cb.base) {
-		iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb);
+		iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb);
 		iov[1].iov_len = con->cb.base;
 		iov[1].iov_base = page_address(con->rx_page);
 		nvec = 2;
@@ -675,7 +675,7 @@ static int receive_from_sock(struct connection *con)
 	ret = dlm_process_incoming_buffer(con->nodeid,
 					  page_address(con->rx_page),
 					  con->cb.base, con->cb.len,
-					  PAGE_CACHE_SIZE);
+					  PAGE_SIZE);
 	if (ret == -EBADMSG) {
 		log_print("lowcomms: addr=%p, base=%u, len=%u, read=%d",
 			  page_address(con->rx_page), con->cb.base,
@@ -1416,7 +1416,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
 	spin_lock(&con->writequeue_lock);
 	e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
 	if ((&e->list == &con->writequeue) ||
-	    (PAGE_CACHE_SIZE - e->end < len)) {
+	    (PAGE_SIZE - e->end < len)) {
 		e = NULL;
 	} else {
 		offset = e->end;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 64026e53722a..d09cb4cdd09f 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -286,7 +286,7 @@ int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg,
 		pg = virt_to_page(addr);
 		offset = offset_in_page(addr);
 		sg_set_page(&sg[i], pg, 0, offset);
-		remainder_of_page = PAGE_CACHE_SIZE - offset;
+		remainder_of_page = PAGE_SIZE - offset;
 		if (size >= remainder_of_page) {
 			sg[i].length = remainder_of_page;
 			addr += remainder_of_page;
@@ -400,7 +400,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
 				    struct page *page)
 {
 	return ecryptfs_lower_header_size(crypt_stat) +
-	       ((loff_t)page->index << PAGE_CACHE_SHIFT);
+	       ((loff_t)page->index << PAGE_SHIFT);
 }
 
 /**
@@ -428,7 +428,7 @@ static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
 	size_t extent_size = crypt_stat->extent_size;
 	int rc;
 
-	extent_base = (((loff_t)page_index) * (PAGE_CACHE_SIZE / extent_size));
+	extent_base = (((loff_t)page_index) * (PAGE_SIZE / extent_size));
 	rc = ecryptfs_derive_iv(extent_iv, crypt_stat,
 				(extent_base + extent_offset));
 	if (rc) {
@@ -498,7 +498,7 @@ int ecryptfs_encrypt_page(struct page *page)
 	}
 
 	for (extent_offset = 0;
-	     extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
+	     extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
 	     extent_offset++) {
 		rc = crypt_extent(crypt_stat, enc_extent_page, page,
 				  extent_offset, ENCRYPT);
@@ -512,7 +512,7 @@ int ecryptfs_encrypt_page(struct page *page)
 	lower_offset = lower_offset_for_page(crypt_stat, page);
 	enc_extent_virt = kmap(enc_extent_page);
 	rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
-				  PAGE_CACHE_SIZE);
+				  PAGE_SIZE);
 	kunmap(enc_extent_page);
 	if (rc < 0) {
 		ecryptfs_printk(KERN_ERR,
@@ -560,7 +560,7 @@ int ecryptfs_decrypt_page(struct page *page)
 
 	lower_offset = lower_offset_for_page(crypt_stat, page);
 	page_virt = kmap(page);
-	rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_CACHE_SIZE,
+	rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE,
 				 ecryptfs_inode);
 	kunmap(page);
 	if (rc < 0) {
@@ -571,7 +571,7 @@ int ecryptfs_decrypt_page(struct page *page)
 	}
 
 	for (extent_offset = 0;
-	     extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
+	     extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
 	     extent_offset++) {
 		rc = crypt_extent(crypt_stat, page, page,
 				  extent_offset, DECRYPT);
@@ -659,11 +659,11 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
 		crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 	else {
-		if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
+		if (PAGE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
 			crypt_stat->metadata_size =
 				ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 		else
-			crypt_stat->metadata_size = PAGE_CACHE_SIZE;
+			crypt_stat->metadata_size = PAGE_SIZE;
 	}
 }
 
@@ -1442,7 +1442,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 						ECRYPTFS_VALIDATE_HEADER_SIZE);
 	if (rc) {
 		/* metadata is not in the file header, so try xattrs */
-		memset(page_virt, 0, PAGE_CACHE_SIZE);
+		memset(page_virt, 0, PAGE_SIZE);
 		rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
 		if (rc) {
 			printk(KERN_DEBUG "Valid eCryptfs headers not found in "
@@ -1475,7 +1475,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
 	}
 out:
 	if (page_virt) {
-		memset(page_virt, 0, PAGE_CACHE_SIZE);
+		memset(page_virt, 0, PAGE_SIZE);
 		kmem_cache_free(ecryptfs_header_cache, page_virt);
 	}
 	return rc;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 121114e9a464..2a988f3a0cdb 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -765,8 +765,8 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
 		 * in which ia->ia_size is located. Fill in the end of
 		 * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to
 		 * PAGE_CACHE_SIZE with zeros. */
-		size_t num_zeros = (PAGE_CACHE_SIZE
-				    - (ia->ia_size & ~PAGE_CACHE_MASK));
+		size_t num_zeros = (PAGE_SIZE
+				    - (ia->ia_size & ~PAGE_MASK));
 
 		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
 			truncate_setsize(inode, ia->ia_size);
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 9893d1538122..3cf1546dca82 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1798,7 +1798,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
 	 * added the our &auth_tok_list */
 	next_packet_is_auth_tok_packet = 1;
 	while (next_packet_is_auth_tok_packet) {
-		size_t max_packet_size = ((PAGE_CACHE_SIZE - 8) - i);
+		size_t max_packet_size = ((PAGE_SIZE - 8) - i);
 
 		switch (src[i]) {
 		case ECRYPTFS_TAG_3_PACKET_TYPE:
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 8b0b4a73116d..1698132d0e57 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -695,12 +695,12 @@ static struct ecryptfs_cache_info {
 	{
 		.cache = &ecryptfs_header_cache,
 		.name = "ecryptfs_headers",
-		.size = PAGE_CACHE_SIZE,
+		.size = PAGE_SIZE,
 	},
 	{
 		.cache = &ecryptfs_xattr_cache,
 		.name = "ecryptfs_xattr_cache",
-		.size = PAGE_CACHE_SIZE,
+		.size = PAGE_SIZE,
 	},
 	{
 		.cache = &ecryptfs_key_record_cache,
@@ -818,7 +818,7 @@ static int __init ecryptfs_init(void)
 {
 	int rc;
 
-	if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_CACHE_SIZE) {
+	if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_SIZE) {
 		rc = -EINVAL;
 		ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is "
 				"larger than the host's page size, and so "
@@ -826,7 +826,7 @@ static int __init ecryptfs_init(void)
 				"default eCryptfs extent size is [%u] bytes; "
 				"the page size is [%lu] bytes.\n",
 				ECRYPTFS_DEFAULT_EXTENT_SIZE,
-				(unsigned long)PAGE_CACHE_SIZE);
+				(unsigned long)PAGE_SIZE);
 		goto out;
 	}
 	rc = ecryptfs_init_kmem_caches();
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 1f5865263b3e..e6b1d80952b9 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -122,7 +122,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 				       struct ecryptfs_crypt_stat *crypt_stat)
 {
 	loff_t extent_num_in_page = 0;
-	loff_t num_extents_per_page = (PAGE_CACHE_SIZE
+	loff_t num_extents_per_page = (PAGE_SIZE
 				       / crypt_stat->extent_size);
 	int rc = 0;
 
@@ -138,7 +138,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 			char *page_virt;
 
 			page_virt = kmap_atomic(page);
-			memset(page_virt, 0, PAGE_CACHE_SIZE);
+			memset(page_virt, 0, PAGE_SIZE);
 			/* TODO: Support more than one header extent */
 			if (view_extent_num == 0) {
 				size_t written;
@@ -164,8 +164,8 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
 				 - crypt_stat->metadata_size);
 
 			rc = ecryptfs_read_lower_page_segment(
-				page, (lower_offset >> PAGE_CACHE_SHIFT),
-				(lower_offset & ~PAGE_CACHE_MASK),
+				page, (lower_offset >> PAGE_SHIFT),
+				(lower_offset & ~PAGE_MASK),
 				crypt_stat->extent_size, page->mapping->host);
 			if (rc) {
 				printk(KERN_ERR "%s: Error attempting to read "
@@ -198,7 +198,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page)
 
 	if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
 		rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
-						      PAGE_CACHE_SIZE,
+						      PAGE_SIZE,
 						      page->mapping->host);
 	} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
 		if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
@@ -215,7 +215,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page)
 
 		} else {
 			rc = ecryptfs_read_lower_page_segment(
-				page, page->index, 0, PAGE_CACHE_SIZE,
+				page, page->index, 0, PAGE_SIZE,
 				page->mapping->host);
 			if (rc) {
 				printk(KERN_ERR "Error reading page; rc = "
@@ -250,12 +250,12 @@ static int fill_zeros_to_end_of_page(struct page *page, unsigned int to)
 	struct inode *inode = page->mapping->host;
 	int end_byte_in_page;
 
-	if ((i_size_read(inode) / PAGE_CACHE_SIZE) != page->index)
+	if ((i_size_read(inode) / PAGE_SIZE) != page->index)
 		goto out;
-	end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE;
+	end_byte_in_page = i_size_read(inode) % PAGE_SIZE;
 	if (to > end_byte_in_page)
 		end_byte_in_page = to;
-	zero_user_segment(page, end_byte_in_page, PAGE_CACHE_SIZE);
+	zero_user_segment(page, end_byte_in_page, PAGE_SIZE);
 out:
 	return 0;
 }
@@ -279,7 +279,7 @@ static int ecryptfs_write_begin(struct file *file,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 	struct page *page;
 	loff_t prev_page_end_size;
 	int rc = 0;
@@ -289,14 +289,14 @@ static int ecryptfs_write_begin(struct file *file,
 		return -ENOMEM;
 	*pagep = page;
 
-	prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT);
+	prev_page_end_size = ((loff_t)index << PAGE_SHIFT);
 	if (!PageUptodate(page)) {
 		struct ecryptfs_crypt_stat *crypt_stat =
 			&ecryptfs_inode_to_private(mapping->host)->crypt_stat;
 
 		if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
 			rc = ecryptfs_read_lower_page_segment(
-				page, index, 0, PAGE_CACHE_SIZE, mapping->host);
+				page, index, 0, PAGE_SIZE, mapping->host);
 			if (rc) {
 				printk(KERN_ERR "%s: Error attempting to read "
 				       "lower page segment; rc = [%d]\n",
@@ -322,7 +322,7 @@ static int ecryptfs_write_begin(struct file *file,
 				SetPageUptodate(page);
 			} else {
 				rc = ecryptfs_read_lower_page_segment(
-					page, index, 0, PAGE_CACHE_SIZE,
+					page, index, 0, PAGE_SIZE,
 					mapping->host);
 				if (rc) {
 					printk(KERN_ERR "%s: Error reading "
@@ -336,9 +336,9 @@ static int ecryptfs_write_begin(struct file *file,
 		} else {
 			if (prev_page_end_size
 			    >= i_size_read(page->mapping->host)) {
-				zero_user(page, 0, PAGE_CACHE_SIZE);
+				zero_user(page, 0, PAGE_SIZE);
 				SetPageUptodate(page);
-			} else if (len < PAGE_CACHE_SIZE) {
+			} else if (len < PAGE_SIZE) {
 				rc = ecryptfs_decrypt_page(page);
 				if (rc) {
 					printk(KERN_ERR "%s: Error decrypting "
@@ -371,11 +371,11 @@ static int ecryptfs_write_begin(struct file *file,
 	 * of page?  Zero it out. */
 	if ((i_size_read(mapping->host) == prev_page_end_size)
 	    && (pos != 0))
-		zero_user(page, 0, PAGE_CACHE_SIZE);
+		zero_user(page, 0, PAGE_SIZE);
 out:
 	if (unlikely(rc)) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		*pagep = NULL;
 	}
 	return rc;
@@ -437,7 +437,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 	}
 	inode_lock(lower_inode);
 	size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
-					   xattr_virt, PAGE_CACHE_SIZE);
+					   xattr_virt, PAGE_SIZE);
 	if (size < 0)
 		size = 8;
 	put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
@@ -479,8 +479,8 @@ static int ecryptfs_write_end(struct file *file,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	pgoff_t index = pos >> PAGE_SHIFT;
+	unsigned from = pos & (PAGE_SIZE - 1);
 	unsigned to = from + copied;
 	struct inode *ecryptfs_inode = mapping->host;
 	struct ecryptfs_crypt_stat *crypt_stat =
@@ -500,7 +500,7 @@ static int ecryptfs_write_end(struct file *file,
 		goto out;
 	}
 	if (!PageUptodate(page)) {
-		if (copied < PAGE_CACHE_SIZE) {
+		if (copied < PAGE_SIZE) {
 			rc = 0;
 			goto out;
 		}
@@ -533,7 +533,7 @@ static int ecryptfs_write_end(struct file *file,
 		rc = copied;
 out:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return rc;
 }
 
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 09fe622274e4..158a3a39f82d 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -74,7 +74,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
 	loff_t offset;
 	int rc;
 
-	offset = ((((loff_t)page_for_lower->index) << PAGE_CACHE_SHIFT)
+	offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT)
 		  + offset_in_page);
 	virt = kmap(page_for_lower);
 	rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
@@ -123,9 +123,9 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 	else
 		pos = offset;
 	while (pos < (offset + size)) {
-		pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
-		size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
-		size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
+		pgoff_t ecryptfs_page_idx = (pos >> PAGE_SHIFT);
+		size_t start_offset_in_page = (pos & ~PAGE_MASK);
+		size_t num_bytes = (PAGE_SIZE - start_offset_in_page);
 		loff_t total_remaining_bytes = ((offset + size) - pos);
 
 		if (fatal_signal_pending(current)) {
@@ -165,7 +165,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 			 * Fill in zero values to the end of the page */
 			memset(((char *)ecryptfs_page_virt
 				+ start_offset_in_page), 0,
-				PAGE_CACHE_SIZE - start_offset_in_page);
+				PAGE_SIZE - start_offset_in_page);
 		}
 
 		/* pos >= offset, we are now writing the data request */
@@ -186,7 +186,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
 						ecryptfs_page,
 						start_offset_in_page,
 						data_offset);
-		page_cache_release(ecryptfs_page);
+		put_page(ecryptfs_page);
 		if (rc) {
 			printk(KERN_ERR "%s: Error encrypting "
 			       "page; rc = [%d]\n", __func__, rc);
@@ -262,7 +262,7 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
 	loff_t offset;
 	int rc;
 
-	offset = ((((loff_t)page_index) << PAGE_CACHE_SHIFT) + offset_in_page);
+	offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page);
 	virt = kmap(page_for_ecryptfs);
 	rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
 	if (rc > 0)
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index dd029d13ea61..553c5d2db4a4 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -197,8 +197,8 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
 	efivarfs_sb = sb;
 
 	sb->s_maxbytes          = MAX_LFS_FILESIZE;
-	sb->s_blocksize         = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits    = PAGE_CACHE_SHIFT;
+	sb->s_blocksize         = PAGE_SIZE;
+	sb->s_blocksize_bits    = PAGE_SHIFT;
 	sb->s_magic             = EFIVARFS_MAGIC;
 	sb->s_op                = &efivarfs_ops;
 	sb->s_d_op		= &efivarfs_d_ops;
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index e5bb2abf77f9..547b93cbea63 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -41,16 +41,16 @@ static inline unsigned exofs_chunk_size(struct inode *inode)
 static inline void exofs_put_page(struct page *page)
 {
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
 {
 	loff_t last_byte = inode->i_size;
 
-	last_byte -= page_nr << PAGE_CACHE_SHIFT;
-	if (last_byte > PAGE_CACHE_SIZE)
-		last_byte = PAGE_CACHE_SIZE;
+	last_byte -= page_nr << PAGE_SHIFT;
+	if (last_byte > PAGE_SIZE)
+		last_byte = PAGE_SIZE;
 	return last_byte;
 }
 
@@ -85,13 +85,13 @@ static void exofs_check_page(struct page *page)
 	unsigned chunk_size = exofs_chunk_size(dir);
 	char *kaddr = page_address(page);
 	unsigned offs, rec_len;
-	unsigned limit = PAGE_CACHE_SIZE;
+	unsigned limit = PAGE_SIZE;
 	struct exofs_dir_entry *p;
 	char *error;
 
 	/* if the page is the last one in the directory */
-	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
-		limit = dir->i_size & ~PAGE_CACHE_MASK;
+	if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_MASK;
 		if (limit & (chunk_size - 1))
 			goto Ebadsize;
 		if (!limit)
@@ -138,7 +138,7 @@ bad_entry:
 	EXOFS_ERR(
 		"ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
 		"offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
-		dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
 		_LLU(le64_to_cpu(p->inode_no)),
 		rec_len, p->name_len);
 	goto fail;
@@ -147,7 +147,7 @@ Eend:
 	EXOFS_ERR("ERROR [exofs_check_page]: "
 		"entry in directory(0x%lx) spans the page boundary"
 		"offset=%lu, inode=0x%llx\n",
-		dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
 		_LLU(le64_to_cpu(p->inode_no)));
 fail:
 	SetPageChecked(page);
@@ -237,8 +237,8 @@ exofs_readdir(struct file *file, struct dir_context *ctx)
 {
 	loff_t pos = ctx->pos;
 	struct inode *inode = file_inode(file);
-	unsigned int offset = pos & ~PAGE_CACHE_MASK;
-	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned int offset = pos & ~PAGE_MASK;
+	unsigned long n = pos >> PAGE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
 	int need_revalidate = (file->f_version != inode->i_version);
@@ -254,7 +254,7 @@ exofs_readdir(struct file *file, struct dir_context *ctx)
 		if (IS_ERR(page)) {
 			EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
 				  inode->i_ino);
-			ctx->pos += PAGE_CACHE_SIZE - offset;
+			ctx->pos += PAGE_SIZE - offset;
 			return PTR_ERR(page);
 		}
 		kaddr = page_address(page);
@@ -262,7 +262,7 @@ exofs_readdir(struct file *file, struct dir_context *ctx)
 			if (offset) {
 				offset = exofs_validate_entry(kaddr, offset,
 								chunk_mask);
-				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+				ctx->pos = (n<<PAGE_SHIFT) + offset;
 			}
 			file->f_version = inode->i_version;
 			need_revalidate = 0;
@@ -449,7 +449,7 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
 		kaddr = page_address(page);
 		dir_end = kaddr + exofs_last_byte(dir, n);
 		de = (struct exofs_dir_entry *)kaddr;
-		kaddr += PAGE_CACHE_SIZE - reclen;
+		kaddr += PAGE_SIZE - reclen;
 		while ((char *)de <= kaddr) {
 			if ((char *)de == dir_end) {
 				name_len = 0;
@@ -602,7 +602,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
 	kunmap_atomic(kaddr);
 	err = exofs_commit_chunk(page, 0, chunk_size);
 fail:
-	page_cache_release(page);
+	put_page(page);
 	return err;
 }
 
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 9eaf595aeaf8..49e1bd00b4ec 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -317,7 +317,7 @@ static int read_exec(struct page_collect *pcol)
 
 	if (!pcol->ios) {
 		int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true,
-					     pcol->pg_first << PAGE_CACHE_SHIFT,
+					     pcol->pg_first << PAGE_SHIFT,
 					     pcol->length, &pcol->ios);
 
 		if (ret)
@@ -383,7 +383,7 @@ static int readpage_strip(void *data, struct page *page)
 	struct inode *inode = pcol->inode;
 	struct exofs_i_info *oi = exofs_i(inode);
 	loff_t i_size = i_size_read(inode);
-	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	pgoff_t end_index = i_size >> PAGE_SHIFT;
 	size_t len;
 	int ret;
 
@@ -397,9 +397,9 @@ static int readpage_strip(void *data, struct page *page)
 	pcol->that_locked_page = page;
 
 	if (page->index < end_index)
-		len = PAGE_CACHE_SIZE;
+		len = PAGE_SIZE;
 	else if (page->index == end_index)
-		len = i_size & ~PAGE_CACHE_MASK;
+		len = i_size & ~PAGE_MASK;
 	else
 		len = 0;
 
@@ -442,8 +442,8 @@ try_again:
 			goto fail;
 	}
 
-	if (len != PAGE_CACHE_SIZE)
-		zero_user(page, len, PAGE_CACHE_SIZE - len);
+	if (len != PAGE_SIZE)
+		zero_user(page, len, PAGE_SIZE - len);
 
 	EXOFS_DBGMSG2("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
 		     inode->i_ino, page->index, len);
@@ -609,7 +609,7 @@ static void __r4w_put_page(void *priv, struct page *page)
 
 	if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
 		EXOFS_DBGMSG2("index=0x%lx\n", page->index);
-		page_cache_release(page);
+		put_page(page);
 		return;
 	}
 	EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
@@ -633,7 +633,7 @@ static int write_exec(struct page_collect *pcol)
 
 	BUG_ON(pcol->ios);
 	ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false,
-				 pcol->pg_first << PAGE_CACHE_SHIFT,
+				 pcol->pg_first << PAGE_SHIFT,
 				 pcol->length, &pcol->ios);
 	if (unlikely(ret))
 		goto err;
@@ -696,7 +696,7 @@ static int writepage_strip(struct page *page,
 	struct inode *inode = pcol->inode;
 	struct exofs_i_info *oi = exofs_i(inode);
 	loff_t i_size = i_size_read(inode);
-	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	pgoff_t end_index = i_size >> PAGE_SHIFT;
 	size_t len;
 	int ret;
 
@@ -708,9 +708,9 @@ static int writepage_strip(struct page *page,
 
 	if (page->index < end_index)
 		/* in this case, the page is within the limits of the file */
-		len = PAGE_CACHE_SIZE;
+		len = PAGE_SIZE;
 	else {
-		len = i_size & ~PAGE_CACHE_MASK;
+		len = i_size & ~PAGE_MASK;
 
 		if (page->index > end_index || !len) {
 			/* in this case, the page is outside the limits
@@ -790,10 +790,10 @@ static int exofs_writepages(struct address_space *mapping,
 	long start, end, expected_pages;
 	int ret;
 
-	start = wbc->range_start >> PAGE_CACHE_SHIFT;
+	start = wbc->range_start >> PAGE_SHIFT;
 	end = (wbc->range_end == LLONG_MAX) ?
 			start + mapping->nrpages :
-			wbc->range_end >> PAGE_CACHE_SHIFT;
+			wbc->range_end >> PAGE_SHIFT;
 
 	if (start || end)
 		expected_pages = end - start + 1;
@@ -881,15 +881,15 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 	}
 
 	 /* read modify write */
-	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+	if (!PageUptodate(page) && (len != PAGE_SIZE)) {
 		loff_t i_size = i_size_read(mapping->host);
-		pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+		pgoff_t end_index = i_size >> PAGE_SHIFT;
 		size_t rlen;
 
 		if (page->index < end_index)
-			rlen = PAGE_CACHE_SIZE;
+			rlen = PAGE_SIZE;
 		else if (page->index == end_index)
-			rlen = i_size & ~PAGE_CACHE_MASK;
+			rlen = i_size & ~PAGE_MASK;
 		else
 			rlen = 0;
 
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index c20d77df2679..622a686bb08b 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -292,11 +292,11 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
 out_dir:
 	if (dir_de) {
 		kunmap(dir_page);
-		page_cache_release(dir_page);
+		put_page(dir_page);
 	}
 out_old:
 	kunmap(old_page);
-	page_cache_release(old_page);
+	put_page(old_page);
 out:
 	return err;
 }
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 0c6638b40f21..1b45694de316 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -67,7 +67,7 @@ static inline unsigned ext2_chunk_size(struct inode *inode)
 static inline void ext2_put_page(struct page *page)
 {
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 /*
@@ -79,9 +79,9 @@ ext2_last_byte(struct inode *inode, unsigned long page_nr)
 {
 	unsigned last_byte = inode->i_size;
 
-	last_byte -= page_nr << PAGE_CACHE_SHIFT;
-	if (last_byte > PAGE_CACHE_SIZE)
-		last_byte = PAGE_CACHE_SIZE;
+	last_byte -= page_nr << PAGE_SHIFT;
+	if (last_byte > PAGE_SIZE)
+		last_byte = PAGE_SIZE;
 	return last_byte;
 }
 
@@ -118,12 +118,12 @@ static void ext2_check_page(struct page *page, int quiet)
 	char *kaddr = page_address(page);
 	u32 max_inumber = le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count);
 	unsigned offs, rec_len;
-	unsigned limit = PAGE_CACHE_SIZE;
+	unsigned limit = PAGE_SIZE;
 	ext2_dirent *p;
 	char *error;
 
-	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
-		limit = dir->i_size & ~PAGE_CACHE_MASK;
+	if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_MASK;
 		if (limit & (chunk_size - 1))
 			goto Ebadsize;
 		if (!limit)
@@ -176,7 +176,7 @@ bad_entry:
 	if (!quiet)
 		ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - "
 			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
-			dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+			dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
 			(unsigned long) le32_to_cpu(p->inode),
 			rec_len, p->name_len);
 	goto fail;
@@ -186,7 +186,7 @@ Eend:
 		ext2_error(sb, "ext2_check_page",
 			"entry in directory #%lu spans the page boundary"
 			"offset=%lu, inode=%lu",
-			dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+			dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
 			(unsigned long) le32_to_cpu(p->inode));
 	}
 fail:
@@ -287,8 +287,8 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
 	loff_t pos = ctx->pos;
 	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	unsigned int offset = pos & ~PAGE_CACHE_MASK;
-	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned int offset = pos & ~PAGE_MASK;
+	unsigned long n = pos >> PAGE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(ext2_chunk_size(inode)-1);
 	unsigned char *types = NULL;
@@ -309,14 +309,14 @@ ext2_readdir(struct file *file, struct dir_context *ctx)
 			ext2_error(sb, __func__,
 				   "bad page in #%lu",
 				   inode->i_ino);
-			ctx->pos += PAGE_CACHE_SIZE - offset;
+			ctx->pos += PAGE_SIZE - offset;
 			return PTR_ERR(page);
 		}
 		kaddr = page_address(page);
 		if (unlikely(need_revalidate)) {
 			if (offset) {
 				offset = ext2_validate_entry(kaddr, offset, chunk_mask);
-				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+				ctx->pos = (n<<PAGE_SHIFT) + offset;
 			}
 			file->f_version = inode->i_version;
 			need_revalidate = 0;
@@ -406,7 +406,7 @@ struct ext2_dir_entry_2 *ext2_find_entry (struct inode * dir,
 		if (++n >= npages)
 			n = 0;
 		/* next page is past the blocks we've got */
-		if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+		if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
 			ext2_error(dir->i_sb, __func__,
 				"dir %lu size %lld exceeds block count %llu",
 				dir->i_ino, dir->i_size,
@@ -511,7 +511,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode)
 		kaddr = page_address(page);
 		dir_end = kaddr + ext2_last_byte(dir, n);
 		de = (ext2_dirent *)kaddr;
-		kaddr += PAGE_CACHE_SIZE - reclen;
+		kaddr += PAGE_SIZE - reclen;
 		while ((char *)de <= kaddr) {
 			if ((char *)de == dir_end) {
 				/* We hit i_size */
@@ -655,7 +655,7 @@ int ext2_make_empty(struct inode *inode, struct inode *parent)
 	kunmap_atomic(kaddr);
 	err = ext2_commit_chunk(page, 0, chunk_size);
 fail:
-	page_cache_release(page);
+	put_page(page);
 	return err;
 }
 
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 7a2be8f7f3c3..d34843925b23 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -398,7 +398,7 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 			ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
 		else {
 			kunmap(dir_page);
-			page_cache_release(dir_page);
+			put_page(dir_page);
 		}
 		inode_dec_link_count(old_dir);
 	}
@@ -408,11 +408,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 out_dir:
 	if (dir_de) {
 		kunmap(dir_page);
-		page_cache_release(dir_page);
+		put_page(dir_page);
 	}
 out_old:
 	kunmap(old_page);
-	page_cache_release(old_page);
+	put_page(old_page);
 out:
 	return err;
 }
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index edc053a81914..2580ef3346ca 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -283,10 +283,10 @@ static int ext4_page_crypto(struct inode *inode,
 	       EXT4_XTS_TWEAK_SIZE - sizeof(index));
 
 	sg_init_table(&dst, 1);
-	sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+	sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
 	sg_init_table(&src, 1);
-	sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
-	skcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+	sg_set_page(&src, src_page, PAGE_SIZE, 0);
+	skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE,
 				   xts_tweak);
 	if (rw == EXT4_DECRYPT)
 		res = crypto_skcipher_decrypt(req);
@@ -396,7 +396,7 @@ int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
 		 (unsigned long) inode->i_ino, lblk, len);
 #endif
 
-	BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
 
 	ctx = ext4_get_crypto_ctx(inode);
 	if (IS_ERR(ctx))
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 50ba27cbed03..561d7308b393 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -155,13 +155,13 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 		err = ext4_map_blocks(NULL, inode, &map, 0);
 		if (err > 0) {
 			pgoff_t index = map.m_pblk >>
-					(PAGE_CACHE_SHIFT - inode->i_blkbits);
+					(PAGE_SHIFT - inode->i_blkbits);
 			if (!ra_has_index(&file->f_ra, index))
 				page_cache_sync_readahead(
 					sb->s_bdev->bd_inode->i_mapping,
 					&file->f_ra, file,
 					index, 1);
-			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+			file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
 			bh = ext4_bread(NULL, inode, map.m_lblk, 0);
 			if (IS_ERR(bh)) {
 				err = PTR_ERR(bh);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6659e216385e..0caece398eb8 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -428,8 +428,8 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 	lastoff = startoff;
 	endoff = (loff_t)end_blk << blkbits;
 
-	index = startoff >> PAGE_CACHE_SHIFT;
-	end = endoff >> PAGE_CACHE_SHIFT;
+	index = startoff >> PAGE_SHIFT;
+	end = endoff >> PAGE_SHIFT;
 
 	pagevec_init(&pvec, 0);
 	do {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 7cbdd3752ba5..7bc6c855cc18 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -482,7 +482,7 @@ static int ext4_read_inline_page(struct inode *inode, struct page *page)
 	ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr);
-	zero_user_segment(page, len, PAGE_CACHE_SIZE);
+	zero_user_segment(page, len, PAGE_SIZE);
 	SetPageUptodate(page);
 	brelse(iloc.bh);
 
@@ -507,7 +507,7 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
 	if (!page->index)
 		ret = ext4_read_inline_page(inode, page);
 	else if (!PageUptodate(page)) {
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		zero_user_segment(page, 0, PAGE_SIZE);
 		SetPageUptodate(page);
 	}
 
@@ -595,7 +595,7 @@ retry:
 
 	if (ret) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		page = NULL;
 		ext4_orphan_add(handle, inode);
 		up_write(&EXT4_I(inode)->xattr_sem);
@@ -621,7 +621,7 @@ retry:
 out:
 	if (page) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	if (sem_held)
 		up_write(&EXT4_I(inode)->xattr_sem);
@@ -690,7 +690,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 	if (!ext4_has_inline_data(inode)) {
 		ret = 0;
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		goto out_up_read;
 	}
 
@@ -815,7 +815,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
 	if (ret) {
 		up_read(&EXT4_I(inode)->xattr_sem);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		ext4_truncate_failed_write(inode);
 		return ret;
 	}
@@ -829,7 +829,7 @@ out:
 	up_read(&EXT4_I(inode)->xattr_sem);
 	if (page) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	return ret;
 }
@@ -919,7 +919,7 @@ retry_journal:
 out_release_page:
 	up_read(&EXT4_I(inode)->xattr_sem);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 out_journal:
 	ext4_journal_stop(handle);
 out:
@@ -947,7 +947,7 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
 		i_size_changed = 1;
 	}
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	/*
 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dab84a2530ff..8a43c683eef9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1057,7 +1057,7 @@ int do_journal_get_write_access(handle_t *handle,
 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 				  get_block_t *get_block)
 {
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned from = pos & (PAGE_SIZE - 1);
 	unsigned to = from + len;
 	struct inode *inode = page->mapping->host;
 	unsigned block_start, block_end;
@@ -1069,15 +1069,15 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 	bool decrypt = false;
 
 	BUG_ON(!PageLocked(page));
-	BUG_ON(from > PAGE_CACHE_SIZE);
-	BUG_ON(to > PAGE_CACHE_SIZE);
+	BUG_ON(from > PAGE_SIZE);
+	BUG_ON(to > PAGE_SIZE);
 	BUG_ON(from > to);
 
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
 	head = page_buffers(page);
 	bbits = ilog2(blocksize);
-	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+	block = (sector_t)page->index << (PAGE_SHIFT - bbits);
 
 	for (bh = head, block_start = 0; bh != head || !block_start;
 	    block++, block_start = block_end, bh = bh->b_this_page) {
@@ -1159,8 +1159,8 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	 * we allocate blocks but write fails for some reason
 	 */
 	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
-	index = pos >> PAGE_CACHE_SHIFT;
-	from = pos & (PAGE_CACHE_SIZE - 1);
+	index = pos >> PAGE_SHIFT;
+	from = pos & (PAGE_SIZE - 1);
 	to = from + len;
 
 	if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -1188,7 +1188,7 @@ retry_grab:
 retry_journal:
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
 	if (IS_ERR(handle)) {
-		page_cache_release(page);
+		put_page(page);
 		return PTR_ERR(handle);
 	}
 
@@ -1196,7 +1196,7 @@ retry_journal:
 	if (page->mapping != mapping) {
 		/* The page got truncated from under us */
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		ext4_journal_stop(handle);
 		goto retry_grab;
 	}
@@ -1252,7 +1252,7 @@ retry_journal:
 		if (ret == -ENOSPC &&
 		    ext4_should_retry_alloc(inode->i_sb, &retries))
 			goto retry_journal;
-		page_cache_release(page);
+		put_page(page);
 		return ret;
 	}
 	*pagep = page;
@@ -1295,7 +1295,7 @@ static int ext4_write_end(struct file *file,
 		ret = ext4_jbd2_file_inode(handle, inode);
 		if (ret) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			goto errout;
 		}
 	}
@@ -1315,7 +1315,7 @@ static int ext4_write_end(struct file *file,
 	 */
 	i_size_changed = ext4_update_inode_size(inode, pos + copied);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	if (old_size < pos)
 		pagecache_isize_extended(inode, old_size, pos);
@@ -1399,7 +1399,7 @@ static int ext4_journalled_write_end(struct file *file,
 	int size_changed = 0;
 
 	trace_ext4_journalled_write_end(inode, pos, len, copied);
-	from = pos & (PAGE_CACHE_SIZE - 1);
+	from = pos & (PAGE_SIZE - 1);
 	to = from + len;
 
 	BUG_ON(!ext4_handle_valid(handle));
@@ -1423,7 +1423,7 @@ static int ext4_journalled_write_end(struct file *file,
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	if (old_size < pos)
 		pagecache_isize_extended(inode, old_size, pos);
@@ -1537,7 +1537,7 @@ static void ext4_da_page_release_reservation(struct page *page,
 	int num_clusters;
 	ext4_fsblk_t lblk;
 
-	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+	BUG_ON(stop > PAGE_SIZE || stop < length);
 
 	head = page_buffers(page);
 	bh = head;
@@ -1553,7 +1553,7 @@ static void ext4_da_page_release_reservation(struct page *page,
 			clear_buffer_delay(bh);
 		} else if (contiguous_blks) {
 			lblk = page->index <<
-			       (PAGE_CACHE_SHIFT - inode->i_blkbits);
+			       (PAGE_SHIFT - inode->i_blkbits);
 			lblk += (curr_off >> inode->i_blkbits) -
 				contiguous_blks;
 			ext4_es_remove_extent(inode, lblk, contiguous_blks);
@@ -1563,7 +1563,7 @@ static void ext4_da_page_release_reservation(struct page *page,
 	} while ((bh = bh->b_this_page) != head);
 
 	if (contiguous_blks) {
-		lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		lblk = page->index << (PAGE_SHIFT - inode->i_blkbits);
 		lblk += (curr_off >> inode->i_blkbits) - contiguous_blks;
 		ext4_es_remove_extent(inode, lblk, contiguous_blks);
 	}
@@ -1572,7 +1572,7 @@ static void ext4_da_page_release_reservation(struct page *page,
 	 * need to release the reserved space for that cluster. */
 	num_clusters = EXT4_NUM_B2C(sbi, to_release);
 	while (num_clusters > 0) {
-		lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
+		lblk = (page->index << (PAGE_SHIFT - inode->i_blkbits)) +
 			((num_clusters - 1) << sbi->s_cluster_bits);
 		if (sbi->s_cluster_ratio == 1 ||
 		    !ext4_find_delalloc_cluster(inode, lblk))
@@ -1619,8 +1619,8 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 	end   = mpd->next_page - 1;
 	if (invalidate) {
 		ext4_lblk_t start, last;
-		start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-		last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		start = index << (PAGE_SHIFT - inode->i_blkbits);
+		last = end << (PAGE_SHIFT - inode->i_blkbits);
 		ext4_es_remove_extent(inode, start, last - start + 1);
 	}
 
@@ -1636,7 +1636,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
 			BUG_ON(!PageLocked(page));
 			BUG_ON(PageWriteback(page));
 			if (invalidate) {
-				block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+				block_invalidatepage(page, 0, PAGE_SIZE);
 				ClearPageUptodate(page);
 			}
 			unlock_page(page);
@@ -2007,10 +2007,10 @@ static int ext4_writepage(struct page *page,
 
 	trace_ext4_writepage(page);
 	size = i_size_read(inode);
-	if (page->index == size >> PAGE_CACHE_SHIFT)
-		len = size & ~PAGE_CACHE_MASK;
+	if (page->index == size >> PAGE_SHIFT)
+		len = size & ~PAGE_MASK;
 	else
-		len = PAGE_CACHE_SIZE;
+		len = PAGE_SIZE;
 
 	page_bufs = page_buffers(page);
 	/*
@@ -2034,7 +2034,7 @@ static int ext4_writepage(struct page *page,
 				   ext4_bh_delay_or_unwritten)) {
 		redirty_page_for_writepage(wbc, page);
 		if ((current->flags & PF_MEMALLOC) ||
-		    (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) {
+		    (inode->i_sb->s_blocksize == PAGE_SIZE)) {
 			/*
 			 * For memory cleaning there's no point in writing only
 			 * some buffers. So just bail out. Warn if we came here
@@ -2076,10 +2076,10 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
 	int err;
 
 	BUG_ON(page->index != mpd->first_page);
-	if (page->index == size >> PAGE_CACHE_SHIFT)
-		len = size & ~PAGE_CACHE_MASK;
+	if (page->index == size >> PAGE_SHIFT)
+		len = size & ~PAGE_MASK;
 	else
-		len = PAGE_CACHE_SIZE;
+		len = PAGE_SIZE;
 	clear_page_dirty_for_io(page);
 	err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
 	if (!err)
@@ -2213,7 +2213,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 	int nr_pages, i;
 	struct inode *inode = mpd->inode;
 	struct buffer_head *head, *bh;
-	int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
+	int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
 	pgoff_t start, end;
 	ext4_lblk_t lblk;
 	sector_t pblock;
@@ -2274,7 +2274,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
 			 * supports blocksize < pagesize as we will try to
 			 * convert potentially unmapped parts of inode.
 			 */
-			mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
+			mpd->io_submit.io_end->size += PAGE_SIZE;
 			/* Page fully mapped - let IO run! */
 			err = mpage_submit_page(mpd, page);
 			if (err < 0) {
@@ -2426,7 +2426,7 @@ update_disksize:
 	 * Update on-disk size after IO is submitted.  Races with
 	 * truncate are avoided by checking i_size under i_data_sem.
 	 */
-	disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
+	disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
 	if (disksize > EXT4_I(inode)->i_disksize) {
 		int err2;
 		loff_t i_size;
@@ -2562,7 +2562,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 			mpd->next_page = page->index + 1;
 			/* Add all dirty buffers to mpd */
 			lblk = ((ext4_lblk_t)page->index) <<
-				(PAGE_CACHE_SHIFT - blkbits);
+				(PAGE_SHIFT - blkbits);
 			head = page_buffers(page);
 			err = mpage_process_page_bufs(mpd, head, head, lblk);
 			if (err <= 0)
@@ -2647,7 +2647,7 @@ static int ext4_writepages(struct address_space *mapping,
 		 * We may need to convert up to one extent per block in
 		 * the page and we may dirty the inode.
 		 */
-		rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
+		rsv_blocks = 1 + (PAGE_SIZE >> inode->i_blkbits);
 	}
 
 	/*
@@ -2678,8 +2678,8 @@ static int ext4_writepages(struct address_space *mapping,
 		mpd.first_page = writeback_index;
 		mpd.last_page = -1;
 	} else {
-		mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
-		mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
+		mpd.first_page = wbc->range_start >> PAGE_SHIFT;
+		mpd.last_page = wbc->range_end >> PAGE_SHIFT;
 	}
 
 	mpd.inode = inode;
@@ -2838,7 +2838,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	handle_t *handle;
 
-	index = pos >> PAGE_CACHE_SHIFT;
+	index = pos >> PAGE_SHIFT;
 
 	if (ext4_nonda_switch(inode->i_sb)) {
 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
@@ -2881,7 +2881,7 @@ retry_journal:
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 				ext4_da_write_credits(inode, pos, len));
 	if (IS_ERR(handle)) {
-		page_cache_release(page);
+		put_page(page);
 		return PTR_ERR(handle);
 	}
 
@@ -2889,7 +2889,7 @@ retry_journal:
 	if (page->mapping != mapping) {
 		/* The page got truncated from under us */
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		ext4_journal_stop(handle);
 		goto retry_grab;
 	}
@@ -2917,7 +2917,7 @@ retry_journal:
 		    ext4_should_retry_alloc(inode->i_sb, &retries))
 			goto retry_journal;
 
-		page_cache_release(page);
+		put_page(page);
 		return ret;
 	}
 
@@ -2965,7 +2965,7 @@ static int ext4_da_write_end(struct file *file,
 				      len, copied, page, fsdata);
 
 	trace_ext4_da_write_end(inode, pos, len, copied);
-	start = pos & (PAGE_CACHE_SIZE - 1);
+	start = pos & (PAGE_SIZE - 1);
 	end = start + copied - 1;
 
 	/*
@@ -3187,7 +3187,7 @@ static int __ext4_journalled_invalidatepage(struct page *page,
 	/*
 	 * If it's a full truncate we just forget about the pending dirtying
 	 */
-	if (offset == 0 && length == PAGE_CACHE_SIZE)
+	if (offset == 0 && length == PAGE_SIZE)
 		ClearPageChecked(page);
 
 	return jbd2_journal_invalidatepage(journal, page, offset, length);
@@ -3556,8 +3556,8 @@ void ext4_set_aops(struct inode *inode)
 static int __ext4_block_zero_page_range(handle_t *handle,
 		struct address_space *mapping, loff_t from, loff_t length)
 {
-	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	ext4_fsblk_t index = from >> PAGE_SHIFT;
+	unsigned offset = from & (PAGE_SIZE-1);
 	unsigned blocksize, pos;
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
@@ -3565,14 +3565,14 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 	struct page *page;
 	int err = 0;
 
-	page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
+	page = find_or_create_page(mapping, from >> PAGE_SHIFT,
 				   mapping_gfp_constraint(mapping, ~__GFP_FS));
 	if (!page)
 		return -ENOMEM;
 
 	blocksize = inode->i_sb->s_blocksize;
 
-	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
 
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
@@ -3614,7 +3614,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 		    ext4_encrypted_inode(inode)) {
 			/* We expect the key to be set. */
 			BUG_ON(!ext4_has_encryption_key(inode));
-			BUG_ON(blocksize != PAGE_CACHE_SIZE);
+			BUG_ON(blocksize != PAGE_SIZE);
 			WARN_ON_ONCE(ext4_decrypt(page));
 		}
 	}
@@ -3638,7 +3638,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 
 unlock:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return err;
 }
 
@@ -3653,7 +3653,7 @@ static int ext4_block_zero_page_range(handle_t *handle,
 		struct address_space *mapping, loff_t from, loff_t length)
 {
 	struct inode *inode = mapping->host;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned offset = from & (PAGE_SIZE-1);
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	unsigned max = blocksize - (offset & (blocksize - 1));
 
@@ -3678,7 +3678,7 @@ static int ext4_block_zero_page_range(handle_t *handle,
 static int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from)
 {
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned offset = from & (PAGE_SIZE-1);
 	unsigned length;
 	unsigned blocksize;
 	struct inode *inode = mapping->host;
@@ -3816,7 +3816,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	 */
 	if (offset + length > inode->i_size) {
 		length = inode->i_size +
-		   PAGE_CACHE_SIZE - (inode->i_size & (PAGE_CACHE_SIZE - 1)) -
+		   PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
 		   offset;
 	}
 
@@ -4891,23 +4891,23 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
 	tid_t commit_tid = 0;
 	int ret;
 
-	offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+	offset = inode->i_size & (PAGE_SIZE - 1);
 	/*
 	 * All buffers in the last page remain valid? Then there's nothing to
 	 * do. We do the check mainly to optimize the common PAGE_CACHE_SIZE ==
 	 * blocksize case
 	 */
-	if (offset > PAGE_CACHE_SIZE - (1 << inode->i_blkbits))
+	if (offset > PAGE_SIZE - (1 << inode->i_blkbits))
 		return;
 	while (1) {
 		page = find_lock_page(inode->i_mapping,
-				      inode->i_size >> PAGE_CACHE_SHIFT);
+				      inode->i_size >> PAGE_SHIFT);
 		if (!page)
 			return;
 		ret = __ext4_journalled_invalidatepage(page, offset,
-						PAGE_CACHE_SIZE - offset);
+						PAGE_SIZE - offset);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		if (ret != -EBUSY)
 			return;
 		commit_tid = 0;
@@ -5546,10 +5546,10 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 		goto out;
 	}
 
-	if (page->index == size >> PAGE_CACHE_SHIFT)
-		len = size & ~PAGE_CACHE_MASK;
+	if (page->index == size >> PAGE_SHIFT)
+		len = size & ~PAGE_MASK;
 	else
-		len = PAGE_CACHE_SIZE;
+		len = PAGE_SIZE;
 	/*
 	 * Return if we have all the buffers mapped. This avoids the need to do
 	 * journal_start/journal_stop which can block and take a long time
@@ -5580,7 +5580,7 @@ retry_alloc:
 	ret = block_page_mkwrite(vma, vmf, get_block);
 	if (!ret && ext4_should_journal_data(inode)) {
 		if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
-			  PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
+			  PAGE_SIZE, NULL, do_journal_get_write_access)) {
 			unlock_page(page);
 			ret = VM_FAULT_SIGBUS;
 			ext4_journal_stop(handle);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 50e05df28f66..c12174711ce2 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -839,7 +839,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 	sb = inode->i_sb;
 	ngroups = ext4_get_groups_count(sb);
 	blocksize = 1 << inode->i_blkbits;
-	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
+	blocks_per_page = PAGE_SIZE / blocksize;
 
 	groups_per_page = blocks_per_page >> 1;
 	if (groups_per_page == 0)
@@ -993,7 +993,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
 	e4b->bd_buddy_page = NULL;
 	e4b->bd_bitmap_page = NULL;
 
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	blocks_per_page = PAGE_SIZE / sb->s_blocksize;
 	/*
 	 * the buddy cache inode stores the block bitmap
 	 * and buddy information in consecutive blocks.
@@ -1028,11 +1028,11 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
 {
 	if (e4b->bd_bitmap_page) {
 		unlock_page(e4b->bd_bitmap_page);
-		page_cache_release(e4b->bd_bitmap_page);
+		put_page(e4b->bd_bitmap_page);
 	}
 	if (e4b->bd_buddy_page) {
 		unlock_page(e4b->bd_buddy_page);
-		page_cache_release(e4b->bd_buddy_page);
+		put_page(e4b->bd_buddy_page);
 	}
 }
 
@@ -1125,7 +1125,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
 	might_sleep();
 	mb_debug(1, "load group %u\n", group);
 
-	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	blocks_per_page = PAGE_SIZE / sb->s_blocksize;
 	grp = ext4_get_group_info(sb, group);
 
 	e4b->bd_blkbits = sb->s_blocksize_bits;
@@ -1167,7 +1167,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
 			 * is yet to initialize the same. So
 			 * wait for it to initialize.
 			 */
-			page_cache_release(page);
+			put_page(page);
 		page = find_or_create_page(inode->i_mapping, pnum, gfp);
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
@@ -1203,7 +1203,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
 	page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
 	if (page == NULL || !PageUptodate(page)) {
 		if (page)
-			page_cache_release(page);
+			put_page(page);
 		page = find_or_create_page(inode->i_mapping, pnum, gfp);
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
@@ -1238,11 +1238,11 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
 
 err:
 	if (page)
-		page_cache_release(page);
+		put_page(page);
 	if (e4b->bd_bitmap_page)
-		page_cache_release(e4b->bd_bitmap_page);
+		put_page(e4b->bd_bitmap_page);
 	if (e4b->bd_buddy_page)
-		page_cache_release(e4b->bd_buddy_page);
+		put_page(e4b->bd_buddy_page);
 	e4b->bd_buddy = NULL;
 	e4b->bd_bitmap = NULL;
 	return ret;
@@ -1257,9 +1257,9 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 {
 	if (e4b->bd_bitmap_page)
-		page_cache_release(e4b->bd_bitmap_page);
+		put_page(e4b->bd_bitmap_page);
 	if (e4b->bd_buddy_page)
-		page_cache_release(e4b->bd_buddy_page);
+		put_page(e4b->bd_buddy_page);
 }
 
 
@@ -2833,8 +2833,8 @@ static void ext4_free_data_callback(struct super_block *sb,
 		/* No more items in the per group rb tree
 		 * balance refcounts from ext4_mb_free_metadata()
 		 */
-		page_cache_release(e4b.bd_buddy_page);
-		page_cache_release(e4b.bd_bitmap_page);
+		put_page(e4b.bd_buddy_page);
+		put_page(e4b.bd_bitmap_page);
 	}
 	ext4_unlock_group(sb, entry->efd_group);
 	kmem_cache_free(ext4_free_data_cachep, entry);
@@ -4385,9 +4385,9 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 		ext4_mb_put_pa(ac, ac->ac_sb, pa);
 	}
 	if (ac->ac_bitmap_page)
-		page_cache_release(ac->ac_bitmap_page);
+		put_page(ac->ac_bitmap_page);
 	if (ac->ac_buddy_page)
-		page_cache_release(ac->ac_buddy_page);
+		put_page(ac->ac_buddy_page);
 	if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
 		mutex_unlock(&ac->ac_lg->lg_mutex);
 	ext4_mb_collect_stats(ac);
@@ -4599,8 +4599,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
 		 * otherwise we'll refresh it from
 		 * on-disk bitmap and lose not-yet-available
 		 * blocks */
-		page_cache_get(e4b->bd_buddy_page);
-		page_cache_get(e4b->bd_bitmap_page);
+		get_page(e4b->bd_buddy_page);
+		get_page(e4b->bd_bitmap_page);
 	}
 	while (*n) {
 		parent = *n;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 4098acc701c3..675b67e5d5c2 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -156,7 +156,7 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
 	page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
 	if (!page[1]) {
 		unlock_page(page[0]);
-		page_cache_release(page[0]);
+		put_page(page[0]);
 		return -ENOMEM;
 	}
 	/*
@@ -192,7 +192,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
 		create_empty_buffers(page, blocksize, 0);
 
 	head = page_buffers(page);
-	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	block = (sector_t)page->index << (PAGE_SHIFT - inode->i_blkbits);
 	for (bh = head, block_start = 0; bh != head || !block_start;
 	     block++, block_start = block_end, bh = bh->b_this_page) {
 		block_end = block_start + blocksize;
@@ -268,7 +268,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
 	int i, err2, jblocks, retries = 0;
 	int replaced_count = 0;
 	int from = data_offset_in_page << orig_inode->i_blkbits;
-	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+	int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
 	struct super_block *sb = orig_inode->i_sb;
 	struct buffer_head *bh = NULL;
 
@@ -404,9 +404,9 @@ data_copy:
 
 unlock_pages:
 	unlock_page(pagep[0]);
-	page_cache_release(pagep[0]);
+	put_page(pagep[0]);
 	unlock_page(pagep[1]);
-	page_cache_release(pagep[1]);
+	put_page(pagep[1]);
 stop_journal:
 	ext4_journal_stop(handle);
 	if (*err == -ENOSPC &&
@@ -554,7 +554,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
 	struct inode *orig_inode = file_inode(o_filp);
 	struct inode *donor_inode = file_inode(d_filp);
 	struct ext4_ext_path *path = NULL;
-	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+	int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
 	ext4_lblk_t o_end, o_start = orig_blk;
 	ext4_lblk_t d_start = donor_blk;
 	int ret;
@@ -648,9 +648,9 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
 		if (o_end - o_start < cur_len)
 			cur_len = o_end - o_start;
 
-		orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
+		orig_page_index = o_start >> (PAGE_SHIFT -
 					       orig_inode->i_blkbits);
-		donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
+		donor_page_index = d_start >> (PAGE_SHIFT -
 					       donor_inode->i_blkbits);
 		offset_in_page = o_start % blocks_per_page;
 		if (cur_len > blocks_per_page- offset_in_page)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index d77d15f4b674..93ad0acf704c 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -432,8 +432,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 	 * the page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	if (len < PAGE_CACHE_SIZE)
-		zero_user_segment(page, len, PAGE_CACHE_SIZE);
+	if (len < PAGE_SIZE)
+		zero_user_segment(page, len, PAGE_SIZE);
 	/*
 	 * In the first loop we prepare and mark buffers to submit. We have to
 	 * mark all buffers in the page before submitting so that
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5dc5e95063de..ea27aa1a778c 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -140,7 +140,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
 
 	struct inode *inode = mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
-	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	const unsigned blocksize = 1 << blkbits;
 	sector_t block_in_file;
 	sector_t last_block;
@@ -173,7 +173,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
 		if (page_has_buffers(page))
 			goto confused;
 
-		block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+		block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
 		last_block = block_in_file + nr_pages * blocks_per_page;
 		last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
 		if (last_block > last_block_in_file)
@@ -217,7 +217,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
 				set_error_page:
 					SetPageError(page);
 					zero_user_segment(page, 0,
-							  PAGE_CACHE_SIZE);
+							  PAGE_SIZE);
 					unlock_page(page);
 					goto next_page;
 				}
@@ -250,7 +250,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
 		}
 		if (first_hole != blocks_per_page) {
 			zero_user_segment(page, first_hole << blkbits,
-					  PAGE_CACHE_SIZE);
+					  PAGE_SIZE);
 			if (first_hole == 0) {
 				SetPageUptodate(page);
 				unlock_page(page);
@@ -319,7 +319,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
 			unlock_page(page);
 	next_page:
 		if (pages)
-			page_cache_release(page);
+			put_page(page);
 	}
 	BUG_ON(pages && !list_empty(pages));
 	if (bio)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 539297515896..0bb74aacb8c0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1784,7 +1784,7 @@ static int parse_options(char *options, struct super_block *sb,
 		int blocksize =
 			BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
-		if (blocksize < PAGE_CACHE_SIZE) {
+		if (blocksize < PAGE_SIZE) {
 			ext4_msg(sb, KERN_ERR, "can't mount with "
 				 "dioread_nolock if block size != PAGE_SIZE");
 			return 0;
@@ -3808,7 +3808,7 @@ no_journal:
 	}
 
 	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
-	    (blocksize != PAGE_CACHE_SIZE)) {
+	    (blocksize != PAGE_SIZE)) {
 		ext4_msg(sb, KERN_ERR,
 			 "Unsupported blocksize for fs encryption");
 		goto failed_mount_wq;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 6f7ee30a89ce..75ed5c2f0c16 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -80,12 +80,12 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry,
 	if (res <= plen)
 		paddr[res] = '\0';
 	if (cpage)
-		page_cache_release(cpage);
+		put_page(cpage);
 	set_delayed_call(done, kfree_link, paddr);
 	return paddr;
 errout:
 	if (cpage)
-		page_cache_release(cpage);
+		put_page(cpage);
 	kfree(paddr);
 	return ERR_PTR(res);
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e5c762b37239..53fec0872e60 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -223,7 +223,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 	/* Allocate a new bio */
 	bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw));
 
-	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 		bio_put(bio);
 		return -EFAULT;
 	}
@@ -265,8 +265,8 @@ alloc_new:
 
 	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
 
-	if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) <
-							PAGE_CACHE_SIZE) {
+	if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
+							PAGE_SIZE) {
 		__submit_merged_bio(io);
 		goto alloc_new;
 	}
@@ -406,7 +406,7 @@ got_it:
 	 * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata.
 	 */
 	if (dn.data_blkaddr == NEW_ADDR) {
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		zero_user_segment(page, 0, PAGE_SIZE);
 		SetPageUptodate(page);
 		unlock_page(page);
 		return page;
@@ -517,7 +517,7 @@ struct page *get_new_data_page(struct inode *inode,
 		goto got_it;
 
 	if (dn.data_blkaddr == NEW_ADDR) {
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		zero_user_segment(page, 0, PAGE_SIZE);
 		SetPageUptodate(page);
 	} else {
 		f2fs_put_page(page, 1);
@@ -530,8 +530,8 @@ struct page *get_new_data_page(struct inode *inode,
 	}
 got_it:
 	if (new_i_size && i_size_read(inode) <
-				((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) {
-		i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT));
+				((loff_t)(index + 1) << PAGE_SHIFT)) {
+		i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT));
 		/* Only the directory inode sets new_i_size */
 		set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
 	}
@@ -570,9 +570,9 @@ alloc:
 	/* update i_size */
 	fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
 							dn->ofs_in_node;
-	if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
+	if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT))
 		i_size_write(dn->inode,
-				((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT));
+				((loff_t)(fofs + 1) << PAGE_SHIFT));
 	return 0;
 }
 
@@ -971,7 +971,7 @@ got_it:
 				goto confused;
 			}
 		} else {
-			zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+			zero_user_segment(page, 0, PAGE_SIZE);
 			SetPageUptodate(page);
 			unlock_page(page);
 			goto next_page;
@@ -1021,7 +1021,7 @@ submit_and_realloc:
 		goto next_page;
 set_error_page:
 		SetPageError(page);
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		zero_user_segment(page, 0, PAGE_SIZE);
 		unlock_page(page);
 		goto next_page;
 confused:
@@ -1032,7 +1032,7 @@ confused:
 		unlock_page(page);
 next_page:
 		if (pages)
-			page_cache_release(page);
+			put_page(page);
 	}
 	BUG_ON(pages && !list_empty(pages));
 	if (bio)
@@ -1136,7 +1136,7 @@ static int f2fs_write_data_page(struct page *page,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	loff_t i_size = i_size_read(inode);
 	const pgoff_t end_index = ((unsigned long long) i_size)
-							>> PAGE_CACHE_SHIFT;
+							>> PAGE_SHIFT;
 	unsigned offset = 0;
 	bool need_balance_fs = false;
 	int err = 0;
@@ -1157,11 +1157,11 @@ static int f2fs_write_data_page(struct page *page,
 	 * If the offset is out-of-range of file size,
 	 * this page does not have to be written to disk.
 	 */
-	offset = i_size & (PAGE_CACHE_SIZE - 1);
+	offset = i_size & (PAGE_SIZE - 1);
 	if ((page->index >= end_index + 1) || !offset)
 		goto out;
 
-	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	zero_user_segment(page, offset, PAGE_SIZE);
 write:
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto redirty_out;
@@ -1267,8 +1267,8 @@ next:
 			cycled = 0;
 		end = -1;
 	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		index = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
@@ -1448,11 +1448,11 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
 	 * the block addresses when there is no need to fill the page.
 	 */
 	if (!f2fs_has_inline_data(inode) && !f2fs_encrypted_inode(inode) &&
-					len == PAGE_CACHE_SIZE)
+					len == PAGE_SIZE)
 		return 0;
 
 	if (f2fs_has_inline_data(inode) ||
-			(pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+			(pos & PAGE_MASK) >= i_size_read(inode)) {
 		f2fs_lock_op(sbi);
 		locked = true;
 	}
@@ -1513,7 +1513,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *page = NULL;
-	pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
+	pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
 	bool need_balance = false;
 	block_t blkaddr = NULL_ADDR;
 	int err = 0;
@@ -1561,22 +1561,22 @@ repeat:
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 		f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
 
-	if (len == PAGE_CACHE_SIZE)
+	if (len == PAGE_SIZE)
 		goto out_update;
 	if (PageUptodate(page))
 		goto out_clear;
 
-	if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
-		unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	if ((pos & PAGE_MASK) >= i_size_read(inode)) {
+		unsigned start = pos & (PAGE_SIZE - 1);
 		unsigned end = start + len;
 
 		/* Reading beyond i_size is simple: memset to zero */
-		zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+		zero_user_segments(page, 0, start, end, PAGE_SIZE);
 		goto out_update;
 	}
 
 	if (blkaddr == NEW_ADDR) {
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		zero_user_segment(page, 0, PAGE_SIZE);
 	} else {
 		struct f2fs_io_info fio = {
 			.sbi = sbi,
@@ -1688,7 +1688,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
 	if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
-		(offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE))
+		(offset % PAGE_SIZE || length != PAGE_SIZE))
 		return;
 
 	if (PageDirty(page)) {
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 4fb6ef88a34f..f4a61a5ff79f 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -164,7 +164,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
 
 	/* build curseg */
 	si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
-	si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE;
+	si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE;
 
 	/* build dirty segmap */
 	si->base_mem += sizeof(struct dirty_seglist_info);
@@ -201,9 +201,9 @@ get_cache:
 
 	si->page_mem = 0;
 	npages = NODE_MAPPING(sbi)->nrpages;
-	si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
+	si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
 	npages = META_MAPPING(sbi)->nrpages;
-	si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
+	si->page_mem += (unsigned long long)npages << PAGE_SHIFT;
 }
 
 static int stat_show(struct seq_file *s, void *v)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 80641ad82745..af819571bce7 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -17,8 +17,8 @@
 
 static unsigned long dir_blocks(struct inode *inode)
 {
-	return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1))
-							>> PAGE_CACHE_SHIFT;
+	return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1))
+							>> PAGE_SHIFT;
 }
 
 static unsigned int dir_buckets(unsigned int level, int dir_level)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index bbe2cd1265d0..7a4558d17f36 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -1294,7 +1294,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
 		f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
 		unlock_page(page);
 	}
-	page_cache_release(page);
+	put_page(page);
 }
 
 static inline void f2fs_put_dnode(struct dnode_of_data *dn)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b41c3579ea9e..443e07705c2a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -74,11 +74,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 		goto mapped;
 
 	/* page is wholly or partially inside EOF */
-	if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) >
+	if (((loff_t)(page->index + 1) << PAGE_SHIFT) >
 						i_size_read(inode)) {
 		unsigned offset;
-		offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
-		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+		offset = i_size_read(inode) & ~PAGE_MASK;
+		zero_user_segment(page, offset, PAGE_SIZE);
 	}
 	set_page_dirty(page);
 	SetPageUptodate(page);
@@ -346,11 +346,11 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 		goto found;
 	}
 
-	pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT);
+	pgofs = (pgoff_t)(offset >> PAGE_SHIFT);
 
 	dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
 
-	for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
+	for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
 		err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
 		if (err && err != -ENOENT) {
@@ -370,7 +370,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 		/* find data/hole in dnode block */
 		for (; dn.ofs_in_node < end_offset;
 				dn.ofs_in_node++, pgofs++,
-				data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
+				data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
 			block_t blkaddr;
 			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
 
@@ -508,8 +508,8 @@ void truncate_data_blocks(struct dnode_of_data *dn)
 static int truncate_partial_data_page(struct inode *inode, u64 from,
 								bool cache_only)
 {
-	unsigned offset = from & (PAGE_CACHE_SIZE - 1);
-	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_SIZE - 1);
+	pgoff_t index = from >> PAGE_SHIFT;
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 
@@ -529,7 +529,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 		return 0;
 truncate_out:
 	f2fs_wait_on_page_writeback(page, DATA, true);
-	zero_user(page, offset, PAGE_CACHE_SIZE - offset);
+	zero_user(page, offset, PAGE_SIZE - offset);
 	if (!cache_only || !f2fs_encrypted_inode(inode) ||
 					!S_ISREG(inode->i_mode))
 		set_page_dirty(page);
@@ -799,11 +799,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		return ret;
 
-	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
-	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+	pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+	pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
 
-	off_start = offset & (PAGE_CACHE_SIZE - 1);
-	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+	off_start = offset & (PAGE_SIZE - 1);
+	off_end = (offset + len) & (PAGE_SIZE - 1);
 
 	if (pg_start == pg_end) {
 		ret = fill_zero(inode, pg_start, off_start,
@@ -813,7 +813,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	} else {
 		if (off_start) {
 			ret = fill_zero(inode, pg_start++, off_start,
-						PAGE_CACHE_SIZE - off_start);
+						PAGE_SIZE - off_start);
 			if (ret)
 				return ret;
 		}
@@ -830,8 +830,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 
 			f2fs_balance_fs(sbi, true);
 
-			blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT;
-			blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT;
+			blk_start = (loff_t)pg_start << PAGE_SHIFT;
+			blk_end = (loff_t)pg_end << PAGE_SHIFT;
 			truncate_inode_pages_range(mapping, blk_start,
 					blk_end - 1);
 
@@ -954,8 +954,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		return ret;
 
-	pg_start = offset >> PAGE_CACHE_SHIFT;
-	pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+	pg_start = offset >> PAGE_SHIFT;
+	pg_end = (offset + len) >> PAGE_SHIFT;
 
 	/* write out all dirty pages from offset */
 	ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
@@ -1006,11 +1006,11 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 
 	truncate_pagecache_range(inode, offset, offset + len - 1);
 
-	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
-	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+	pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+	pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
 
-	off_start = offset & (PAGE_CACHE_SIZE - 1);
-	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+	off_start = offset & (PAGE_SIZE - 1);
+	off_end = (offset + len) & (PAGE_SIZE - 1);
 
 	if (pg_start == pg_end) {
 		ret = fill_zero(inode, pg_start, off_start,
@@ -1024,12 +1024,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 	} else {
 		if (off_start) {
 			ret = fill_zero(inode, pg_start++, off_start,
-						PAGE_CACHE_SIZE - off_start);
+						PAGE_SIZE - off_start);
 			if (ret)
 				return ret;
 
 			new_size = max_t(loff_t, new_size,
-					(loff_t)pg_start << PAGE_CACHE_SHIFT);
+					(loff_t)pg_start << PAGE_SHIFT);
 		}
 
 		for (index = pg_start; index < pg_end; index++) {
@@ -1060,7 +1060,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 			f2fs_unlock_op(sbi);
 
 			new_size = max_t(loff_t, new_size,
-				(loff_t)(index + 1) << PAGE_CACHE_SHIFT);
+				(loff_t)(index + 1) << PAGE_SHIFT);
 		}
 
 		if (off_end) {
@@ -1117,8 +1117,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 	truncate_pagecache(inode, offset);
 
-	pg_start = offset >> PAGE_CACHE_SHIFT;
-	pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
+	pg_start = offset >> PAGE_SHIFT;
+	pg_end = (offset + len) >> PAGE_SHIFT;
 	delta = pg_end - pg_start;
 	nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
 
@@ -1158,11 +1158,11 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 
 	f2fs_balance_fs(sbi, true);
 
-	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
-	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
+	pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
+	pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
 
-	off_start = offset & (PAGE_CACHE_SIZE - 1);
-	off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+	off_start = offset & (PAGE_SIZE - 1);
+	off_end = (offset + len) & (PAGE_SIZE - 1);
 
 	f2fs_lock_op(sbi);
 
@@ -1180,12 +1180,12 @@ noalloc:
 		if (pg_start == pg_end)
 			new_size = offset + len;
 		else if (index == pg_start && off_start)
-			new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT;
+			new_size = (loff_t)(index + 1) << PAGE_SHIFT;
 		else if (index == pg_end)
-			new_size = ((loff_t)index << PAGE_CACHE_SHIFT) +
+			new_size = ((loff_t)index << PAGE_SHIFT) +
 								off_end;
 		else
-			new_size += PAGE_CACHE_SIZE;
+			new_size += PAGE_SIZE;
 	}
 
 	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -1652,8 +1652,8 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 	if (need_inplace_update(inode))
 		return -EINVAL;
 
-	pg_start = range->start >> PAGE_CACHE_SHIFT;
-	pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
+	pg_start = range->start >> PAGE_SHIFT;
+	pg_end = (range->start + range->len) >> PAGE_SHIFT;
 
 	f2fs_balance_fs(sbi, true);
 
@@ -1770,7 +1770,7 @@ clear_out:
 out:
 	inode_unlock(inode);
 	if (!err)
-		range->len = (u64)total << PAGE_CACHE_SHIFT;
+		range->len = (u64)total << PAGE_SHIFT;
 	return err;
 }
 
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 358214e9f707..a2fbe6f427d3 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -51,7 +51,7 @@ void read_inline_data(struct page *page, struct page *ipage)
 
 	f2fs_bug_on(F2FS_P_SB(page), page->index);
 
-	zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+	zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
 
 	/* Copy the whole inline data block */
 	src_addr = inline_data_addr(ipage);
@@ -93,7 +93,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 	}
 
 	if (page->index)
-		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+		zero_user_segment(page, 0, PAGE_SIZE);
 	else
 		read_inline_data(page, ipage);
 
@@ -375,7 +375,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
 		goto out;
 
 	f2fs_wait_on_page_writeback(page, DATA, true);
-	zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+	zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE);
 
 	dentry_blk = kmap_atomic(page);
 
@@ -405,8 +405,8 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
 	stat_dec_inline_dir(dir);
 	clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
 
-	if (i_size_read(dir) < PAGE_CACHE_SIZE) {
-		i_size_write(dir, PAGE_CACHE_SIZE);
+	if (i_size_read(dir) < PAGE_SIZE) {
+		i_size_write(dir, PAGE_SIZE);
 		set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
 	}
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 7876f1052101..c1d9e9d2cb37 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1051,12 +1051,12 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
 	/* Null-terminate the name */
 	paddr[res] = '\0';
 
-	page_cache_release(cpage);
+	put_page(cpage);
 	set_delayed_call(done, kfree_link, paddr);
 	return paddr;
 errout:
 	fscrypt_fname_free_buffer(&pstr);
-	page_cache_release(cpage);
+	put_page(cpage);
 	return ERR_PTR(res);
 }
 
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 118321bd1a7f..1a33de9d84b1 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -46,11 +46,11 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 	 */
 	if (type == FREE_NIDS) {
 		mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
-							PAGE_CACHE_SHIFT;
+							PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == NAT_ENTRIES) {
 		mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
-							PAGE_CACHE_SHIFT;
+							PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == DIRTY_DENTS) {
 		if (sbi->sb->s_bdi->wb.dirty_exceeded)
@@ -62,13 +62,13 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 
 		for (i = 0; i <= UPDATE_INO; i++)
 			mem_size += (sbi->im[i].ino_num *
-				sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
+				sizeof(struct ino_entry)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else if (type == EXTENT_CACHE) {
 		mem_size = (atomic_read(&sbi->total_ext_tree) *
 				sizeof(struct extent_tree) +
 				atomic_read(&sbi->total_ext_node) *
-				sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
+				sizeof(struct extent_node)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else {
 		if (!sbi->sb->s_bdi->wb.dirty_exceeded)
@@ -121,7 +121,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
 
 	src_addr = page_address(src_page);
 	dst_addr = page_address(dst_page);
-	memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+	memcpy(dst_addr, src_addr, PAGE_SIZE);
 	set_page_dirty(dst_page);
 	f2fs_put_page(src_page, 1);
 
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 0b30cd2aeebd..011942f94d64 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -591,7 +591,7 @@ out:
 
 	/* truncate meta pages to be used by the recovery */
 	truncate_inode_pages_range(META_MAPPING(sbi),
-			(loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+			(loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1);
 
 	if (err) {
 		truncate_inode_pages_final(NODE_MAPPING(sbi));
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 6f16b39f0b52..540669d6978e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -885,12 +885,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
 		}
 	}
 
-	sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
+	sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE -
 			SUM_FOOTER_SIZE) / SUMMARY_SIZE;
 	if (valid_sum_count <= sum_in_page)
 		return 1;
 	else if ((valid_sum_count - sum_in_page) <=
-		(PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
+		(PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
 		return 2;
 	return 3;
 }
@@ -909,9 +909,9 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
 	void *dst = page_address(page);
 
 	if (src)
-		memcpy(dst, src, PAGE_CACHE_SIZE);
+		memcpy(dst, src, PAGE_SIZE);
 	else
-		memset(dst, 0, PAGE_CACHE_SIZE);
+		memset(dst, 0, PAGE_SIZE);
 	set_page_dirty(page);
 	f2fs_put_page(page, 1);
 }
@@ -1596,7 +1596,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
 			s = (struct f2fs_summary *)(kaddr + offset);
 			seg_i->sum_blk->entries[j] = *s;
 			offset += SUMMARY_SIZE;
-			if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+			if (offset + SUMMARY_SIZE <= PAGE_SIZE -
 						SUM_FOOTER_SIZE)
 				continue;
 
@@ -1757,7 +1757,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
 			*summary = seg_i->sum_blk->entries[j];
 			written_size += SUMMARY_SIZE;
 
-			if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
+			if (written_size + SUMMARY_SIZE <= PAGE_SIZE -
 							SUM_FOOTER_SIZE)
 				continue;
 
@@ -1844,7 +1844,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
 
 	src_addr = page_address(src_page);
 	dst_addr = page_address(dst_page);
-	memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
+	memcpy(dst_addr, src_addr, PAGE_SIZE);
 
 	set_page_dirty(dst_page);
 	f2fs_put_page(src_page, 1);
@@ -2171,7 +2171,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < NR_CURSEG_TYPE; i++) {
 		mutex_init(&array[i].curseg_mutex);
-		array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+		array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!array[i].sum_blk)
 			return -ENOMEM;
 		init_rwsem(&array[i].journal_rwsem);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 15bb81f8dac2..53b766cdabf2 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1070,10 +1070,10 @@ static int sanity_check_raw_super(struct super_block *sb,
 	}
 
 	/* Currently, support only 4KB page cache size */
-	if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) {
+	if (F2FS_BLKSIZE != PAGE_SIZE) {
 		f2fs_msg(sb, KERN_INFO,
 			"Invalid page_cache_size (%lu), supports only 4KB\n",
-			PAGE_CACHE_SIZE);
+			PAGE_SIZE);
 		return 1;
 	}
 
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index cb84f0fcc72a..bfc780c682fb 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -66,11 +66,11 @@ static int
 vxfs_immed_readpage(struct file *fp, struct page *pp)
 {
 	struct vxfs_inode_info	*vip = VXFS_INO(pp->mapping->host);
-	u_int64_t	offset = (u_int64_t)pp->index << PAGE_CACHE_SHIFT;
+	u_int64_t	offset = (u_int64_t)pp->index << PAGE_SHIFT;
 	caddr_t		kaddr;
 
 	kaddr = kmap(pp);
-	memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_CACHE_SIZE);
+	memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_SIZE);
 	kunmap(pp);
 	
 	flush_dcache_page(pp);
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 1cff72df0389..a49e0cfbb686 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -45,7 +45,7 @@
 /*
  * Number of VxFS blocks per page.
  */
-#define VXFS_BLOCK_PER_PAGE(sbp)  ((PAGE_CACHE_SIZE / (sbp)->s_blocksize))
+#define VXFS_BLOCK_PER_PAGE(sbp)  ((PAGE_SIZE / (sbp)->s_blocksize))
 
 
 static struct dentry *	vxfs_lookup(struct inode *, struct dentry *, unsigned int);
@@ -175,7 +175,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
 	if (de) {
 		ino = de->d_ino;
 		kunmap(pp);
-		page_cache_release(pp);
+		put_page(pp);
 	}
 	
 	return (ino);
@@ -255,8 +255,8 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx)
 	nblocks = dir_blocks(ip);
 	pblocks = VXFS_BLOCK_PER_PAGE(sbp);
 
-	page = pos >> PAGE_CACHE_SHIFT;
-	offset = pos & ~PAGE_CACHE_MASK;
+	page = pos >> PAGE_SHIFT;
+	offset = pos & ~PAGE_MASK;
 	block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
 
 	for (; page < npages; page++, block = 0) {
@@ -289,7 +289,7 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx)
 					continue;
 
 				offset = (char *)de - kaddr;
-				ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+				ctx->pos = ((page << PAGE_SHIFT) | offset) + 2;
 				if (!dir_emit(ctx, de->d_name, de->d_namelen,
 					de->d_ino, DT_UNKNOWN)) {
 					vxfs_put_page(pp);
@@ -301,6 +301,6 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx)
 		vxfs_put_page(pp);
 		offset = 0;
 	}
-	ctx->pos = ((page << PAGE_CACHE_SHIFT) | offset) + 2;
+	ctx->pos = ((page << PAGE_SHIFT) | offset) + 2;
 	return 0;
 }
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 5d318c44f855..e806694d4145 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -50,7 +50,7 @@ inline void
 vxfs_put_page(struct page *pp)
 {
 	kunmap(pp);
-	page_cache_release(pp);
+	put_page(pp);
 }
 
 /**
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index fee81e8768c9..592cea54cea0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -33,7 +33,7 @@
 /*
  * 4MB minimal write chunk size
  */
-#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
+#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_SHIFT - 10))
 
 struct wb_completion {
 	atomic_t		cnt;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 6b35fc4860a0..3078b679fcd1 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -113,7 +113,7 @@ try_again:
 
 	wake_up_bit(&cookie->flags, 0);
 	if (xpage)
-		page_cache_release(xpage);
+		put_page(xpage);
 	__fscache_uncache_page(cookie, page);
 	return true;
 
@@ -164,7 +164,7 @@ static void fscache_end_page_write(struct fscache_object *object,
 	}
 	spin_unlock(&object->lock);
 	if (xpage)
-		page_cache_release(xpage);
+		put_page(xpage);
 }
 
 /*
@@ -884,7 +884,7 @@ void fscache_invalidate_writes(struct fscache_cookie *cookie)
 		spin_unlock(&cookie->stores_lock);
 
 		for (i = n - 1; i >= 0; i--)
-			page_cache_release(results[i]);
+			put_page(results[i]);
 	}
 
 	_leave("");
@@ -982,7 +982,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 
 	radix_tree_tag_set(&cookie->stores, page->index,
 			   FSCACHE_COOKIE_PENDING_TAG);
-	page_cache_get(page);
+	get_page(page);
 
 	/* we only want one writer at a time, but we do need to queue new
 	 * writers after exclusive ops */
@@ -1026,7 +1026,7 @@ submit_failed:
 	radix_tree_delete(&cookie->stores, page->index);
 	spin_unlock(&cookie->stores_lock);
 	wake_cookie = __fscache_unuse_cookie(cookie);
-	page_cache_release(page);
+	put_page(page);
 	ret = -ENOBUFS;
 	goto nobufs;
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ebb5e37455a0..cbece1221417 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -897,7 +897,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 		return err;
 	}
 
-	page_cache_get(newpage);
+	get_page(newpage);
 
 	if (!(buf->flags & PIPE_BUF_FLAG_LRU))
 		lru_cache_add_file(newpage);
@@ -912,12 +912,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 
 	if (err) {
 		unlock_page(newpage);
-		page_cache_release(newpage);
+		put_page(newpage);
 		return err;
 	}
 
 	unlock_page(oldpage);
-	page_cache_release(oldpage);
+	put_page(oldpage);
 	cs->len = 0;
 
 	return 0;
@@ -951,7 +951,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
 	fuse_copy_finish(cs);
 
 	buf = cs->pipebufs;
-	page_cache_get(page);
+	get_page(page);
 	buf->page = page;
 	buf->offset = offset;
 	buf->len = count;
@@ -1435,7 +1435,7 @@ out_unlock:
 
 out:
 	for (; page_nr < cs.nr_segs; page_nr++)
-		page_cache_release(bufs[page_nr].page);
+		put_page(bufs[page_nr].page);
 
 	kfree(bufs);
 	return ret;
@@ -1632,8 +1632,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
 		goto out_up_killsb;
 
 	mapping = inode->i_mapping;
-	index = outarg.offset >> PAGE_CACHE_SHIFT;
-	offset = outarg.offset & ~PAGE_CACHE_MASK;
+	index = outarg.offset >> PAGE_SHIFT;
+	offset = outarg.offset & ~PAGE_MASK;
 	file_size = i_size_read(inode);
 	end = outarg.offset + outarg.size;
 	if (end > file_size) {
@@ -1652,13 +1652,13 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
 		if (!page)
 			goto out_iput;
 
-		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		this_num = min_t(unsigned, num, PAGE_SIZE - offset);
 		err = fuse_copy_page(cs, &page, offset, this_num, 0);
 		if (!err && offset == 0 &&
-		    (this_num == PAGE_CACHE_SIZE || file_size == end))
+		    (this_num == PAGE_SIZE || file_size == end))
 			SetPageUptodate(page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 
 		if (err)
 			goto out_iput;
@@ -1697,7 +1697,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
 	size_t total_len = 0;
 	int num_pages;
 
-	offset = outarg->offset & ~PAGE_CACHE_MASK;
+	offset = outarg->offset & ~PAGE_MASK;
 	file_size = i_size_read(inode);
 
 	num = outarg->size;
@@ -1720,7 +1720,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
 	req->page_descs[0].offset = offset;
 	req->end = fuse_retrieve_end;
 
-	index = outarg->offset >> PAGE_CACHE_SHIFT;
+	index = outarg->offset >> PAGE_SHIFT;
 
 	while (num && req->num_pages < num_pages) {
 		struct page *page;
@@ -1730,7 +1730,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
 		if (!page)
 			break;
 
-		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+		this_num = min_t(unsigned, num, PAGE_SIZE - offset);
 		req->pages[req->num_pages] = page;
 		req->page_descs[req->num_pages].length = this_num;
 		req->num_pages++;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9dde38f12c07..719924d6c706 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -348,7 +348,7 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
 		pgoff_t curr_index;
 
 		BUG_ON(req->inode != inode);
-		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		curr_index = req->misc.write.in.offset >> PAGE_SHIFT;
 		if (idx_from < curr_index + req->num_pages &&
 		    curr_index <= idx_to) {
 			found = true;
@@ -683,11 +683,11 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode,
 		 * present there.
 		 */
 		int i;
-		int start_idx = num_read >> PAGE_CACHE_SHIFT;
-		size_t off = num_read & (PAGE_CACHE_SIZE - 1);
+		int start_idx = num_read >> PAGE_SHIFT;
+		size_t off = num_read & (PAGE_SIZE - 1);
 
 		for (i = start_idx; i < req->num_pages; i++) {
-			zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
+			zero_user_segment(req->pages[i], off, PAGE_SIZE);
 			off = 0;
 		}
 	} else {
@@ -704,7 +704,7 @@ static int fuse_do_readpage(struct file *file, struct page *page)
 	struct fuse_req *req;
 	size_t num_read;
 	loff_t pos = page_offset(page);
-	size_t count = PAGE_CACHE_SIZE;
+	size_t count = PAGE_SIZE;
 	u64 attr_ver;
 	int err;
 
@@ -789,7 +789,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 		else
 			SetPageError(page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	if (req->ff)
 		fuse_file_put(req->ff, false);
@@ -800,7 +800,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
 	loff_t pos = page_offset(req->pages[0]);
-	size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+	size_t count = req->num_pages << PAGE_SHIFT;
 
 	req->out.argpages = 1;
 	req->out.page_zeroing = 1;
@@ -836,7 +836,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 
 	if (req->num_pages &&
 	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
-	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
+	     (req->num_pages + 1) * PAGE_SIZE > fc->max_read ||
 	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
 		int nr_alloc = min_t(unsigned, data->nr_pages,
 				     FUSE_MAX_PAGES_PER_REQ);
@@ -858,7 +858,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 		return -EIO;
 	}
 
-	page_cache_get(page);
+	get_page(page);
 	req->pages[req->num_pages] = page;
 	req->page_descs[req->num_pages].length = PAGE_SIZE;
 	req->num_pages++;
@@ -1003,17 +1003,17 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
 	for (i = 0; i < req->num_pages; i++) {
 		struct page *page = req->pages[i];
 
-		if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
+		if (!req->out.h.error && !offset && count >= PAGE_SIZE)
 			SetPageUptodate(page);
 
-		if (count > PAGE_CACHE_SIZE - offset)
-			count -= PAGE_CACHE_SIZE - offset;
+		if (count > PAGE_SIZE - offset)
+			count -= PAGE_SIZE - offset;
 		else
 			count = 0;
 		offset = 0;
 
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	return res;
@@ -1024,7 +1024,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 			       struct iov_iter *ii, loff_t pos)
 {
 	struct fuse_conn *fc = get_fuse_conn(mapping->host);
-	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned offset = pos & (PAGE_SIZE - 1);
 	size_t count = 0;
 	int err;
 
@@ -1034,8 +1034,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 	do {
 		size_t tmp;
 		struct page *page;
-		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-		size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
+		pgoff_t index = pos >> PAGE_SHIFT;
+		size_t bytes = min_t(size_t, PAGE_SIZE - offset,
 				     iov_iter_count(ii));
 
 		bytes = min_t(size_t, bytes, fc->max_write - count);
@@ -1059,7 +1059,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 		iov_iter_advance(ii, tmp);
 		if (!tmp) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			bytes = min(bytes, iov_iter_single_seg_count(ii));
 			goto again;
 		}
@@ -1072,7 +1072,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 		count += tmp;
 		pos += tmp;
 		offset += tmp;
-		if (offset == PAGE_CACHE_SIZE)
+		if (offset == PAGE_SIZE)
 			offset = 0;
 
 		if (!fc->big_writes)
@@ -1086,8 +1086,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
 {
 	return min_t(unsigned,
-		     ((pos + len - 1) >> PAGE_CACHE_SHIFT) -
-		     (pos >> PAGE_CACHE_SHIFT) + 1,
+		     ((pos + len - 1) >> PAGE_SHIFT) -
+		     (pos >> PAGE_SHIFT) + 1,
 		     FUSE_MAX_PAGES_PER_REQ);
 }
 
@@ -1205,8 +1205,8 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			goto out;
 
 		invalidate_mapping_pages(file->f_mapping,
-					 pos >> PAGE_CACHE_SHIFT,
-					 endbyte >> PAGE_CACHE_SHIFT);
+					 pos >> PAGE_SHIFT,
+					 endbyte >> PAGE_SHIFT);
 
 		written += written_buffered;
 		iocb->ki_pos = pos + written_buffered;
@@ -1315,8 +1315,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 	size_t nmax = write ? fc->max_write : fc->max_read;
 	loff_t pos = *ppos;
 	size_t count = iov_iter_count(iter);
-	pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
-	pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+	pgoff_t idx_from = pos >> PAGE_SHIFT;
+	pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
 	ssize_t res = 0;
 	struct fuse_req *req;
 	int err = 0;
@@ -1466,7 +1466,7 @@ __acquires(fc->lock)
 {
 	struct fuse_inode *fi = get_fuse_inode(req->inode);
 	struct fuse_write_in *inarg = &req->misc.write.in;
-	__u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
+	__u64 data_size = req->num_pages * PAGE_SIZE;
 
 	if (!fc->connected)
 		goto out_free;
@@ -1727,7 +1727,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 	list_del(&new_req->writepages_entry);
 	list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
 		BUG_ON(old_req->inode != new_req->inode);
-		curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		curr_index = old_req->misc.write.in.offset >> PAGE_SHIFT;
 		if (curr_index <= page->index &&
 		    page->index < curr_index + old_req->num_pages) {
 			found = true;
@@ -1742,7 +1742,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 	new_req->num_pages = 1;
 	for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
 		BUG_ON(tmp->inode != new_req->inode);
-		curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT;
 		if (tmp->num_pages == 1 &&
 		    curr_index == page->index) {
 			old_req = tmp;
@@ -1799,7 +1799,7 @@ static int fuse_writepages_fill(struct page *page,
 
 	if (req && req->num_pages &&
 	    (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
-	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+	     (req->num_pages + 1) * PAGE_SIZE > fc->max_write ||
 	     data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
 		fuse_writepages_send(data);
 		data->req = NULL;
@@ -1924,7 +1924,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata)
 {
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 	struct fuse_conn *fc = get_fuse_conn(file_inode(file));
 	struct page *page;
 	loff_t fsize;
@@ -1938,15 +1938,15 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
 
 	fuse_wait_on_page_writeback(mapping->host, page->index);
 
-	if (PageUptodate(page) || len == PAGE_CACHE_SIZE)
+	if (PageUptodate(page) || len == PAGE_SIZE)
 		goto success;
 	/*
 	 * Check if the start this page comes after the end of file, in which
 	 * case the readpage can be optimized away.
 	 */
 	fsize = i_size_read(mapping->host);
-	if (fsize <= (pos & PAGE_CACHE_MASK)) {
-		size_t off = pos & ~PAGE_CACHE_MASK;
+	if (fsize <= (pos & PAGE_MASK)) {
+		size_t off = pos & ~PAGE_MASK;
 		if (off)
 			zero_user_segment(page, 0, off);
 		goto success;
@@ -1960,7 +1960,7 @@ success:
 
 cleanup:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 error:
 	return err;
 }
@@ -1973,16 +1973,16 @@ static int fuse_write_end(struct file *file, struct address_space *mapping,
 
 	if (!PageUptodate(page)) {
 		/* Zero any unwritten bytes at the end of the page */
-		size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK;
+		size_t endoff = (pos + copied) & ~PAGE_MASK;
 		if (endoff)
-			zero_user_segment(page, endoff, PAGE_CACHE_SIZE);
+			zero_user_segment(page, endoff, PAGE_SIZE);
 		SetPageUptodate(page);
 	}
 
 	fuse_write_update_size(inode, pos + copied);
 	set_page_dirty(page);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return copied;
 }
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4d69d5c0bedc..1ce67668a8e1 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -339,11 +339,11 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
 
 	fuse_invalidate_attr(inode);
 	if (offset >= 0) {
-		pg_start = offset >> PAGE_CACHE_SHIFT;
+		pg_start = offset >> PAGE_SHIFT;
 		if (len <= 0)
 			pg_end = -1;
 		else
-			pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+			pg_end = (offset + len - 1) >> PAGE_SHIFT;
 		invalidate_inode_pages2_range(inode->i_mapping,
 					      pg_start, pg_end);
 	}
@@ -864,7 +864,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		process_init_limits(fc, arg);
 
 		if (arg->minor >= 6) {
-			ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
+			ra_pages = arg->max_readahead / PAGE_SIZE;
 			if (arg->flags & FUSE_ASYNC_READ)
 				fc->async_read = 1;
 			if (!(arg->flags & FUSE_POSIX_LOCKS))
@@ -901,7 +901,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 			if (arg->time_gran && arg->time_gran <= 1000000000)
 				fc->sb->s_time_gran = arg->time_gran;
 		} else {
-			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
 			fc->no_flock = 1;
 		}
@@ -922,7 +922,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
-	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
+	arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE;
 	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
 		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
 		FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
@@ -955,7 +955,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 	int err;
 
 	fc->bdi.name = "fuse";
-	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
 	/* fuse does it's own writeback accounting */
 	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
 
@@ -1053,8 +1053,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 			goto err;
 #endif
 	} else {
-		sb->s_blocksize = PAGE_CACHE_SIZE;
-		sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+		sb->s_blocksize = PAGE_SIZE;
+		sb->s_blocksize_bits = PAGE_SHIFT;
 	}
 	sb->s_magic = FUSE_SUPER_MAGIC;
 	sb->s_op = &fuse_super_operations;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index aa016e4b8bec..1bbbee945f46 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -101,7 +101,7 @@ static int gfs2_writepage_common(struct page *page,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	loff_t i_size = i_size_read(inode);
-	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	pgoff_t end_index = i_size >> PAGE_SHIFT;
 	unsigned offset;
 
 	if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
@@ -109,9 +109,9 @@ static int gfs2_writepage_common(struct page *page,
 	if (current->journal_info)
 		goto redirty;
 	/* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_CACHE_SIZE-1);
+	offset = i_size & (PAGE_SIZE-1);
 	if (page->index > end_index || (page->index == end_index && !offset)) {
-		page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
 		goto out;
 	}
 	return 1;
@@ -238,7 +238,7 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
+	unsigned nrblocks = nr_pages * (PAGE_SIZE/inode->i_sb->s_blocksize);
 	int i;
 	int ret;
 
@@ -366,8 +366,8 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
 			cycled = 0;
 		end = -1;
 	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		index = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
@@ -458,7 +458,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	 * so we need to supply one here. It doesn't happen often.
 	 */
 	if (unlikely(page->index)) {
-		zero_user(page, 0, PAGE_CACHE_SIZE);
+		zero_user(page, 0, PAGE_SIZE);
 		SetPageUptodate(page);
 		return 0;
 	}
@@ -471,7 +471,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	if (dsize > (dibh->b_size - sizeof(struct gfs2_dinode)))
 		dsize = (dibh->b_size - sizeof(struct gfs2_dinode));
 	memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
-	memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
+	memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
 	kunmap_atomic(kaddr);
 	flush_dcache_page(page);
 	brelse(dibh);
@@ -560,8 +560,8 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
                        unsigned size)
 {
 	struct address_space *mapping = ip->i_inode.i_mapping;
-	unsigned long index = *pos / PAGE_CACHE_SIZE;
-	unsigned offset = *pos & (PAGE_CACHE_SIZE - 1);
+	unsigned long index = *pos / PAGE_SIZE;
+	unsigned offset = *pos & (PAGE_SIZE - 1);
 	unsigned copied = 0;
 	unsigned amt;
 	struct page *page;
@@ -569,15 +569,15 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
 
 	do {
 		amt = size - copied;
-		if (offset + size > PAGE_CACHE_SIZE)
-			amt = PAGE_CACHE_SIZE - offset;
+		if (offset + size > PAGE_SIZE)
+			amt = PAGE_SIZE - offset;
 		page = read_cache_page(mapping, index, __gfs2_readpage, NULL);
 		if (IS_ERR(page))
 			return PTR_ERR(page);
 		p = kmap_atomic(page);
 		memcpy(buf + copied, p + offset, amt);
 		kunmap_atomic(p);
-		page_cache_release(page);
+		put_page(page);
 		copied += amt;
 		index++;
 		offset = 0;
@@ -651,8 +651,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 	unsigned requested = 0;
 	int alloc_required;
 	int error = 0;
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	pgoff_t index = pos >> PAGE_SHIFT;
+	unsigned from = pos & (PAGE_SIZE - 1);
 	struct page *page;
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
@@ -697,7 +697,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 		rblocks += gfs2_rg_blocks(ip, requested);
 
 	error = gfs2_trans_begin(sdp, rblocks,
-				 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+				 PAGE_SIZE/sdp->sd_sb.sb_bsize);
 	if (error)
 		goto out_trans_fail;
 
@@ -727,7 +727,7 @@ out:
 		return 0;
 
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	gfs2_trans_end(sdp);
 	if (pos + len > ip->i_inode.i_size)
@@ -827,7 +827,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 	if (!PageUptodate(page))
 		SetPageUptodate(page);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	if (copied) {
 		if (inode->i_size < to)
@@ -877,7 +877,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
 	struct buffer_head *dibh;
-	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int from = pos & (PAGE_SIZE - 1);
 	unsigned int to = from + len;
 	int ret;
 	struct gfs2_trans *tr = current->journal_info;
@@ -888,7 +888,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	ret = gfs2_meta_inode_buffer(ip, &dibh);
 	if (unlikely(ret)) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		goto failed;
 	}
 
@@ -992,7 +992,7 @@ static void gfs2_invalidatepage(struct page *page, unsigned int offset,
 {
 	struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host);
 	unsigned int stop = offset + length;
-	int partial_page = (offset || length < PAGE_CACHE_SIZE);
+	int partial_page = (offset || length < PAGE_SIZE);
 	struct buffer_head *bh, *head;
 	unsigned long pos = 0;
 
@@ -1082,7 +1082,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 * the first place, mapping->nr_pages will always be zero.
 	 */
 	if (mapping->nrpages) {
-		loff_t lstart = offset & ~(PAGE_CACHE_SIZE - 1);
+		loff_t lstart = offset & ~(PAGE_SIZE - 1);
 		loff_t len = iov_iter_count(iter);
 		loff_t end = PAGE_ALIGN(offset + len) - 1;
 
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 0860f0b5b3f1..24ce1cdd434a 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -75,7 +75,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 			dsize = dibh->b_size - sizeof(struct gfs2_dinode);
 
 		memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
-		memset(kaddr + dsize, 0, PAGE_CACHE_SIZE - dsize);
+		memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
 		kunmap(page);
 
 		SetPageUptodate(page);
@@ -98,7 +98,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
 
 	if (release) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	return 0;
@@ -932,8 +932,8 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
 {
 	struct inode *inode = mapping->host;
 	struct gfs2_inode *ip = GFS2_I(inode);
-	unsigned long index = from >> PAGE_CACHE_SHIFT;
-	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned long index = from >> PAGE_SHIFT;
+	unsigned offset = from & (PAGE_SIZE-1);
 	unsigned blocksize, iblock, length, pos;
 	struct buffer_head *bh;
 	struct page *page;
@@ -945,7 +945,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
 
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
-	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+	iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
 
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize, 0);
@@ -989,7 +989,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
 	mark_buffer_dirty(bh);
 unlock:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return err;
 }
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c9384f932975..208efc70ad49 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -354,8 +354,8 @@ static int gfs2_allocate_page_backing(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct buffer_head bh;
-	unsigned long size = PAGE_CACHE_SIZE;
-	u64 lblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	unsigned long size = PAGE_SIZE;
+	u64 lblock = page->index << (PAGE_SHIFT - inode->i_blkbits);
 
 	do {
 		bh.b_state = 0;
@@ -386,7 +386,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_alloc_parms ap = { .aflags = 0, };
 	unsigned long last_index;
-	u64 pos = page->index << PAGE_CACHE_SHIFT;
+	u64 pos = page->index << PAGE_SHIFT;
 	unsigned int data_blocks, ind_blocks, rblocks;
 	struct gfs2_holder gh;
 	loff_t size;
@@ -401,7 +401,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		goto out;
 
-	gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
+	gfs2_size_hint(vma->vm_file, pos, PAGE_SIZE);
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
@@ -411,7 +411,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
 
-	if (!gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE)) {
+	if (!gfs2_write_alloc_required(ip, pos, PAGE_SIZE)) {
 		lock_page(page);
 		if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
 			ret = -EAGAIN;
@@ -424,7 +424,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (ret)
 		goto out_unlock;
 
-	gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
+	gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks);
 	ap.target = data_blocks + ind_blocks;
 	ret = gfs2_quota_lock_check(ip, &ap);
 	if (ret)
@@ -447,7 +447,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	lock_page(page);
 	ret = -EINVAL;
 	size = i_size_read(inode);
-	last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+	last_index = (size - 1) >> PAGE_SHIFT;
 	/* Check page index against inode size */
 	if (size == 0 || (page->index > last_index))
 		goto out_trans_end;
@@ -873,7 +873,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
 			rblocks += data_blocks ? data_blocks : 1;
 
 		error = gfs2_trans_begin(sdp, rblocks,
-					 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
+					 PAGE_SIZE/sdp->sd_sb.sb_bsize);
 		if (error)
 			goto out_trans_fail;
 
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index e137d96f1b17..0448524c11bc 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -124,7 +124,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 	if (mapping == NULL)
 		mapping = &sdp->sd_aspace;
 
-	shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
+	shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
 	index = blkno >> shift;             /* convert block to page */
 	bufnum = blkno - (index << shift);  /* block buf index within page */
 
@@ -154,7 +154,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 		map_bh(bh, sdp->sd_vfs, blkno);
 
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return bh;
 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a39891344259..ce7d69a2fdc0 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -701,7 +701,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
 	unsigned to_write = bytes, pg_off = off;
 	int done = 0;
 
-	blk = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift);
+	blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift);
 	boff = off % bsize;
 
 	page = find_or_create_page(mapping, index, GFP_NOFS);
@@ -753,13 +753,13 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index,
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return 0;
 
 unlock_out:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return -EIO;
 }
 
@@ -773,13 +773,13 @@ static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp,
 
 	nbytes = sizeof(struct gfs2_quota);
 
-	pg_beg = loc >> PAGE_CACHE_SHIFT;
-	pg_off = loc % PAGE_CACHE_SIZE;
+	pg_beg = loc >> PAGE_SHIFT;
+	pg_off = loc % PAGE_SIZE;
 
 	/* If the quota straddles a page boundary, split the write in two */
-	if ((pg_off + nbytes) > PAGE_CACHE_SIZE) {
+	if ((pg_off + nbytes) > PAGE_SIZE) {
 		pg_oflow = 1;
-		overflow = (pg_off + nbytes) - PAGE_CACHE_SIZE;
+		overflow = (pg_off + nbytes) - PAGE_SIZE;
 	}
 
 	ptr = qp;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 07c0265aa195..99a0bdac8796 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -918,9 +918,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 		goto fail;
 
 	rgd->rd_gl->gl_object = rgd;
-	rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_CACHE_MASK;
-	rgd->rd_gl->gl_vm.end = PAGE_CACHE_ALIGN((rgd->rd_addr +
-						  rgd->rd_length) * bsize) - 1;
+	rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_MASK;
+	rgd->rd_gl->gl_vm.end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1;
 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
 	rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
 	if (rgd->rd_data > sdp->sd_max_rg_data)
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 221719eac5de..d77d844b668b 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -278,14 +278,14 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 
 	mapping = tree->inode->i_mapping;
 	off = (loff_t)cnid * tree->node_size;
-	block = off >> PAGE_CACHE_SHIFT;
-	node->page_offset = off & ~PAGE_CACHE_MASK;
+	block = off >> PAGE_SHIFT;
+	node->page_offset = off & ~PAGE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; i++) {
 		page = read_mapping_page(mapping, block++, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
-			page_cache_release(page);
+			put_page(page);
 			goto fail;
 		}
 		node->page[i] = page;
@@ -401,7 +401,7 @@ void hfs_bnode_free(struct hfs_bnode *node)
 
 	for (i = 0; i < node->tree->pages_per_bnode; i++)
 		if (node->page[i])
-			page_cache_release(node->page[i]);
+			put_page(node->page[i]);
 	kfree(node);
 }
 
@@ -429,11 +429,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
 
 	pagep = node->page;
 	memset(kmap(*pagep) + node->page_offset, 0,
-	       min((int)PAGE_CACHE_SIZE, (int)tree->node_size));
+	       min((int)PAGE_SIZE, (int)tree->node_size));
 	set_page_dirty(*pagep);
 	kunmap(*pagep);
 	for (i = 1; i < tree->pages_per_bnode; i++) {
-		memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE);
+		memset(kmap(*++pagep), 0, PAGE_SIZE);
 		set_page_dirty(*pagep);
 		kunmap(*pagep);
 	}
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 1ab19e660e69..37cdd955eceb 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -116,14 +116,14 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
 	}
 
 	tree->node_size_shift = ffs(size) - 1;
-	tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 	return tree;
 
 fail_page:
-	page_cache_release(page);
+	put_page(page);
 free_inode:
 	tree->inode->i_mapping->a_ops = &hfs_aops;
 	iput(tree->inode);
@@ -257,9 +257,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 	off = off16;
 
 	off += node->page_offset;
-	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	pagep = node->page + (off >> PAGE_SHIFT);
 	data = kmap(*pagep);
-	off &= ~PAGE_CACHE_MASK;
+	off &= ~PAGE_MASK;
 	idx = 0;
 
 	for (;;) {
@@ -279,7 +279,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 					}
 				}
 			}
-			if (++off >= PAGE_CACHE_SIZE) {
+			if (++off >= PAGE_SIZE) {
 				kunmap(*pagep);
 				data = kmap(*++pagep);
 				off = 0;
@@ -302,9 +302,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 		len = hfs_brec_lenoff(node, 0, &off16);
 		off = off16;
 		off += node->page_offset;
-		pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+		pagep = node->page + (off >> PAGE_SHIFT);
 		data = kmap(*pagep);
-		off &= ~PAGE_CACHE_MASK;
+		off &= ~PAGE_MASK;
 	}
 }
 
@@ -348,9 +348,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
 		len = hfs_brec_lenoff(node, 0, &off);
 	}
 	off += node->page_offset + nidx / 8;
-	page = node->page[off >> PAGE_CACHE_SHIFT];
+	page = node->page[off >> PAGE_SHIFT];
 	data = kmap(page);
-	off &= ~PAGE_CACHE_MASK;
+	off &= ~PAGE_MASK;
 	m = 1 << (~nidx & 7);
 	byte = data[off];
 	if (!(byte & m)) {
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 6686bf39a5b5..cb1e5faa2fb7 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -91,8 +91,8 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
 	if (!tree)
 		return 0;
 
-	if (tree->node_size >= PAGE_CACHE_SIZE) {
-		nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
+	if (tree->node_size >= PAGE_SIZE) {
+		nidx = page->index >> (tree->node_size_shift - PAGE_SHIFT);
 		spin_lock(&tree->hash_lock);
 		node = hfs_bnode_findhash(tree, nidx);
 		if (!node)
@@ -105,8 +105,8 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
 		}
 		spin_unlock(&tree->hash_lock);
 	} else {
-		nidx = page->index << (PAGE_CACHE_SHIFT - tree->node_size_shift);
-		i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+		nidx = page->index << (PAGE_SHIFT - tree->node_size_shift);
+		i = 1 << (PAGE_SHIFT - tree->node_size_shift);
 		spin_lock(&tree->hash_lock);
 		do {
 			node = hfs_bnode_findhash(tree, nidx++);
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index d2954451519e..c0ae274c0a22 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -13,7 +13,7 @@
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
 
-#define PAGE_CACHE_BITS	(PAGE_CACHE_SIZE * 8)
+#define PAGE_CACHE_BITS	(PAGE_SIZE * 8)
 
 int hfsplus_block_allocate(struct super_block *sb, u32 size,
 		u32 offset, u32 *max)
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 63924662aaf3..ce014ceb89ef 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -24,16 +24,16 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
 	int l;
 
 	off += node->page_offset;
-	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
-	off &= ~PAGE_CACHE_MASK;
+	pagep = node->page + (off >> PAGE_SHIFT);
+	off &= ~PAGE_MASK;
 
-	l = min_t(int, len, PAGE_CACHE_SIZE - off);
+	l = min_t(int, len, PAGE_SIZE - off);
 	memcpy(buf, kmap(*pagep) + off, l);
 	kunmap(*pagep);
 
 	while ((len -= l) != 0) {
 		buf += l;
-		l = min_t(int, len, PAGE_CACHE_SIZE);
+		l = min_t(int, len, PAGE_SIZE);
 		memcpy(buf, kmap(*++pagep), l);
 		kunmap(*pagep);
 	}
@@ -77,17 +77,17 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
 	int l;
 
 	off += node->page_offset;
-	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
-	off &= ~PAGE_CACHE_MASK;
+	pagep = node->page + (off >> PAGE_SHIFT);
+	off &= ~PAGE_MASK;
 
-	l = min_t(int, len, PAGE_CACHE_SIZE - off);
+	l = min_t(int, len, PAGE_SIZE - off);
 	memcpy(kmap(*pagep) + off, buf, l);
 	set_page_dirty(*pagep);
 	kunmap(*pagep);
 
 	while ((len -= l) != 0) {
 		buf += l;
-		l = min_t(int, len, PAGE_CACHE_SIZE);
+		l = min_t(int, len, PAGE_SIZE);
 		memcpy(kmap(*++pagep), buf, l);
 		set_page_dirty(*pagep);
 		kunmap(*pagep);
@@ -107,16 +107,16 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
 	int l;
 
 	off += node->page_offset;
-	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
-	off &= ~PAGE_CACHE_MASK;
+	pagep = node->page + (off >> PAGE_SHIFT);
+	off &= ~PAGE_MASK;
 
-	l = min_t(int, len, PAGE_CACHE_SIZE - off);
+	l = min_t(int, len, PAGE_SIZE - off);
 	memset(kmap(*pagep) + off, 0, l);
 	set_page_dirty(*pagep);
 	kunmap(*pagep);
 
 	while ((len -= l) != 0) {
-		l = min_t(int, len, PAGE_CACHE_SIZE);
+		l = min_t(int, len, PAGE_SIZE);
 		memset(kmap(*++pagep), 0, l);
 		set_page_dirty(*pagep);
 		kunmap(*pagep);
@@ -136,20 +136,20 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
 	tree = src_node->tree;
 	src += src_node->page_offset;
 	dst += dst_node->page_offset;
-	src_page = src_node->page + (src >> PAGE_CACHE_SHIFT);
-	src &= ~PAGE_CACHE_MASK;
-	dst_page = dst_node->page + (dst >> PAGE_CACHE_SHIFT);
-	dst &= ~PAGE_CACHE_MASK;
+	src_page = src_node->page + (src >> PAGE_SHIFT);
+	src &= ~PAGE_MASK;
+	dst_page = dst_node->page + (dst >> PAGE_SHIFT);
+	dst &= ~PAGE_MASK;
 
 	if (src == dst) {
-		l = min_t(int, len, PAGE_CACHE_SIZE - src);
+		l = min_t(int, len, PAGE_SIZE - src);
 		memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l);
 		kunmap(*src_page);
 		set_page_dirty(*dst_page);
 		kunmap(*dst_page);
 
 		while ((len -= l) != 0) {
-			l = min_t(int, len, PAGE_CACHE_SIZE);
+			l = min_t(int, len, PAGE_SIZE);
 			memcpy(kmap(*++dst_page), kmap(*++src_page), l);
 			kunmap(*src_page);
 			set_page_dirty(*dst_page);
@@ -161,12 +161,12 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
 		do {
 			src_ptr = kmap(*src_page) + src;
 			dst_ptr = kmap(*dst_page) + dst;
-			if (PAGE_CACHE_SIZE - src < PAGE_CACHE_SIZE - dst) {
-				l = PAGE_CACHE_SIZE - src;
+			if (PAGE_SIZE - src < PAGE_SIZE - dst) {
+				l = PAGE_SIZE - src;
 				src = 0;
 				dst += l;
 			} else {
-				l = PAGE_CACHE_SIZE - dst;
+				l = PAGE_SIZE - dst;
 				src += l;
 				dst = 0;
 			}
@@ -195,11 +195,11 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 	dst += node->page_offset;
 	if (dst > src) {
 		src += len - 1;
-		src_page = node->page + (src >> PAGE_CACHE_SHIFT);
-		src = (src & ~PAGE_CACHE_MASK) + 1;
+		src_page = node->page + (src >> PAGE_SHIFT);
+		src = (src & ~PAGE_MASK) + 1;
 		dst += len - 1;
-		dst_page = node->page + (dst >> PAGE_CACHE_SHIFT);
-		dst = (dst & ~PAGE_CACHE_MASK) + 1;
+		dst_page = node->page + (dst >> PAGE_SHIFT);
+		dst = (dst & ~PAGE_MASK) + 1;
 
 		if (src == dst) {
 			while (src < len) {
@@ -208,7 +208,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 				set_page_dirty(*dst_page);
 				kunmap(*dst_page);
 				len -= src;
-				src = PAGE_CACHE_SIZE;
+				src = PAGE_SIZE;
 				src_page--;
 				dst_page--;
 			}
@@ -226,32 +226,32 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 				dst_ptr = kmap(*dst_page) + dst;
 				if (src < dst) {
 					l = src;
-					src = PAGE_CACHE_SIZE;
+					src = PAGE_SIZE;
 					dst -= l;
 				} else {
 					l = dst;
 					src -= l;
-					dst = PAGE_CACHE_SIZE;
+					dst = PAGE_SIZE;
 				}
 				l = min(len, l);
 				memmove(dst_ptr - l, src_ptr - l, l);
 				kunmap(*src_page);
 				set_page_dirty(*dst_page);
 				kunmap(*dst_page);
-				if (dst == PAGE_CACHE_SIZE)
+				if (dst == PAGE_SIZE)
 					dst_page--;
 				else
 					src_page--;
 			} while ((len -= l));
 		}
 	} else {
-		src_page = node->page + (src >> PAGE_CACHE_SHIFT);
-		src &= ~PAGE_CACHE_MASK;
-		dst_page = node->page + (dst >> PAGE_CACHE_SHIFT);
-		dst &= ~PAGE_CACHE_MASK;
+		src_page = node->page + (src >> PAGE_SHIFT);
+		src &= ~PAGE_MASK;
+		dst_page = node->page + (dst >> PAGE_SHIFT);
+		dst &= ~PAGE_MASK;
 
 		if (src == dst) {
-			l = min_t(int, len, PAGE_CACHE_SIZE - src);
+			l = min_t(int, len, PAGE_SIZE - src);
 			memmove(kmap(*dst_page) + src,
 				kmap(*src_page) + src, l);
 			kunmap(*src_page);
@@ -259,7 +259,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 			kunmap(*dst_page);
 
 			while ((len -= l) != 0) {
-				l = min_t(int, len, PAGE_CACHE_SIZE);
+				l = min_t(int, len, PAGE_SIZE);
 				memmove(kmap(*++dst_page),
 					kmap(*++src_page), l);
 				kunmap(*src_page);
@@ -272,13 +272,13 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
 			do {
 				src_ptr = kmap(*src_page) + src;
 				dst_ptr = kmap(*dst_page) + dst;
-				if (PAGE_CACHE_SIZE - src <
-						PAGE_CACHE_SIZE - dst) {
-					l = PAGE_CACHE_SIZE - src;
+				if (PAGE_SIZE - src <
+						PAGE_SIZE - dst) {
+					l = PAGE_SIZE - src;
 					src = 0;
 					dst += l;
 				} else {
-					l = PAGE_CACHE_SIZE - dst;
+					l = PAGE_SIZE - dst;
 					src += l;
 					dst = 0;
 				}
@@ -444,14 +444,14 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
 
 	mapping = tree->inode->i_mapping;
 	off = (loff_t)cnid << tree->node_size_shift;
-	block = off >> PAGE_CACHE_SHIFT;
-	node->page_offset = off & ~PAGE_CACHE_MASK;
+	block = off >> PAGE_SHIFT;
+	node->page_offset = off & ~PAGE_MASK;
 	for (i = 0; i < tree->pages_per_bnode; block++, i++) {
 		page = read_mapping_page(mapping, block, NULL);
 		if (IS_ERR(page))
 			goto fail;
 		if (PageError(page)) {
-			page_cache_release(page);
+			put_page(page);
 			goto fail;
 		}
 		node->page[i] = page;
@@ -569,7 +569,7 @@ void hfs_bnode_free(struct hfs_bnode *node)
 
 	for (i = 0; i < node->tree->pages_per_bnode; i++)
 		if (node->page[i])
-			page_cache_release(node->page[i]);
+			put_page(node->page[i]);
 	kfree(node);
 }
 
@@ -597,11 +597,11 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
 
 	pagep = node->page;
 	memset(kmap(*pagep) + node->page_offset, 0,
-	       min_t(int, PAGE_CACHE_SIZE, tree->node_size));
+	       min_t(int, PAGE_SIZE, tree->node_size));
 	set_page_dirty(*pagep);
 	kunmap(*pagep);
 	for (i = 1; i < tree->pages_per_bnode; i++) {
-		memset(kmap(*++pagep), 0, PAGE_CACHE_SIZE);
+		memset(kmap(*++pagep), 0, PAGE_SIZE);
 		set_page_dirty(*pagep);
 		kunmap(*pagep);
 	}
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 3345c7553edc..d9d1a36ba826 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -236,15 +236,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
 	tree->node_size_shift = ffs(size) - 1;
 
 	tree->pages_per_bnode =
-		(tree->node_size + PAGE_CACHE_SIZE - 1) >>
-		PAGE_CACHE_SHIFT;
+		(tree->node_size + PAGE_SIZE - 1) >>
+		PAGE_SHIFT;
 
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 	return tree;
 
  fail_page:
-	page_cache_release(page);
+	put_page(page);
  free_inode:
 	tree->inode->i_mapping->a_ops = &hfsplus_aops;
 	iput(tree->inode);
@@ -380,9 +380,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 	off = off16;
 
 	off += node->page_offset;
-	pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+	pagep = node->page + (off >> PAGE_SHIFT);
 	data = kmap(*pagep);
-	off &= ~PAGE_CACHE_MASK;
+	off &= ~PAGE_MASK;
 	idx = 0;
 
 	for (;;) {
@@ -403,7 +403,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 					}
 				}
 			}
-			if (++off >= PAGE_CACHE_SIZE) {
+			if (++off >= PAGE_SIZE) {
 				kunmap(*pagep);
 				data = kmap(*++pagep);
 				off = 0;
@@ -426,9 +426,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 		len = hfs_brec_lenoff(node, 0, &off16);
 		off = off16;
 		off += node->page_offset;
-		pagep = node->page + (off >> PAGE_CACHE_SHIFT);
+		pagep = node->page + (off >> PAGE_SHIFT);
 		data = kmap(*pagep);
-		off &= ~PAGE_CACHE_MASK;
+		off &= ~PAGE_MASK;
 	}
 }
 
@@ -475,9 +475,9 @@ void hfs_bmap_free(struct hfs_bnode *node)
 		len = hfs_brec_lenoff(node, 0, &off);
 	}
 	off += node->page_offset + nidx / 8;
-	page = node->page[off >> PAGE_CACHE_SHIFT];
+	page = node->page[off >> PAGE_SHIFT];
 	data = kmap(page);
-	off &= ~PAGE_CACHE_MASK;
+	off &= ~PAGE_MASK;
 	m = 1 << (~nidx & 7);
 	byte = data[off];
 	if (!(byte & m)) {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1a6394cdb54e..b28f39865c3a 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -87,9 +87,9 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 	}
 	if (!tree)
 		return 0;
-	if (tree->node_size >= PAGE_CACHE_SIZE) {
+	if (tree->node_size >= PAGE_SIZE) {
 		nidx = page->index >>
-			(tree->node_size_shift - PAGE_CACHE_SHIFT);
+			(tree->node_size_shift - PAGE_SHIFT);
 		spin_lock(&tree->hash_lock);
 		node = hfs_bnode_findhash(tree, nidx);
 		if (!node)
@@ -103,8 +103,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 		spin_unlock(&tree->hash_lock);
 	} else {
 		nidx = page->index <<
-			(PAGE_CACHE_SHIFT - tree->node_size_shift);
-		i = 1 << (PAGE_CACHE_SHIFT - tree->node_size_shift);
+			(PAGE_SHIFT - tree->node_size_shift);
+		i = 1 << (PAGE_SHIFT - tree->node_size_shift);
 		spin_lock(&tree->hash_lock);
 		do {
 			node = hfs_bnode_findhash(tree, nidx++);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 5d54490a136d..c35911362ff9 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -438,7 +438,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 	err = -EFBIG;
 	last_fs_block = sbi->total_blocks - 1;
 	last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >>
-			PAGE_CACHE_SHIFT;
+			PAGE_SHIFT;
 
 	if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
 	    (last_fs_page > (pgoff_t)(~0ULL))) {
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index ab01530b4930..70e445ff0cff 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -220,7 +220,7 @@ check_attr_tree_state_again:
 
 	index = 0;
 	written = 0;
-	for (; written < node_size; index++, written += PAGE_CACHE_SIZE) {
+	for (; written < node_size; index++, written += PAGE_SIZE) {
 		void *kaddr;
 
 		page = read_mapping_page(mapping, index, NULL);
@@ -231,11 +231,11 @@ check_attr_tree_state_again:
 
 		kaddr = kmap_atomic(page);
 		memcpy(kaddr, buf + written,
-			min_t(size_t, PAGE_CACHE_SIZE, node_size - written));
+			min_t(size_t, PAGE_SIZE, node_size - written));
 		kunmap_atomic(kaddr);
 
 		set_page_dirty(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	hfsplus_mark_inode_dirty(attr_file, HFSPLUS_I_ATTR_DIRTY);
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index d1abbee281d1..7016653f3e41 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -410,12 +410,12 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
 	struct inode *inode = mapping->host;
 	char *buffer;
 	loff_t base = page_offset(page);
-	int count = PAGE_CACHE_SIZE;
-	int end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int count = PAGE_SIZE;
+	int end_index = inode->i_size >> PAGE_SHIFT;
 	int err;
 
 	if (page->index >= end_index)
-		count = inode->i_size & (PAGE_CACHE_SIZE-1);
+		count = inode->i_size & (PAGE_SIZE-1);
 
 	buffer = kmap(page);
 
@@ -447,7 +447,7 @@ static int hostfs_readpage(struct file *file, struct page *page)
 
 	buffer = kmap(page);
 	bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
-			PAGE_CACHE_SIZE);
+			PAGE_SIZE);
 	if (bytes_read < 0) {
 		ClearPageUptodate(page);
 		SetPageError(page);
@@ -455,7 +455,7 @@ static int hostfs_readpage(struct file *file, struct page *page)
 		goto out;
 	}
 
-	memset(buffer + bytes_read, 0, PAGE_CACHE_SIZE - bytes_read);
+	memset(buffer + bytes_read, 0, PAGE_SIZE - bytes_read);
 
 	ClearPageError(page);
 	SetPageUptodate(page);
@@ -471,7 +471,7 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping,
 			      loff_t pos, unsigned len, unsigned flags,
 			      struct page **pagep, void **fsdata)
 {
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 
 	*pagep = grab_cache_page_write_begin(mapping, index, flags);
 	if (!*pagep)
@@ -485,14 +485,14 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	void *buffer;
-	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned from = pos & (PAGE_SIZE - 1);
 	int err;
 
 	buffer = kmap(page);
 	err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied);
 	kunmap(page);
 
-	if (!PageUptodate(page) && err == PAGE_CACHE_SIZE)
+	if (!PageUptodate(page) && err == PAGE_SIZE)
 		SetPageUptodate(page);
 
 	/*
@@ -502,7 +502,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
 	if (err > 0 && (pos > inode->i_size))
 		inode->i_size = pos;
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return err;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e1f465a389d5..afb7c7f05de5 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -213,12 +213,12 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
 	int i, chunksize;
 
 	/* Find which 4k chunk and offset with in that chunk */
-	i = offset >> PAGE_CACHE_SHIFT;
-	offset = offset & ~PAGE_CACHE_MASK;
+	i = offset >> PAGE_SHIFT;
+	offset = offset & ~PAGE_MASK;
 
 	while (size) {
 		size_t n;
-		chunksize = PAGE_CACHE_SIZE;
+		chunksize = PAGE_SIZE;
 		if (offset)
 			chunksize -= offset;
 		if (chunksize > size)
@@ -285,7 +285,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			 * We have the page, copy it to user space buffer.
 			 */
 			copied = hugetlbfs_read_actor(page, offset, to, nr);
-			page_cache_release(page);
+			put_page(page);
 		}
 		offset += copied;
 		retval += copied;
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index f311bf084015..2e4e834d1a98 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -26,7 +26,7 @@
 #include "zisofs.h"
 
 /* This should probably be global. */
-static char zisofs_sink_page[PAGE_CACHE_SIZE];
+static char zisofs_sink_page[PAGE_SIZE];
 
 /*
  * This contains the zlib memory allocation and the mutex for the
@@ -70,11 +70,11 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
 		for ( i = 0 ; i < pcount ; i++ ) {
 			if (!pages[i])
 				continue;
-			memset(page_address(pages[i]), 0, PAGE_CACHE_SIZE);
+			memset(page_address(pages[i]), 0, PAGE_SIZE);
 			flush_dcache_page(pages[i]);
 			SetPageUptodate(pages[i]);
 		}
-		return ((loff_t)pcount) << PAGE_CACHE_SHIFT;
+		return ((loff_t)pcount) << PAGE_SHIFT;
 	}
 
 	/* Because zlib is not thread-safe, do all the I/O at the top. */
@@ -121,11 +121,11 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
 			if (pages[curpage]) {
 				stream.next_out = page_address(pages[curpage])
 						+ poffset;
-				stream.avail_out = PAGE_CACHE_SIZE - poffset;
+				stream.avail_out = PAGE_SIZE - poffset;
 				poffset = 0;
 			} else {
 				stream.next_out = (void *)&zisofs_sink_page;
-				stream.avail_out = PAGE_CACHE_SIZE;
+				stream.avail_out = PAGE_SIZE;
 			}
 		}
 		if (!stream.avail_in) {
@@ -220,14 +220,14 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
 	 * pages with the data we have anyway...
 	 */
 	start_off = page_offset(pages[full_page]);
-	end_off = min_t(loff_t, start_off + PAGE_CACHE_SIZE, inode->i_size);
+	end_off = min_t(loff_t, start_off + PAGE_SIZE, inode->i_size);
 
 	cstart_block = start_off >> zisofs_block_shift;
 	cend_block = (end_off + (1 << zisofs_block_shift) - 1)
 			>> zisofs_block_shift;
 
-	WARN_ON(start_off - (full_page << PAGE_CACHE_SHIFT) !=
-		((cstart_block << zisofs_block_shift) & PAGE_CACHE_MASK));
+	WARN_ON(start_off - (full_page << PAGE_SHIFT) !=
+		((cstart_block << zisofs_block_shift) & PAGE_MASK));
 
 	/* Find the pointer to this specific chunk */
 	/* Note: we're not using isonum_731() here because the data is known aligned */
@@ -260,10 +260,10 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
 		ret = zisofs_uncompress_block(inode, block_start, block_end,
 					      pcount, pages, poffset, &err);
 		poffset += ret;
-		pages += poffset >> PAGE_CACHE_SHIFT;
-		pcount -= poffset >> PAGE_CACHE_SHIFT;
-		full_page -= poffset >> PAGE_CACHE_SHIFT;
-		poffset &= ~PAGE_CACHE_MASK;
+		pages += poffset >> PAGE_SHIFT;
+		pcount -= poffset >> PAGE_SHIFT;
+		full_page -= poffset >> PAGE_SHIFT;
+		poffset &= ~PAGE_MASK;
 
 		if (err) {
 			brelse(bh);
@@ -282,7 +282,7 @@ static int zisofs_fill_pages(struct inode *inode, int full_page, int pcount,
 
 	if (poffset && *pages) {
 		memset(page_address(*pages) + poffset, 0,
-		       PAGE_CACHE_SIZE - poffset);
+		       PAGE_SIZE - poffset);
 		flush_dcache_page(*pages);
 		SetPageUptodate(*pages);
 	}
@@ -302,12 +302,12 @@ static int zisofs_readpage(struct file *file, struct page *page)
 	int i, pcount, full_page;
 	unsigned int zisofs_block_shift = ISOFS_I(inode)->i_format_parm[1];
 	unsigned int zisofs_pages_per_cblock =
-		PAGE_CACHE_SHIFT <= zisofs_block_shift ?
-		(1 << (zisofs_block_shift - PAGE_CACHE_SHIFT)) : 0;
+		PAGE_SHIFT <= zisofs_block_shift ?
+		(1 << (zisofs_block_shift - PAGE_SHIFT)) : 0;
 	struct page *pages[max_t(unsigned, zisofs_pages_per_cblock, 1)];
 	pgoff_t index = page->index, end_index;
 
-	end_index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	end_index = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	/*
 	 * If this page is wholly outside i_size we just return zero;
 	 * do_generic_file_read() will handle this for us
@@ -318,7 +318,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
 		return 0;
 	}
 
-	if (PAGE_CACHE_SHIFT <= zisofs_block_shift) {
+	if (PAGE_SHIFT <= zisofs_block_shift) {
 		/* We have already been given one page, this is the one
 		   we must do. */
 		full_page = index & (zisofs_pages_per_cblock - 1);
@@ -351,7 +351,7 @@ static int zisofs_readpage(struct file *file, struct page *page)
 			kunmap(pages[i]);
 			unlock_page(pages[i]);
 			if (i != full_page)
-				page_cache_release(pages[i]);
+				put_page(pages[i]);
 		}
 	}			
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index bcd2d41b318a..131dedc920d8 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1021,7 +1021,7 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock,
 		 * the page with useless information without generating any
 		 * I/O errors.
 		 */
-		if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
+		if (b_off > ((inode->i_size + PAGE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) {
 			printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n",
 				__func__, b_off,
 				(unsigned long long)inode->i_size);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 517f2de784cf..2ad98d6e19f4 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -81,11 +81,11 @@ static void release_buffer_page(struct buffer_head *bh)
 	if (!trylock_page(page))
 		goto nope;
 
-	page_cache_get(page);
+	get_page(page);
 	__brelse(bh);
 	try_to_free_buffers(page);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return;
 
 nope:
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index de73a9516a54..435f0b26ac20 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2221,7 +2221,7 @@ void jbd2_journal_ack_err(journal_t *journal)
 
 int jbd2_journal_blocks_per_page(struct inode *inode)
 {
-	return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+	return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
 }
 
 /*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 01e4652d88f6..67c103867bf8 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2263,7 +2263,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
 	struct buffer_head *head, *bh, *next;
 	unsigned int stop = offset + length;
 	unsigned int curr_off = 0;
-	int partial_page = (offset || length < PAGE_CACHE_SIZE);
+	int partial_page = (offset || length < PAGE_SIZE);
 	int may_free = 1;
 	int ret = 0;
 
@@ -2272,7 +2272,7 @@ int jbd2_journal_invalidatepage(journal_t *journal,
 	if (!page_has_buffers(page))
 		return 0;
 
-	BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
+	BUG_ON(stop > PAGE_SIZE || stop < length);
 
 	/* We will potentially be playing with lists other than just the
 	 * data lists (especially for journaled data mode), so be
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 1090eb64b90d..9d26b1b9fc01 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -95,15 +95,15 @@ __jffs2_dbg_fragtree_paranoia_check_nolock(struct jffs2_inode_info *f)
 			   rather than mucking around with actually reading the node
 			   and checking the compression type, which is the real way
 			   to tell a hole node. */
-			if (frag->ofs & (PAGE_CACHE_SIZE-1) && frag_prev(frag)
-					&& frag_prev(frag)->size < PAGE_CACHE_SIZE && frag_prev(frag)->node) {
+			if (frag->ofs & (PAGE_SIZE-1) && frag_prev(frag)
+					&& frag_prev(frag)->size < PAGE_SIZE && frag_prev(frag)->node) {
 				JFFS2_ERROR("REF_PRISTINE node at 0x%08x had a previous non-hole frag in the same page. Tell dwmw2.\n",
 					ref_offset(fn->raw));
 				bitched = 1;
 			}
 
-			if ((frag->ofs+frag->size) & (PAGE_CACHE_SIZE-1) && frag_next(frag)
-					&& frag_next(frag)->size < PAGE_CACHE_SIZE && frag_next(frag)->node) {
+			if ((frag->ofs+frag->size) & (PAGE_SIZE-1) && frag_next(frag)
+					&& frag_next(frag)->size < PAGE_SIZE && frag_next(frag)->node) {
 				JFFS2_ERROR("REF_PRISTINE node at 0x%08x (%08x-%08x) had a following non-hole frag in the same page. Tell dwmw2.\n",
 				       ref_offset(fn->raw), frag->ofs, frag->ofs+frag->size);
 				bitched = 1;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index cad86bac3453..0e62dec3effc 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -87,14 +87,15 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
 	int ret;
 
 	jffs2_dbg(2, "%s(): ino #%lu, page at offset 0x%lx\n",
-		  __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT);
+		  __func__, inode->i_ino, pg->index << PAGE_SHIFT);
 
 	BUG_ON(!PageLocked(pg));
 
 	pg_buf = kmap(pg);
 	/* FIXME: Can kmap fail? */
 
-	ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE);
+	ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT,
+				     PAGE_SIZE);
 
 	if (ret) {
 		ClearPageUptodate(pg);
@@ -137,8 +138,8 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 	struct page *pg;
 	struct inode *inode = mapping->host;
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-	uint32_t pageofs = index << PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
+	uint32_t pageofs = index << PAGE_SHIFT;
 	int ret = 0;
 
 	pg = grab_cache_page_write_begin(mapping, index, flags);
@@ -230,7 +231,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
 
 out_page:
 	unlock_page(pg);
-	page_cache_release(pg);
+	put_page(pg);
 	return ret;
 }
 
@@ -245,14 +246,14 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
 	struct jffs2_raw_inode *ri;
-	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned start = pos & (PAGE_SIZE - 1);
 	unsigned end = start + copied;
 	unsigned aligned_start = start & ~3;
 	int ret = 0;
 	uint32_t writtenlen = 0;
 
 	jffs2_dbg(1, "%s(): ino #%lu, page at 0x%lx, range %d-%d, flags %lx\n",
-		  __func__, inode->i_ino, pg->index << PAGE_CACHE_SHIFT,
+		  __func__, inode->i_ino, pg->index << PAGE_SHIFT,
 		  start, end, pg->flags);
 
 	/* We need to avoid deadlock with page_cache_read() in
@@ -261,7 +262,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
 	   to re-lock it. */
 	BUG_ON(!PageUptodate(pg));
 
-	if (end == PAGE_CACHE_SIZE) {
+	if (end == PAGE_SIZE) {
 		/* When writing out the end of a page, write out the
 		   _whole_ page. This helps to reduce the number of
 		   nodes in files which have many short writes, like
@@ -275,7 +276,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
 		jffs2_dbg(1, "%s(): Allocation of raw inode failed\n",
 			  __func__);
 		unlock_page(pg);
-		page_cache_release(pg);
+		put_page(pg);
 		return -ENOMEM;
 	}
 
@@ -292,7 +293,7 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
 	kmap(pg);
 
 	ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + aligned_start,
-				      (pg->index << PAGE_CACHE_SHIFT) + aligned_start,
+				      (pg->index << PAGE_SHIFT) + aligned_start,
 				      end - aligned_start, &writtenlen);
 
 	kunmap(pg);
@@ -329,6 +330,6 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping,
 	jffs2_dbg(1, "%s() returning %d\n",
 		  __func__, writtenlen > 0 ? writtenlen : ret);
 	unlock_page(pg);
-	page_cache_release(pg);
+	put_page(pg);
 	return writtenlen > 0 ? writtenlen : ret;
 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index bead25ae8fe4..ae2ebb26b446 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -586,8 +586,8 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_root;
 
 	sb->s_maxbytes = 0xFFFFFFFF;
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = JFFS2_SUPER_MAGIC;
 	if (!(sb->s_flags & MS_RDONLY))
 		jffs2_start_garbage_collect_thread(c);
@@ -685,7 +685,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
 	struct inode *inode = OFNI_EDONI_2SFFJ(f);
 	struct page *pg;
 
-	pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+	pg = read_cache_page(inode->i_mapping, offset >> PAGE_SHIFT,
 			     (void *)jffs2_do_readpage_unlock, inode);
 	if (IS_ERR(pg))
 		return (void *)pg;
@@ -701,7 +701,7 @@ void jffs2_gc_release_page(struct jffs2_sb_info *c,
 	struct page *pg = (void *)*priv;
 
 	kunmap(pg);
-	page_cache_release(pg);
+	put_page(pg);
 }
 
 static int jffs2_flash_setup(struct jffs2_sb_info *c) {
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 7e553f286775..9ed0f26cf023 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -552,7 +552,7 @@ static int jffs2_garbage_collect_live(struct jffs2_sb_info *c,  struct jffs2_era
 				goto upnout;
 		}
 		/* We found a datanode. Do the GC */
-		if((start >> PAGE_CACHE_SHIFT) < ((end-1) >> PAGE_CACHE_SHIFT)) {
+		if((start >> PAGE_SHIFT) < ((end-1) >> PAGE_SHIFT)) {
 			/* It crosses a page boundary. Therefore, it must be a hole. */
 			ret = jffs2_garbage_collect_hole(c, jeb, f, fn, start, end);
 		} else {
@@ -1192,8 +1192,8 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
 		struct jffs2_node_frag *frag;
 		uint32_t min, max;
 
-		min = start & ~(PAGE_CACHE_SIZE-1);
-		max = min + PAGE_CACHE_SIZE;
+		min = start & ~(PAGE_SIZE-1);
+		max = min + PAGE_SIZE;
 
 		frag = jffs2_lookup_node_frag(&f->fragtree, start);
 
@@ -1351,7 +1351,7 @@ static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_era
 		cdatalen = min_t(uint32_t, alloclen - sizeof(ri), end - offset);
 		datalen = end - offset;
 
-		writebuf = pg_ptr + (offset & (PAGE_CACHE_SIZE -1));
+		writebuf = pg_ptr + (offset & (PAGE_SIZE -1));
 
 		comprtype = jffs2_compress(c, f, writebuf, &comprbuf, &datalen, &cdatalen);
 
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 9a5449bc3afb..b86c78d178c6 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -90,7 +90,7 @@ uint32_t jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list,
 
 	/* If the last fragment starts at the RAM page boundary, it is
 	 * REF_PRISTINE irrespective of its size. */
-	if (frag->node && (frag->ofs & (PAGE_CACHE_SIZE - 1)) == 0) {
+	if (frag->node && (frag->ofs & (PAGE_SIZE - 1)) == 0) {
 		dbg_fragtree2("marking the last fragment 0x%08x-0x%08x REF_PRISTINE.\n",
 			frag->ofs, frag->ofs + frag->size);
 		frag->node->raw->flash_offset = ref_offset(frag->node->raw) | REF_PRISTINE;
@@ -237,7 +237,7 @@ static int jffs2_add_frag_to_fragtree(struct jffs2_sb_info *c, struct rb_root *r
 		   If so, both 'this' and the new node get marked REF_NORMAL so
 		   the GC can take a look.
 		*/
-		if (lastend && (lastend-1) >> PAGE_CACHE_SHIFT == newfrag->ofs >> PAGE_CACHE_SHIFT) {
+		if (lastend && (lastend-1) >> PAGE_SHIFT == newfrag->ofs >> PAGE_SHIFT) {
 			if (this->node)
 				mark_ref_normal(this->node->raw);
 			mark_ref_normal(newfrag->node->raw);
@@ -382,7 +382,7 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in
 
 	/* If we now share a page with other nodes, mark either previous
 	   or next node REF_NORMAL, as appropriate.  */
-	if (newfrag->ofs & (PAGE_CACHE_SIZE-1)) {
+	if (newfrag->ofs & (PAGE_SIZE-1)) {
 		struct jffs2_node_frag *prev = frag_prev(newfrag);
 
 		mark_ref_normal(fn->raw);
@@ -391,7 +391,7 @@ int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_in
 			mark_ref_normal(prev->node->raw);
 	}
 
-	if ((newfrag->ofs+newfrag->size) & (PAGE_CACHE_SIZE-1)) {
+	if ((newfrag->ofs+newfrag->size) & (PAGE_SIZE-1)) {
 		struct jffs2_node_frag *next = frag_next(newfrag);
 
 		if (next) {
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index b634de4c8101..7fb187ab2682 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -172,8 +172,8 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2
 	   beginning of a page and runs to the end of the file, or if
 	   it's a hole node, mark it REF_PRISTINE, else REF_NORMAL.
 	*/
-	if ((je32_to_cpu(ri->dsize) >= PAGE_CACHE_SIZE) ||
-	    ( ((je32_to_cpu(ri->offset)&(PAGE_CACHE_SIZE-1))==0) &&
+	if ((je32_to_cpu(ri->dsize) >= PAGE_SIZE) ||
+	    ( ((je32_to_cpu(ri->offset)&(PAGE_SIZE-1))==0) &&
 	      (je32_to_cpu(ri->dsize)+je32_to_cpu(ri->offset) ==  je32_to_cpu(ri->isize)))) {
 		flash_ofs |= REF_PRISTINE;
 	} else {
@@ -366,7 +366,8 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
 			break;
 		}
 		mutex_lock(&f->sem);
-		datalen = min_t(uint32_t, writelen, PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1)));
+		datalen = min_t(uint32_t, writelen,
+				PAGE_SIZE - (offset & (PAGE_SIZE-1)));
 		cdatalen = min_t(uint32_t, alloclen - sizeof(*ri), datalen);
 
 		comprtype = jffs2_compress(c, f, buf, &comprbuf, &datalen, &cdatalen);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index a3eb316b1ac3..b60e015cc757 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -80,7 +80,7 @@ static inline void lock_metapage(struct metapage *mp)
 static struct kmem_cache *metapage_cache;
 static mempool_t *metapage_mempool;
 
-#define MPS_PER_PAGE (PAGE_CACHE_SIZE >> L2PSIZE)
+#define MPS_PER_PAGE (PAGE_SIZE >> L2PSIZE)
 
 #if MPS_PER_PAGE > 1
 
@@ -316,7 +316,7 @@ static void last_write_complete(struct page *page)
 	struct metapage *mp;
 	unsigned int offset;
 
-	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+	for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
 		mp = page_to_mp(page, offset);
 		if (mp && test_bit(META_io, &mp->flag)) {
 			if (mp->lsn)
@@ -366,12 +366,12 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 	int bad_blocks = 0;
 
 	page_start = (sector_t)page->index <<
-		     (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		     (PAGE_SHIFT - inode->i_blkbits);
 	BUG_ON(!PageLocked(page));
 	BUG_ON(PageWriteback(page));
 	set_page_writeback(page);
 
-	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+	for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
 		mp = page_to_mp(page, offset);
 
 		if (!mp || !test_bit(META_dirty, &mp->flag))
@@ -416,7 +416,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
 			bio = NULL;
 		} else
 			inc_io(page);
-		xlen = (PAGE_CACHE_SIZE - offset) >> inode->i_blkbits;
+		xlen = (PAGE_SIZE - offset) >> inode->i_blkbits;
 		pblock = metapage_get_blocks(inode, lblock, &xlen);
 		if (!pblock) {
 			printk(KERN_ERR "JFS: metapage_get_blocks failed\n");
@@ -485,7 +485,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
 	struct inode *inode = page->mapping->host;
 	struct bio *bio = NULL;
 	int block_offset;
-	int blocks_per_page = PAGE_CACHE_SIZE >> inode->i_blkbits;
+	int blocks_per_page = PAGE_SIZE >> inode->i_blkbits;
 	sector_t page_start;	/* address of page in fs blocks */
 	sector_t pblock;
 	int xlen;
@@ -494,7 +494,7 @@ static int metapage_readpage(struct file *fp, struct page *page)
 
 	BUG_ON(!PageLocked(page));
 	page_start = (sector_t)page->index <<
-		     (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		     (PAGE_SHIFT - inode->i_blkbits);
 
 	block_offset = 0;
 	while (block_offset < blocks_per_page) {
@@ -542,7 +542,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
 	int ret = 1;
 	int offset;
 
-	for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+	for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
 		mp = page_to_mp(page, offset);
 
 		if (!mp)
@@ -568,7 +568,7 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask)
 static void metapage_invalidatepage(struct page *page, unsigned int offset,
 				    unsigned int length)
 {
-	BUG_ON(offset || length < PAGE_CACHE_SIZE);
+	BUG_ON(offset || length < PAGE_SIZE);
 
 	BUG_ON(PageWriteback(page));
 
@@ -599,10 +599,10 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
 		 inode->i_ino, lblock, absolute);
 
 	l2bsize = inode->i_blkbits;
-	l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize;
+	l2BlocksPerPage = PAGE_SHIFT - l2bsize;
 	page_index = lblock >> l2BlocksPerPage;
 	page_offset = (lblock - (page_index << l2BlocksPerPage)) << l2bsize;
-	if ((page_offset + size) > PAGE_CACHE_SIZE) {
+	if ((page_offset + size) > PAGE_SIZE) {
 		jfs_err("MetaData crosses page boundary!!");
 		jfs_err("lblock = %lx, size  = %d", lblock, size);
 		dump_stack();
@@ -621,7 +621,7 @@ struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
 		mapping = inode->i_mapping;
 	}
 
-	if (new && (PSIZE == PAGE_CACHE_SIZE)) {
+	if (new && (PSIZE == PAGE_SIZE)) {
 		page = grab_cache_page(mapping, page_index);
 		if (!page) {
 			jfs_err("grab_cache_page failed!");
@@ -693,7 +693,7 @@ unlock:
 void grab_metapage(struct metapage * mp)
 {
 	jfs_info("grab_metapage: mp = 0x%p", mp);
-	page_cache_get(mp->page);
+	get_page(mp->page);
 	lock_page(mp->page);
 	mp->count++;
 	lock_metapage(mp);
@@ -706,12 +706,12 @@ void force_metapage(struct metapage *mp)
 	jfs_info("force_metapage: mp = 0x%p", mp);
 	set_bit(META_forcewrite, &mp->flag);
 	clear_bit(META_sync, &mp->flag);
-	page_cache_get(page);
+	get_page(page);
 	lock_page(page);
 	set_page_dirty(page);
 	write_one_page(page, 1);
 	clear_bit(META_forcewrite, &mp->flag);
-	page_cache_release(page);
+	put_page(page);
 }
 
 void hold_metapage(struct metapage *mp)
@@ -726,7 +726,7 @@ void put_metapage(struct metapage *mp)
 		unlock_page(mp->page);
 		return;
 	}
-	page_cache_get(mp->page);
+	get_page(mp->page);
 	mp->count++;
 	lock_metapage(mp);
 	unlock_page(mp->page);
@@ -746,7 +746,7 @@ void release_metapage(struct metapage * mp)
 	assert(mp->count);
 	if (--mp->count || mp->nohomeok) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		return;
 	}
 
@@ -764,13 +764,13 @@ void release_metapage(struct metapage * mp)
 	drop_metapage(page, mp);
 
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 void __invalidate_metapages(struct inode *ip, s64 addr, int len)
 {
 	sector_t lblock;
-	int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_blkbits;
+	int l2BlocksPerPage = PAGE_SHIFT - ip->i_blkbits;
 	int BlocksPerPage = 1 << l2BlocksPerPage;
 	/* All callers are interested in block device's mapping */
 	struct address_space *mapping =
@@ -788,7 +788,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
 		page = find_lock_page(mapping, lblock >> l2BlocksPerPage);
 		if (!page)
 			continue;
-		for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) {
+		for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) {
 			mp = page_to_mp(page, offset);
 			if (!mp)
 				continue;
@@ -803,7 +803,7 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
 				remove_from_logsync(mp);
 		}
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 }
 
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index 337e9e51ac06..a869fb4a20d6 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -106,7 +106,7 @@ static inline void metapage_nohomeok(struct metapage *mp)
 	lock_page(page);
 	if (!mp->nohomeok++) {
 		mark_metapage_dirty(mp);
-		page_cache_get(page);
+		get_page(page);
 		wait_on_page_writeback(page);
 	}
 	unlock_page(page);
@@ -128,7 +128,7 @@ static inline void metapage_wait_for_io(struct metapage *mp)
 static inline void _metapage_homeok(struct metapage *mp)
 {
 	if (!--mp->nohomeok)
-		page_cache_release(mp->page);
+		put_page(mp->page);
 }
 
 static inline void metapage_homeok(struct metapage *mp)
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4f5d85ba8e23..78d599198bf5 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -596,7 +596,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	 * Page cache is indexed by long.
 	 * I would use MAX_LFS_FILESIZE, but it's only half as big
 	 */
-	sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1,
+	sb->s_maxbytes = min(((u64) PAGE_SIZE << 32) - 1,
 			     (u64)sb->s_maxbytes);
 #endif
 	sb->s_time_gran = 1;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index b67dbccdaf88..f73541fbe7af 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -138,8 +138,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 	struct dentry *root;
 
 	info->sb = sb;
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = magic;
 	sb->s_op = &kernfs_sops;
 	sb->s_time_gran = 1;
diff --git a/fs/libfs.c b/fs/libfs.c
index 0ca80b2af420..f3fa82ce9b70 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -25,7 +25,7 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry,
 {
 	struct inode *inode = d_inode(dentry);
 	generic_fillattr(inode, stat);
-	stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9);
+	stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
 	return 0;
 }
 EXPORT_SYMBOL(simple_getattr);
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(simple_getattr);
 int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	buf->f_type = dentry->d_sb->s_magic;
-	buf->f_bsize = PAGE_CACHE_SIZE;
+	buf->f_bsize = PAGE_SIZE;
 	buf->f_namelen = NAME_MAX;
 	return 0;
 }
@@ -395,7 +395,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
 	struct page *page;
 	pgoff_t index;
 
-	index = pos >> PAGE_CACHE_SHIFT;
+	index = pos >> PAGE_SHIFT;
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
@@ -403,10 +403,10 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
 
 	*pagep = page;
 
-	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
-		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+	if (!PageUptodate(page) && (len != PAGE_SIZE)) {
+		unsigned from = pos & (PAGE_SIZE - 1);
 
-		zero_user_segments(page, 0, from, from + len, PAGE_CACHE_SIZE);
+		zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
 	}
 	return 0;
 }
@@ -442,7 +442,7 @@ int simple_write_end(struct file *file, struct address_space *mapping,
 
 	/* zero the stale part of the page if we did a short copy */
 	if (copied < len) {
-		unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+		unsigned from = pos & (PAGE_SIZE - 1);
 
 		zero_user(page, from + copied, len - copied);
 	}
@@ -458,7 +458,7 @@ int simple_write_end(struct file *file, struct address_space *mapping,
 
 	set_page_dirty(page);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return copied;
 }
@@ -477,8 +477,8 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
 	struct dentry *dentry;
 	int i;
 
-	s->s_blocksize = PAGE_CACHE_SIZE;
-	s->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	s->s_blocksize = PAGE_SIZE;
+	s->s_blocksize_bits = PAGE_SHIFT;
 	s->s_magic = magic;
 	s->s_op = &simple_super_operations;
 	s->s_time_gran = 1;
@@ -994,12 +994,12 @@ int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
 {
 	u64 last_fs_block = num_blocks - 1;
 	u64 last_fs_page =
-		last_fs_block >> (PAGE_CACHE_SHIFT - blocksize_bits);
+		last_fs_block >> (PAGE_SHIFT - blocksize_bits);
 
 	if (unlikely(num_blocks == 0))
 		return 0;
 
-	if ((blocksize_bits < 9) || (blocksize_bits > PAGE_CACHE_SHIFT))
+	if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT))
 		return -EINVAL;
 
 	if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index a709d80c8ebc..cc26f8f215f5 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -64,7 +64,7 @@ static void writeseg_end_io(struct bio *bio)
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		end_page_writeback(bvec->bv_page);
-		page_cache_release(bvec->bv_page);
+		put_page(bvec->bv_page);
 	}
 	bio_put(bio);
 	if (atomic_dec_and_test(&super->s_pending_writes))
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 9c501449450d..b76a62b1978f 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -46,9 +46,9 @@ static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
 
 	BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
 	BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
-	BUG_ON(len > PAGE_CACHE_SIZE);
-	page_start = ofs & PAGE_CACHE_MASK;
-	page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
+	BUG_ON(len > PAGE_SIZE);
+	page_start = ofs & PAGE_MASK;
+	page_end = PAGE_ALIGN(ofs + len) - 1;
 	ret = mtd_write(mtd, ofs, len, &retlen, buf);
 	if (ret || (retlen != len))
 		return -EIO;
@@ -82,7 +82,7 @@ static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs,
 		if (!page)
 			continue;
 		memset(page_address(page), 0xFF, PAGE_SIZE);
-		page_cache_release(page);
+		put_page(page);
 	}
 	return 0;
 }
@@ -195,7 +195,7 @@ static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
 		err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
 					page_address(page));
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		if (err)
 			return err;
 	}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 542468e9bfb4..ddbed2be5366 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -183,7 +183,7 @@ static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
 		if (name->len != be16_to_cpu(dd->namelen) ||
 				memcmp(name->name, dd->name, name->len)) {
 			kunmap_atomic(dd);
-			page_cache_release(page);
+			put_page(page);
 			continue;
 		}
 
@@ -238,7 +238,7 @@ static int logfs_unlink(struct inode *dir, struct dentry *dentry)
 		return PTR_ERR(page);
 	}
 	index = page->index;
-	page_cache_release(page);
+	put_page(page);
 
 	mutex_lock(&super->s_dirop_mutex);
 	logfs_add_transaction(dir, ta);
@@ -316,7 +316,7 @@ static int logfs_readdir(struct file *file, struct dir_context *ctx)
 				be16_to_cpu(dd->namelen),
 				be64_to_cpu(dd->ino), dd->type);
 		kunmap(page);
-		page_cache_release(page);
+		put_page(page);
 		if (full)
 			break;
 	}
@@ -349,7 +349,7 @@ static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
 	dd = kmap_atomic(page);
 	ino = be64_to_cpu(dd->ino);
 	kunmap_atomic(dd);
-	page_cache_release(page);
+	put_page(page);
 
 	inode = logfs_iget(dir->i_sb, ino);
 	if (IS_ERR(inode))
@@ -392,7 +392,7 @@ static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
 
 		err = logfs_write_buf(dir, page, WF_LOCK);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		if (!err)
 			grow_dir(dir, index);
 		return err;
@@ -561,7 +561,7 @@ static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
 	map = kmap_atomic(page);
 	memcpy(dd, map, sizeof(*dd));
 	kunmap_atomic(map);
-	page_cache_release(page);
+	put_page(page);
 	return 0;
 }
 
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 61eaeb1b6cac..f01ddfb1a03b 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -15,21 +15,21 @@ static int logfs_write_begin(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	struct page *page;
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
 
-	if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
+	if ((len == PAGE_SIZE) || PageUptodate(page))
 		return 0;
-	if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
-		unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	if ((pos & PAGE_MASK) >= i_size_read(inode)) {
+		unsigned start = pos & (PAGE_SIZE - 1);
 		unsigned end = start + len;
 
 		/* Reading beyond i_size is simple: memset to zero */
-		zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
+		zero_user_segments(page, 0, start, end, PAGE_SIZE);
 		return 0;
 	}
 	return logfs_readpage_nolock(page);
@@ -41,11 +41,11 @@ static int logfs_write_end(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	pgoff_t index = page->index;
-	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned start = pos & (PAGE_SIZE - 1);
 	unsigned end = start + copied;
 	int ret = 0;
 
-	BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
+	BUG_ON(PAGE_SIZE != inode->i_sb->s_blocksize);
 	BUG_ON(page->index > I3_BLOCKS);
 
 	if (copied < len) {
@@ -61,8 +61,8 @@ static int logfs_write_end(struct file *file, struct address_space *mapping,
 	if (copied == 0)
 		goto out; /* FIXME: do we need to update inode? */
 
-	if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
-		i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
+	if (i_size_read(inode) < (index << PAGE_SHIFT) + end) {
+		i_size_write(inode, (index << PAGE_SHIFT) + end);
 		mark_inode_dirty_sync(inode);
 	}
 
@@ -75,7 +75,7 @@ static int logfs_write_end(struct file *file, struct address_space *mapping,
 	}
 out:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return ret ? ret : copied;
 }
 
@@ -118,7 +118,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
-	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	pgoff_t end_index = i_size >> PAGE_SHIFT;
 	unsigned offset;
 	u64 bix;
 	level_t level;
@@ -142,7 +142,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
 		return __logfs_writepage(page);
 
 	 /* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_CACHE_SIZE-1);
+	offset = i_size & (PAGE_SIZE-1);
 	if (bix > end_index || offset == 0) {
 		unlock_page(page);
 		return 0; /* don't care */
@@ -155,7 +155,7 @@ static int logfs_writepage(struct page *page, struct writeback_control *wbc)
 	 * the  page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+	zero_user_segment(page, offset, PAGE_SIZE);
 	return __logfs_writepage(page);
 }
 
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 20973c9e52f8..3fb8c6d67303 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -281,7 +281,7 @@ static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
 static void logfs_put_read_page(struct page *page)
 {
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 static void logfs_lock_write_page(struct page *page)
@@ -323,7 +323,7 @@ repeat:
 			return NULL;
 		err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
 		if (unlikely(err)) {
-			page_cache_release(page);
+			put_page(page);
 			if (err == -EEXIST)
 				goto repeat;
 			return NULL;
@@ -342,7 +342,7 @@ static void logfs_unlock_write_page(struct page *page)
 static void logfs_put_write_page(struct page *page)
 {
 	logfs_unlock_write_page(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
@@ -562,7 +562,7 @@ static void indirect_free_block(struct super_block *sb,
 
 	if (PagePrivate(page)) {
 		ClearPagePrivate(page);
-		page_cache_release(page);
+		put_page(page);
 		set_page_private(page, 0);
 	}
 	__free_block(sb, block);
@@ -655,7 +655,7 @@ static void alloc_data_block(struct inode *inode, struct page *page)
 	block->page = page;
 
 	SetPagePrivate(page);
-	page_cache_get(page);
+	get_page(page);
 	set_page_private(page, (unsigned long) block);
 
 	block->ops = &indirect_block_ops;
@@ -709,7 +709,7 @@ static u64 block_get_pointer(struct page *page, int index)
 
 static int logfs_read_empty(struct page *page)
 {
-	zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+	zero_user_segment(page, 0, PAGE_SIZE);
 	return 0;
 }
 
@@ -1660,7 +1660,7 @@ static int truncate_data_block(struct inode *inode, struct page *page,
 	if (err)
 		return err;
 
-	zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
+	zero_user_segment(page, size - pageofs, PAGE_SIZE);
 	return logfs_segment_write(inode, page, shadow);
 }
 
@@ -1919,7 +1919,7 @@ static void move_page_to_inode(struct inode *inode, struct page *page)
 	block->page = NULL;
 	if (PagePrivate(page)) {
 		ClearPagePrivate(page);
-		page_cache_release(page);
+		put_page(page);
 		set_page_private(page, 0);
 	}
 }
@@ -1940,7 +1940,7 @@ static void move_inode_to_page(struct page *page, struct inode *inode)
 
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
-		page_cache_get(page);
+		get_page(page);
 		set_page_private(page, (unsigned long) block);
 	}
 
@@ -1971,7 +1971,7 @@ int logfs_read_inode(struct inode *inode)
 	logfs_disk_to_inode(di, inode);
 	kunmap_atomic(di);
 	move_page_to_inode(inode, page);
-	page_cache_release(page);
+	put_page(page);
 	return 0;
 }
 
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index d270e4b2ab6b..1efd6055f4b0 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -90,9 +90,9 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
 
 		if (!PagePrivate(page)) {
 			SetPagePrivate(page);
-			page_cache_get(page);
+			get_page(page);
 		}
-		page_cache_release(page);
+		put_page(page);
 
 		buf += copylen;
 		len -= copylen;
@@ -117,9 +117,9 @@ static void pad_partial_page(struct logfs_area *area)
 		memset(page_address(page) + offset, 0xff, len);
 		if (!PagePrivate(page)) {
 			SetPagePrivate(page);
-			page_cache_get(page);
+			get_page(page);
 		}
-		page_cache_release(page);
+		put_page(page);
 	}
 }
 
@@ -129,20 +129,20 @@ static void pad_full_pages(struct logfs_area *area)
 	struct logfs_super *super = logfs_super(sb);
 	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
 	u32 len = super->s_segsize - area->a_used_bytes;
-	pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
-	pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
+	pgoff_t index = PAGE_ALIGN(ofs) >> PAGE_SHIFT;
+	pgoff_t no_indizes = len >> PAGE_SHIFT;
 	struct page *page;
 
 	while (no_indizes) {
 		page = get_mapping_page(sb, index, 0);
 		BUG_ON(!page); /* FIXME: reserve a pool */
 		SetPageUptodate(page);
-		memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
+		memset(page_address(page), 0xff, PAGE_SIZE);
 		if (!PagePrivate(page)) {
 			SetPagePrivate(page);
-			page_cache_get(page);
+			get_page(page);
 		}
-		page_cache_release(page);
+		put_page(page);
 		index++;
 		no_indizes--;
 	}
@@ -411,7 +411,7 @@ int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
 		if (IS_ERR(page))
 			return PTR_ERR(page);
 		memcpy(buf, page_address(page) + offset, copylen);
-		page_cache_release(page);
+		put_page(page);
 
 		buf += copylen;
 		len -= copylen;
@@ -499,7 +499,7 @@ static void move_btree_to_page(struct inode *inode, struct page *page,
 
 	if (!PagePrivate(page)) {
 		SetPagePrivate(page);
-		page_cache_get(page);
+		get_page(page);
 		set_page_private(page, (unsigned long) block);
 	}
 	block->ops = &indirect_block_ops;
@@ -554,7 +554,7 @@ void move_page_to_btree(struct page *page)
 
 	if (PagePrivate(page)) {
 		ClearPagePrivate(page);
-		page_cache_release(page);
+		put_page(page);
 		set_page_private(page, 0);
 	}
 	block->ops = &btree_block_ops;
@@ -723,9 +723,9 @@ void freeseg(struct super_block *sb, u32 segno)
 			continue;
 		if (PagePrivate(page)) {
 			ClearPagePrivate(page);
-			page_cache_release(page);
+			put_page(page);
 		}
-		page_cache_release(page);
+		put_page(page);
 	}
 }
 
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index 54360293bcb5..5751082dba52 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -48,7 +48,7 @@ void emergency_read_end(struct page *page)
 	if (page == emergency_page)
 		mutex_unlock(&emergency_mutex);
 	else
-		page_cache_release(page);
+		put_page(page);
 }
 
 static void dump_segfile(struct super_block *sb)
@@ -206,7 +206,7 @@ static int write_one_sb(struct super_block *sb,
 	logfs_set_segment_erased(sb, segno, ec, 0);
 	logfs_write_ds(sb, ds, segno, ec);
 	err = super->s_devops->write_sb(sb, page);
-	page_cache_release(page);
+	put_page(page);
 	return err;
 }
 
@@ -366,24 +366,24 @@ static struct page *find_super_block(struct super_block *sb)
 		return NULL;
 	last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
 	if (!last || IS_ERR(last)) {
-		page_cache_release(first);
+		put_page(first);
 		return NULL;
 	}
 
 	if (!logfs_check_ds(page_address(first))) {
-		page_cache_release(last);
+		put_page(last);
 		return first;
 	}
 
 	/* First one didn't work, try the second superblock */
 	if (!logfs_check_ds(page_address(last))) {
-		page_cache_release(first);
+		put_page(first);
 		return last;
 	}
 
 	/* Neither worked, sorry folks */
-	page_cache_release(first);
-	page_cache_release(last);
+	put_page(first);
+	put_page(last);
 	return NULL;
 }
 
@@ -425,7 +425,7 @@ static int __logfs_read_sb(struct super_block *sb)
 	super->s_data_levels = ds->ds_data_levels;
 	super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
 		+ super->s_data_levels;
-	page_cache_release(page);
+	put_page(page);
 	return 0;
 }
 
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d19ac258105a..33957c07cd11 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -28,7 +28,7 @@ const struct file_operations minix_dir_operations = {
 static inline void dir_put_page(struct page *page)
 {
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 /*
@@ -38,10 +38,10 @@ static inline void dir_put_page(struct page *page)
 static unsigned
 minix_last_byte(struct inode *inode, unsigned long page_nr)
 {
-	unsigned last_byte = PAGE_CACHE_SIZE;
+	unsigned last_byte = PAGE_SIZE;
 
-	if (page_nr == (inode->i_size >> PAGE_CACHE_SHIFT))
-		last_byte = inode->i_size & (PAGE_CACHE_SIZE - 1);
+	if (page_nr == (inode->i_size >> PAGE_SHIFT))
+		last_byte = inode->i_size & (PAGE_SIZE - 1);
 	return last_byte;
 }
 
@@ -92,8 +92,8 @@ static int minix_readdir(struct file *file, struct dir_context *ctx)
 	if (pos >= inode->i_size)
 		return 0;
 
-	offset = pos & ~PAGE_CACHE_MASK;
-	n = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_MASK;
+	n = pos >> PAGE_SHIFT;
 
 	for ( ; n < npages; n++, offset = 0) {
 		char *p, *kaddr, *limit;
@@ -229,7 +229,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
 		lock_page(page);
 		kaddr = (char*)page_address(page);
 		dir_end = kaddr + minix_last_byte(dir, n);
-		limit = kaddr + PAGE_CACHE_SIZE - sbi->s_dirsize;
+		limit = kaddr + PAGE_SIZE - sbi->s_dirsize;
 		for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
 			de = (minix_dirent *)p;
 			de3 = (minix3_dirent *)p;
@@ -327,7 +327,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
 	}
 
 	kaddr = kmap_atomic(page);
-	memset(kaddr, 0, PAGE_CACHE_SIZE);
+	memset(kaddr, 0, PAGE_SIZE);
 
 	if (sbi->s_version == MINIX_V3) {
 		minix3_dirent *de3 = (minix3_dirent *)kaddr;
@@ -350,7 +350,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
 
 	err = dir_commit_chunk(page, 0, 2 * sbi->s_dirsize);
 fail:
-	page_cache_release(page);
+	put_page(page);
 	return err;
 }
 
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index a795a11e50c7..2887d1d95ce2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -243,11 +243,11 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
 out_dir:
 	if (dir_de) {
 		kunmap(dir_page);
-		page_cache_release(dir_page);
+		put_page(dir_page);
 	}
 out_old:
 	kunmap(old_page);
-	page_cache_release(old_page);
+	put_page(old_page);
 out:
 	return err;
 }
diff --git a/fs/mpage.c b/fs/mpage.c
index 6bd9fd90964e..e7083bfbabb3 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -107,7 +107,7 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
 		 * don't make any buffers if there is only one buffer on
 		 * the page and the page just needs to be set up to date
 		 */
-		if (inode->i_blkbits == PAGE_CACHE_SHIFT && 
+		if (inode->i_blkbits == PAGE_SHIFT &&
 		    buffer_uptodate(bh)) {
 			SetPageUptodate(page);    
 			return;
@@ -145,7 +145,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 {
 	struct inode *inode = page->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
-	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	const unsigned blocksize = 1 << blkbits;
 	sector_t block_in_file;
 	sector_t last_block;
@@ -162,7 +162,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	if (page_has_buffers(page))
 		goto confused;
 
-	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+	block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
 	last_block = block_in_file + nr_pages * blocks_per_page;
 	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
 	if (last_block > last_block_in_file)
@@ -249,7 +249,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
 	}
 
 	if (first_hole != blocks_per_page) {
-		zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
+		zero_user_segment(page, first_hole << blkbits, PAGE_SIZE);
 		if (first_hole == 0) {
 			SetPageUptodate(page);
 			unlock_page(page);
@@ -380,7 +380,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 					&first_logical_block,
 					get_block, gfp);
 		}
-		page_cache_release(page);
+		put_page(page);
 	}
 	BUG_ON(!list_empty(pages));
 	if (bio)
@@ -472,7 +472,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 	struct inode *inode = page->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
 	unsigned long end_index;
-	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
+	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
 	sector_t last_block;
 	sector_t block_in_file;
 	sector_t blocks[MAX_BUF_PER_PAGE];
@@ -542,7 +542,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 	 * The page has no buffers: map it to disk
 	 */
 	BUG_ON(!PageUptodate(page));
-	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
+	block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits);
 	last_block = (i_size - 1) >> blkbits;
 	map_bh.b_page = page;
 	for (page_block = 0; page_block < blocks_per_page; ) {
@@ -574,7 +574,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 	first_unmapped = page_block;
 
 page_is_mapped:
-	end_index = i_size >> PAGE_CACHE_SHIFT;
+	end_index = i_size >> PAGE_SHIFT;
 	if (page->index >= end_index) {
 		/*
 		 * The page straddles i_size.  It must be zeroed out on each
@@ -584,11 +584,11 @@ page_is_mapped:
 		 * is zeroed when mapped, and writes to that region are not
 		 * written out to the file."
 		 */
-		unsigned offset = i_size & (PAGE_CACHE_SIZE - 1);
+		unsigned offset = i_size & (PAGE_SIZE - 1);
 
 		if (page->index > end_index || !offset)
 			goto confused;
-		zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+		zero_user_segment(page, offset, PAGE_SIZE);
 	}
 
 	/*
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index b7f8eaeea5d8..bfdad003ee56 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -510,7 +510,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
 			kunmap(ctl.page);
 			SetPageUptodate(ctl.page);
 			unlock_page(ctl.page);
-			page_cache_release(ctl.page);
+			put_page(ctl.page);
 			ctl.page = NULL;
 		}
 		ctl.idx  = 0;
@@ -520,7 +520,7 @@ invalid_cache:
 	if (ctl.page) {
 		kunmap(ctl.page);
 		unlock_page(ctl.page);
-		page_cache_release(ctl.page);
+		put_page(ctl.page);
 		ctl.page = NULL;
 	}
 	ctl.cache = cache;
@@ -554,14 +554,14 @@ finished:
 		kunmap(ctl.page);
 		SetPageUptodate(ctl.page);
 		unlock_page(ctl.page);
-		page_cache_release(ctl.page);
+		put_page(ctl.page);
 	}
 	if (page) {
 		cache->head = ctl.head;
 		kunmap(page);
 		SetPageUptodate(page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 out:
 	return result;
@@ -649,7 +649,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
 			kunmap(ctl.page);
 			SetPageUptodate(ctl.page);
 			unlock_page(ctl.page);
-			page_cache_release(ctl.page);
+			put_page(ctl.page);
 		}
 		ctl.cache = NULL;
 		ctl.idx  -= NCP_DIRCACHE_SIZE;
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 5233fbc1747a..17cfb743b5bf 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -191,7 +191,7 @@ struct ncp_cache_head {
 	int		eof;
 };
 
-#define NCP_DIRCACHE_SIZE	((int)(PAGE_CACHE_SIZE/sizeof(struct dentry *)))
+#define NCP_DIRCACHE_SIZE	((int)(PAGE_SIZE/sizeof(struct dentry *)))
 union ncp_dir_cache {
 	struct ncp_cache_head	head;
 	struct dentry		*dentry[NCP_DIRCACHE_SIZE];
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 02e4d87d2ed3..17a42e4eb872 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -231,7 +231,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
 	size_t bytes_left = header->args.count;
 	unsigned int pg_offset = header->args.pgbase, pg_len;
 	struct page **pages = header->args.pages;
-	int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+	int pg_index = header->args.pgbase >> PAGE_SHIFT;
 	const bool is_dio = (header->dreq != NULL);
 	struct blk_plug plug;
 	int i;
@@ -263,13 +263,13 @@ bl_read_pagelist(struct nfs_pgio_header *header)
 		}
 
 		if (is_dio) {
-			if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
-				pg_len = PAGE_CACHE_SIZE - pg_offset;
+			if (pg_offset + bytes_left > PAGE_SIZE)
+				pg_len = PAGE_SIZE - pg_offset;
 			else
 				pg_len = bytes_left;
 		} else {
 			BUG_ON(pg_offset != 0);
-			pg_len = PAGE_CACHE_SIZE;
+			pg_len = PAGE_SIZE;
 		}
 
 		if (is_hole(&be)) {
@@ -339,9 +339,9 @@ static void bl_write_cleanup(struct work_struct *work)
 
 	if (likely(!hdr->pnfs_error)) {
 		struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
-		u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
+		u64 start = hdr->args.offset & (loff_t)PAGE_MASK;
 		u64 end = (hdr->args.offset + hdr->args.count +
-			PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
+			PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
 
 		ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
 					(end - start) >> SECTOR_SHIFT);
@@ -373,7 +373,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 	loff_t offset = header->args.offset;
 	size_t count = header->args.count;
 	struct page **pages = header->args.pages;
-	int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
+	int pg_index = header->args.pgbase >> PAGE_SHIFT;
 	unsigned int pg_len;
 	struct blk_plug plug;
 	int i;
@@ -392,7 +392,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 	blk_start_plug(&plug);
 
 	/* we always write out the whole page */
-	offset = offset & (loff_t)PAGE_CACHE_MASK;
+	offset = offset & (loff_t)PAGE_MASK;
 	isect = offset >> SECTOR_SHIFT;
 
 	for (i = pg_index; i < header->page_array.npages; i++) {
@@ -408,7 +408,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
 			extent_length = be.be_length - (isect - be.be_f_offset);
 		}
 
-		pg_len = PAGE_CACHE_SIZE;
+		pg_len = PAGE_SIZE;
 		bio = do_add_page_to_bio(bio, header->page_array.npages - i,
 					 WRITE, isect, pages[i], &map, &be,
 					 bl_end_io_write, par,
@@ -820,7 +820,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
 	pgoff_t end;
 
 	/* Optimize common case that writes from 0 to end of file */
-	end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
+	end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 	if (end != inode->i_mapping->nrpages) {
 		rcu_read_lock();
 		end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
@@ -828,9 +828,9 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
 	}
 
 	if (!end)
-		return i_size_read(inode) - (idx << PAGE_CACHE_SHIFT);
+		return i_size_read(inode) - (idx << PAGE_SHIFT);
 	else
-		return (end - idx) << PAGE_CACHE_SHIFT;
+		return (end - idx) << PAGE_SHIFT;
 }
 
 static void
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index bc21205309e0..18e6fd0b9506 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -40,8 +40,8 @@
 #include "../pnfs.h"
 #include "../netns.h"
 
-#define PAGE_CACHE_SECTORS (PAGE_CACHE_SIZE >> SECTOR_SHIFT)
-#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
+#define PAGE_CACHE_SECTORS (PAGE_SIZE >> SECTOR_SHIFT)
+#define PAGE_CACHE_SECTOR_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
 #define SECTOR_SIZE (1 << SECTOR_SHIFT)
 
 struct pnfs_block_dev;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index d6d5d2a48e83..0c96528db94a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -736,7 +736,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
 		server->rsize = max_rpc_payload;
 	if (server->rsize > NFS_MAX_FILE_IO_SIZE)
 		server->rsize = NFS_MAX_FILE_IO_SIZE;
-	server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	server->backing_dev_info.name = "nfs";
 	server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
@@ -745,13 +745,13 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
 		server->wsize = max_rpc_payload;
 	if (server->wsize > NFS_MAX_FILE_IO_SIZE)
 		server->wsize = NFS_MAX_FILE_IO_SIZE;
-	server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
 
 	server->dtsize = nfs_block_size(fsinfo->dtpref, NULL);
-	if (server->dtsize > PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES)
-		server->dtsize = PAGE_CACHE_SIZE * NFS_MAX_READDIR_PAGES;
+	if (server->dtsize > PAGE_SIZE * NFS_MAX_READDIR_PAGES)
+		server->dtsize = PAGE_SIZE * NFS_MAX_READDIR_PAGES;
 	if (server->dtsize > server->rsize)
 		server->dtsize = server->rsize;
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4bfa7d8bcade..adef506c5786 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -707,7 +707,7 @@ void cache_page_release(nfs_readdir_descriptor_t *desc)
 {
 	if (!desc->page->mapping)
 		nfs_readdir_clear_array(desc->page);
-	page_cache_release(desc->page);
+	put_page(desc->page);
 	desc->page = NULL;
 }
 
@@ -1923,7 +1923,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 		 * add_to_page_cache_lru() grabs an extra page refcount.
 		 * Drop it here to avoid leaking this page later.
 		 */
-		page_cache_release(page);
+		put_page(page);
 	} else
 		__free_page(page);
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 7a0cfd3266e5..c93826e4a8c6 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -269,7 +269,7 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 {
 	unsigned int i;
 	for (i = 0; i < npages; i++)
-		page_cache_release(pages[i]);
+		put_page(pages[i]);
 }
 
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
@@ -1003,7 +1003,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 		      iov_iter_count(iter));
 
 	pos = iocb->ki_pos;
-	end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT;
+	end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
 
 	inode_lock(inode);
 
@@ -1013,7 +1013,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 
 	if (mapping->nrpages) {
 		result = invalidate_inode_pages2_range(mapping,
-					pos >> PAGE_CACHE_SHIFT, end);
+					pos >> PAGE_SHIFT, end);
 		if (result)
 			goto out_unlock;
 	}
@@ -1042,7 +1042,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 
 	if (mapping->nrpages) {
 		invalidate_inode_pages2_range(mapping,
-					      pos >> PAGE_CACHE_SHIFT, end);
+					      pos >> PAGE_SHIFT, end);
 	}
 
 	inode_unlock(inode);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 89bf093d342a..be01095b97ae 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -320,7 +320,7 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
 			loff_t pos, unsigned len)
 {
 	unsigned int pglen = nfs_page_length(page);
-	unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned int offset = pos & (PAGE_SIZE - 1);
 	unsigned int end = offset + len;
 
 	if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
@@ -351,7 +351,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
 			struct page **pagep, void **fsdata)
 {
 	int ret;
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 	struct page *page;
 	int once_thru = 0;
 
@@ -380,12 +380,12 @@ start:
 	ret = nfs_flush_incompatible(file, page);
 	if (ret) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	} else if (!once_thru &&
 		   nfs_want_read_modify_write(file, page, pos, len)) {
 		once_thru = 1;
 		ret = nfs_readpage(file, page);
-		page_cache_release(page);
+		put_page(page);
 		if (!ret)
 			goto start;
 	}
@@ -396,7 +396,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
 			struct page *page, void *fsdata)
 {
-	unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned offset = pos & (PAGE_SIZE - 1);
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	int status;
 
@@ -413,20 +413,20 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 
 		if (pglen == 0) {
 			zero_user_segments(page, 0, offset,
-					end, PAGE_CACHE_SIZE);
+					end, PAGE_SIZE);
 			SetPageUptodate(page);
 		} else if (end >= pglen) {
-			zero_user_segment(page, end, PAGE_CACHE_SIZE);
+			zero_user_segment(page, end, PAGE_SIZE);
 			if (offset == 0)
 				SetPageUptodate(page);
 		} else
-			zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+			zero_user_segment(page, pglen, PAGE_SIZE);
 	}
 
 	status = nfs_updatepage(file, page, offset, copied);
 
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	if (status < 0)
 		return status;
@@ -454,7 +454,7 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset,
 	dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %u, %u)\n",
 		 page, offset, length);
 
-	if (offset != 0 || length < PAGE_CACHE_SIZE)
+	if (offset != 0 || length < PAGE_SIZE)
 		return;
 	/* Cancel any unstarted writes on this page */
 	nfs_wb_page_cancel(page_file_mapping(page)->host, page);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 565f8135ae1f..f1d1d2c472e9 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -638,11 +638,11 @@ unsigned int nfs_page_length(struct page *page)
 
 	if (i_size > 0) {
 		pgoff_t page_index = page_file_index(page);
-		pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+		pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT;
 		if (page_index < end_index)
-			return PAGE_CACHE_SIZE;
+			return PAGE_SIZE;
 		if (page_index == end_index)
-			return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
+			return ((i_size - 1) & ~PAGE_MASK) + 1;
 	}
 	return 0;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e4441216804..88474a4fc669 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5001,7 +5001,7 @@ static int decode_space_limit(struct xdr_stream *xdr,
 		blocksize = be32_to_cpup(p);
 		maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
 	}
-	maxsize >>= PAGE_CACHE_SHIFT;
+	maxsize >>= PAGE_SHIFT;
 	*pagemod_limit = min_t(u64, maxsize, ULONG_MAX);
 	return 0;
 out_overflow:
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 9aebffb40505..049c1b1f2932 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -486,7 +486,7 @@ static void __r4w_put_page(void *priv, struct page *page)
 	dprintk("%s: index=0x%lx\n", __func__,
 		(page == ZERO_PAGE(0)) ? -1UL : page->index);
 	if (ZERO_PAGE(0) != page)
-		page_cache_release(page);
+		put_page(page);
 	return;
 }
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 8ce4f61cbaa5..1f6db4231057 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -342,7 +342,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
 	 * update_nfs_request below if the region is not locked. */
 	req->wb_page    = page;
 	req->wb_index	= page_file_index(page);
-	page_cache_get(page);
+	get_page(page);
 	req->wb_offset  = offset;
 	req->wb_pgbase	= offset;
 	req->wb_bytes   = count;
@@ -392,7 +392,7 @@ static void nfs_clear_request(struct nfs_page *req)
 	struct nfs_lock_context *l_ctx = req->wb_lock_context;
 
 	if (page != NULL) {
-		page_cache_release(page);
+		put_page(page);
 		req->wb_page = NULL;
 	}
 	if (l_ctx != NULL) {
@@ -904,7 +904,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 				return false;
 		} else {
 			if (req->wb_pgbase != 0 ||
-			    prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
+			    prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE)
 				return false;
 		}
 	}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2fa483e6dbe2..89a5ef4df08a 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -841,7 +841,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 
 		i_size = i_size_read(ino);
 
-		lgp->args.minlength = PAGE_CACHE_SIZE;
+		lgp->args.minlength = PAGE_SIZE;
 		if (lgp->args.minlength > range->length)
 			lgp->args.minlength = range->length;
 		if (range->iomode == IOMODE_READ) {
@@ -1618,13 +1618,13 @@ lookup_again:
 		spin_unlock(&clp->cl_lock);
 	}
 
-	pg_offset = arg.offset & ~PAGE_CACHE_MASK;
+	pg_offset = arg.offset & ~PAGE_MASK;
 	if (pg_offset) {
 		arg.offset -= pg_offset;
 		arg.length += pg_offset;
 	}
 	if (arg.length != NFS4_MAX_UINT64)
-		arg.length = PAGE_CACHE_ALIGN(arg.length);
+		arg.length = PAGE_ALIGN(arg.length);
 
 	lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
 	atomic_dec(&lo->plh_outstanding);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index eb31e23e7def..6776d7a7839e 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -46,7 +46,7 @@ static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
 static
 int nfs_return_empty_page(struct page *page)
 {
-	zero_user(page, 0, PAGE_CACHE_SIZE);
+	zero_user(page, 0, PAGE_SIZE);
 	SetPageUptodate(page);
 	unlock_page(page);
 	return 0;
@@ -118,8 +118,8 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 		unlock_page(page);
 		return PTR_ERR(new);
 	}
-	if (len < PAGE_CACHE_SIZE)
-		zero_user_segment(page, len, PAGE_CACHE_SIZE);
+	if (len < PAGE_SIZE)
+		zero_user_segment(page, len, PAGE_SIZE);
 
 	nfs_pageio_init_read(&pgio, inode, false,
 			     &nfs_async_read_completion_ops);
@@ -295,7 +295,7 @@ int nfs_readpage(struct file *file, struct page *page)
 	int		error;
 
 	dprintk("NFS: nfs_readpage (%p %ld@%lu)\n",
-		page, PAGE_CACHE_SIZE, page_file_index(page));
+		page, PAGE_SIZE, page_file_index(page));
 	nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
 	nfs_add_stats(inode, NFSIOS_READPAGES, 1);
 
@@ -361,8 +361,8 @@ readpage_async_filler(void *data, struct page *page)
 	if (IS_ERR(new))
 		goto out_error;
 
-	if (len < PAGE_CACHE_SIZE)
-		zero_user_segment(page, len, PAGE_CACHE_SIZE);
+	if (len < PAGE_SIZE)
+		zero_user_segment(page, len, PAGE_SIZE);
 	if (!nfs_pageio_add_request(desc->pgio, new)) {
 		nfs_list_remove_request(new);
 		nfs_readpage_release(new);
@@ -424,8 +424,8 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
 
 	pgm = &pgio.pg_mirrors[0];
 	NFS_I(inode)->read_io += pgm->pg_bytes_written;
-	npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >>
-		 PAGE_CACHE_SHIFT;
+	npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >>
+		 PAGE_SHIFT;
 	nfs_add_stats(inode, NFSIOS_READPAGES, npages);
 read_complete:
 	put_nfs_open_context(desc.ctx);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5754835a2886..5f4fd53e5764 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -150,7 +150,7 @@ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int c
 
 	spin_lock(&inode->i_lock);
 	i_size = i_size_read(inode);
-	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+	end_index = (i_size - 1) >> PAGE_SHIFT;
 	if (i_size > 0 && page_file_index(page) < end_index)
 		goto out;
 	end = page_file_offset(page) + ((loff_t)offset+count);
@@ -1942,7 +1942,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
 {
 	loff_t range_start = page_file_offset(page);
-	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+	loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1);
 	struct writeback_control wbc = {
 		.sync_mode = WB_SYNC_ALL,
 		.nr_to_write = 0,
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 27f75bcbeb30..a9fb3636c142 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -458,7 +458,7 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
 	struct buffer_head *pbh;
 	__u64 key;
 
-	key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
+	key = page_index(bh->b_page) << (PAGE_SHIFT -
 					 bmap->b_inode->i_blkbits);
 	for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page)
 		key++;
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index a35ae35e6932..e0c9daf9aa22 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -62,7 +62,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
 	set_buffer_uptodate(bh);
 
 	unlock_page(bh->b_page);
-	page_cache_release(bh->b_page);
+	put_page(bh->b_page);
 	return bh;
 }
 
@@ -128,7 +128,7 @@ found:
 
 out_locked:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return err;
 }
 
@@ -146,7 +146,7 @@ void nilfs_btnode_delete(struct buffer_head *bh)
 	pgoff_t index = page_index(page);
 	int still_dirty;
 
-	page_cache_get(page);
+	get_page(page);
 	lock_page(page);
 	wait_on_page_writeback(page);
 
@@ -154,7 +154,7 @@ void nilfs_btnode_delete(struct buffer_head *bh)
 	still_dirty = PageDirty(page);
 	mapping = page->mapping;
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	if (!still_dirty && mapping)
 		invalidate_inode_pages2_range(mapping, index, index);
@@ -181,7 +181,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
 	obh = ctxt->bh;
 	ctxt->newbh = NULL;
 
-	if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
+	if (inode->i_blkbits == PAGE_SHIFT) {
 		lock_page(obh->b_page);
 		/*
 		 * We cannot call radix_tree_preload for the kernels older
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 6b8b92b19cec..e08f064e4bd7 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -58,7 +58,7 @@ static inline unsigned nilfs_chunk_size(struct inode *inode)
 static inline void nilfs_put_page(struct page *page)
 {
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 /*
@@ -69,9 +69,9 @@ static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
 {
 	unsigned last_byte = inode->i_size;
 
-	last_byte -= page_nr << PAGE_CACHE_SHIFT;
-	if (last_byte > PAGE_CACHE_SIZE)
-		last_byte = PAGE_CACHE_SIZE;
+	last_byte -= page_nr << PAGE_SHIFT;
+	if (last_byte > PAGE_SIZE)
+		last_byte = PAGE_SIZE;
 	return last_byte;
 }
 
@@ -109,12 +109,12 @@ static void nilfs_check_page(struct page *page)
 	unsigned chunk_size = nilfs_chunk_size(dir);
 	char *kaddr = page_address(page);
 	unsigned offs, rec_len;
-	unsigned limit = PAGE_CACHE_SIZE;
+	unsigned limit = PAGE_SIZE;
 	struct nilfs_dir_entry *p;
 	char *error;
 
-	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
-		limit = dir->i_size & ~PAGE_CACHE_MASK;
+	if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_MASK;
 		if (limit & (chunk_size - 1))
 			goto Ebadsize;
 		if (!limit)
@@ -161,7 +161,7 @@ Espan:
 bad_entry:
 	nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
 		    "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
-		    dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		    dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
 		    (unsigned long) le64_to_cpu(p->inode),
 		    rec_len, p->name_len);
 	goto fail;
@@ -170,7 +170,7 @@ Eend:
 	nilfs_error(sb, "nilfs_check_page",
 		    "entry in directory #%lu spans the page boundary"
 		    "offset=%lu, inode=%lu",
-		    dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		    dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
 		    (unsigned long) le64_to_cpu(p->inode));
 fail:
 	SetPageChecked(page);
@@ -256,8 +256,8 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
 	loff_t pos = ctx->pos;
 	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	unsigned int offset = pos & ~PAGE_CACHE_MASK;
-	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned int offset = pos & ~PAGE_MASK;
+	unsigned long n = pos >> PAGE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 /*	unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
 
@@ -272,7 +272,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
 		if (IS_ERR(page)) {
 			nilfs_error(sb, __func__, "bad page in #%lu",
 				    inode->i_ino);
-			ctx->pos += PAGE_CACHE_SIZE - offset;
+			ctx->pos += PAGE_SIZE - offset;
 			return -EIO;
 		}
 		kaddr = page_address(page);
@@ -361,7 +361,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
 		if (++n >= npages)
 			n = 0;
 		/* next page is past the blocks we've got */
-		if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
+		if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
 			nilfs_error(dir->i_sb, __func__,
 			       "dir %lu size %lld exceeds block count %llu",
 			       dir->i_ino, dir->i_size,
@@ -401,7 +401,7 @@ ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
 	if (de) {
 		res = le64_to_cpu(de->inode);
 		kunmap(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	return res;
 }
@@ -460,7 +460,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
 		kaddr = page_address(page);
 		dir_end = kaddr + nilfs_last_byte(dir, n);
 		de = (struct nilfs_dir_entry *)kaddr;
-		kaddr += PAGE_CACHE_SIZE - reclen;
+		kaddr += PAGE_SIZE - reclen;
 		while ((char *)de <= kaddr) {
 			if ((char *)de == dir_end) {
 				/* We hit i_size */
@@ -603,7 +603,7 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
 	kunmap_atomic(kaddr);
 	nilfs_commit_chunk(page, mapping, 0, chunk_size);
 fail:
-	page_cache_release(page);
+	put_page(page);
 	return err;
 }
 
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 748ca238915a..0224b7826ace 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -115,7 +115,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
 
  failed:
 	unlock_page(bh->b_page);
-	page_cache_release(bh->b_page);
+	put_page(bh->b_page);
 	return err;
 }
 
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 21a1e2e0d92f..534631358b13 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -249,7 +249,7 @@ static int nilfs_set_page_dirty(struct page *page)
 		if (nr_dirty)
 			nilfs_set_file_dirty(inode, nr_dirty);
 	} else if (ret) {
-		unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
 
 		nilfs_set_file_dirty(inode, nr_dirty);
 	}
@@ -291,7 +291,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
 			   struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
-	unsigned start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned start = pos & (PAGE_SIZE - 1);
 	unsigned nr_dirty;
 	int err;
 
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 1125f40233ff..f6982b9153d5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -110,7 +110,7 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
 
  failed_bh:
 	unlock_page(bh->b_page);
-	page_cache_release(bh->b_page);
+	put_page(bh->b_page);
 	brelse(bh);
 
  failed_unlock:
@@ -170,7 +170,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
 
  failed_bh:
 	unlock_page(bh->b_page);
-	page_cache_release(bh->b_page);
+	put_page(bh->b_page);
 	brelse(bh);
  failed:
 	return ret;
@@ -363,7 +363,7 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
 int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
 {
 	pgoff_t index = (pgoff_t)block >>
-		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+		(PAGE_SHIFT - inode->i_blkbits);
 	struct page *page;
 	unsigned long first_block;
 	int ret = 0;
@@ -376,7 +376,7 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
 	wait_on_page_writeback(page);
 
 	first_block = (unsigned long)index <<
-		(PAGE_CACHE_SHIFT - inode->i_blkbits);
+		(PAGE_SHIFT - inode->i_blkbits);
 	if (page_has_buffers(page)) {
 		struct buffer_head *bh;
 
@@ -385,7 +385,7 @@ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
 	}
 	still_dirty = PageDirty(page);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	if (still_dirty ||
 	    invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
@@ -578,7 +578,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh)
 	}
 
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return 0;
 }
 
@@ -597,7 +597,7 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh)
 			bh_frozen = nilfs_page_get_nth_block(page, n);
 		}
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	return bh_frozen;
 }
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 7ccdb961eea9..151bc19d47c0 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -431,11 +431,11 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 out_dir:
 	if (dir_de) {
 		kunmap(dir_page);
-		page_cache_release(dir_page);
+		put_page(dir_page);
 	}
 out_old:
 	kunmap(old_page);
-	page_cache_release(old_page);
+	put_page(old_page);
 out:
 	nilfs_transaction_abort(old_dir->i_sb);
 	return err;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index c20df77eff99..489391561cda 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -50,7 +50,7 @@ __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
 	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << blkbits, b_state);
 
-	first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
+	first_block = (unsigned long)index << (PAGE_SHIFT - blkbits);
 	bh = nilfs_page_get_nth_block(page, block - first_block);
 
 	touch_buffer(bh);
@@ -64,7 +64,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 				      unsigned long b_state)
 {
 	int blkbits = inode->i_blkbits;
-	pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
+	pgoff_t index = blkoff >> (PAGE_SHIFT - blkbits);
 	struct page *page;
 	struct buffer_head *bh;
 
@@ -75,7 +75,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
 	bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
 	if (unlikely(!bh)) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		return NULL;
 	}
 	return bh;
@@ -288,7 +288,7 @@ repeat:
 		__set_page_dirty_nobuffers(dpage);
 
 		unlock_page(dpage);
-		page_cache_release(dpage);
+		put_page(dpage);
 		unlock_page(page);
 	}
 	pagevec_release(&pvec);
@@ -333,7 +333,7 @@ repeat:
 			WARN_ON(PageDirty(dpage));
 			nilfs_copy_page(dpage, page, 0);
 			unlock_page(dpage);
-			page_cache_release(dpage);
+			put_page(dpage);
 		} else {
 			struct page *page2;
 
@@ -350,7 +350,7 @@ repeat:
 			if (unlikely(err < 0)) {
 				WARN_ON(err == -EEXIST);
 				page->mapping = NULL;
-				page_cache_release(page); /* for cache */
+				put_page(page); /* for cache */
 			} else {
 				page->mapping = dmap;
 				dmap->nrpages++;
@@ -523,8 +523,8 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 	if (inode->i_mapping->nrpages == 0)
 		return 0;
 
-	index = start_blk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-	nblocks_in_page = 1U << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	index = start_blk >> (PAGE_SHIFT - inode->i_blkbits);
+	nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits);
 
 	pagevec_init(&pvec, 0);
 
@@ -537,7 +537,7 @@ repeat:
 	if (length > 0 && pvec.pages[0]->index > index)
 		goto out;
 
-	b = pvec.pages[0]->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	b = pvec.pages[0]->index << (PAGE_SHIFT - inode->i_blkbits);
 	i = 0;
 	do {
 		page = pvec.pages[i];
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 9b4f205d1173..5afa77fadc11 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -544,14 +544,14 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
 				blocksize, page, NULL);
 
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 
 		(*nr_salvaged_blocks)++;
 		goto next;
 
  failed_page:
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 
  failed_inode:
 		printk(KERN_WARNING
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 3b65adaae7e4..4317f72568e6 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2070,7 +2070,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 			goto failed_to_write;
 
 		if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE ||
-		    nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
+		    nilfs->ns_blocksize_bits != PAGE_SHIFT) {
 			/*
 			 * At this point, we avoid double buffering
 			 * for blocksize < pagesize because page dirty
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 7521e11db728..a474e7ef92ea 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -74,7 +74,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 
 		set_buffer_uptodate(bh);
 
-		file_ofs = ((s64)page->index << PAGE_CACHE_SHIFT) +
+		file_ofs = ((s64)page->index << PAGE_SHIFT) +
 				bh_offset(bh);
 		read_lock_irqsave(&ni->size_lock, flags);
 		init_size = ni->initialized_size;
@@ -142,7 +142,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		u32 rec_size;
 
 		rec_size = ni->itype.index.block_size;
-		recs = PAGE_CACHE_SIZE / rec_size;
+		recs = PAGE_SIZE / rec_size;
 		/* Should have been verified before we got here... */
 		BUG_ON(!recs);
 		local_irq_save(flags);
@@ -229,7 +229,7 @@ static int ntfs_read_block(struct page *page)
 	 * fully truncated, truncate will throw it away as soon as we unlock
 	 * it so no need to worry what we do with it.
 	 */
-	iblock = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+	iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
 	read_lock_irqsave(&ni->size_lock, flags);
 	lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
 	init_size = ni->initialized_size;
@@ -412,9 +412,9 @@ retry_readpage:
 	vi = page->mapping->host;
 	i_size = i_size_read(vi);
 	/* Is the page fully outside i_size? (truncate in progress) */
-	if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
-			PAGE_CACHE_SHIFT)) {
-		zero_user(page, 0, PAGE_CACHE_SIZE);
+	if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
+			PAGE_SHIFT)) {
+		zero_user(page, 0, PAGE_SIZE);
 		ntfs_debug("Read outside i_size - truncated?");
 		goto done;
 	}
@@ -463,7 +463,7 @@ retry_readpage:
 	 * ok to ignore the compressed flag here.
 	 */
 	if (unlikely(page->index > 0)) {
-		zero_user(page, 0, PAGE_CACHE_SIZE);
+		zero_user(page, 0, PAGE_SIZE);
 		goto done;
 	}
 	if (!NInoAttr(ni))
@@ -509,7 +509,7 @@ retry_readpage:
 			le16_to_cpu(ctx->attr->data.resident.value_offset),
 			attr_len);
 	/* Zero the remainder of the page. */
-	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+	memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
 	flush_dcache_page(page);
 	kunmap_atomic(addr);
 put_unm_err_out:
@@ -599,7 +599,7 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
 	/* NOTE: Different naming scheme to ntfs_read_block()! */
 
 	/* The first block in the page. */
-	block = (s64)page->index << (PAGE_CACHE_SHIFT - blocksize_bits);
+	block = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
 
 	read_lock_irqsave(&ni->size_lock, flags);
 	i_size = i_size_read(vi);
@@ -925,7 +925,7 @@ static int ntfs_write_mst_block(struct page *page,
 	ntfs_volume *vol = ni->vol;
 	u8 *kaddr;
 	unsigned int rec_size = ni->itype.index.block_size;
-	ntfs_inode *locked_nis[PAGE_CACHE_SIZE / rec_size];
+	ntfs_inode *locked_nis[PAGE_SIZE / rec_size];
 	struct buffer_head *bh, *head, *tbh, *rec_start_bh;
 	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
 	runlist_element *rl;
@@ -949,7 +949,7 @@ static int ntfs_write_mst_block(struct page *page,
 			(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
 	bh_size = vol->sb->s_blocksize;
 	bh_size_bits = vol->sb->s_blocksize_bits;
-	max_bhs = PAGE_CACHE_SIZE / bh_size;
+	max_bhs = PAGE_SIZE / bh_size;
 	BUG_ON(!max_bhs);
 	BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
 
@@ -961,13 +961,13 @@ static int ntfs_write_mst_block(struct page *page,
 	BUG_ON(!bh);
 
 	rec_size_bits = ni->itype.index.block_size_bits;
-	BUG_ON(!(PAGE_CACHE_SIZE >> rec_size_bits));
+	BUG_ON(!(PAGE_SIZE >> rec_size_bits));
 	bhs_per_rec = rec_size >> bh_size_bits;
 	BUG_ON(!bhs_per_rec);
 
 	/* The first block in the page. */
 	rec_block = block = (sector_t)page->index <<
-			(PAGE_CACHE_SHIFT - bh_size_bits);
+			(PAGE_SHIFT - bh_size_bits);
 
 	/* The first out of bounds block for the data size. */
 	dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
@@ -1133,7 +1133,7 @@ lock_retry_remap:
 			unsigned long mft_no;
 
 			/* Get the mft record number. */
-			mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+			mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
 					>> rec_size_bits;
 			/* Check whether to write this mft record. */
 			tni = NULL;
@@ -1249,7 +1249,7 @@ do_mirror:
 				continue;
 			ofs = bh_offset(tbh);
 			/* Get the mft record number. */
-			mft_no = (((s64)page->index << PAGE_CACHE_SHIFT) + ofs)
+			mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
 					>> rec_size_bits;
 			if (mft_no < vol->mftmirr_size)
 				ntfs_sync_mft_mirror(vol, mft_no,
@@ -1300,7 +1300,7 @@ done:
 		 * Set page error if there is only one ntfs record in the page.
 		 * Otherwise we would loose per-record granularity.
 		 */
-		if (ni->itype.index.block_size == PAGE_CACHE_SIZE)
+		if (ni->itype.index.block_size == PAGE_SIZE)
 			SetPageError(page);
 		NVolSetErrors(vol);
 	}
@@ -1308,7 +1308,7 @@ done:
 		ntfs_debug("Page still contains one or more dirty ntfs "
 				"records.  Redirtying the page starting at "
 				"record 0x%lx.", page->index <<
-				(PAGE_CACHE_SHIFT - rec_size_bits));
+				(PAGE_SHIFT - rec_size_bits));
 		redirty_page_for_writepage(wbc, page);
 		unlock_page(page);
 	} else {
@@ -1365,13 +1365,13 @@ retry_writepage:
 	BUG_ON(!PageLocked(page));
 	i_size = i_size_read(vi);
 	/* Is the page fully outside i_size? (truncate in progress) */
-	if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >>
-			PAGE_CACHE_SHIFT)) {
+	if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
+			PAGE_SHIFT)) {
 		/*
 		 * The page may have dirty, unmapped buffers.  Make them
 		 * freeable here, so the page does not leak.
 		 */
-		block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		block_invalidatepage(page, 0, PAGE_SIZE);
 		unlock_page(page);
 		ntfs_debug("Write outside i_size - truncated?");
 		return 0;
@@ -1414,10 +1414,10 @@ retry_writepage:
 	/* NInoNonResident() == NInoIndexAllocPresent() */
 	if (NInoNonResident(ni)) {
 		/* We have to zero every time due to mmap-at-end-of-file. */
-		if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) {
+		if (page->index >= (i_size >> PAGE_SHIFT)) {
 			/* The page straddles i_size. */
-			unsigned int ofs = i_size & ~PAGE_CACHE_MASK;
-			zero_user_segment(page, ofs, PAGE_CACHE_SIZE);
+			unsigned int ofs = i_size & ~PAGE_MASK;
+			zero_user_segment(page, ofs, PAGE_SIZE);
 		}
 		/* Handle mst protected attributes. */
 		if (NInoMstProtected(ni))
@@ -1500,7 +1500,7 @@ retry_writepage:
 			le16_to_cpu(ctx->attr->data.resident.value_offset),
 			addr, attr_len);
 	/* Zero out of bounds area in the page cache page. */
-	memset(addr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+	memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
 	kunmap_atomic(addr);
 	flush_dcache_page(page);
 	flush_dcache_mft_record_page(ctx->ntfs_ino);
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
index caecc58f529c..37cd7e45dcbc 100644
--- a/fs/ntfs/aops.h
+++ b/fs/ntfs/aops.h
@@ -40,7 +40,7 @@
 static inline void ntfs_unmap_page(struct page *page)
 {
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 /**
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 250ed5b20c8f..44a39a099b54 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -152,7 +152,7 @@ int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
 			if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
 					old_ctx.base_ntfs_ino) {
 				put_this_page = old_ctx.ntfs_ino->page;
-				page_cache_get(put_this_page);
+				get_page(put_this_page);
 			}
 			/*
 			 * Reinitialize the search context so we can lookup the
@@ -275,7 +275,7 @@ retry_map:
 		 * the pieces anyway.
 		 */
 		if (put_this_page)
-			page_cache_release(put_this_page);
+			put_page(put_this_page);
 	}
 	return err;
 }
@@ -1660,7 +1660,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
 		memcpy(kaddr, (u8*)a +
 				le16_to_cpu(a->data.resident.value_offset),
 				attr_size);
-		memset(kaddr + attr_size, 0, PAGE_CACHE_SIZE - attr_size);
+		memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size);
 		kunmap_atomic(kaddr);
 		flush_dcache_page(page);
 		SetPageUptodate(page);
@@ -1748,7 +1748,7 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
 	if (page) {
 		set_page_dirty(page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	ntfs_debug("Done.");
 	return 0;
@@ -1835,7 +1835,7 @@ rl_err_out:
 		ntfs_free(rl);
 page_err_out:
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	if (err == -EINVAL)
 		err = -EIO;
@@ -2513,17 +2513,17 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 	BUG_ON(NInoEncrypted(ni));
 	mapping = VFS_I(ni)->i_mapping;
 	/* Work out the starting index and page offset. */
-	idx = ofs >> PAGE_CACHE_SHIFT;
-	start_ofs = ofs & ~PAGE_CACHE_MASK;
+	idx = ofs >> PAGE_SHIFT;
+	start_ofs = ofs & ~PAGE_MASK;
 	/* Work out the ending index and page offset. */
 	end = ofs + cnt;
-	end_ofs = end & ~PAGE_CACHE_MASK;
+	end_ofs = end & ~PAGE_MASK;
 	/* If the end is outside the inode size return -ESPIPE. */
 	if (unlikely(end > i_size_read(VFS_I(ni)))) {
 		ntfs_error(vol->sb, "Request exceeds end of attribute.");
 		return -ESPIPE;
 	}
-	end >>= PAGE_CACHE_SHIFT;
+	end >>= PAGE_SHIFT;
 	/* If there is a first partial page, need to do it the slow way. */
 	if (start_ofs) {
 		page = read_mapping_page(mapping, idx, NULL);
@@ -2536,7 +2536,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		 * If the last page is the same as the first page, need to
 		 * limit the write to the end offset.
 		 */
-		size = PAGE_CACHE_SIZE;
+		size = PAGE_SIZE;
 		if (idx == end)
 			size = end_ofs;
 		kaddr = kmap_atomic(page);
@@ -2544,7 +2544,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		flush_dcache_page(page);
 		kunmap_atomic(kaddr);
 		set_page_dirty(page);
-		page_cache_release(page);
+		put_page(page);
 		balance_dirty_pages_ratelimited(mapping);
 		cond_resched();
 		if (idx == end)
@@ -2561,7 +2561,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 			return -ENOMEM;
 		}
 		kaddr = kmap_atomic(page);
-		memset(kaddr, val, PAGE_CACHE_SIZE);
+		memset(kaddr, val, PAGE_SIZE);
 		flush_dcache_page(page);
 		kunmap_atomic(kaddr);
 		/*
@@ -2585,7 +2585,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		set_page_dirty(page);
 		/* Finally unlock and release the page. */
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		balance_dirty_pages_ratelimited(mapping);
 		cond_resched();
 	}
@@ -2602,7 +2602,7 @@ int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
 		flush_dcache_page(page);
 		kunmap_atomic(kaddr);
 		set_page_dirty(page);
-		page_cache_release(page);
+		put_page(page);
 		balance_dirty_pages_ratelimited(mapping);
 		cond_resched();
 	}
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
index 0809cf876098..ec130c588d2b 100644
--- a/fs/ntfs/bitmap.c
+++ b/fs/ntfs/bitmap.c
@@ -67,8 +67,8 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
 	 * Calculate the indices for the pages containing the first and last
 	 * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
 	 */
-	index = start_bit >> (3 + PAGE_CACHE_SHIFT);
-	end_index = (start_bit + cnt - 1) >> (3 + PAGE_CACHE_SHIFT);
+	index = start_bit >> (3 + PAGE_SHIFT);
+	end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT);
 
 	/* Get the page containing the first bit (@start_bit). */
 	mapping = vi->i_mapping;
@@ -82,7 +82,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
 	kaddr = page_address(page);
 
 	/* Set @pos to the position of the byte containing @start_bit. */
-	pos = (start_bit >> 3) & ~PAGE_CACHE_MASK;
+	pos = (start_bit >> 3) & ~PAGE_MASK;
 
 	/* Calculate the position of @start_bit in the first byte. */
 	bit = start_bit & 7;
@@ -108,7 +108,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
 	 * Depending on @value, modify all remaining whole bytes in the page up
 	 * to @cnt.
 	 */
-	len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE - pos);
+	len = min_t(s64, cnt >> 3, PAGE_SIZE - pos);
 	memset(kaddr + pos, value ? 0xff : 0, len);
 	cnt -= len << 3;
 
@@ -132,7 +132,7 @@ int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
 		 * Depending on @value, modify all remaining whole bytes in the
 		 * page up to @cnt.
 		 */
-		len = min_t(s64, cnt >> 3, PAGE_CACHE_SIZE);
+		len = min_t(s64, cnt >> 3, PAGE_SIZE);
 		memset(kaddr, value ? 0xff : 0, len);
 		cnt -= len << 3;
 	}
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index f82498c35e78..b6074a56661b 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -104,7 +104,7 @@ static void zero_partial_compressed_page(struct page *page,
 	unsigned int kp_ofs;
 
 	ntfs_debug("Zeroing page region outside initialized size.");
-	if (((s64)page->index << PAGE_CACHE_SHIFT) >= initialized_size) {
+	if (((s64)page->index << PAGE_SHIFT) >= initialized_size) {
 		/*
 		 * FIXME: Using clear_page() will become wrong when we get
 		 * PAGE_CACHE_SIZE != PAGE_SIZE but for now there is no problem.
@@ -112,8 +112,8 @@ static void zero_partial_compressed_page(struct page *page,
 		clear_page(kp);
 		return;
 	}
-	kp_ofs = initialized_size & ~PAGE_CACHE_MASK;
-	memset(kp + kp_ofs, 0, PAGE_CACHE_SIZE - kp_ofs);
+	kp_ofs = initialized_size & ~PAGE_MASK;
+	memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs);
 	return;
 }
 
@@ -123,7 +123,7 @@ static void zero_partial_compressed_page(struct page *page,
 static inline void handle_bounds_compressed_page(struct page *page,
 		const loff_t i_size, const s64 initialized_size)
 {
-	if ((page->index >= (initialized_size >> PAGE_CACHE_SHIFT)) &&
+	if ((page->index >= (initialized_size >> PAGE_SHIFT)) &&
 			(initialized_size < i_size))
 		zero_partial_compressed_page(page, initialized_size);
 	return;
@@ -241,7 +241,7 @@ return_error:
 				if (di == xpage)
 					*xpage_done = 1;
 				else
-					page_cache_release(dp);
+					put_page(dp);
 				dest_pages[di] = NULL;
 			}
 		}
@@ -274,7 +274,7 @@ return_error:
 		cb = cb_sb_end;
 
 		/* Advance destination position to next sub-block. */
-		*dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_CACHE_MASK;
+		*dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK;
 		if (!*dest_ofs && (++*dest_index > dest_max_index))
 			goto return_overflow;
 		goto do_next_sb;
@@ -301,7 +301,7 @@ return_error:
 
 		/* Advance destination position to next sub-block. */
 		*dest_ofs += NTFS_SB_SIZE;
-		if (!(*dest_ofs &= ~PAGE_CACHE_MASK)) {
+		if (!(*dest_ofs &= ~PAGE_MASK)) {
 finalize_page:
 			/*
 			 * First stage: add current page index to array of
@@ -335,7 +335,7 @@ do_next_tag:
 			*dest_ofs += nr_bytes;
 		}
 		/* We have finished the current sub-block. */
-		if (!(*dest_ofs &= ~PAGE_CACHE_MASK))
+		if (!(*dest_ofs &= ~PAGE_MASK))
 			goto finalize_page;
 		goto do_next_sb;
 	}
@@ -498,13 +498,13 @@ int ntfs_read_compressed_block(struct page *page)
 	VCN vcn;
 	LCN lcn;
 	/* The first wanted vcn (minimum alignment is PAGE_CACHE_SIZE). */
-	VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >>
+	VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >>
 			vol->cluster_size_bits;
 	/*
 	 * The first vcn after the last wanted vcn (minimum alignment is again
 	 * PAGE_CACHE_SIZE.
 	 */
-	VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1)
+	VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1)
 			& ~cb_size_mask) >> vol->cluster_size_bits;
 	/* Number of compression blocks (cbs) in the wanted vcn range. */
 	unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
@@ -515,7 +515,7 @@ int ntfs_read_compressed_block(struct page *page)
 	 * guarantees of start_vcn and end_vcn, no need to round up here.
 	 */
 	unsigned int nr_pages = (end_vcn - start_vcn) <<
-			vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+			vol->cluster_size_bits >> PAGE_SHIFT;
 	unsigned int xpage, max_page, cur_page, cur_ofs, i;
 	unsigned int cb_clusters, cb_max_ofs;
 	int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
@@ -549,7 +549,7 @@ int ntfs_read_compressed_block(struct page *page)
 	 * We have already been given one page, this is the one we must do.
 	 * Once again, the alignment guarantees keep it simple.
 	 */
-	offset = start_vcn << vol->cluster_size_bits >> PAGE_CACHE_SHIFT;
+	offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT;
 	xpage = index - offset;
 	pages[xpage] = page;
 	/*
@@ -560,13 +560,13 @@ int ntfs_read_compressed_block(struct page *page)
 	i_size = i_size_read(VFS_I(ni));
 	initialized_size = ni->initialized_size;
 	read_unlock_irqrestore(&ni->size_lock, flags);
-	max_page = ((i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+	max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) -
 			offset;
 	/* Is the page fully outside i_size? (truncate in progress) */
 	if (xpage >= max_page) {
 		kfree(bhs);
 		kfree(pages);
-		zero_user(page, 0, PAGE_CACHE_SIZE);
+		zero_user(page, 0, PAGE_SIZE);
 		ntfs_debug("Compressed read outside i_size - truncated?");
 		SetPageUptodate(page);
 		unlock_page(page);
@@ -591,7 +591,7 @@ int ntfs_read_compressed_block(struct page *page)
 				continue;
 			}
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			pages[i] = NULL;
 		}
 	}
@@ -735,9 +735,9 @@ lock_retry_remap:
 	ntfs_debug("Successfully read the compression block.");
 
 	/* The last page and maximum offset within it for the current cb. */
-	cb_max_page = (cur_page << PAGE_CACHE_SHIFT) + cur_ofs + cb_size;
-	cb_max_ofs = cb_max_page & ~PAGE_CACHE_MASK;
-	cb_max_page >>= PAGE_CACHE_SHIFT;
+	cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size;
+	cb_max_ofs = cb_max_page & ~PAGE_MASK;
+	cb_max_page >>= PAGE_SHIFT;
 
 	/* Catch end of file inside a compression block. */
 	if (cb_max_page > max_page)
@@ -762,7 +762,7 @@ lock_retry_remap:
 					clear_page(page_address(page));
 				else
 					memset(page_address(page) + cur_ofs, 0,
-							PAGE_CACHE_SIZE -
+							PAGE_SIZE -
 							cur_ofs);
 				flush_dcache_page(page);
 				kunmap(page);
@@ -771,10 +771,10 @@ lock_retry_remap:
 				if (cur_page == xpage)
 					xpage_done = 1;
 				else
-					page_cache_release(page);
+					put_page(page);
 				pages[cur_page] = NULL;
 			}
-			cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+			cb_pos += PAGE_SIZE - cur_ofs;
 			cur_ofs = 0;
 			if (cb_pos >= cb_end)
 				break;
@@ -816,8 +816,8 @@ lock_retry_remap:
 			page = pages[cur_page];
 			if (page)
 				memcpy(page_address(page) + cur_ofs, cb_pos,
-						PAGE_CACHE_SIZE - cur_ofs);
-			cb_pos += PAGE_CACHE_SIZE - cur_ofs;
+						PAGE_SIZE - cur_ofs);
+			cb_pos += PAGE_SIZE - cur_ofs;
 			cur_ofs = 0;
 			if (cb_pos >= cb_end)
 				break;
@@ -850,10 +850,10 @@ lock_retry_remap:
 				if (cur2_page == xpage)
 					xpage_done = 1;
 				else
-					page_cache_release(page);
+					put_page(page);
 				pages[cur2_page] = NULL;
 			}
-			cb_pos2 += PAGE_CACHE_SIZE - cur_ofs2;
+			cb_pos2 += PAGE_SIZE - cur_ofs2;
 			cur_ofs2 = 0;
 			if (cb_pos2 >= cb_end)
 				break;
@@ -884,7 +884,7 @@ lock_retry_remap:
 					kunmap(page);
 					unlock_page(page);
 					if (prev_cur_page != xpage)
-						page_cache_release(page);
+						put_page(page);
 					pages[prev_cur_page] = NULL;
 				}
 			}
@@ -914,7 +914,7 @@ lock_retry_remap:
 			kunmap(page);
 			unlock_page(page);
 			if (cur_page != xpage)
-				page_cache_release(page);
+				put_page(page);
 			pages[cur_page] = NULL;
 		}
 	}
@@ -961,7 +961,7 @@ err_out:
 			kunmap(page);
 			unlock_page(page);
 			if (i != xpage)
-				page_cache_release(page);
+				put_page(page);
 		}
 	}
 	kfree(pages);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index b2eff5816adc..3cdce162592d 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -319,7 +319,7 @@ descend_into_child_node:
 	 * disk if necessary.
 	 */
 	page = ntfs_map_page(ia_mapping, vcn <<
-			dir_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+			dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
 	if (IS_ERR(page)) {
 		ntfs_error(sb, "Failed to map directory index page, error %ld.",
 				-PTR_ERR(page));
@@ -331,9 +331,9 @@ descend_into_child_node:
 fast_descend_into_child_node:
 	/* Get to the index allocation block. */
 	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
-			dir_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+			dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
 	/* Bounds checks. */
-	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
 		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
 				"inode 0x%lx or driver bug.", dir_ni->mft_no);
 		goto unm_err_out;
@@ -366,7 +366,7 @@ fast_descend_into_child_node:
 		goto unm_err_out;
 	}
 	index_end = (u8*)ia + dir_ni->itype.index.block_size;
-	if (index_end > kaddr + PAGE_CACHE_SIZE) {
+	if (index_end > kaddr + PAGE_SIZE) {
 		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
 				"0x%lx crosses page boundary. Impossible! "
 				"Cannot access! This is probably a bug in the "
@@ -559,9 +559,9 @@ found_it2:
 			/* If vcn is in the same page cache page as old_vcn we
 			 * recycle the mapped page. */
 			if (old_vcn << vol->cluster_size_bits >>
-					PAGE_CACHE_SHIFT == vcn <<
+					PAGE_SHIFT == vcn <<
 					vol->cluster_size_bits >>
-					PAGE_CACHE_SHIFT)
+					PAGE_SHIFT)
 				goto fast_descend_into_child_node;
 			unlock_page(page);
 			ntfs_unmap_page(page);
@@ -1246,15 +1246,15 @@ skip_index_root:
 		goto iput_err_out;
 	}
 	/* Get the starting bit position in the current bitmap page. */
-	cur_bmp_pos = bmp_pos & ((PAGE_CACHE_SIZE * 8) - 1);
-	bmp_pos &= ~(u64)((PAGE_CACHE_SIZE * 8) - 1);
+	cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1);
+	bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1);
 get_next_bmp_page:
 	ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
-			(unsigned long long)bmp_pos >> (3 + PAGE_CACHE_SHIFT),
+			(unsigned long long)bmp_pos >> (3 + PAGE_SHIFT),
 			(unsigned long long)bmp_pos &
-			(unsigned long long)((PAGE_CACHE_SIZE * 8) - 1));
+			(unsigned long long)((PAGE_SIZE * 8) - 1));
 	bmp_page = ntfs_map_page(bmp_mapping,
-			bmp_pos >> (3 + PAGE_CACHE_SHIFT));
+			bmp_pos >> (3 + PAGE_SHIFT));
 	if (IS_ERR(bmp_page)) {
 		ntfs_error(sb, "Reading index bitmap failed.");
 		err = PTR_ERR(bmp_page);
@@ -1270,9 +1270,9 @@ find_next_index_buffer:
 		 * If we have reached the end of the bitmap page, get the next
 		 * page, and put away the old one.
 		 */
-		if (unlikely((cur_bmp_pos >> 3) >= PAGE_CACHE_SIZE)) {
+		if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) {
 			ntfs_unmap_page(bmp_page);
-			bmp_pos += PAGE_CACHE_SIZE * 8;
+			bmp_pos += PAGE_SIZE * 8;
 			cur_bmp_pos = 0;
 			goto get_next_bmp_page;
 		}
@@ -1285,8 +1285,8 @@ find_next_index_buffer:
 	ntfs_debug("Handling index buffer 0x%llx.",
 			(unsigned long long)bmp_pos + cur_bmp_pos);
 	/* If the current index buffer is in the same page we reuse the page. */
-	if ((prev_ia_pos & (s64)PAGE_CACHE_MASK) !=
-			(ia_pos & (s64)PAGE_CACHE_MASK)) {
+	if ((prev_ia_pos & (s64)PAGE_MASK) !=
+			(ia_pos & (s64)PAGE_MASK)) {
 		prev_ia_pos = ia_pos;
 		if (likely(ia_page != NULL)) {
 			unlock_page(ia_page);
@@ -1296,7 +1296,7 @@ find_next_index_buffer:
 		 * Map the page cache page containing the current ia_pos,
 		 * reading it from disk if necessary.
 		 */
-		ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_CACHE_SHIFT);
+		ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT);
 		if (IS_ERR(ia_page)) {
 			ntfs_error(sb, "Reading index allocation data failed.");
 			err = PTR_ERR(ia_page);
@@ -1307,10 +1307,10 @@ find_next_index_buffer:
 		kaddr = (u8*)page_address(ia_page);
 	}
 	/* Get the current index buffer. */
-	ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_CACHE_MASK &
-			~(s64)(ndir->itype.index.block_size - 1)));
+	ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK &
+					  ~(s64)(ndir->itype.index.block_size - 1)));
 	/* Bounds checks. */
-	if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE)) {
+	if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) {
 		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
 				"inode 0x%lx or driver bug.", vdir->i_ino);
 		goto err_out;
@@ -1348,7 +1348,7 @@ find_next_index_buffer:
 		goto err_out;
 	}
 	index_end = (u8*)ia + ndir->itype.index.block_size;
-	if (unlikely(index_end > kaddr + PAGE_CACHE_SIZE)) {
+	if (unlikely(index_end > kaddr + PAGE_SIZE)) {
 		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
 				"0x%lx crosses page boundary. Impossible! "
 				"Cannot access! This is probably a bug in the "
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index bed4d427dfae..2dae60857544 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -220,8 +220,8 @@ do_non_resident_extend:
 		m = NULL;
 	}
 	mapping = vi->i_mapping;
-	index = old_init_size >> PAGE_CACHE_SHIFT;
-	end_index = (new_init_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	index = old_init_size >> PAGE_SHIFT;
+	end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	do {
 		/*
 		 * Read the page.  If the page is not present, this will zero
@@ -233,7 +233,7 @@ do_non_resident_extend:
 			goto init_err_out;
 		}
 		if (unlikely(PageError(page))) {
-			page_cache_release(page);
+			put_page(page);
 			err = -EIO;
 			goto init_err_out;
 		}
@@ -242,13 +242,13 @@ do_non_resident_extend:
 		 * enough to make ntfs_writepage() work.
 		 */
 		write_lock_irqsave(&ni->size_lock, flags);
-		ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT;
+		ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT;
 		if (ni->initialized_size > new_init_size)
 			ni->initialized_size = new_init_size;
 		write_unlock_irqrestore(&ni->size_lock, flags);
 		/* Set the page dirty so it gets written out. */
 		set_page_dirty(page);
-		page_cache_release(page);
+		put_page(page);
 		/*
 		 * Play nice with the vm and the rest of the system.  This is
 		 * very much needed as we can potentially be modifying the
@@ -543,7 +543,7 @@ out:
 err_out:
 	while (nr > 0) {
 		unlock_page(pages[--nr]);
-		page_cache_release(pages[nr]);
+		put_page(pages[nr]);
 	}
 	goto out;
 }
@@ -653,7 +653,7 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
 	u = 0;
 do_next_page:
 	page = pages[u];
-	bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+	bh_pos = (s64)page->index << PAGE_SHIFT;
 	bh = head = page_buffers(page);
 	do {
 		VCN cdelta;
@@ -810,11 +810,11 @@ map_buffer_cached:
 					
 				kaddr = kmap_atomic(page);
 				if (bh_pos < pos) {
-					pofs = bh_pos & ~PAGE_CACHE_MASK;
+					pofs = bh_pos & ~PAGE_MASK;
 					memset(kaddr + pofs, 0, pos - bh_pos);
 				}
 				if (bh_end > end) {
-					pofs = end & ~PAGE_CACHE_MASK;
+					pofs = end & ~PAGE_MASK;
 					memset(kaddr + pofs, 0, bh_end - end);
 				}
 				kunmap_atomic(kaddr);
@@ -942,7 +942,7 @@ rl_not_mapped_enoent:
 		 * unmapped.  This can only happen when the cluster size is
 		 * less than the page cache size.
 		 */
-		if (unlikely(vol->cluster_size < PAGE_CACHE_SIZE)) {
+		if (unlikely(vol->cluster_size < PAGE_SIZE)) {
 			bh_cend = (bh_end + vol->cluster_size - 1) >>
 					vol->cluster_size_bits;
 			if ((bh_cend <= cpos || bh_cpos >= cend)) {
@@ -1208,7 +1208,7 @@ rl_not_mapped_enoent:
 		wait_on_buffer(bh);
 		if (likely(buffer_uptodate(bh))) {
 			page = bh->b_page;
-			bh_pos = ((s64)page->index << PAGE_CACHE_SHIFT) +
+			bh_pos = ((s64)page->index << PAGE_SHIFT) +
 					bh_offset(bh);
 			/*
 			 * If the buffer overflows the initialized size, need
@@ -1350,7 +1350,7 @@ rl_not_mapped_enoent:
 		bh = head = page_buffers(page);
 		do {
 			if (u == nr_pages &&
-					((s64)page->index << PAGE_CACHE_SHIFT) +
+					((s64)page->index << PAGE_SHIFT) +
 					bh_offset(bh) >= end)
 				break;
 			if (!buffer_new(bh))
@@ -1422,7 +1422,7 @@ static inline int ntfs_commit_pages_after_non_resident_write(
 		bool partial;
 
 		page = pages[u];
-		bh_pos = (s64)page->index << PAGE_CACHE_SHIFT;
+		bh_pos = (s64)page->index << PAGE_SHIFT;
 		bh = head = page_buffers(page);
 		partial = false;
 		do {
@@ -1639,7 +1639,7 @@ static int ntfs_commit_pages_after_write(struct page **pages,
 		if (end < attr_len)
 			memcpy(kaddr + end, kattr + end, attr_len - end);
 		/* Zero the region outside the end of the attribute value. */
-		memset(kaddr + attr_len, 0, PAGE_CACHE_SIZE - attr_len);
+		memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len);
 		flush_dcache_page(page);
 		SetPageUptodate(page);
 	}
@@ -1706,7 +1706,7 @@ static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
 	unsigned len, copied;
 
 	do {
-		len = PAGE_CACHE_SIZE - ofs;
+		len = PAGE_SIZE - ofs;
 		if (len > bytes)
 			len = bytes;
 		copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
@@ -1724,14 +1724,14 @@ out:
 	return total;
 err:
 	/* Zero the rest of the target like __copy_from_user(). */
-	len = PAGE_CACHE_SIZE - copied;
+	len = PAGE_SIZE - copied;
 	do {
 		if (len > bytes)
 			len = bytes;
 		zero_user(*pages, copied, len);
 		bytes -= len;
 		copied = 0;
-		len = PAGE_CACHE_SIZE;
+		len = PAGE_SIZE;
 	} while (++pages < last_page);
 	goto out;
 }
@@ -1787,8 +1787,8 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
 	 * attributes.
 	 */
 	nr_pages = 1;
-	if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
-		nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
+	if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni))
+		nr_pages = vol->cluster_size >> PAGE_SHIFT;
 	last_vcn = -1;
 	do {
 		VCN vcn;
@@ -1796,9 +1796,9 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
 		unsigned ofs, do_pages, u;
 		size_t copied;
 
-		start_idx = idx = pos >> PAGE_CACHE_SHIFT;
-		ofs = pos & ~PAGE_CACHE_MASK;
-		bytes = PAGE_CACHE_SIZE - ofs;
+		start_idx = idx = pos >> PAGE_SHIFT;
+		ofs = pos & ~PAGE_MASK;
+		bytes = PAGE_SIZE - ofs;
 		do_pages = 1;
 		if (nr_pages > 1) {
 			vcn = pos >> vol->cluster_size_bits;
@@ -1832,7 +1832,7 @@ static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
 				if (lcn == LCN_HOLE) {
 					start_idx = (pos & ~(s64)
 							vol->cluster_size_mask)
-							>> PAGE_CACHE_SHIFT;
+							>> PAGE_SHIFT;
 					bytes = vol->cluster_size - (pos &
 							vol->cluster_size_mask);
 					do_pages = nr_pages;
@@ -1871,12 +1871,12 @@ again:
 			if (unlikely(status)) {
 				do {
 					unlock_page(pages[--do_pages]);
-					page_cache_release(pages[do_pages]);
+					put_page(pages[do_pages]);
 				} while (do_pages);
 				break;
 			}
 		}
-		u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
+		u = (pos >> PAGE_SHIFT) - pages[0]->index;
 		copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
 					i, bytes);
 		ntfs_flush_dcache_pages(pages + u, do_pages - u);
@@ -1889,7 +1889,7 @@ again:
 		}
 		do {
 			unlock_page(pages[--do_pages]);
-			page_cache_release(pages[do_pages]);
+			put_page(pages[do_pages]);
 		} while (do_pages);
 		if (unlikely(status < 0))
 			break;
@@ -1921,7 +1921,7 @@ again:
 		}
 	} while (iov_iter_count(i));
 	if (cached_page)
-		page_cache_release(cached_page);
+		put_page(cached_page);
 	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
 			written ? "written" : "status", (unsigned long)written,
 			(long)status);
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
index 096c135691ae..02a83a46ead2 100644
--- a/fs/ntfs/index.c
+++ b/fs/ntfs/index.c
@@ -276,7 +276,7 @@ descend_into_child_node:
 	 * disk if necessary.
 	 */
 	page = ntfs_map_page(ia_mapping, vcn <<
-			idx_ni->itype.index.vcn_size_bits >> PAGE_CACHE_SHIFT);
+			idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
 	if (IS_ERR(page)) {
 		ntfs_error(sb, "Failed to map index page, error %ld.",
 				-PTR_ERR(page));
@@ -288,9 +288,9 @@ descend_into_child_node:
 fast_descend_into_child_node:
 	/* Get to the index allocation block. */
 	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
-			idx_ni->itype.index.vcn_size_bits) & ~PAGE_CACHE_MASK));
+			idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
 	/* Bounds checks. */
-	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_CACHE_SIZE) {
+	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
 		ntfs_error(sb, "Out of bounds check failed.  Corrupt inode "
 				"0x%lx or driver bug.", idx_ni->mft_no);
 		goto unm_err_out;
@@ -323,7 +323,7 @@ fast_descend_into_child_node:
 		goto unm_err_out;
 	}
 	index_end = (u8*)ia + idx_ni->itype.index.block_size;
-	if (index_end > kaddr + PAGE_CACHE_SIZE) {
+	if (index_end > kaddr + PAGE_SIZE) {
 		ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
 				"crosses page boundary.  Impossible!  Cannot "
 				"access!  This is probably a bug in the "
@@ -427,9 +427,9 @@ ia_done:
 		 * the mapped page.
 		 */
 		if (old_vcn << vol->cluster_size_bits >>
-				PAGE_CACHE_SHIFT == vcn <<
+				PAGE_SHIFT == vcn <<
 				vol->cluster_size_bits >>
-				PAGE_CACHE_SHIFT)
+				PAGE_SHIFT)
 			goto fast_descend_into_child_node;
 		unlock_page(page);
 		ntfs_unmap_page(page);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d284f07eda77..3eda6d4bcc65 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -868,12 +868,12 @@ skip_attr_list_load:
 					ni->itype.index.block_size);
 			goto unm_err_out;
 		}
-		if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
+		if (ni->itype.index.block_size > PAGE_SIZE) {
 			ntfs_error(vi->i_sb, "Index block size (%u) > "
 					"PAGE_CACHE_SIZE (%ld) is not "
 					"supported.  Sorry.",
 					ni->itype.index.block_size,
-					PAGE_CACHE_SIZE);
+					PAGE_SIZE);
 			err = -EOPNOTSUPP;
 			goto unm_err_out;
 		}
@@ -1585,10 +1585,10 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
 				"two.", ni->itype.index.block_size);
 		goto unm_err_out;
 	}
-	if (ni->itype.index.block_size > PAGE_CACHE_SIZE) {
+	if (ni->itype.index.block_size > PAGE_SIZE) {
 		ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_CACHE_SIZE "
 				"(%ld) is not supported.  Sorry.",
-				ni->itype.index.block_size, PAGE_CACHE_SIZE);
+				ni->itype.index.block_size, PAGE_SIZE);
 		err = -EOPNOTSUPP;
 		goto unm_err_out;
 	}
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
index 1711b710b641..27a24a42f712 100644
--- a/fs/ntfs/lcnalloc.c
+++ b/fs/ntfs/lcnalloc.c
@@ -283,15 +283,15 @@ runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
 			ntfs_unmap_page(page);
 		}
 		page = ntfs_map_page(mapping, last_read_pos >>
-				PAGE_CACHE_SHIFT);
+				PAGE_SHIFT);
 		if (IS_ERR(page)) {
 			err = PTR_ERR(page);
 			ntfs_error(vol->sb, "Failed to map page.");
 			goto out;
 		}
-		buf_size = last_read_pos & ~PAGE_CACHE_MASK;
+		buf_size = last_read_pos & ~PAGE_MASK;
 		buf = page_address(page) + buf_size;
-		buf_size = PAGE_CACHE_SIZE - buf_size;
+		buf_size = PAGE_SIZE - buf_size;
 		if (unlikely(last_read_pos + buf_size > i_size))
 			buf_size = i_size - last_read_pos;
 		buf_size <<= 3;
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index c71de292c5ad..9d71213ca81e 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -381,7 +381,7 @@ static int ntfs_check_and_load_restart_page(struct inode *vi,
 	 * completely inside @rp, just copy it from there.  Otherwise map all
 	 * the required pages and copy the data from them.
 	 */
-	size = PAGE_CACHE_SIZE - (pos & ~PAGE_CACHE_MASK);
+	size = PAGE_SIZE - (pos & ~PAGE_MASK);
 	if (size >= le32_to_cpu(rp->system_page_size)) {
 		memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
 	} else {
@@ -394,8 +394,8 @@ static int ntfs_check_and_load_restart_page(struct inode *vi,
 		/* Copy the remaining data one page at a time. */
 		have_read = size;
 		to_read = le32_to_cpu(rp->system_page_size) - size;
-		idx = (pos + size) >> PAGE_CACHE_SHIFT;
-		BUG_ON((pos + size) & ~PAGE_CACHE_MASK);
+		idx = (pos + size) >> PAGE_SHIFT;
+		BUG_ON((pos + size) & ~PAGE_MASK);
 		do {
 			page = ntfs_map_page(vi->i_mapping, idx);
 			if (IS_ERR(page)) {
@@ -406,7 +406,7 @@ static int ntfs_check_and_load_restart_page(struct inode *vi,
 					err = -EIO;
 				goto err_out;
 			}
-			size = min_t(int, to_read, PAGE_CACHE_SIZE);
+			size = min_t(int, to_read, PAGE_SIZE);
 			memcpy((u8*)trp + have_read, page_address(page), size);
 			ntfs_unmap_page(page);
 			have_read += size;
@@ -509,11 +509,11 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 	 * log page size if the page cache size is between the default log page
 	 * size and twice that.
 	 */
-	if (PAGE_CACHE_SIZE >= DefaultLogPageSize && PAGE_CACHE_SIZE <=
+	if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <=
 			DefaultLogPageSize * 2)
 		log_page_size = DefaultLogPageSize;
 	else
-		log_page_size = PAGE_CACHE_SIZE;
+		log_page_size = PAGE_SIZE;
 	log_page_mask = log_page_size - 1;
 	/*
 	 * Use ntfs_ffs() instead of ffs() to enable the compiler to
@@ -539,7 +539,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 	 * to be empty.
 	 */
 	for (pos = 0; pos < size; pos <<= 1) {
-		pgoff_t idx = pos >> PAGE_CACHE_SHIFT;
+		pgoff_t idx = pos >> PAGE_SHIFT;
 		if (!page || page->index != idx) {
 			if (page)
 				ntfs_unmap_page(page);
@@ -550,7 +550,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
 				goto err_out;
 			}
 		}
-		kaddr = (u8*)page_address(page) + (pos & ~PAGE_CACHE_MASK);
+		kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK);
 		/*
 		 * A non-empty block means the logfile is not empty while an
 		 * empty block after a non-empty block has been encountered
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 3014a36a255b..37b2501caaa4 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -61,16 +61,16 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
 	 * here if the volume was that big...
 	 */
 	index = (u64)ni->mft_no << vol->mft_record_size_bits >>
-			PAGE_CACHE_SHIFT;
-	ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+			PAGE_SHIFT;
+	ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
 
 	i_size = i_size_read(mft_vi);
 	/* The maximum valid index into the page cache for $MFT's data. */
-	end_index = i_size >> PAGE_CACHE_SHIFT;
+	end_index = i_size >> PAGE_SHIFT;
 
 	/* If the wanted index is out of bounds the mft record doesn't exist. */
 	if (unlikely(index >= end_index)) {
-		if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs +
+		if (index > end_index || (i_size & ~PAGE_MASK) < ofs +
 				vol->mft_record_size) {
 			page = ERR_PTR(-ENOENT);
 			ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
@@ -487,7 +487,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 	}
 	/* Get the page containing the mirror copy of the mft record @m. */
 	page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
-			(PAGE_CACHE_SHIFT - vol->mft_record_size_bits));
+			(PAGE_SHIFT - vol->mft_record_size_bits));
 	if (IS_ERR(page)) {
 		ntfs_error(vol->sb, "Failed to map mft mirror page.");
 		err = PTR_ERR(page);
@@ -497,7 +497,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
 	BUG_ON(!PageUptodate(page));
 	ClearPageUptodate(page);
 	/* Offset of the mft mirror record inside the page. */
-	page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+	page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
 	/* The address in the page of the mirror copy of the mft record @m. */
 	kmirr = page_address(page) + page_ofs;
 	/* Copy the mst protected mft record to the mirror. */
@@ -1178,8 +1178,8 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
 	for (; pass <= 2;) {
 		/* Cap size to pass_end. */
 		ofs = data_pos >> 3;
-		page_ofs = ofs & ~PAGE_CACHE_MASK;
-		size = PAGE_CACHE_SIZE - page_ofs;
+		page_ofs = ofs & ~PAGE_MASK;
+		size = PAGE_SIZE - page_ofs;
 		ll = ((pass_end + 7) >> 3) - ofs;
 		if (size > ll)
 			size = ll;
@@ -1190,7 +1190,7 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
 		 */
 		if (size) {
 			page = ntfs_map_page(mftbmp_mapping,
-					ofs >> PAGE_CACHE_SHIFT);
+					ofs >> PAGE_SHIFT);
 			if (IS_ERR(page)) {
 				ntfs_error(vol->sb, "Failed to read mft "
 						"bitmap, aborting.");
@@ -1328,13 +1328,13 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
 	 */
 	ll = lcn >> 3;
 	page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
-			ll >> PAGE_CACHE_SHIFT);
+			ll >> PAGE_SHIFT);
 	if (IS_ERR(page)) {
 		up_write(&mftbmp_ni->runlist.lock);
 		ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
 		return PTR_ERR(page);
 	}
-	b = (u8*)page_address(page) + (ll & ~PAGE_CACHE_MASK);
+	b = (u8*)page_address(page) + (ll & ~PAGE_MASK);
 	tb = 1 << (lcn & 7ull);
 	down_write(&vol->lcnbmp_lock);
 	if (*b != 0xff && !(*b & tb)) {
@@ -2103,14 +2103,14 @@ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
 	 * The index into the page cache and the offset within the page cache
 	 * page of the wanted mft record.
 	 */
-	index = mft_no << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
-	ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+	index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT;
+	ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
 	/* The maximum valid index into the page cache for $MFT's data. */
 	i_size = i_size_read(mft_vi);
-	end_index = i_size >> PAGE_CACHE_SHIFT;
+	end_index = i_size >> PAGE_SHIFT;
 	if (unlikely(index >= end_index)) {
 		if (unlikely(index > end_index || ofs + vol->mft_record_size >=
-				(i_size & ~PAGE_CACHE_MASK))) {
+				(i_size & ~PAGE_MASK))) {
 			ntfs_error(vol->sb, "Tried to format non-existing mft "
 					"record 0x%llx.", (long long)mft_no);
 			return -ENOENT;
@@ -2515,8 +2515,8 @@ mft_rec_already_initialized:
 	 * We now have allocated and initialized the mft record.  Calculate the
 	 * index of and the offset within the page cache page the record is in.
 	 */
-	index = bit << vol->mft_record_size_bits >> PAGE_CACHE_SHIFT;
-	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK;
+	index = bit << vol->mft_record_size_bits >> PAGE_SHIFT;
+	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK;
 	/* Read, map, and pin the page containing the mft record. */
 	page = ntfs_map_page(vol->mft_ino->i_mapping, index);
 	if (IS_ERR(page)) {
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index c581e26a350d..12de47b96ca9 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -43,7 +43,7 @@ typedef enum {
 	NTFS_MAX_NAME_LEN	= 255,
 	NTFS_MAX_ATTR_NAME_LEN	= 255,
 	NTFS_MAX_CLUSTER_SIZE	= 64 * 1024,	/* 64kiB */
-	NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_CACHE_SIZE,
+	NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE,
 } NTFS_CONSTANTS;
 
 /* Global variables. */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1b38abdaa3ed..ab2b0930054e 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -826,11 +826,11 @@ static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
 	 * We cannot support mft record sizes above the PAGE_CACHE_SIZE since
 	 * we store $MFT/$DATA, the table of mft records in the page cache.
 	 */
-	if (vol->mft_record_size > PAGE_CACHE_SIZE) {
+	if (vol->mft_record_size > PAGE_SIZE) {
 		ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
 				"PAGE_CACHE_SIZE on your system (%lu).  "
 				"This is not supported.  Sorry.",
-				vol->mft_record_size, PAGE_CACHE_SIZE);
+				vol->mft_record_size, PAGE_SIZE);
 		return false;
 	}
 	/* We cannot support mft record sizes below the sector size. */
@@ -1096,7 +1096,7 @@ static bool check_mft_mirror(ntfs_volume *vol)
 
 	ntfs_debug("Entering.");
 	/* Compare contents of $MFT and $MFTMirr. */
-	mrecs_per_page = PAGE_CACHE_SIZE / vol->mft_record_size;
+	mrecs_per_page = PAGE_SIZE / vol->mft_record_size;
 	BUG_ON(!mrecs_per_page);
 	BUG_ON(!vol->mftmirr_size);
 	mft_page = mirr_page = NULL;
@@ -1615,20 +1615,20 @@ static bool load_and_init_attrdef(ntfs_volume *vol)
 	if (!vol->attrdef)
 		goto iput_failed;
 	index = 0;
-	max_index = i_size >> PAGE_CACHE_SHIFT;
-	size = PAGE_CACHE_SIZE;
+	max_index = i_size >> PAGE_SHIFT;
+	size = PAGE_SIZE;
 	while (index < max_index) {
 		/* Read the attrdef table and copy it into the linear buffer. */
 read_partial_attrdef_page:
 		page = ntfs_map_page(ino->i_mapping, index);
 		if (IS_ERR(page))
 			goto free_iput_failed;
-		memcpy((u8*)vol->attrdef + (index++ << PAGE_CACHE_SHIFT),
+		memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT),
 				page_address(page), size);
 		ntfs_unmap_page(page);
 	};
-	if (size == PAGE_CACHE_SIZE) {
-		size = i_size & ~PAGE_CACHE_MASK;
+	if (size == PAGE_SIZE) {
+		size = i_size & ~PAGE_MASK;
 		if (size)
 			goto read_partial_attrdef_page;
 	}
@@ -1684,20 +1684,20 @@ static bool load_and_init_upcase(ntfs_volume *vol)
 	if (!vol->upcase)
 		goto iput_upcase_failed;
 	index = 0;
-	max_index = i_size >> PAGE_CACHE_SHIFT;
-	size = PAGE_CACHE_SIZE;
+	max_index = i_size >> PAGE_SHIFT;
+	size = PAGE_SIZE;
 	while (index < max_index) {
 		/* Read the upcase table and copy it into the linear buffer. */
 read_partial_upcase_page:
 		page = ntfs_map_page(ino->i_mapping, index);
 		if (IS_ERR(page))
 			goto iput_upcase_failed;
-		memcpy((char*)vol->upcase + (index++ << PAGE_CACHE_SHIFT),
+		memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT),
 				page_address(page), size);
 		ntfs_unmap_page(page);
 	};
-	if (size == PAGE_CACHE_SIZE) {
-		size = i_size & ~PAGE_CACHE_MASK;
+	if (size == PAGE_SIZE) {
+		size = i_size & ~PAGE_MASK;
 		if (size)
 			goto read_partial_upcase_page;
 	}
@@ -2474,11 +2474,11 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 	 * multiples of PAGE_CACHE_SIZE, rounding up so that if we have one
 	 * full and one partial page max_index = 2.
 	 */
-	max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_CACHE_SIZE - 1) >>
-			PAGE_CACHE_SHIFT;
+	max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >>
+			PAGE_SHIFT;
 	/* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
 	ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
-			max_index, PAGE_CACHE_SIZE / 4);
+			max_index, PAGE_SIZE / 4);
 	for (index = 0; index < max_index; index++) {
 		unsigned long *kaddr;
 
@@ -2491,7 +2491,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 		if (IS_ERR(page)) {
 			ntfs_debug("read_mapping_page() error. Skipping "
 					"page (index 0x%lx).", index);
-			nr_free -= PAGE_CACHE_SIZE * 8;
+			nr_free -= PAGE_SIZE * 8;
 			continue;
 		}
 		kaddr = kmap_atomic(page);
@@ -2503,9 +2503,9 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
 		 * ntfs_readpage().
 		 */
 		nr_free -= bitmap_weight(kaddr,
-					PAGE_CACHE_SIZE * BITS_PER_BYTE);
+					PAGE_SIZE * BITS_PER_BYTE);
 		kunmap_atomic(kaddr);
-		page_cache_release(page);
+		put_page(page);
 	}
 	ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
 	/*
@@ -2549,7 +2549,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 	ntfs_debug("Entering.");
 	/* Use multiples of 4 bytes, thus max_size is PAGE_CACHE_SIZE / 4. */
 	ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
-			"0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
+			"0x%lx.", max_index, PAGE_SIZE / 4);
 	for (index = 0; index < max_index; index++) {
 		unsigned long *kaddr;
 
@@ -2562,7 +2562,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 		if (IS_ERR(page)) {
 			ntfs_debug("read_mapping_page() error. Skipping "
 					"page (index 0x%lx).", index);
-			nr_free -= PAGE_CACHE_SIZE * 8;
+			nr_free -= PAGE_SIZE * 8;
 			continue;
 		}
 		kaddr = kmap_atomic(page);
@@ -2574,9 +2574,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
 		 * ntfs_readpage().
 		 */
 		nr_free -= bitmap_weight(kaddr,
-					PAGE_CACHE_SIZE * BITS_PER_BYTE);
+					PAGE_SIZE * BITS_PER_BYTE);
 		kunmap_atomic(kaddr);
-		page_cache_release(page);
+		put_page(page);
 	}
 	ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
 			index - 1);
@@ -2618,17 +2618,17 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
 	/* Type of filesystem. */
 	sfs->f_type   = NTFS_SB_MAGIC;
 	/* Optimal transfer block size. */
-	sfs->f_bsize  = PAGE_CACHE_SIZE;
+	sfs->f_bsize  = PAGE_SIZE;
 	/*
 	 * Total data blocks in filesystem in units of f_bsize and since
 	 * inodes are also stored in data blocs ($MFT is a file) this is just
 	 * the total clusters.
 	 */
 	sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
-				PAGE_CACHE_SHIFT;
+				PAGE_SHIFT;
 	/* Free data blocks in filesystem in units of f_bsize. */
 	size	      = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
-				PAGE_CACHE_SHIFT;
+				PAGE_SHIFT;
 	if (size < 0LL)
 		size = 0LL;
 	/* Free blocks avail to non-superuser, same as above on NTFS. */
@@ -2643,7 +2643,7 @@ static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
 	 * have one full and one partial page max_index = 2.
 	 */
 	max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits)
-			+ 7) >> 3) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+			+ 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	read_unlock_irqrestore(&mft_ni->size_lock, flags);
 	/* Number of inodes in filesystem (at this point in time). */
 	sfs->f_files = size;
@@ -2766,14 +2766,14 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		goto err_out_now;
 
 	/* We support sector sizes up to the PAGE_CACHE_SIZE. */
-	if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+	if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) {
 		if (!silent)
 			ntfs_error(sb, "Device has unsupported sector size "
 					"(%i).  The maximum supported sector "
 					"size on this architecture is %lu "
 					"bytes.",
 					bdev_logical_block_size(sb->s_bdev),
-					PAGE_CACHE_SIZE);
+					PAGE_SIZE);
 		goto err_out_now;
 	}
 	/*
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 70907d638b60..e361d1a0ca09 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6671,7 +6671,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
 {
 	int i;
 	struct page *page;
-	unsigned int from, to = PAGE_CACHE_SIZE;
+	unsigned int from, to = PAGE_SIZE;
 	struct super_block *sb = inode->i_sb;
 
 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
@@ -6679,21 +6679,21 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
 	if (numpages == 0)
 		goto out;
 
-	to = PAGE_CACHE_SIZE;
+	to = PAGE_SIZE;
 	for(i = 0; i < numpages; i++) {
 		page = pages[i];
 
-		from = start & (PAGE_CACHE_SIZE - 1);
-		if ((end >> PAGE_CACHE_SHIFT) == page->index)
-			to = end & (PAGE_CACHE_SIZE - 1);
+		from = start & (PAGE_SIZE - 1);
+		if ((end >> PAGE_SHIFT) == page->index)
+			to = end & (PAGE_SIZE - 1);
 
-		BUG_ON(from > PAGE_CACHE_SIZE);
-		BUG_ON(to > PAGE_CACHE_SIZE);
+		BUG_ON(from > PAGE_SIZE);
+		BUG_ON(to > PAGE_SIZE);
 
 		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
 					 &phys);
 
-		start = (page->index + 1) << PAGE_CACHE_SHIFT;
+		start = (page->index + 1) << PAGE_SHIFT;
 	}
 out:
 	if (pages)
@@ -6712,7 +6712,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
 
 	numpages = 0;
 	last_page_bytes = PAGE_ALIGN(end);
-	index = start >> PAGE_CACHE_SHIFT;
+	index = start >> PAGE_SHIFT;
 	do {
 		pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS);
 		if (!pages[numpages]) {
@@ -6723,7 +6723,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
 
 		numpages++;
 		index++;
-	} while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
+	} while (index < (last_page_bytes >> PAGE_SHIFT));
 
 out:
 	if (ret != 0) {
@@ -6950,8 +6950,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 		 * to do that now.
 		 */
 		if (!ocfs2_sparse_alloc(osb) &&
-		    PAGE_CACHE_SIZE < osb->s_clustersize)
-			end = PAGE_CACHE_SIZE;
+		    PAGE_SIZE < osb->s_clustersize)
+			end = PAGE_SIZE;
 
 		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
 		if (ret) {
@@ -6971,8 +6971,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 			goto out_unlock;
 		}
 
-		page_end = PAGE_CACHE_SIZE;
-		if (PAGE_CACHE_SIZE > osb->s_clustersize)
+		page_end = PAGE_SIZE;
+		if (PAGE_SIZE > osb->s_clustersize)
 			page_end = osb->s_clustersize;
 
 		for (i = 0; i < num_pages; i++)
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1581240a7ca0..ce5dc4f92935 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -234,7 +234,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 
 	size = i_size_read(inode);
 
-	if (size > PAGE_CACHE_SIZE ||
+	if (size > PAGE_SIZE ||
 	    size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %llu has with inline data has bad size: %Lu\n",
@@ -247,7 +247,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 	if (size)
 		memcpy(kaddr, di->id2.i_data.id_data, size);
 	/* Clear the remaining part of the page */
-	memset(kaddr + size, 0, PAGE_CACHE_SIZE - size);
+	memset(kaddr + size, 0, PAGE_SIZE - size);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr);
 
@@ -282,7 +282,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
+	loff_t start = (loff_t)page->index << PAGE_SHIFT;
 	int ret, unlock = 1;
 
 	trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
@@ -385,7 +385,7 @@ static int ocfs2_readpages(struct file *filp, struct address_space *mapping,
 	 * drop out in that case as it's not worth handling here.
 	 */
 	last = list_entry(pages->prev, struct page, lru);
-	start = (loff_t)last->index << PAGE_CACHE_SHIFT;
+	start = (loff_t)last->index << PAGE_SHIFT;
 	if (start >= i_size_read(inode))
 		goto out_unlock;
 
@@ -511,12 +511,12 @@ static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
 					    unsigned int *start,
 					    unsigned int *end)
 {
-	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+	unsigned int cluster_start = 0, cluster_end = PAGE_SIZE;
 
-	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+	if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits)) {
 		unsigned int cpp;
 
-		cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+		cpp = 1 << (PAGE_SHIFT - osb->s_clustersize_bits);
 
 		cluster_start = cpos % cpp;
 		cluster_start = cluster_start << osb->s_clustersize_bits;
@@ -687,10 +687,10 @@ next_bh:
 #if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
 #define OCFS2_MAX_CTXT_PAGES	1
 #else
-#define OCFS2_MAX_CTXT_PAGES	(OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#define OCFS2_MAX_CTXT_PAGES	(OCFS2_MAX_CLUSTERSIZE / PAGE_SIZE)
 #endif
 
-#define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+#define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_SIZE / OCFS2_MIN_CLUSTERSIZE)
 
 struct ocfs2_unwritten_extent {
 	struct list_head	ue_node;
@@ -785,7 +785,7 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
 		if (pages[i]) {
 			unlock_page(pages[i]);
 			mark_page_accessed(pages[i]);
-			page_cache_release(pages[i]);
+			put_page(pages[i]);
 		}
 	}
 }
@@ -808,7 +808,7 @@ static void ocfs2_unlock_pages(struct ocfs2_write_ctxt *wc)
 			}
 		}
 		mark_page_accessed(wc->w_target_page);
-		page_cache_release(wc->w_target_page);
+		put_page(wc->w_target_page);
 	}
 	ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
 }
@@ -857,7 +857,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
 	wc->w_di_bh = di_bh;
 	wc->w_type = type;
 
-	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+	if (unlikely(PAGE_SHIFT > osb->s_clustersize_bits))
 		wc->w_large_pages = 1;
 	else
 		wc->w_large_pages = 0;
@@ -920,7 +920,7 @@ static void ocfs2_write_failure(struct inode *inode,
 				loff_t user_pos, unsigned user_len)
 {
 	int i;
-	unsigned from = user_pos & (PAGE_CACHE_SIZE - 1),
+	unsigned from = user_pos & (PAGE_SIZE - 1),
 		to = user_pos + user_len;
 	struct page *tmppage;
 
@@ -960,7 +960,7 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
 			(page_offset(page) <= user_pos));
 
 	if (page == wc->w_target_page) {
-		map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+		map_from = user_pos & (PAGE_SIZE - 1);
 		map_to = map_from + user_len;
 
 		if (new)
@@ -1034,7 +1034,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	loff_t last_byte;
 
-	target_index = user_pos >> PAGE_CACHE_SHIFT;
+	target_index = user_pos >> PAGE_SHIFT;
 
 	/*
 	 * Figure out how many pages we'll be manipulating here. For
@@ -1053,14 +1053,14 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 		 */
 		last_byte = max(user_pos + user_len, i_size_read(inode));
 		BUG_ON(last_byte < 1);
-		end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+		end_index = ((last_byte - 1) >> PAGE_SHIFT) + 1;
 		if ((start + wc->w_num_pages) > end_index)
 			wc->w_num_pages = end_index - start;
 	} else {
 		wc->w_num_pages = 1;
 		start = target_index;
 	}
-	end_index = (user_pos + user_len - 1) >> PAGE_CACHE_SHIFT;
+	end_index = (user_pos + user_len - 1) >> PAGE_SHIFT;
 
 	for(i = 0; i < wc->w_num_pages; i++) {
 		index = start + i;
@@ -1082,7 +1082,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 				goto out;
 			}
 
-			page_cache_get(mmap_page);
+			get_page(mmap_page);
 			wc->w_pages[i] = mmap_page;
 			wc->w_target_locked = true;
 		} else if (index >= target_index && index <= end_index &&
@@ -1272,7 +1272,7 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 {
 	struct ocfs2_write_cluster_desc *desc;
 
-	wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
+	wc->w_target_from = pos & (PAGE_SIZE - 1);
 	wc->w_target_to = wc->w_target_from + len;
 
 	if (alloc == 0)
@@ -1309,7 +1309,7 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
 							&wc->w_target_to);
 	} else {
 		wc->w_target_from = 0;
-		wc->w_target_to = PAGE_CACHE_SIZE;
+		wc->w_target_to = PAGE_SIZE;
 	}
 }
 
@@ -1981,7 +1981,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 			   struct page *page, void *fsdata)
 {
 	int i, ret;
-	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+	unsigned from, to, start = pos & (PAGE_SIZE - 1);
 	struct inode *inode = mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_write_ctxt *wc = fsdata;
@@ -2027,8 +2027,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 			from = wc->w_target_from;
 			to = wc->w_target_to;
 
-			BUG_ON(from > PAGE_CACHE_SIZE ||
-			       to > PAGE_CACHE_SIZE ||
+			BUG_ON(from > PAGE_SIZE ||
+			       to > PAGE_SIZE ||
 			       to < from);
 		} else {
 			/*
@@ -2037,7 +2037,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 			 * to flush their entire range.
 			 */
 			from = 0;
-			to = PAGE_CACHE_SIZE;
+			to = PAGE_SIZE;
 		}
 
 		if (page_has_buffers(tmppage)) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index bd15929b5f92..1934abb6b680 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -417,13 +417,13 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 	bio->bi_private = wc;
 	bio->bi_end_io = o2hb_bio_end_io;
 
-	vec_start = (cs << bits) % PAGE_CACHE_SIZE;
+	vec_start = (cs << bits) % PAGE_SIZE;
 	while(cs < max_slots) {
 		current_page = cs / spp;
 		page = reg->hr_slot_data[current_page];
 
-		vec_len = min(PAGE_CACHE_SIZE - vec_start,
-			      (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
+		vec_len = min(PAGE_SIZE - vec_start,
+			      (max_slots-cs) * (PAGE_SIZE/spp) );
 
 		mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
 		     current_page, vec_len, vec_start);
@@ -431,7 +431,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
 		len = bio_add_page(bio, page, vec_len, vec_start);
 		if (len != vec_len) break;
 
-		cs += vec_len / (PAGE_CACHE_SIZE/spp);
+		cs += vec_len / (PAGE_SIZE/spp);
 		vec_start = 0;
 	}
 
@@ -1576,7 +1576,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
 
 static void o2hb_init_region_params(struct o2hb_region *reg)
 {
-	reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits;
+	reg->hr_slots_per_page = PAGE_SIZE >> reg->hr_block_bits;
 	reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS;
 
 	mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n",
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 03768bb3aab1..47b3b2d4e775 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -571,8 +571,8 @@ static int dlmfs_fill_super(struct super_block * sb,
 			    int silent)
 {
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = DLMFS_MAGIC;
 	sb->s_op = &dlmfs_ops;
 	sb->s_root = d_make_root(dlmfs_get_root_inode(sb));
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c18ab45f8d21..5308841756be 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -770,14 +770,14 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
-	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
+	unsigned long index = abs_from >> PAGE_SHIFT;
 	handle_t *handle;
 	int ret = 0;
 	unsigned zero_from, zero_to, block_start, block_end;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 
 	BUG_ON(abs_from >= abs_to);
-	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+	BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
 	BUG_ON(abs_from & (inode->i_blkbits - 1));
 
 	handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
@@ -794,10 +794,10 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	}
 
 	/* Get the offsets within the page that we want to zero */
-	zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
-	zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+	zero_from = abs_from & (PAGE_SIZE - 1);
+	zero_to = abs_to & (PAGE_SIZE - 1);
 	if (!zero_to)
-		zero_to = PAGE_CACHE_SIZE;
+		zero_to = PAGE_SIZE;
 
 	trace_ocfs2_write_zero_page(
 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -851,7 +851,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 
 out_unlock:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 out_commit_trans:
 	if (handle)
 		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
@@ -959,7 +959,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
 	BUG_ON(range_start >= range_end);
 
 	while (zero_pos < range_end) {
-		next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+		next_pos = (zero_pos & PAGE_MASK) + PAGE_SIZE;
 		if (next_pos > range_end)
 			next_pos = range_end;
 		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9ea081f4e6e4..71545ad4628c 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -65,13 +65,13 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
 	struct inode *inode = file_inode(file);
 	struct address_space *mapping = inode->i_mapping;
 	loff_t pos = page_offset(page);
-	unsigned int len = PAGE_CACHE_SIZE;
+	unsigned int len = PAGE_SIZE;
 	pgoff_t last_index;
 	struct page *locked_page = NULL;
 	void *fsdata;
 	loff_t size = i_size_read(inode);
 
-	last_index = (size - 1) >> PAGE_CACHE_SHIFT;
+	last_index = (size - 1) >> PAGE_SHIFT;
 
 	/*
 	 * There are cases that lead to the page no longer bebongs to the
@@ -102,7 +102,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
 	 * because the "write" would invalidate their data.
 	 */
 	if (page->index == last_index)
-		len = ((size - 1) & ~PAGE_CACHE_MASK) + 1;
+		len = ((size - 1) & ~PAGE_MASK) + 1;
 
 	ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP,
 				       &locked_page, &fsdata, di_bh, page);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6cf6538a0651..e63af7ddfe68 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -822,10 +822,10 @@ static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
 	u32 clusters = pg_index;
 	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
 
-	if (unlikely(PAGE_CACHE_SHIFT > cbits))
-		clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
-	else if (PAGE_CACHE_SHIFT < cbits)
-		clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+	if (unlikely(PAGE_SHIFT > cbits))
+		clusters = pg_index << (PAGE_SHIFT - cbits);
+	else if (PAGE_SHIFT < cbits)
+		clusters = pg_index >> (cbits - PAGE_SHIFT);
 
 	return clusters;
 }
@@ -839,10 +839,10 @@ static inline pgoff_t ocfs2_align_clusters_to_page_index(struct super_block *sb,
 	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
         pgoff_t index = clusters;
 
-	if (PAGE_CACHE_SHIFT > cbits) {
-		index = (pgoff_t)clusters >> (PAGE_CACHE_SHIFT - cbits);
-	} else if (PAGE_CACHE_SHIFT < cbits) {
-		index = (pgoff_t)clusters << (cbits - PAGE_CACHE_SHIFT);
+	if (PAGE_SHIFT > cbits) {
+		index = (pgoff_t)clusters >> (PAGE_SHIFT - cbits);
+	} else if (PAGE_SHIFT < cbits) {
+		index = (pgoff_t)clusters << (cbits - PAGE_SHIFT);
 	}
 
 	return index;
@@ -853,8 +853,8 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
 	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
 	unsigned int pages_per_cluster = 1;
 
-	if (PAGE_CACHE_SHIFT < cbits)
-		pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+	if (PAGE_SHIFT < cbits)
+		pages_per_cluster = 1 << (cbits - PAGE_SHIFT);
 
 	return pages_per_cluster;
 }
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3eff031aaf26..881242c3a8d2 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2937,16 +2937,16 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 		end = i_size_read(inode);
 
 	while (offset < end) {
-		page_index = offset >> PAGE_CACHE_SHIFT;
-		map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+		page_index = offset >> PAGE_SHIFT;
+		map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
 		if (map_end > end)
 			map_end = end;
 
 		/* from, to is the offset within the page. */
-		from = offset & (PAGE_CACHE_SIZE - 1);
-		to = PAGE_CACHE_SIZE;
-		if (map_end & (PAGE_CACHE_SIZE - 1))
-			to = map_end & (PAGE_CACHE_SIZE - 1);
+		from = offset & (PAGE_SIZE - 1);
+		to = PAGE_SIZE;
+		if (map_end & (PAGE_SIZE - 1))
+			to = map_end & (PAGE_SIZE - 1);
 
 		page = find_or_create_page(mapping, page_index, GFP_NOFS);
 		if (!page) {
@@ -2959,7 +2959,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 		 * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page
 		 * can't be dirtied before we CoW it out.
 		 */
-		if (PAGE_CACHE_SIZE <= OCFS2_SB(sb)->s_clustersize)
+		if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize)
 			BUG_ON(PageDirty(page));
 
 		if (!PageUptodate(page)) {
@@ -2987,7 +2987,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 		mark_page_accessed(page);
 unlock:
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		page = NULL;
 		offset = map_end;
 		if (ret)
@@ -3165,8 +3165,8 @@ int ocfs2_cow_sync_writeback(struct super_block *sb,
 	}
 
 	while (offset < end) {
-		page_index = offset >> PAGE_CACHE_SHIFT;
-		map_end = ((loff_t)page_index + 1) << PAGE_CACHE_SHIFT;
+		page_index = offset >> PAGE_SHIFT;
+		map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
 		if (map_end > end)
 			map_end = end;
 
@@ -3182,7 +3182,7 @@ int ocfs2_cow_sync_writeback(struct super_block *sb,
 			mark_page_accessed(page);
 
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		page = NULL;
 		offset = map_end;
 		if (ret)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7db631e1c8b0..d7cae3327de5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -605,8 +605,8 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
 	/*
 	 * We might be limited by page cache size.
 	 */
-	if (bytes > PAGE_CACHE_SIZE) {
-		bytes = PAGE_CACHE_SIZE;
+	if (bytes > PAGE_SIZE) {
+		bytes = PAGE_SIZE;
 		trim = 1;
 		/*
 		 * Shift by 31 here so that we don't get larger than
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 2382e267b49e..0166faabf8f2 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -18,8 +18,8 @@ static int read_one_page(struct page *page)
 	int max_block;
 	ssize_t bytes_read = 0;
 	struct inode *inode = page->mapping->host;
-	const __u32 blocksize = PAGE_CACHE_SIZE;	/* inode->i_blksize */
-	const __u32 blockbits = PAGE_CACHE_SHIFT;	/* inode->i_blkbits */
+	const __u32 blocksize = PAGE_SIZE;	/* inode->i_blksize */
+	const __u32 blockbits = PAGE_SHIFT;	/* inode->i_blkbits */
 	struct iov_iter to;
 	struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE};
 
@@ -86,7 +86,7 @@ static int orangefs_readpages(struct file *file,
 				"failure adding page to cache, read_one_page returned: %d\n",
 				ret);
 	      } else {
-			page_cache_release(page);
+			put_page(page);
 	      }
 	}
 	BUG_ON(!list_empty(pages));
@@ -328,7 +328,7 @@ static int orangefs_init_iops(struct inode *inode)
 	case S_IFREG:
 		inode->i_op = &orangefs_file_inode_operations;
 		inode->i_fop = &orangefs_file_operations;
-		inode->i_blkbits = PAGE_CACHE_SHIFT;
+		inode->i_blkbits = PAGE_SHIFT;
 		break;
 	case S_IFLNK:
 		inode->i_op = &orangefs_symlink_inode_operations;
@@ -456,7 +456,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
 	inode->i_uid = current_fsuid();
 	inode->i_gid = current_fsgid();
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	inode->i_size = PAGE_CACHE_SIZE;
+	inode->i_size = PAGE_SIZE;
 	inode->i_rdev = dev;
 
 	error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref);
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index 1f8acc9f9a88..75375e90a63f 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -170,7 +170,7 @@ orangefs_bufmap_unmap(struct orangefs_bufmap *bufmap)
 	int i;
 
 	for (i = 0; i < bufmap->page_count; i++)
-		page_cache_release(bufmap->page_array[i]);
+		put_page(bufmap->page_array[i]);
 }
 
 static void
@@ -299,7 +299,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
 
 		for (i = 0; i < ret; i++) {
 			SetPageError(bufmap->page_array[i]);
-			page_cache_release(bufmap->page_array[i]);
+			put_page(bufmap->page_array[i]);
 		}
 		return -ENOMEM;
 	}
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 40f5163b56aa..8277aba65e87 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -303,7 +303,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
 		}
 		break;
 	case S_IFDIR:
-		inode->i_size = PAGE_CACHE_SIZE;
+		inode->i_size = PAGE_SIZE;
 		orangefs_inode->blksize = (1 << inode->i_blkbits);
 		spin_lock(&inode->i_lock);
 		inode_set_bytes(inode, inode->i_size);
diff --git a/fs/pipe.c b/fs/pipe.c
index ab8dad3ccb6a..0d3f5165cb0b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -134,7 +134,7 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 	if (page_count(page) == 1 && !pipe->tmp_page)
 		pipe->tmp_page = page;
 	else
-		page_cache_release(page);
+		put_page(page);
 }
 
 /**
@@ -180,7 +180,7 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
  */
 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
-	page_cache_get(buf->page);
+	get_page(buf->page);
 }
 EXPORT_SYMBOL(generic_pipe_buf_get);
 
@@ -211,7 +211,7 @@ EXPORT_SYMBOL(generic_pipe_buf_confirm);
 void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 			      struct pipe_buffer *buf)
 {
-	page_cache_release(buf->page);
+	put_page(buf->page);
 }
 EXPORT_SYMBOL(generic_pipe_buf_release);
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9df431642042..229cb546bee0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -553,7 +553,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 		if (radix_tree_exceptional_entry(page))
 			mss->swap += PAGE_SIZE;
 		else
-			page_cache_release(page);
+			put_page(page);
 
 		return;
 	}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 55bb57e6a30d..8afe10cf7df8 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -279,12 +279,12 @@ static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (!page)
 		return VM_FAULT_OOM;
 	if (!PageUptodate(page)) {
-		offset = (loff_t) index << PAGE_CACHE_SHIFT;
+		offset = (loff_t) index << PAGE_SHIFT;
 		buf = __va((page_to_pfn(page) << PAGE_SHIFT));
 		rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
 		if (rc < 0) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
 		}
 		SetPageUptodate(page);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index dc645b66cd79..45d6110744cb 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -420,8 +420,8 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
 	pstore_sb = sb;
 
 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
-	sb->s_blocksize		= PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_blocksize		= PAGE_SIZE;
+	sb->s_blocksize_bits	= PAGE_SHIFT;
 	sb->s_magic		= PSTOREFS_MAGIC;
 	sb->s_op		= &pstore_ops;
 	sb->s_time_gran		= 1;
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index e1f37278cf97..144ceda4948e 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -35,9 +35,9 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n)
 static unsigned last_entry(struct inode *inode, unsigned long page_nr)
 {
 	unsigned long last_byte = inode->i_size;
-	last_byte -= page_nr << PAGE_CACHE_SHIFT;
-	if (last_byte > PAGE_CACHE_SIZE)
-		last_byte = PAGE_CACHE_SIZE;
+	last_byte -= page_nr << PAGE_SHIFT;
+	if (last_byte > PAGE_SIZE)
+		last_byte = PAGE_SIZE;
 	return last_byte / QNX6_DIR_ENTRY_SIZE;
 }
 
@@ -47,9 +47,9 @@ static struct qnx6_long_filename *qnx6_longname(struct super_block *sb,
 {
 	struct qnx6_sb_info *sbi = QNX6_SB(sb);
 	u32 s = fs32_to_cpu(sbi, de->de_long_inode); /* in block units */
-	u32 n = s >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits); /* in pages */
+	u32 n = s >> (PAGE_SHIFT - sb->s_blocksize_bits); /* in pages */
 	/* within page */
-	u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_CACHE_MASK;
+	u32 offs = (s << sb->s_blocksize_bits) & ~PAGE_MASK;
 	struct address_space *mapping = sbi->longfile->i_mapping;
 	struct page *page = read_mapping_page(mapping, n, NULL);
 	if (IS_ERR(page))
@@ -115,8 +115,8 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
 	struct qnx6_sb_info *sbi = QNX6_SB(s);
 	loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
 	unsigned long npages = dir_pages(inode);
-	unsigned long n = pos >> PAGE_CACHE_SHIFT;
-	unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE;
+	unsigned long n = pos >> PAGE_SHIFT;
+	unsigned start = (pos & ~PAGE_MASK) / QNX6_DIR_ENTRY_SIZE;
 	bool done = false;
 
 	ctx->pos = pos;
@@ -131,7 +131,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
 
 		if (IS_ERR(page)) {
 			pr_err("%s(): read failed\n", __func__);
-			ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
+			ctx->pos = (n + 1) << PAGE_SHIFT;
 			return PTR_ERR(page);
 		}
 		de = ((struct qnx6_dir_entry *)page_address(page)) + start;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 47bb1de07155..1192422a1c56 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -542,8 +542,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
 		iget_failed(inode);
 		return ERR_PTR(-EIO);
 	}
-	n = (ino - 1) >> (PAGE_CACHE_SHIFT - QNX6_INODE_SIZE_BITS);
-	offs = (ino - 1) & (~PAGE_CACHE_MASK >> QNX6_INODE_SIZE_BITS);
+	n = (ino - 1) >> (PAGE_SHIFT - QNX6_INODE_SIZE_BITS);
+	offs = (ino - 1) & (~PAGE_MASK >> QNX6_INODE_SIZE_BITS);
 	mapping = sbi->inodes->i_mapping;
 	page = read_mapping_page(mapping, n, NULL);
 	if (IS_ERR(page)) {
diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h
index d3fb2b698800..f23b5c4a66ad 100644
--- a/fs/qnx6/qnx6.h
+++ b/fs/qnx6/qnx6.h
@@ -128,7 +128,7 @@ extern struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s,
 static inline void qnx6_put_page(struct page *page)
 {
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 extern unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 38981b037524..1ab6e6c2e60e 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -223,8 +223,8 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent)
 		return err;
 
 	sb->s_maxbytes		= MAX_LFS_FILESIZE;
-	sb->s_blocksize		= PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits	= PAGE_CACHE_SHIFT;
+	sb->s_blocksize		= PAGE_SIZE;
+	sb->s_blocksize_bits	= PAGE_SHIFT;
 	sb->s_magic		= RAMFS_MAGIC;
 	sb->s_op		= &ramfs_ops;
 	sb->s_time_gran		= 1;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9424a4ba93a9..389773711de4 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -180,11 +180,11 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
 	int partial = 0;
 	unsigned blocksize;
 	struct buffer_head *bh, *head;
-	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	unsigned long i_size_index = inode->i_size >> PAGE_SHIFT;
 	int new;
 	int logit = reiserfs_file_data_log(inode);
 	struct super_block *s = inode->i_sb;
-	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+	int bh_per_page = PAGE_SIZE / s->s_blocksize;
 	struct reiserfs_transaction_handle th;
 	int ret = 0;
 
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ae9e5b308cf9..d5c2e9c865de 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -386,7 +386,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
 		goto finished;
 	}
 	/* read file tail into part of page */
-	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
+	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_SIZE - 1);
 	copy_item_head(&tmp_ih, ih);
 
 	/*
@@ -587,10 +587,10 @@ static int convert_tail_for_hole(struct inode *inode,
 		return -EIO;
 
 	/* always try to read until the end of the block */
-	tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
+	tail_start = tail_offset & (PAGE_SIZE - 1);
 	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
 
-	index = tail_offset >> PAGE_CACHE_SHIFT;
+	index = tail_offset >> PAGE_SHIFT;
 	/*
 	 * hole_page can be zero in case of direct_io, we are sure
 	 * that we cannot get here if we write with O_DIRECT into tail page
@@ -629,7 +629,7 @@ static int convert_tail_for_hole(struct inode *inode,
 unlock:
 	if (tail_page != hole_page) {
 		unlock_page(tail_page);
-		page_cache_release(tail_page);
+		put_page(tail_page);
 	}
 out:
 	return retval;
@@ -2189,11 +2189,11 @@ static int grab_tail_page(struct inode *inode,
 	 * we want the page with the last byte in the file,
 	 * not the page that will hold the next byte for appending
 	 */
-	unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+	unsigned long index = (inode->i_size - 1) >> PAGE_SHIFT;
 	unsigned long pos = 0;
 	unsigned long start = 0;
 	unsigned long blocksize = inode->i_sb->s_blocksize;
-	unsigned long offset = (inode->i_size) & (PAGE_CACHE_SIZE - 1);
+	unsigned long offset = (inode->i_size) & (PAGE_SIZE - 1);
 	struct buffer_head *bh;
 	struct buffer_head *head;
 	struct page *page;
@@ -2251,7 +2251,7 @@ out:
 
 unlock:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return error;
 }
 
@@ -2265,7 +2265,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
 {
 	struct reiserfs_transaction_handle th;
 	/* we want the offset for the first byte after the end of the file */
-	unsigned long offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+	unsigned long offset = inode->i_size & (PAGE_SIZE - 1);
 	unsigned blocksize = inode->i_sb->s_blocksize;
 	unsigned length;
 	struct page *page = NULL;
@@ -2345,7 +2345,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
 			}
 		}
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	reiserfs_write_unlock(inode->i_sb);
@@ -2354,7 +2354,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
 out:
 	if (page) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	reiserfs_write_unlock(inode->i_sb);
@@ -2426,7 +2426,7 @@ research:
 	} else if (is_direct_le_ih(ih)) {
 		char *p;
 		p = page_address(bh_result->b_page);
-		p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
+		p += (byte_offset - 1) & (PAGE_SIZE - 1);
 		copy_size = ih_item_len(ih) - pos_in_item;
 
 		fs_gen = get_generation(inode->i_sb);
@@ -2525,7 +2525,7 @@ static int reiserfs_write_full_page(struct page *page,
 				    struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = inode->i_size >> PAGE_SHIFT;
 	int error = 0;
 	unsigned long block;
 	sector_t last_block;
@@ -2535,7 +2535,7 @@ static int reiserfs_write_full_page(struct page *page,
 	int checked = PageChecked(page);
 	struct reiserfs_transaction_handle th;
 	struct super_block *s = inode->i_sb;
-	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+	int bh_per_page = PAGE_SIZE / s->s_blocksize;
 	th.t_trans_id = 0;
 
 	/* no logging allowed when nonblocking or from PF_MEMALLOC */
@@ -2564,16 +2564,16 @@ static int reiserfs_write_full_page(struct page *page,
 	if (page->index >= end_index) {
 		unsigned last_offset;
 
-		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+		last_offset = inode->i_size & (PAGE_SIZE - 1);
 		/* no file contents in this page */
 		if (page->index >= end_index + 1 || !last_offset) {
 			unlock_page(page);
 			return 0;
 		}
-		zero_user_segment(page, last_offset, PAGE_CACHE_SIZE);
+		zero_user_segment(page, last_offset, PAGE_SIZE);
 	}
 	bh = head;
-	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
+	block = page->index << (PAGE_SHIFT - s->s_blocksize_bits);
 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
 	/* first map all the buffers, logging any direct items we find */
 	do {
@@ -2774,7 +2774,7 @@ static int reiserfs_write_begin(struct file *file,
 		*fsdata = (void *)(unsigned long)flags;
 	}
 
-	index = pos >> PAGE_CACHE_SHIFT;
+	index = pos >> PAGE_SHIFT;
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
@@ -2822,7 +2822,7 @@ static int reiserfs_write_begin(struct file *file,
 	}
 	if (ret) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		/* Truncate allocated blocks */
 		reiserfs_truncate_failed_write(inode);
 	}
@@ -2909,7 +2909,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
 	else
 		th = NULL;
 
-	start = pos & (PAGE_CACHE_SIZE - 1);
+	start = pos & (PAGE_SIZE - 1);
 	if (unlikely(copied < len)) {
 		if (!PageUptodate(page))
 			copied = 0;
@@ -2974,7 +2974,7 @@ out:
 	if (locked)
 		reiserfs_write_unlock(inode->i_sb);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	if (pos + len > inode->i_size)
 		reiserfs_truncate_failed_write(inode);
@@ -2996,7 +2996,7 @@ int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
-	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
+	loff_t pos = ((loff_t) page->index << PAGE_SHIFT) + to;
 	int ret = 0;
 	int update_sd = 0;
 	struct reiserfs_transaction_handle *th = NULL;
@@ -3181,7 +3181,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
 	struct inode *inode = page->mapping->host;
 	unsigned int curr_off = 0;
 	unsigned int stop = offset + length;
-	int partial_page = (offset || length < PAGE_CACHE_SIZE);
+	int partial_page = (offset || length < PAGE_SIZE);
 	int ret = 1;
 
 	BUG_ON(!PageLocked(page));
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 036a1fc0a8c3..57045f423893 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -203,7 +203,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
 	 * __reiserfs_write_begin on that page.  This will force a
 	 * reiserfs_get_block to unpack the tail for us.
 	 */
-	index = inode->i_size >> PAGE_CACHE_SHIFT;
+	index = inode->i_size >> PAGE_SHIFT;
 	mapping = inode->i_mapping;
 	page = grab_cache_page(mapping, index);
 	retval = -ENOMEM;
@@ -221,7 +221,7 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
 
 out_unlock:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 out:
 	inode_unlock(inode);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 44c2bdced1c8..8c6487238bb3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -605,12 +605,12 @@ static void release_buffer_page(struct buffer_head *bh)
 {
 	struct page *page = bh->b_page;
 	if (!page->mapping && trylock_page(page)) {
-		page_cache_get(page);
+		get_page(page);
 		put_bh(bh);
 		if (!page->mapping)
 			try_to_free_buffers(page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	} else {
 		put_bh(bh);
 	}
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 24cbe013240f..5feacd689241 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1342,7 +1342,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
 		 */
 
 		data = kmap_atomic(un_bh->b_page);
-		off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
+		off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_SIZE - 1));
 		memcpy(data + off,
 		       ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
 		       ret_value);
@@ -1511,7 +1511,7 @@ static void unmap_buffers(struct page *page, loff_t pos)
 
 	if (page) {
 		if (page_has_buffers(page)) {
-			tail_index = pos & (PAGE_CACHE_SIZE - 1);
+			tail_index = pos & (PAGE_SIZE - 1);
 			cur_index = 0;
 			head = page_buffers(page);
 			bh = head;
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index f41e19b4bb42..2d5489b0a269 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -151,7 +151,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
 	 */
 	if (up_to_date_bh) {
 		unsigned pgoff =
-		    (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1);
+		    (tail_offset + total_tail - 1) & (PAGE_SIZE - 1);
 		char *kaddr = kmap_atomic(up_to_date_bh->b_page);
 		memset(kaddr + pgoff, 0, blk_size - total_tail);
 		kunmap_atomic(kaddr);
@@ -271,7 +271,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
 	 * the page was locked and this part of the page was up to date when
 	 * indirect2direct was called, so we know the bytes are still valid
 	 */
-	tail = tail + (pos & (PAGE_CACHE_SIZE - 1));
+	tail = tail + (pos & (PAGE_SIZE - 1));
 
 	PATH_LAST_POSITION(path)++;
 
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 57e0b2310532..28f5f8b11370 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -415,7 +415,7 @@ out:
 static inline void reiserfs_put_page(struct page *page)
 {
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 static struct page *reiserfs_get_page(struct inode *dir, size_t n)
@@ -427,7 +427,7 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n)
 	 * and an unlink/rmdir has just occurred - GFP_NOFS avoids this
 	 */
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL);
+	page = read_mapping_page(mapping, n >> PAGE_SHIFT, NULL);
 	if (!IS_ERR(page)) {
 		kmap(page);
 		if (PageError(page))
@@ -526,10 +526,10 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
 	while (buffer_pos < buffer_size || buffer_pos == 0) {
 		size_t chunk;
 		size_t skip = 0;
-		size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
+		size_t page_offset = (file_pos & (PAGE_SIZE - 1));
 
-		if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
-			chunk = PAGE_CACHE_SIZE;
+		if (buffer_size - buffer_pos > PAGE_SIZE)
+			chunk = PAGE_SIZE;
 		else
 			chunk = buffer_size - buffer_pos;
 
@@ -546,8 +546,8 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
 			struct reiserfs_xattr_header *rxh;
 
 			skip = file_pos = sizeof(struct reiserfs_xattr_header);
-			if (chunk + skip > PAGE_CACHE_SIZE)
-				chunk = PAGE_CACHE_SIZE - skip;
+			if (chunk + skip > PAGE_SIZE)
+				chunk = PAGE_SIZE - skip;
 			rxh = (struct reiserfs_xattr_header *)data;
 			rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC);
 			rxh->h_hash = cpu_to_le32(xahash);
@@ -675,8 +675,8 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
 		char *data;
 		size_t skip = 0;
 
-		if (isize - file_pos > PAGE_CACHE_SIZE)
-			chunk = PAGE_CACHE_SIZE;
+		if (isize - file_pos > PAGE_SIZE)
+			chunk = PAGE_SIZE;
 		else
 			chunk = isize - file_pos;
 
diff --git a/fs/splice.c b/fs/splice.c
index 9947b5c69664..b018eb485019 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -88,7 +88,7 @@ out_unlock:
 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
 					struct pipe_buffer *buf)
 {
-	page_cache_release(buf->page);
+	put_page(buf->page);
 	buf->flags &= ~PIPE_BUF_FLAG_LRU;
 }
 
@@ -268,7 +268,7 @@ EXPORT_SYMBOL_GPL(splice_to_pipe);
 
 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
 {
-	page_cache_release(spd->pages[i]);
+	put_page(spd->pages[i]);
 }
 
 /*
@@ -328,9 +328,9 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	if (splice_grow_spd(pipe, &spd))
 		return -ENOMEM;
 
-	index = *ppos >> PAGE_CACHE_SHIFT;
-	loff = *ppos & ~PAGE_CACHE_MASK;
-	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	index = *ppos >> PAGE_SHIFT;
+	loff = *ppos & ~PAGE_MASK;
+	req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	nr_pages = min(req_pages, spd.nr_pages_max);
 
 	/*
@@ -365,7 +365,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			error = add_to_page_cache_lru(page, mapping, index,
 				   mapping_gfp_constraint(mapping, GFP_KERNEL));
 			if (unlikely(error)) {
-				page_cache_release(page);
+				put_page(page);
 				if (error == -EEXIST)
 					continue;
 				break;
@@ -385,7 +385,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	 * Now loop over the map and see if we need to start IO on any
 	 * pages, fill in the partial map, etc.
 	 */
-	index = *ppos >> PAGE_CACHE_SHIFT;
+	index = *ppos >> PAGE_SHIFT;
 	nr_pages = spd.nr_pages;
 	spd.nr_pages = 0;
 	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
@@ -397,7 +397,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		/*
 		 * this_len is the max we'll use from this page
 		 */
-		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+		this_len = min_t(unsigned long, len, PAGE_SIZE - loff);
 		page = spd.pages[page_nr];
 
 		if (PageReadahead(page))
@@ -426,7 +426,7 @@ retry_lookup:
 					error = -ENOMEM;
 					break;
 				}
-				page_cache_release(spd.pages[page_nr]);
+				put_page(spd.pages[page_nr]);
 				spd.pages[page_nr] = page;
 			}
 			/*
@@ -456,7 +456,7 @@ fill_it:
 		 * i_size must be checked after PageUptodate.
 		 */
 		isize = i_size_read(mapping->host);
-		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+		end_index = (isize - 1) >> PAGE_SHIFT;
 		if (unlikely(!isize || index > end_index))
 			break;
 
@@ -470,7 +470,7 @@ fill_it:
 			/*
 			 * max good bytes in this page
 			 */
-			plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+			plen = ((isize - 1) & ~PAGE_MASK) + 1;
 			if (plen <= loff)
 				break;
 
@@ -494,8 +494,8 @@ fill_it:
 	 * we got, 'nr_pages' is how many pages are in the map.
 	 */
 	while (page_nr < nr_pages)
-		page_cache_release(spd.pages[page_nr++]);
-	in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
+		put_page(spd.pages[page_nr++]);
+	in->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
 
 	if (spd.nr_pages)
 		error = splice_to_pipe(pipe, &spd);
@@ -636,8 +636,8 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 			goto shrink_ret;
 	}
 
-	offset = *ppos & ~PAGE_CACHE_MASK;
-	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	offset = *ppos & ~PAGE_MASK;
+	nr_pages = (len + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) {
 		struct page *page;
@@ -647,7 +647,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 		if (!page)
 			goto err;
 
-		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+		this_len = min_t(size_t, len, PAGE_SIZE - offset);
 		vec[i].iov_base = (void __user *) page_address(page);
 		vec[i].iov_len = this_len;
 		spd.pages[i] = page;
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 0cea9b9236d0..2c2618410d51 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -181,11 +181,11 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length,
 			in = min(bytes, msblk->devblksize - offset);
 			bytes -= in;
 			while (in) {
-				if (pg_offset == PAGE_CACHE_SIZE) {
+				if (pg_offset == PAGE_SIZE) {
 					data = squashfs_next_page(output);
 					pg_offset = 0;
 				}
-				avail = min_t(int, in, PAGE_CACHE_SIZE -
+				avail = min_t(int, in, PAGE_SIZE -
 						pg_offset);
 				memcpy(data + pg_offset, bh[k]->b_data + offset,
 						avail);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 1cb70a0b2168..27e501af0e8b 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -255,7 +255,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
 	cache->unused = entries;
 	cache->entries = entries;
 	cache->block_size = block_size;
-	cache->pages = block_size >> PAGE_CACHE_SHIFT;
+	cache->pages = block_size >> PAGE_SHIFT;
 	cache->pages = cache->pages ? cache->pages : 1;
 	cache->name = name;
 	cache->num_waiters = 0;
@@ -275,7 +275,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
 		}
 
 		for (j = 0; j < cache->pages; j++) {
-			entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+			entry->data[j] = kmalloc(PAGE_SIZE, GFP_KERNEL);
 			if (entry->data[j] == NULL) {
 				ERROR("Failed to allocate %s buffer\n", name);
 				goto cleanup;
@@ -314,10 +314,10 @@ int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
 		return min(length, entry->length - offset);
 
 	while (offset < entry->length) {
-		void *buff = entry->data[offset / PAGE_CACHE_SIZE]
-				+ (offset % PAGE_CACHE_SIZE);
+		void *buff = entry->data[offset / PAGE_SIZE]
+				+ (offset % PAGE_SIZE);
 		int bytes = min_t(int, entry->length - offset,
-				PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE));
+				PAGE_SIZE - (offset % PAGE_SIZE));
 
 		if (bytes >= remaining) {
 			memcpy(buffer, buff, remaining);
@@ -415,7 +415,7 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
  */
 void *squashfs_read_table(struct super_block *sb, u64 block, int length)
 {
-	int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	int pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	int i, res;
 	void *table, *buffer, **data;
 	struct squashfs_page_actor *actor;
@@ -436,7 +436,7 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length)
 		goto failed2;
 	}
 
-	for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
+	for (i = 0; i < pages; i++, buffer += PAGE_SIZE)
 		data[i] = buffer;
 
 	res = squashfs_read_data(sb, block, length |
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index e9034bf6e5ae..d2bc13636f79 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -102,7 +102,7 @@ static void *get_comp_opts(struct super_block *sb, unsigned short flags)
 	 * Read decompressor specific options from file system if present
 	 */
 	if (SQUASHFS_COMP_OPTS(flags)) {
-		buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+		buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
 		if (buffer == NULL) {
 			comp_opts = ERR_PTR(-ENOMEM);
 			goto out;
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index e5c9689062ba..437de9e89221 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -175,7 +175,7 @@ static long long read_indexes(struct super_block *sb, int n,
 {
 	int err, i;
 	long long block = 0;
-	__le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
+	__le32 *blist = kmalloc(PAGE_SIZE, GFP_KERNEL);
 
 	if (blist == NULL) {
 		ERROR("read_indexes: Failed to allocate block_list\n");
@@ -183,7 +183,7 @@ static long long read_indexes(struct super_block *sb, int n,
 	}
 
 	while (n) {
-		int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2);
+		int blocks = min_t(int, n, PAGE_SIZE >> 2);
 
 		err = squashfs_read_metadata(sb, blist, start_block,
 				offset, blocks << 2);
@@ -377,7 +377,7 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
 	struct inode *inode = page->mapping->host;
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
 	void *pageaddr;
-	int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+	int i, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
 	int start_index = page->index & ~mask, end_index = start_index | mask;
 
 	/*
@@ -387,9 +387,9 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
 	 * been called to fill.
 	 */
 	for (i = start_index; i <= end_index && bytes > 0; i++,
-			bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+			bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
 		struct page *push_page;
-		int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0;
+		int avail = buffer ? min_t(int, bytes, PAGE_SIZE) : 0;
 
 		TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
 
@@ -404,14 +404,14 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
 
 		pageaddr = kmap_atomic(push_page);
 		squashfs_copy_data(pageaddr, buffer, offset, avail);
-		memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+		memset(pageaddr + avail, 0, PAGE_SIZE - avail);
 		kunmap_atomic(pageaddr);
 		flush_dcache_page(push_page);
 		SetPageUptodate(push_page);
 skip_page:
 		unlock_page(push_page);
 		if (i != page->index)
-			page_cache_release(push_page);
+			put_page(push_page);
 	}
 }
 
@@ -454,7 +454,7 @@ static int squashfs_readpage(struct file *file, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-	int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+	int index = page->index >> (msblk->block_log - PAGE_SHIFT);
 	int file_end = i_size_read(inode) >> msblk->block_log;
 	int res;
 	void *pageaddr;
@@ -462,8 +462,8 @@ static int squashfs_readpage(struct file *file, struct page *page)
 	TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
 				page->index, squashfs_i(inode)->start);
 
-	if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-					PAGE_CACHE_SHIFT))
+	if (page->index >= ((i_size_read(inode) + PAGE_SIZE - 1) >>
+					PAGE_SHIFT))
 		goto out;
 
 	if (index < file_end || squashfs_i(inode)->fragment_block ==
@@ -487,7 +487,7 @@ error_out:
 	SetPageError(page);
 out:
 	pageaddr = kmap_atomic(page);
-	memset(pageaddr, 0, PAGE_CACHE_SIZE);
+	memset(pageaddr, 0, PAGE_SIZE);
 	kunmap_atomic(pageaddr);
 	flush_dcache_page(page);
 	if (!PageError(page))
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 43e7a7eddac0..cb485d8e0e91 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -30,8 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
 	struct inode *inode = target_page->mapping->host;
 	struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
 
-	int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
-	int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+	int file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+	int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
 	int start_index = target_page->index & ~mask;
 	int end_index = start_index | mask;
 	int i, n, pages, missing_pages, bytes, res = -ENOMEM;
@@ -68,7 +68,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
 
 		if (PageUptodate(page[i])) {
 			unlock_page(page[i]);
-			page_cache_release(page[i]);
+			put_page(page[i]);
 			page[i] = NULL;
 			missing_pages++;
 		}
@@ -96,10 +96,10 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
 		goto mark_errored;
 
 	/* Last page may have trailing bytes not filled */
-	bytes = res % PAGE_CACHE_SIZE;
+	bytes = res % PAGE_SIZE;
 	if (bytes) {
 		pageaddr = kmap_atomic(page[pages - 1]);
-		memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
+		memset(pageaddr + bytes, 0, PAGE_SIZE - bytes);
 		kunmap_atomic(pageaddr);
 	}
 
@@ -109,7 +109,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
 		SetPageUptodate(page[i]);
 		unlock_page(page[i]);
 		if (page[i] != target_page)
-			page_cache_release(page[i]);
+			put_page(page[i]);
 	}
 
 	kfree(actor);
@@ -127,7 +127,7 @@ mark_errored:
 		flush_dcache_page(page[i]);
 		SetPageError(page[i]);
 		unlock_page(page[i]);
-		page_cache_release(page[i]);
+		put_page(page[i]);
 	}
 
 out:
@@ -153,21 +153,21 @@ static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
 	}
 
 	for (n = 0; n < pages && bytes > 0; n++,
-			bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
-		int avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+			bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
+		int avail = min_t(int, bytes, PAGE_SIZE);
 
 		if (page[n] == NULL)
 			continue;
 
 		pageaddr = kmap_atomic(page[n]);
 		squashfs_copy_data(pageaddr, buffer, offset, avail);
-		memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+		memset(pageaddr + avail, 0, PAGE_SIZE - avail);
 		kunmap_atomic(pageaddr);
 		flush_dcache_page(page[n]);
 		SetPageUptodate(page[n]);
 		unlock_page(page[n]);
 		if (page[n] != target_page)
-			page_cache_release(page[n]);
+			put_page(page[n]);
 	}
 
 out:
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c
index c31e2bc9c081..ff4468bd18b0 100644
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -117,13 +117,13 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	data = squashfs_first_page(output);
 	buff = stream->output;
 	while (data) {
-		if (bytes <= PAGE_CACHE_SIZE) {
+		if (bytes <= PAGE_SIZE) {
 			memcpy(data, buff, bytes);
 			break;
 		}
-		memcpy(data, buff, PAGE_CACHE_SIZE);
-		buff += PAGE_CACHE_SIZE;
-		bytes -= PAGE_CACHE_SIZE;
+		memcpy(data, buff, PAGE_SIZE);
+		buff += PAGE_SIZE;
+		bytes -= PAGE_SIZE;
 		data = squashfs_next_page(output);
 	}
 	squashfs_finish_page(output);
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 244b9fbfff7b..934c17e96590 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -102,13 +102,13 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	data = squashfs_first_page(output);
 	buff = stream->output;
 	while (data) {
-		if (bytes <= PAGE_CACHE_SIZE) {
+		if (bytes <= PAGE_SIZE) {
 			memcpy(data, buff, bytes);
 			break;
 		} else {
-			memcpy(data, buff, PAGE_CACHE_SIZE);
-			buff += PAGE_CACHE_SIZE;
-			bytes -= PAGE_CACHE_SIZE;
+			memcpy(data, buff, PAGE_SIZE);
+			buff += PAGE_SIZE;
+			bytes -= PAGE_SIZE;
 			data = squashfs_next_page(output);
 		}
 	}
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
index 5a1c11f56441..9b7b1b6a7892 100644
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -48,7 +48,7 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
 	if (actor == NULL)
 		return NULL;
 
-	actor->length = length ? : pages * PAGE_CACHE_SIZE;
+	actor->length = length ? : pages * PAGE_SIZE;
 	actor->buffer = buffer;
 	actor->pages = pages;
 	actor->next_page = 0;
@@ -88,7 +88,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
 	if (actor == NULL)
 		return NULL;
 
-	actor->length = length ? : pages * PAGE_CACHE_SIZE;
+	actor->length = length ? : pages * PAGE_SIZE;
 	actor->page = page;
 	actor->pages = pages;
 	actor->next_page = 0;
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
index 26dd82008b82..98537eab27e2 100644
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -24,7 +24,7 @@ static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
 	if (actor == NULL)
 		return NULL;
 
-	actor->length = length ? : pages * PAGE_CACHE_SIZE;
+	actor->length = length ? : pages * PAGE_SIZE;
 	actor->page = page;
 	actor->pages = pages;
 	actor->next_page = 0;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 5e79bfa4f260..cf01e15a7b16 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -152,7 +152,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	 * Check the system page size is not larger than the filesystem
 	 * block size (by default 128K).  This is currently not supported.
 	 */
-	if (PAGE_CACHE_SIZE > msblk->block_size) {
+	if (PAGE_SIZE > msblk->block_size) {
 		ERROR("Page size > filesystem block size (%d).  This is "
 			"currently not supported!\n", msblk->block_size);
 		goto failed_mount;
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index dbcc2f54bad4..d688ef42a6a1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -48,10 +48,10 @@ static int squashfs_symlink_readpage(struct file *file, struct page *page)
 	struct inode *inode = page->mapping->host;
 	struct super_block *sb = inode->i_sb;
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
-	int index = page->index << PAGE_CACHE_SHIFT;
+	int index = page->index << PAGE_SHIFT;
 	u64 block = squashfs_i(inode)->start;
 	int offset = squashfs_i(inode)->offset;
-	int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE);
+	int length = min_t(int, i_size_read(inode) - index, PAGE_SIZE);
 	int bytes, copied;
 	void *pageaddr;
 	struct squashfs_cache_entry *entry;
@@ -94,7 +94,7 @@ static int squashfs_symlink_readpage(struct file *file, struct page *page)
 		copied = squashfs_copy_data(pageaddr + bytes, entry, offset,
 								length - bytes);
 		if (copied == length - bytes)
-			memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length);
+			memset(pageaddr + length, 0, PAGE_SIZE - length);
 		else
 			block = entry->next_index;
 		kunmap_atomic(pageaddr);
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index c609624e4b8a..6bfaef73d065 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -141,7 +141,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	stream->buf.in_pos = 0;
 	stream->buf.in_size = 0;
 	stream->buf.out_pos = 0;
-	stream->buf.out_size = PAGE_CACHE_SIZE;
+	stream->buf.out_size = PAGE_SIZE;
 	stream->buf.out = squashfs_first_page(output);
 
 	do {
@@ -158,7 +158,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
 			stream->buf.out = squashfs_next_page(output);
 			if (stream->buf.out != NULL) {
 				stream->buf.out_pos = 0;
-				total += PAGE_CACHE_SIZE;
+				total += PAGE_SIZE;
 			}
 		}
 
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 8727caba6882..2ec24d128bce 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -69,7 +69,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 	int zlib_err, zlib_init = 0, k = 0;
 	z_stream *stream = strm;
 
-	stream->avail_out = PAGE_CACHE_SIZE;
+	stream->avail_out = PAGE_SIZE;
 	stream->next_out = squashfs_first_page(output);
 	stream->avail_in = 0;
 
@@ -85,7 +85,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
 		if (stream->avail_out == 0) {
 			stream->next_out = squashfs_next_page(output);
 			if (stream->next_out != NULL)
-				stream->avail_out = PAGE_CACHE_SIZE;
+				stream->avail_out = PAGE_SIZE;
 		}
 
 		if (!zlib_init) {
diff --git a/fs/sync.c b/fs/sync.c
index dd5d1711c7ac..2a54c1f22035 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -302,7 +302,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
 		goto out;
 
 	if (sizeof(pgoff_t) == 4) {
-		if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
+		if (offset >= (0x100000000ULL << PAGE_SHIFT)) {
 			/*
 			 * The range starts outside a 32 bit machine's
 			 * pagecache addressing capabilities.  Let it "succeed"
@@ -310,7 +310,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
 			ret = 0;
 			goto out;
 		}
-		if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) {
+		if (endbyte >= (0x100000000ULL << PAGE_SHIFT)) {
 			/*
 			 * Out to EOF
 			 */
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 63c1bcb224ee..c0f0a3e643eb 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -30,7 +30,7 @@ const struct file_operations sysv_dir_operations = {
 static inline void dir_put_page(struct page *page)
 {
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len)
@@ -73,8 +73,8 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx)
 	if (pos >= inode->i_size)
 		return 0;
 
-	offset = pos & ~PAGE_CACHE_MASK;
-	n = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_MASK;
+	n = pos >> PAGE_SHIFT;
 
 	for ( ; n < npages; n++, offset = 0) {
 		char *kaddr, *limit;
@@ -85,7 +85,7 @@ static int sysv_readdir(struct file *file, struct dir_context *ctx)
 			continue;
 		kaddr = (char *)page_address(page);
 		de = (struct sysv_dir_entry *)(kaddr+offset);
-		limit = kaddr + PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+		limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE;
 		for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
 			char *name = de->name;
 
@@ -146,7 +146,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
 		if (!IS_ERR(page)) {
 			kaddr = (char*)page_address(page);
 			de = (struct sysv_dir_entry *) kaddr;
-			kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+			kaddr += PAGE_SIZE - SYSV_DIRSIZE;
 			for ( ; (char *) de <= kaddr ; de++) {
 				if (!de->inode)
 					continue;
@@ -190,7 +190,7 @@ int sysv_add_link(struct dentry *dentry, struct inode *inode)
 			goto out;
 		kaddr = (char*)page_address(page);
 		de = (struct sysv_dir_entry *)kaddr;
-		kaddr += PAGE_CACHE_SIZE - SYSV_DIRSIZE;
+		kaddr += PAGE_SIZE - SYSV_DIRSIZE;
 		while ((char *)de <= kaddr) {
 			if (!de->inode)
 				goto got_it;
@@ -261,7 +261,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir)
 	kmap(page);
 
 	base = (char*)page_address(page);
-	memset(base, 0, PAGE_CACHE_SIZE);
+	memset(base, 0, PAGE_SIZE);
 
 	de = (struct sysv_dir_entry *) base;
 	de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
@@ -273,7 +273,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir)
 	kunmap(page);
 	err = dir_commit_chunk(page, 0, 2 * SYSV_DIRSIZE);
 fail:
-	page_cache_release(page);
+	put_page(page);
 	return err;
 }
 
@@ -296,7 +296,7 @@ int sysv_empty_dir(struct inode * inode)
 
 		kaddr = (char *)page_address(page);
 		de = (struct sysv_dir_entry *)kaddr;
-		kaddr += PAGE_CACHE_SIZE-SYSV_DIRSIZE;
+		kaddr += PAGE_SIZE-SYSV_DIRSIZE;
 
 		for ( ;(char *)de <= kaddr; de++) {
 			if (!de->inode)
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 11e83ed0b4bf..90b60c03b588 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -264,11 +264,11 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
 out_dir:
 	if (dir_de) {
 		kunmap(dir_page);
-		page_cache_release(dir_page);
+		put_page(dir_page);
 	}
 out_old:
 	kunmap(old_page);
-	page_cache_release(old_page);
+	put_page(old_page);
 out:
 	return err;
 }
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 065c88f8e4b8..1a9c6640e604 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -121,7 +121,7 @@ static int do_readpage(struct page *page)
 	if (block >= beyond) {
 		/* Reading beyond inode */
 		SetPageChecked(page);
-		memset(addr, 0, PAGE_CACHE_SIZE);
+		memset(addr, 0, PAGE_SIZE);
 		goto out;
 	}
 
@@ -223,7 +223,7 @@ static int write_begin_slow(struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 	struct ubifs_budget_req req = { .new_page = 1 };
 	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
 	struct page *page;
@@ -254,13 +254,13 @@ static int write_begin_slow(struct address_space *mapping,
 	}
 
 	if (!PageUptodate(page)) {
-		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+		if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE)
 			SetPageChecked(page);
 		else {
 			err = do_readpage(page);
 			if (err) {
 				unlock_page(page);
-				page_cache_release(page);
+				put_page(page);
 				ubifs_release_budget(c, &req);
 				return err;
 			}
@@ -428,7 +428,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct ubifs_inode *ui = ubifs_inode(inode);
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
 	int skipped_read = 0;
 	struct page *page;
@@ -446,7 +446,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 
 	if (!PageUptodate(page)) {
 		/* The page is not loaded from the flash */
-		if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
+		if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) {
 			/*
 			 * We change whole page so no need to load it. But we
 			 * do not know whether this page exists on the media or
@@ -462,7 +462,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 			err = do_readpage(page);
 			if (err) {
 				unlock_page(page);
-				page_cache_release(page);
+				put_page(page);
 				return err;
 			}
 		}
@@ -494,7 +494,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 			mutex_unlock(&ui->ui_mutex);
 		}
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 
 		return write_begin_slow(mapping, pos, len, pagep, flags);
 	}
@@ -549,7 +549,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
 	dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
 		inode->i_ino, pos, page->index, len, copied, inode->i_size);
 
-	if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) {
+	if (unlikely(copied < len && len == PAGE_SIZE)) {
 		/*
 		 * VFS copied less data to the page that it intended and
 		 * declared in its '->write_begin()' call via the @len
@@ -593,7 +593,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
 
 out:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return copied;
 }
 
@@ -621,10 +621,10 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 
 	addr = zaddr = kmap(page);
 
-	end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+	end_index = (i_size - 1) >> PAGE_SHIFT;
 	if (!i_size || page->index > end_index) {
 		hole = 1;
-		memset(addr, 0, PAGE_CACHE_SIZE);
+		memset(addr, 0, PAGE_SIZE);
 		goto out_hole;
 	}
 
@@ -673,7 +673,7 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 	}
 
 	if (end_index == page->index) {
-		int len = i_size & (PAGE_CACHE_SIZE - 1);
+		int len = i_size & (PAGE_SIZE - 1);
 
 		if (len && len < read)
 			memset(zaddr + len, 0, read - len);
@@ -773,7 +773,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
 	isize = i_size_read(inode);
 	if (isize == 0)
 		goto out_free;
-	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+	end_index = ((isize - 1) >> PAGE_SHIFT);
 
 	for (page_idx = 1; page_idx < page_cnt; page_idx++) {
 		pgoff_t page_offset = offset + page_idx;
@@ -788,7 +788,7 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
 		if (!PageUptodate(page))
 			err = populate_page(c, page, bu, &n);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		if (err)
 			break;
 	}
@@ -905,7 +905,7 @@ static int do_writepage(struct page *page, int len)
 #ifdef UBIFS_DEBUG
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	spin_lock(&ui->ui_lock);
-	ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT);
+	ubifs_assert(page->index <= ui->synced_i_size >> PAGE_SHIFT);
 	spin_unlock(&ui->ui_lock);
 #endif
 
@@ -1001,8 +1001,8 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
 	struct inode *inode = page->mapping->host;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	loff_t i_size =  i_size_read(inode), synced_i_size;
-	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-	int err, len = i_size & (PAGE_CACHE_SIZE - 1);
+	pgoff_t end_index = i_size >> PAGE_SHIFT;
+	int err, len = i_size & (PAGE_SIZE - 1);
 	void *kaddr;
 
 	dbg_gen("ino %lu, pg %lu, pg flags %#lx",
@@ -1021,7 +1021,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
 
 	/* Is the page fully inside @i_size? */
 	if (page->index < end_index) {
-		if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
+		if (page->index >= synced_i_size >> PAGE_SHIFT) {
 			err = inode->i_sb->s_op->write_inode(inode, NULL);
 			if (err)
 				goto out_unlock;
@@ -1034,7 +1034,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
 			 * with this.
 			 */
 		}
-		return do_writepage(page, PAGE_CACHE_SIZE);
+		return do_writepage(page, PAGE_SIZE);
 	}
 
 	/*
@@ -1045,7 +1045,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
 	 * writes to that region are not written out to the file."
 	 */
 	kaddr = kmap_atomic(page);
-	memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
+	memset(kaddr + len, 0, PAGE_SIZE - len);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr);
 
@@ -1138,7 +1138,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 	truncate_setsize(inode, new_size);
 
 	if (offset) {
-		pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
+		pgoff_t index = new_size >> PAGE_SHIFT;
 		struct page *page;
 
 		page = find_lock_page(inode->i_mapping, index);
@@ -1157,9 +1157,9 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 				clear_page_dirty_for_io(page);
 				if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
 					offset = new_size &
-						 (PAGE_CACHE_SIZE - 1);
+						 (PAGE_SIZE - 1);
 				err = do_writepage(page, offset);
-				page_cache_release(page);
+				put_page(page);
 				if (err)
 					goto out_budg;
 				/*
@@ -1173,7 +1173,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 				 * having to read it.
 				 */
 				unlock_page(page);
-				page_cache_release(page);
+				put_page(page);
 			}
 		}
 	}
@@ -1285,7 +1285,7 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset,
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 
 	ubifs_assert(PagePrivate(page));
-	if (offset || length < PAGE_CACHE_SIZE)
+	if (offset || length < PAGE_SIZE)
 		/* Partial page remains dirty */
 		return;
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index a233ba913be4..20daea9aa657 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2240,9 +2240,9 @@ static int __init ubifs_init(void)
 	 * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
 	 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
 	 */
-	if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
+	if (PAGE_SIZE < UBIFS_BLOCK_SIZE) {
 		pr_err("UBIFS error (pid %d): VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes",
-		       current->pid, (unsigned int)PAGE_CACHE_SIZE);
+		       current->pid, (unsigned int)PAGE_SIZE);
 		return -EINVAL;
 	}
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c2a57e193a81..4cd7e569cd00 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -46,8 +46,8 @@
 #define UBIFS_SUPER_MAGIC 0x24051905
 
 /* Number of UBIFS blocks per VFS page */
-#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE)
-#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT)
+#define UBIFS_BLOCKS_PER_PAGE (PAGE_SIZE / UBIFS_BLOCK_SIZE)
+#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_SHIFT - UBIFS_BLOCK_SHIFT)
 
 /* "File system end of life" sequence number watermark */
 #define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 1af98963d860..877ba1c9b461 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -46,7 +46,7 @@ static void __udf_adinicb_readpage(struct page *page)
 
 	kaddr = kmap(page);
 	memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
-	memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size);
+	memset(kaddr + inode->i_size, 0, PAGE_SIZE - inode->i_size);
 	flush_dcache_page(page);
 	SetPageUptodate(page);
 	kunmap(page);
@@ -87,14 +87,14 @@ static int udf_adinicb_write_begin(struct file *file,
 {
 	struct page *page;
 
-	if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE))
+	if (WARN_ON_ONCE(pos >= PAGE_SIZE))
 		return -EIO;
 	page = grab_cache_page_write_begin(mapping, 0, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
 
-	if (!PageUptodate(page) && len != PAGE_CACHE_SIZE)
+	if (!PageUptodate(page) && len != PAGE_SIZE)
 		__udf_adinicb_readpage(page);
 	return 0;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 166d3ed32c39..2dc461eeb415 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -287,7 +287,7 @@ int udf_expand_file_adinicb(struct inode *inode)
 	if (!PageUptodate(page)) {
 		kaddr = kmap(page);
 		memset(kaddr + iinfo->i_lenAlloc, 0x00,
-		       PAGE_CACHE_SIZE - iinfo->i_lenAlloc);
+		       PAGE_SIZE - iinfo->i_lenAlloc);
 		memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr,
 			iinfo->i_lenAlloc);
 		flush_dcache_page(page);
@@ -319,7 +319,7 @@ int udf_expand_file_adinicb(struct inode *inode)
 		inode->i_data.a_ops = &udf_adinicb_aops;
 		up_write(&iinfo->i_data_sem);
 	}
-	page_cache_release(page);
+	put_page(page);
 	mark_inode_dirty(inode);
 
 	return err;
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index dc5fae601c24..0447b949c7f5 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -237,7 +237,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
 			       sector_t newb, struct page *locked_page)
 {
 	const unsigned blks_per_page =
-		1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		1 << (PAGE_SHIFT - inode->i_blkbits);
 	const unsigned mask = blks_per_page - 1;
 	struct address_space * const mapping = inode->i_mapping;
 	pgoff_t index, cur_index, last_index;
@@ -255,9 +255,9 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
 
 	cur_index = locked_page->index;
 	end = count + beg;
-	last_index = end >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	last_index = end >> (PAGE_SHIFT - inode->i_blkbits);
 	for (i = beg; i < end; i = (i | mask) + 1) {
-		index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+		index = i >> (PAGE_SHIFT - inode->i_blkbits);
 
 		if (likely(cur_index != index)) {
 			page = ufs_get_locked_page(mapping, index);
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 74f2e80288bf..0b1457292734 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -62,7 +62,7 @@ static int ufs_commit_chunk(struct page *page, loff_t pos, unsigned len)
 static inline void ufs_put_page(struct page *page)
 {
 	kunmap(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
@@ -111,13 +111,13 @@ static void ufs_check_page(struct page *page)
 	struct super_block *sb = dir->i_sb;
 	char *kaddr = page_address(page);
 	unsigned offs, rec_len;
-	unsigned limit = PAGE_CACHE_SIZE;
+	unsigned limit = PAGE_SIZE;
 	const unsigned chunk_mask = UFS_SB(sb)->s_uspi->s_dirblksize - 1;
 	struct ufs_dir_entry *p;
 	char *error;
 
-	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
-		limit = dir->i_size & ~PAGE_CACHE_MASK;
+	if ((dir->i_size >> PAGE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_MASK;
 		if (limit & chunk_mask)
 			goto Ebadsize;
 		if (!limit)
@@ -170,7 +170,7 @@ Einumber:
 bad_entry:
 	ufs_error (sb, "ufs_check_page", "bad entry in directory #%lu: %s - "
 		   "offset=%lu, rec_len=%d, name_len=%d",
-		   dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		   dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
 		   rec_len, ufs_get_de_namlen(sb, p));
 	goto fail;
 Eend:
@@ -178,7 +178,7 @@ Eend:
 	ufs_error(sb, __func__,
 		   "entry in directory #%lu spans the page boundary"
 		   "offset=%lu",
-		   dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs);
+		   dir->i_ino, (page->index<<PAGE_SHIFT)+offs);
 fail:
 	SetPageChecked(page);
 	SetPageError(page);
@@ -211,9 +211,9 @@ ufs_last_byte(struct inode *inode, unsigned long page_nr)
 {
 	unsigned last_byte = inode->i_size;
 
-	last_byte -= page_nr << PAGE_CACHE_SHIFT;
-	if (last_byte > PAGE_CACHE_SIZE)
-		last_byte = PAGE_CACHE_SIZE;
+	last_byte -= page_nr << PAGE_SHIFT;
+	if (last_byte > PAGE_SIZE)
+		last_byte = PAGE_SIZE;
 	return last_byte;
 }
 
@@ -341,7 +341,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode)
 		kaddr = page_address(page);
 		dir_end = kaddr + ufs_last_byte(dir, n);
 		de = (struct ufs_dir_entry *)kaddr;
-		kaddr += PAGE_CACHE_SIZE - reclen;
+		kaddr += PAGE_SIZE - reclen;
 		while ((char *)de <= kaddr) {
 			if ((char *)de == dir_end) {
 				/* We hit i_size */
@@ -432,8 +432,8 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
 	loff_t pos = ctx->pos;
 	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	unsigned int offset = pos & ~PAGE_CACHE_MASK;
-	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned int offset = pos & ~PAGE_MASK;
+	unsigned long n = pos >> PAGE_SHIFT;
 	unsigned long npages = dir_pages(inode);
 	unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1);
 	int need_revalidate = file->f_version != inode->i_version;
@@ -454,14 +454,14 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
 			ufs_error(sb, __func__,
 				  "bad page in #%lu",
 				  inode->i_ino);
-			ctx->pos += PAGE_CACHE_SIZE - offset;
+			ctx->pos += PAGE_SIZE - offset;
 			return -EIO;
 		}
 		kaddr = page_address(page);
 		if (unlikely(need_revalidate)) {
 			if (offset) {
 				offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask);
-				ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset;
+				ctx->pos = (n<<PAGE_SHIFT) + offset;
 			}
 			file->f_version = inode->i_version;
 			need_revalidate = 0;
@@ -574,7 +574,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
 
 	kmap(page);
 	base = (char*)page_address(page);
-	memset(base, 0, PAGE_CACHE_SIZE);
+	memset(base, 0, PAGE_SIZE);
 
 	de = (struct ufs_dir_entry *) base;
 
@@ -594,7 +594,7 @@ int ufs_make_empty(struct inode * inode, struct inode *dir)
 
 	err = ufs_commit_chunk(page, 0, chunk_size);
 fail:
-	page_cache_release(page);
+	put_page(page);
 	return err;
 }
 
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index d897e169ab9c..9f49431e798d 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -1051,13 +1051,13 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
 	lastfrag--;
 
 	lastpage = ufs_get_locked_page(mapping, lastfrag >>
-				       (PAGE_CACHE_SHIFT - inode->i_blkbits));
+				       (PAGE_SHIFT - inode->i_blkbits));
        if (IS_ERR(lastpage)) {
                err = -EIO;
                goto out;
        }
 
-       end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1);
+       end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1);
        bh = page_buffers(lastpage);
        for (i = 0; i < end; ++i)
                bh = bh->b_this_page;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index acf4a3b61b81..a1559f762805 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -305,7 +305,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0);
 		else {
 			kunmap(dir_page);
-			page_cache_release(dir_page);
+			put_page(dir_page);
 		}
 		inode_dec_link_count(old_dir);
 	}
@@ -315,11 +315,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 out_dir:
 	if (dir_de) {
 		kunmap(dir_page);
-		page_cache_release(dir_page);
+		put_page(dir_page);
 	}
 out_old:
 	kunmap(old_page);
-	page_cache_release(old_page);
+	put_page(old_page);
 out:
 	return err;
 }
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index b6c2f94e041e..a409e3e7827a 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -261,14 +261,14 @@ struct page *ufs_get_locked_page(struct address_space *mapping,
 		if (unlikely(page->mapping == NULL)) {
 			/* Truncate got there first */
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			page = NULL;
 			goto out;
 		}
 
 		if (!PageUptodate(page) || PageError(page)) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 
 			printk(KERN_ERR "ufs_change_blocknr: "
 			       "can not read page: ino %lu, index: %lu\n",
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 954175928240..b7fbf53dbc81 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -283,7 +283,7 @@ extern struct page *ufs_get_locked_page(struct address_space *mapping,
 static inline void ufs_put_locked_page(struct page *page)
 {
        unlock_page(page);
-       page_cache_release(page);
+       put_page(page);
 }
 
 
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 041b6948aecc..ce41d7fe753c 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3742,11 +3742,11 @@ xfs_bmap_btalloc(
 		args.prod = align;
 		if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
 			args.mod = (xfs_extlen_t)(args.prod - args.mod);
-	} else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) {
+	} else if (mp->m_sb.sb_blocksize >= PAGE_SIZE) {
 		args.prod = 1;
 		args.mod = 0;
 	} else {
-		args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog;
+		args.prod = PAGE_SIZE >> mp->m_sb.sb_blocklog;
 		if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod))))
 			args.mod = (xfs_extlen_t)(args.prod - args.mod);
 	}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d445a64b979e..5b8ae03ba855 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -704,7 +704,7 @@ next_buffer:
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 out_invalidate:
-	xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
 	return;
 }
 
@@ -925,9 +925,9 @@ xfs_do_writepage(
 	 * ---------------------------------^------------------|
 	 */
 	offset = i_size_read(inode);
-	end_index = offset >> PAGE_CACHE_SHIFT;
+	end_index = offset >> PAGE_SHIFT;
 	if (page->index < end_index)
-		end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
+		end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
 	else {
 		/*
 		 * Check whether the page to write out is beyond or straddles
@@ -940,7 +940,7 @@ xfs_do_writepage(
 		 * |				    |      Straddles     |
 		 * ---------------------------------^-----------|--------|
 		 */
-		unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
+		unsigned offset_into_page = offset & (PAGE_SIZE - 1);
 
 		/*
 		 * Skip the page if it is fully outside i_size, e.g. due to a
@@ -971,7 +971,7 @@ xfs_do_writepage(
 		 * memory is zeroed when mapped, and writes to that region are
 		 * not written out to the file."
 		 */
-		zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
+		zero_user_segment(page, offset_into_page, PAGE_SIZE);
 
 		/* Adjust the end_offset to the end of file */
 		end_offset = offset;
@@ -1475,7 +1475,7 @@ xfs_vm_write_failed(
 	loff_t			block_offset;
 	loff_t			block_start;
 	loff_t			block_end;
-	loff_t			from = pos & (PAGE_CACHE_SIZE - 1);
+	loff_t			from = pos & (PAGE_SIZE - 1);
 	loff_t			to = from + len;
 	struct buffer_head	*bh, *head;
 	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
@@ -1491,7 +1491,7 @@ xfs_vm_write_failed(
 	 * start of the page by using shifts rather than masks the mismatch
 	 * problem.
 	 */
-	block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT;
+	block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;
 
 	ASSERT(block_offset + from == pos);
 
@@ -1558,7 +1558,7 @@ xfs_vm_write_begin(
 	struct page		**pagep,
 	void			**fsdata)
 {
-	pgoff_t			index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t			index = pos >> PAGE_SHIFT;
 	struct page		*page;
 	int			status;
 	struct xfs_mount	*mp = XFS_I(mapping->host)->i_mount;
@@ -1592,7 +1592,7 @@ xfs_vm_write_begin(
 			truncate_pagecache_range(inode, start, pos + len);
 		}
 
-		page_cache_release(page);
+		put_page(page);
 		page = NULL;
 	}
 
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index a32c1dcae2ff..3b6309865c65 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1237,7 +1237,7 @@ xfs_free_file_space(
 	/* wait for the completion of any pending DIOs */
 	inode_dio_wait(VFS_I(ip));
 
-	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
 	ioffset = round_down(offset, rounding);
 	iendoffset = round_up(offset + len, rounding) - 1;
 	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
@@ -1466,7 +1466,7 @@ xfs_shift_file_space(
 	if (error)
 		return error;
 	error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-					offset >> PAGE_CACHE_SHIFT, -1);
+					offset >> PAGE_SHIFT, -1);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ac0fd32de31e..569938a4a357 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -106,8 +106,8 @@ xfs_iozero(
 		unsigned offset, bytes;
 		void *fsdata;
 
-		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-		bytes = PAGE_CACHE_SIZE - offset;
+		offset = (pos & (PAGE_SIZE -1)); /* Within page */
+		bytes = PAGE_SIZE - offset;
 		if (bytes > count)
 			bytes = count;
 
@@ -799,8 +799,8 @@ xfs_file_dio_aio_write(
 	/* see generic_file_direct_write() for why this is necessary */
 	if (mapping->nrpages) {
 		invalidate_inode_pages2_range(mapping,
-					      pos >> PAGE_CACHE_SHIFT,
-					      end >> PAGE_CACHE_SHIFT);
+					      pos >> PAGE_SHIFT,
+					      end >> PAGE_SHIFT);
 	}
 
 	if (ret > 0) {
@@ -1207,9 +1207,9 @@ xfs_find_get_desired_pgoff(
 
 	pagevec_init(&pvec, 0);
 
-	index = startoff >> PAGE_CACHE_SHIFT;
+	index = startoff >> PAGE_SHIFT;
 	endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount);
-	end = endoff >> PAGE_CACHE_SHIFT;
+	end = endoff >> PAGE_SHIFT;
 	do {
 		int		want;
 		unsigned	nr_pages;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index ec0e239a0fa9..a8192dc797dc 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -135,7 +135,7 @@ typedef __u32			xfs_nlink_t;
  * Size of block device i/o is parameterized here.
  * Currently the system supports page-sized i/o.
  */
-#define	BLKDEV_IOSHIFT		PAGE_CACHE_SHIFT
+#define	BLKDEV_IOSHIFT		PAGE_SHIFT
 #define	BLKDEV_IOSIZE		(1<<BLKDEV_IOSHIFT)
 /* number of BB's per block device block */
 #define	BLKDEV_BB		BTOBB(BLKDEV_IOSIZE)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 536a0ee9cd5a..cfd4210dd015 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -171,7 +171,7 @@ xfs_sb_validate_fsb_count(
 	ASSERT(sbp->sb_blocklog >= BBSHIFT);
 
 	/* Limited by ULONG_MAX of page cache index */
-	if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
+	if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
 		return -EFBIG;
 	return 0;
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bac6b3435591..eafe257b357a 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -231,12 +231,12 @@ static inline unsigned long
 xfs_preferred_iosize(xfs_mount_t *mp)
 {
 	if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
-		return PAGE_CACHE_SIZE;
+		return PAGE_SIZE;
 	return (mp->m_swidth ?
 		(mp->m_swidth << mp->m_sb.sb_blocklog) :
 		((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
 			(1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) :
-			PAGE_CACHE_SIZE));
+			PAGE_SIZE));
 }
 
 #define XFS_LAST_UNMOUNT_WAS_CLEAN(mp)	\
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index ade236e90bb3..51ddaf2c2b8c 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -293,8 +293,8 @@ xfs_fs_commit_blocks(
 		 * Make sure reads through the pagecache see the new data.
 		 */
 		error = invalidate_inode_pages2_range(inode->i_mapping,
-					start >> PAGE_CACHE_SHIFT,
-					(end - 1) >> PAGE_CACHE_SHIFT);
+					start >> PAGE_SHIFT,
+					(end - 1) >> PAGE_SHIFT);
 		WARN_ON_ONCE(error);
 
 		error = xfs_iomap_write_unwritten(ip, start, length);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d760934109b5..6082f54e0557 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -570,10 +570,10 @@ xfs_max_file_offset(
 #if BITS_PER_LONG == 32
 # if defined(CONFIG_LBDAF)
 	ASSERT(sizeof(sector_t) == 8);
-	pagefactor = PAGE_CACHE_SIZE;
+	pagefactor = PAGE_SIZE;
 	bitshift = BITS_PER_LONG;
 # else
-	pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+	pagefactor = PAGE_SIZE >> (PAGE_SHIFT - blockshift);
 # endif
 #endif
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 88bc64f00bb5..6b7481f62218 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -41,7 +41,7 @@
 #endif
 
 #define BIO_MAX_PAGES		256
-#define BIO_MAX_SIZE		(BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
+#define BIO_MAX_SIZE		(BIO_MAX_PAGES << PAGE_SHIFT)
 #define BIO_MAX_SECTORS		(BIO_MAX_SIZE >> 9)
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7e5d7e018bea..669e419d6234 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1372,7 +1372,7 @@ unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *);
 
 static inline void put_dev_sector(Sector p)
 {
-	page_cache_release(p.v);
+	put_page(p.v);
 }
 
 static inline bool __bvec_gap_to_prev(struct request_queue *q,
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c67f052cc5e5..d48daa3f6f20 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -43,7 +43,7 @@ enum bh_state_bits {
 			 */
 };
 
-#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512)
+#define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
 
 struct page;
 struct buffer_head;
@@ -263,7 +263,7 @@ void buffer_init(void);
 static inline void attach_page_buffers(struct page *page,
 		struct buffer_head *head)
 {
-	page_cache_get(page);
+	get_page(page);
 	SetPagePrivate(page);
 	set_page_private(page, (unsigned long)head);
 }
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index e7975e4681e1..db92a8d4926e 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -176,8 +176,8 @@ extern void ceph_put_snap_context(struct ceph_snap_context *sc);
  */
 static inline int calc_pages_for(u64 off, u64 len)
 {
-	return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
-		(off >> PAGE_CACHE_SHIFT);
+	return ((off+len+PAGE_SIZE-1) >> PAGE_SHIFT) -
+		(off >> PAGE_SHIFT);
 }
 
 extern struct kmem_cache *ceph_inode_cachep;
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index 9eb215a155e0..b90e9bdbd1dd 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -262,7 +262,7 @@ struct f2fs_node {
 /*
  * For NAT entries
  */
-#define NAT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_nat_entry))
+#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry))
 
 struct f2fs_nat_entry {
 	__u8 version;		/* latest version of cached nat entry */
@@ -282,7 +282,7 @@ struct f2fs_nat_block {
  * Not allow to change this.
  */
 #define SIT_VBLOCK_MAP_SIZE 64
-#define SIT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_sit_entry))
+#define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry))
 
 /*
  * Note that f2fs_sit_entry->vblocks has the following bit-field information.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14a97194b34b..304991a80e23 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -929,7 +929,7 @@ static inline struct file *get_file(struct file *f)
 /* Page cache limit. The filesystems should put that into their s_maxbytes 
    limits, otherwise bad things can happen in VM. */ 
 #if BITS_PER_LONG==32
-#define MAX_LFS_FILESIZE	(((loff_t)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) 
+#define MAX_LFS_FILESIZE	(((loff_t)PAGE_SIZE << (BITS_PER_LONG-1))-1)
 #elif BITS_PER_LONG==64
 #define MAX_LFS_FILESIZE 	((loff_t)0x7fffffffffffffffLL)
 #endif
@@ -2067,7 +2067,7 @@ extern int generic_update_time(struct inode *, struct timespec *, int);
 /* /sys/fs */
 extern struct kobject *fs_kobj;
 
-#define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
+#define MAX_RW_COUNT (INT_MAX & PAGE_MASK)
 
 #ifdef CONFIG_MANDATORY_FILE_LOCKING
 extern int locks_mandatory_locked(struct file *);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index f2f650f136ee..efada239205e 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -184,7 +184,7 @@ nfs_list_entry(struct list_head *head)
 static inline
 loff_t req_offset(struct nfs_page *req)
 {
-	return (((loff_t)req->wb_index) << PAGE_CACHE_SHIFT) + req->wb_offset;
+	return (((loff_t)req->wb_index) << PAGE_SHIFT) + req->wb_offset;
 }
 
 #endif /* _LINUX_NFS_PAGE_H */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 1ebd65c91422..f396ccb900cc 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -390,13 +390,13 @@ static inline pgoff_t page_to_pgoff(struct page *page)
 		return page->index << compound_order(page);
 
 	if (likely(!PageTransTail(page)))
-		return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+		return page->index;
 
 	/*
 	 *  We don't initialize ->index for tail pages: calculate based on
 	 *  head page
 	 */
-	pgoff = compound_head(page)->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	pgoff = compound_head(page)->index;
 	pgoff += page - compound_head(page);
 	return pgoff;
 }
@@ -406,12 +406,12 @@ static inline pgoff_t page_to_pgoff(struct page *page)
  */
 static inline loff_t page_offset(struct page *page)
 {
-	return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
+	return ((loff_t)page->index) << PAGE_SHIFT;
 }
 
 static inline loff_t page_file_offset(struct page *page)
 {
-	return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT;
+	return ((loff_t)page_file_index(page)) << PAGE_SHIFT;
 }
 
 extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
@@ -425,7 +425,7 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 		return linear_hugepage_index(vma, address);
 	pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
 	pgoff += vma->vm_pgoff;
-	return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	return pgoff;
 }
 
 extern void __lock_page(struct page *page);
@@ -671,8 +671,8 @@ static inline int add_to_page_cache(struct page *page,
 
 static inline unsigned long dir_pages(struct inode *inode)
 {
-	return (unsigned long)(inode->i_size + PAGE_CACHE_SIZE - 1) >>
-			       PAGE_CACHE_SHIFT;
+	return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
+			       PAGE_SHIFT;
 }
 
 #endif /* _LINUX_PAGEMAP_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d18b65c53dbb..3d980ea1c946 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -435,7 +435,7 @@ struct backing_dev_info;
 /* only sparc can not include linux/pagemap.h in this file
  * so leave page_cache_release and release_pages undeclared... */
 #define free_page_and_swap_cache(page) \
-	page_cache_release(page)
+	put_page(page)
 #define free_pages_and_swap_cache(pages, nr) \
 	release_pages((pages), (nr), false);
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 781c1399c6a3..ade739f67f1d 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -307,8 +307,8 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *inode;
 	struct ipc_namespace *ns = data;
 
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = MQUEUE_MAGIC;
 	sb->s_op = &mqueue_super_ops;
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 220fc17b9718..7edc95edfaee 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -321,7 +321,7 @@ retry:
 	copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
 
 	ret = __replace_page(vma, vaddr, old_page, new_page);
-	page_cache_release(new_page);
+	put_page(new_page);
 put_old:
 	put_page(old_page);
 
@@ -539,14 +539,14 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
 	 * see uprobe_register().
 	 */
 	if (mapping->a_ops->readpage)
-		page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+		page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp);
 	else
-		page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
+		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
 	if (IS_ERR(page))
 		return PTR_ERR(page);
 
 	copy_from_page(page, offset, insn, nbytes);
-	page_cache_release(page);
+	put_page(page);
 
 	return 0;
 }
diff --git a/mm/fadvise.c b/mm/fadvise.c
index b8a5bc66b0c0..b8024fa7101d 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -97,8 +97,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		break;
 	case POSIX_FADV_WILLNEED:
 		/* First and last PARTIAL page! */
-		start_index = offset >> PAGE_CACHE_SHIFT;
-		end_index = endbyte >> PAGE_CACHE_SHIFT;
+		start_index = offset >> PAGE_SHIFT;
+		end_index = endbyte >> PAGE_SHIFT;
 
 		/* Careful about overflow on the "+1" */
 		nrpages = end_index - start_index + 1;
@@ -124,8 +124,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		 * preserved on the expectation that it is better to preserve
 		 * needed memory than to discard unneeded memory.
 		 */
-		start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
-		end_index = (endbyte >> PAGE_CACHE_SHIFT);
+		start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT;
+		end_index = (endbyte >> PAGE_SHIFT);
 
 		if (end_index >= start_index) {
 			unsigned long count = invalidate_mapping_pages(mapping,
diff --git a/mm/filemap.c b/mm/filemap.c
index a8c69c8c0a90..f2479af09da9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -265,7 +265,7 @@ void delete_from_page_cache(struct page *page)
 
 	if (freepage)
 		freepage(page);
-	page_cache_release(page);
+	put_page(page);
 }
 EXPORT_SYMBOL(delete_from_page_cache);
 
@@ -352,8 +352,8 @@ EXPORT_SYMBOL(filemap_flush);
 static int __filemap_fdatawait_range(struct address_space *mapping,
 				     loff_t start_byte, loff_t end_byte)
 {
-	pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
-	pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
+	pgoff_t index = start_byte >> PAGE_SHIFT;
+	pgoff_t end = end_byte >> PAGE_SHIFT;
 	struct pagevec pvec;
 	int nr_pages;
 	int ret = 0;
@@ -550,7 +550,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 		pgoff_t offset = old->index;
 		freepage = mapping->a_ops->freepage;
 
-		page_cache_get(new);
+		get_page(new);
 		new->mapping = mapping;
 		new->index = offset;
 
@@ -572,7 +572,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 		radix_tree_preload_end();
 		if (freepage)
 			freepage(old);
-		page_cache_release(old);
+		put_page(old);
 	}
 
 	return error;
@@ -651,7 +651,7 @@ static int __add_to_page_cache_locked(struct page *page,
 		return error;
 	}
 
-	page_cache_get(page);
+	get_page(page);
 	page->mapping = mapping;
 	page->index = offset;
 
@@ -675,7 +675,7 @@ err_insert:
 	spin_unlock_irq(&mapping->tree_lock);
 	if (!huge)
 		mem_cgroup_cancel_charge(page, memcg, false);
-	page_cache_release(page);
+	put_page(page);
 	return error;
 }
 
@@ -1083,7 +1083,7 @@ repeat:
 		 * include/linux/pagemap.h for details.
 		 */
 		if (unlikely(page != *pagep)) {
-			page_cache_release(page);
+			put_page(page);
 			goto repeat;
 		}
 	}
@@ -1121,7 +1121,7 @@ repeat:
 		/* Has the page been truncated? */
 		if (unlikely(page->mapping != mapping)) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			goto repeat;
 		}
 		VM_BUG_ON_PAGE(page->index != offset, page);
@@ -1168,7 +1168,7 @@ repeat:
 	if (fgp_flags & FGP_LOCK) {
 		if (fgp_flags & FGP_NOWAIT) {
 			if (!trylock_page(page)) {
-				page_cache_release(page);
+				put_page(page);
 				return NULL;
 			}
 		} else {
@@ -1178,7 +1178,7 @@ repeat:
 		/* Has the page been truncated? */
 		if (unlikely(page->mapping != mapping)) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			goto repeat;
 		}
 		VM_BUG_ON_PAGE(page->index != offset, page);
@@ -1209,7 +1209,7 @@ no_page:
 		err = add_to_page_cache_lru(page, mapping, offset,
 				gfp_mask & GFP_RECLAIM_MASK);
 		if (unlikely(err)) {
-			page_cache_release(page);
+			put_page(page);
 			page = NULL;
 			if (err == -EEXIST)
 				goto repeat;
@@ -1278,7 +1278,7 @@ repeat:
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			page_cache_release(page);
+			put_page(page);
 			goto repeat;
 		}
 export:
@@ -1343,7 +1343,7 @@ repeat:
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			page_cache_release(page);
+			put_page(page);
 			goto repeat;
 		}
 
@@ -1405,7 +1405,7 @@ repeat:
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			page_cache_release(page);
+			put_page(page);
 			goto repeat;
 		}
 
@@ -1415,7 +1415,7 @@ repeat:
 		 * negatives, which is just confusing to the caller.
 		 */
 		if (page->mapping == NULL || page->index != iter.index) {
-			page_cache_release(page);
+			put_page(page);
 			break;
 		}
 
@@ -1482,7 +1482,7 @@ repeat:
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			page_cache_release(page);
+			put_page(page);
 			goto repeat;
 		}
 
@@ -1549,7 +1549,7 @@ repeat:
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			page_cache_release(page);
+			put_page(page);
 			goto repeat;
 		}
 export:
@@ -1610,11 +1610,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
 	unsigned int prev_offset;
 	int error = 0;
 
-	index = *ppos >> PAGE_CACHE_SHIFT;
-	prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
-	prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
-	last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
-	offset = *ppos & ~PAGE_CACHE_MASK;
+	index = *ppos >> PAGE_SHIFT;
+	prev_index = ra->prev_pos >> PAGE_SHIFT;
+	prev_offset = ra->prev_pos & (PAGE_SIZE-1);
+	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
+	offset = *ppos & ~PAGE_MASK;
 
 	for (;;) {
 		struct page *page;
@@ -1648,7 +1648,7 @@ find_page:
 			if (PageUptodate(page))
 				goto page_ok;
 
-			if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+			if (inode->i_blkbits == PAGE_SHIFT ||
 					!mapping->a_ops->is_partially_uptodate)
 				goto page_not_up_to_date;
 			if (!trylock_page(page))
@@ -1672,18 +1672,18 @@ page_ok:
 		 */
 
 		isize = i_size_read(inode);
-		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+		end_index = (isize - 1) >> PAGE_SHIFT;
 		if (unlikely(!isize || index > end_index)) {
-			page_cache_release(page);
+			put_page(page);
 			goto out;
 		}
 
 		/* nr is the maximum number of bytes to copy from this page */
-		nr = PAGE_CACHE_SIZE;
+		nr = PAGE_SIZE;
 		if (index == end_index) {
-			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+			nr = ((isize - 1) & ~PAGE_MASK) + 1;
 			if (nr <= offset) {
-				page_cache_release(page);
+				put_page(page);
 				goto out;
 			}
 		}
@@ -1711,11 +1711,11 @@ page_ok:
 
 		ret = copy_page_to_iter(page, offset, nr, iter);
 		offset += ret;
-		index += offset >> PAGE_CACHE_SHIFT;
-		offset &= ~PAGE_CACHE_MASK;
+		index += offset >> PAGE_SHIFT;
+		offset &= ~PAGE_MASK;
 		prev_offset = offset;
 
-		page_cache_release(page);
+		put_page(page);
 		written += ret;
 		if (!iov_iter_count(iter))
 			goto out;
@@ -1735,7 +1735,7 @@ page_not_up_to_date_locked:
 		/* Did it get truncated before we got the lock? */
 		if (!page->mapping) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			continue;
 		}
 
@@ -1757,7 +1757,7 @@ readpage:
 
 		if (unlikely(error)) {
 			if (error == AOP_TRUNCATED_PAGE) {
-				page_cache_release(page);
+				put_page(page);
 				error = 0;
 				goto find_page;
 			}
@@ -1774,7 +1774,7 @@ readpage:
 					 * invalidate_mapping_pages got it
 					 */
 					unlock_page(page);
-					page_cache_release(page);
+					put_page(page);
 					goto find_page;
 				}
 				unlock_page(page);
@@ -1789,7 +1789,7 @@ readpage:
 
 readpage_error:
 		/* UHHUH! A synchronous read error occurred. Report it */
-		page_cache_release(page);
+		put_page(page);
 		goto out;
 
 no_cached_page:
@@ -1805,7 +1805,7 @@ no_cached_page:
 		error = add_to_page_cache_lru(page, mapping, index,
 				mapping_gfp_constraint(mapping, GFP_KERNEL));
 		if (error) {
-			page_cache_release(page);
+			put_page(page);
 			if (error == -EEXIST) {
 				error = 0;
 				goto find_page;
@@ -1817,10 +1817,10 @@ no_cached_page:
 
 out:
 	ra->prev_pos = prev_index;
-	ra->prev_pos <<= PAGE_CACHE_SHIFT;
+	ra->prev_pos <<= PAGE_SHIFT;
 	ra->prev_pos |= prev_offset;
 
-	*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
+	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
 	file_accessed(filp);
 	return written ? written : error;
 }
@@ -1912,7 +1912,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
 		else if (ret == -EEXIST)
 			ret = 0; /* losing race to add is OK */
 
-		page_cache_release(page);
+		put_page(page);
 
 	} while (ret == AOP_TRUNCATED_PAGE);
 
@@ -2022,8 +2022,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	loff_t size;
 	int ret = 0;
 
-	size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
-	if (offset >= size >> PAGE_CACHE_SHIFT)
+	size = round_up(i_size_read(inode), PAGE_SIZE);
+	if (offset >= size >> PAGE_SHIFT)
 		return VM_FAULT_SIGBUS;
 
 	/*
@@ -2049,7 +2049,7 @@ retry_find:
 	}
 
 	if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
-		page_cache_release(page);
+		put_page(page);
 		return ret | VM_FAULT_RETRY;
 	}
 
@@ -2072,10 +2072,10 @@ retry_find:
 	 * Found the page and have a reference on it.
 	 * We must recheck i_size under page lock.
 	 */
-	size = round_up(i_size_read(inode), PAGE_CACHE_SIZE);
-	if (unlikely(offset >= size >> PAGE_CACHE_SHIFT)) {
+	size = round_up(i_size_read(inode), PAGE_SIZE);
+	if (unlikely(offset >= size >> PAGE_SHIFT)) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		return VM_FAULT_SIGBUS;
 	}
 
@@ -2120,7 +2120,7 @@ page_not_uptodate:
 		if (!PageUptodate(page))
 			error = -EIO;
 	}
-	page_cache_release(page);
+	put_page(page);
 
 	if (!error || error == AOP_TRUNCATED_PAGE)
 		goto retry_find;
@@ -2164,7 +2164,7 @@ repeat:
 
 		/* Has the page moved? */
 		if (unlikely(page != *slot)) {
-			page_cache_release(page);
+			put_page(page);
 			goto repeat;
 		}
 
@@ -2178,8 +2178,8 @@ repeat:
 		if (page->mapping != mapping || !PageUptodate(page))
 			goto unlock;
 
-		size = round_up(i_size_read(mapping->host), PAGE_CACHE_SIZE);
-		if (page->index >= size >> PAGE_CACHE_SHIFT)
+		size = round_up(i_size_read(mapping->host), PAGE_SIZE);
+		if (page->index >= size >> PAGE_SHIFT)
 			goto unlock;
 
 		pte = vmf->pte + page->index - vmf->pgoff;
@@ -2195,7 +2195,7 @@ repeat:
 unlock:
 		unlock_page(page);
 skip:
-		page_cache_release(page);
+		put_page(page);
 next:
 		if (iter.index == vmf->max_pgoff)
 			break;
@@ -2278,7 +2278,7 @@ static struct page *wait_on_page_read(struct page *page)
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		if (!PageUptodate(page)) {
-			page_cache_release(page);
+			put_page(page);
 			page = ERR_PTR(-EIO);
 		}
 	}
@@ -2301,7 +2301,7 @@ repeat:
 			return ERR_PTR(-ENOMEM);
 		err = add_to_page_cache_lru(page, mapping, index, gfp);
 		if (unlikely(err)) {
-			page_cache_release(page);
+			put_page(page);
 			if (err == -EEXIST)
 				goto repeat;
 			/* Presumably ENOMEM for radix tree node */
@@ -2311,7 +2311,7 @@ repeat:
 filler:
 		err = filler(data, page);
 		if (err < 0) {
-			page_cache_release(page);
+			put_page(page);
 			return ERR_PTR(err);
 		}
 
@@ -2364,7 +2364,7 @@ filler:
 	/* Case c or d, restart the operation */
 	if (!page->mapping) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		goto repeat;
 	}
 
@@ -2511,7 +2511,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 	struct iov_iter data;
 
 	write_len = iov_iter_count(from);
-	end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+	end = (pos + write_len - 1) >> PAGE_SHIFT;
 
 	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
 	if (written)
@@ -2525,7 +2525,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 	 */
 	if (mapping->nrpages) {
 		written = invalidate_inode_pages2_range(mapping,
-					pos >> PAGE_CACHE_SHIFT, end);
+					pos >> PAGE_SHIFT, end);
 		/*
 		 * If a page can not be invalidated, return 0 to fall back
 		 * to buffered write.
@@ -2550,7 +2550,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
 	 */
 	if (mapping->nrpages) {
 		invalidate_inode_pages2_range(mapping,
-					      pos >> PAGE_CACHE_SHIFT, end);
+					      pos >> PAGE_SHIFT, end);
 	}
 
 	if (written > 0) {
@@ -2611,8 +2611,8 @@ ssize_t generic_perform_write(struct file *file,
 		size_t copied;		/* Bytes copied from user */
 		void *fsdata;
 
-		offset = (pos & (PAGE_CACHE_SIZE - 1));
-		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+		offset = (pos & (PAGE_SIZE - 1));
+		bytes = min_t(unsigned long, PAGE_SIZE - offset,
 						iov_iter_count(i));
 
 again:
@@ -2665,7 +2665,7 @@ again:
 			 * because not all segments in the iov can be copied at
 			 * once without a pagefault.
 			 */
-			bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
 						iov_iter_single_seg_count(i));
 			goto again;
 		}
@@ -2752,8 +2752,8 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			iocb->ki_pos = endbyte + 1;
 			written += status;
 			invalidate_mapping_pages(mapping,
-						 pos >> PAGE_CACHE_SHIFT,
-						 endbyte >> PAGE_CACHE_SHIFT);
+						 pos >> PAGE_SHIFT,
+						 endbyte >> PAGE_SHIFT);
 		} else {
 			/*
 			 * We don't know how much we wrote, so just return
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 06058eaa173b..19d0d08b396f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3346,7 +3346,7 @@ retry_avoidcopy:
 			old_page != pagecache_page)
 		outside_reserve = 1;
 
-	page_cache_get(old_page);
+	get_page(old_page);
 
 	/*
 	 * Drop page table lock as buddy allocator may be called. It will
@@ -3364,7 +3364,7 @@ retry_avoidcopy:
 		 * may get SIGKILLed if it later faults.
 		 */
 		if (outside_reserve) {
-			page_cache_release(old_page);
+			put_page(old_page);
 			BUG_ON(huge_pte_none(pte));
 			unmap_ref_private(mm, vma, old_page, address);
 			BUG_ON(huge_pte_none(pte));
@@ -3425,9 +3425,9 @@ retry_avoidcopy:
 	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out_release_all:
-	page_cache_release(new_page);
+	put_page(new_page);
 out_release_old:
-	page_cache_release(old_page);
+	put_page(old_page);
 
 	spin_lock(ptl); /* Caller expects lock to be held */
 	return ret;
diff --git a/mm/madvise.c b/mm/madvise.c
index a01147359f3b..07427d3fcead 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -170,7 +170,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
 								vma, index);
 		if (page)
-			page_cache_release(page);
+			put_page(page);
 	}
 
 	return 0;
@@ -204,14 +204,14 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 		page = find_get_entry(mapping, index);
 		if (!radix_tree_exceptional_entry(page)) {
 			if (page)
-				page_cache_release(page);
+				put_page(page);
 			continue;
 		}
 		swap = radix_to_swp_entry(page);
 		page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
 								NULL, 0);
 		if (page)
-			page_cache_release(page);
+			put_page(page);
 	}
 
 	lru_add_drain();	/* Push any new pages onto the LRU now */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5a544c6c0717..78f5f2641b91 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -538,7 +538,7 @@ static int delete_from_lru_cache(struct page *p)
 		/*
 		 * drop the page count elevated by isolate_lru_page()
 		 */
-		page_cache_release(p);
+		put_page(p);
 		return 0;
 	}
 	return -EIO;
diff --git a/mm/memory.c b/mm/memory.c
index 098f00d05461..07a420488cda 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2054,7 +2054,7 @@ static inline int wp_page_reuse(struct mm_struct *mm,
 		VM_BUG_ON_PAGE(PageAnon(page), page);
 		mapping = page->mapping;
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 
 		if ((dirtied || page_mkwrite) && mapping) {
 			/*
@@ -2188,7 +2188,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	if (new_page)
-		page_cache_release(new_page);
+		put_page(new_page);
 
 	pte_unmap_unlock(page_table, ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -2203,14 +2203,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 				munlock_vma_page(old_page);
 			unlock_page(old_page);
 		}
-		page_cache_release(old_page);
+		put_page(old_page);
 	}
 	return page_copied ? VM_FAULT_WRITE : 0;
 oom_free_new:
-	page_cache_release(new_page);
+	put_page(new_page);
 oom:
 	if (old_page)
-		page_cache_release(old_page);
+		put_page(old_page);
 	return VM_FAULT_OOM;
 }
 
@@ -2258,7 +2258,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	int page_mkwrite = 0;
 
-	page_cache_get(old_page);
+	get_page(old_page);
 
 	if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
 		int tmp;
@@ -2267,7 +2267,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
 		tmp = do_page_mkwrite(vma, old_page, address);
 		if (unlikely(!tmp || (tmp &
 				      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-			page_cache_release(old_page);
+			put_page(old_page);
 			return tmp;
 		}
 		/*
@@ -2281,7 +2281,7 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (!pte_same(*page_table, orig_pte)) {
 			unlock_page(old_page);
 			pte_unmap_unlock(page_table, ptl);
-			page_cache_release(old_page);
+			put_page(old_page);
 			return 0;
 		}
 		page_mkwrite = 1;
@@ -2341,7 +2341,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	if (PageAnon(old_page) && !PageKsm(old_page)) {
 		if (!trylock_page(old_page)) {
-			page_cache_get(old_page);
+			get_page(old_page);
 			pte_unmap_unlock(page_table, ptl);
 			lock_page(old_page);
 			page_table = pte_offset_map_lock(mm, pmd, address,
@@ -2349,10 +2349,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			if (!pte_same(*page_table, orig_pte)) {
 				unlock_page(old_page);
 				pte_unmap_unlock(page_table, ptl);
-				page_cache_release(old_page);
+				put_page(old_page);
 				return 0;
 			}
-			page_cache_release(old_page);
+			put_page(old_page);
 		}
 		if (reuse_swap_page(old_page)) {
 			/*
@@ -2375,7 +2375,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
-	page_cache_get(old_page);
+	get_page(old_page);
 
 	pte_unmap_unlock(page_table, ptl);
 	return wp_page_copy(mm, vma, address, page_table, pmd,
@@ -2619,7 +2619,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * parallel locked swapcache.
 		 */
 		unlock_page(swapcache);
-		page_cache_release(swapcache);
+		put_page(swapcache);
 	}
 
 	if (flags & FAULT_FLAG_WRITE) {
@@ -2641,10 +2641,10 @@ out_nomap:
 out_page:
 	unlock_page(page);
 out_release:
-	page_cache_release(page);
+	put_page(page);
 	if (page != swapcache) {
 		unlock_page(swapcache);
-		page_cache_release(swapcache);
+		put_page(swapcache);
 	}
 	return ret;
 }
@@ -2752,7 +2752,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (userfaultfd_missing(vma)) {
 		pte_unmap_unlock(page_table, ptl);
 		mem_cgroup_cancel_charge(page, memcg, false);
-		page_cache_release(page);
+		put_page(page);
 		return handle_userfault(vma, address, flags,
 					VM_UFFD_MISSING);
 	}
@@ -2771,10 +2771,10 @@ unlock:
 	return 0;
 release:
 	mem_cgroup_cancel_charge(page, memcg, false);
-	page_cache_release(page);
+	put_page(page);
 	goto unlock;
 oom_free_page:
-	page_cache_release(page);
+	put_page(page);
 oom:
 	return VM_FAULT_OOM;
 }
@@ -2807,7 +2807,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	if (unlikely(PageHWPoison(vmf.page))) {
 		if (ret & VM_FAULT_LOCKED)
 			unlock_page(vmf.page);
-		page_cache_release(vmf.page);
+		put_page(vmf.page);
 		return VM_FAULT_HWPOISON;
 	}
 
@@ -2996,7 +2996,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(!pte_same(*pte, orig_pte))) {
 		pte_unmap_unlock(pte, ptl);
 		unlock_page(fault_page);
-		page_cache_release(fault_page);
+		put_page(fault_page);
 		return ret;
 	}
 	do_set_pte(vma, address, fault_page, pte, false, false);
@@ -3024,7 +3024,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 
 	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
-		page_cache_release(new_page);
+		put_page(new_page);
 		return VM_FAULT_OOM;
 	}
 
@@ -3041,7 +3041,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 		if (fault_page) {
 			unlock_page(fault_page);
-			page_cache_release(fault_page);
+			put_page(fault_page);
 		} else {
 			/*
 			 * The fault handler has no page to lock, so it holds
@@ -3057,7 +3057,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_unmap_unlock(pte, ptl);
 	if (fault_page) {
 		unlock_page(fault_page);
-		page_cache_release(fault_page);
+		put_page(fault_page);
 	} else {
 		/*
 		 * The fault handler has no page to lock, so it holds
@@ -3068,7 +3068,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return ret;
 uncharge_out:
 	mem_cgroup_cancel_charge(new_page, memcg, false);
-	page_cache_release(new_page);
+	put_page(new_page);
 	return ret;
 }
 
@@ -3096,7 +3096,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		tmp = do_page_mkwrite(vma, fault_page, address);
 		if (unlikely(!tmp ||
 				(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
-			page_cache_release(fault_page);
+			put_page(fault_page);
 			return tmp;
 		}
 	}
@@ -3105,7 +3105,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(!pte_same(*pte, orig_pte))) {
 		pte_unmap_unlock(pte, ptl);
 		unlock_page(fault_page);
-		page_cache_release(fault_page);
+		put_page(fault_page);
 		return ret;
 	}
 	do_set_pte(vma, address, fault_page, pte, true, false);
@@ -3736,7 +3736,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 						    buf, maddr + offset, bytes);
 			}
 			kunmap(page);
-			page_cache_release(page);
+			put_page(page);
 		}
 		len -= bytes;
 		buf += bytes;
diff --git a/mm/mincore.c b/mm/mincore.c
index 563f32045490..0eed23d50adb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -75,7 +75,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
 #endif
 	if (page) {
 		present = PageUptodate(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	return present;
@@ -226,7 +226,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
 	unsigned char *tmp;
 
 	/* Check the start address: needs to be page-aligned.. */
- 	if (start & ~PAGE_CACHE_MASK)
+	if (start & ~PAGE_MASK)
 		return -EINVAL;
 
 	/* ..and we need to be passed a valid user-space range */
diff --git a/mm/nommu.c b/mm/nommu.c
index de8b6b6580c1..102e257cc6c3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -141,7 +141,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		if (pages) {
 			pages[i] = virt_to_page(start);
 			if (pages[i])
-				page_cache_get(pages[i]);
+				get_page(pages[i]);
 		}
 		if (vmas)
 			vmas[i] = vma;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 11ff8f758631..999792d35ccc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2176,8 +2176,8 @@ int write_cache_pages(struct address_space *mapping,
 			cycled = 0;
 		end = -1;
 	} else {
-		index = wbc->range_start >> PAGE_CACHE_SHIFT;
-		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		index = wbc->range_start >> PAGE_SHIFT;
+		end = wbc->range_end >> PAGE_SHIFT;
 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
@@ -2382,14 +2382,14 @@ int write_one_page(struct page *page, int wait)
 		wait_on_page_writeback(page);
 
 	if (clear_page_dirty_for_io(page)) {
-		page_cache_get(page);
+		get_page(page);
 		ret = mapping->a_ops->writepage(page, &wbc);
 		if (ret == 0 && wait) {
 			wait_on_page_writeback(page);
 			if (PageError(page))
 				ret = -EIO;
 		}
-		page_cache_release(page);
+		put_page(page);
 	} else {
 		unlock_page(page);
 	}
@@ -2431,7 +2431,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 		__inc_zone_page_state(page, NR_DIRTIED);
 		__inc_wb_stat(wb, WB_RECLAIMABLE);
 		__inc_wb_stat(wb, WB_DIRTIED);
-		task_io_account_write(PAGE_CACHE_SIZE);
+		task_io_account_write(PAGE_SIZE);
 		current->nr_dirtied++;
 		this_cpu_inc(bdp_ratelimits);
 	}
@@ -2450,7 +2450,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
 		mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
 		dec_zone_page_state(page, NR_FILE_DIRTY);
 		dec_wb_stat(wb, WB_RECLAIMABLE);
-		task_io_account_cancelled_write(PAGE_CACHE_SIZE);
+		task_io_account_cancelled_write(PAGE_SIZE);
 	}
 }
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 18aac7819cc9..cd92e3d67a32 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -252,7 +252,7 @@ out:
 
 static sector_t swap_page_sector(struct page *page)
 {
-	return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9);
+	return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
 }
 
 int __swap_writepage(struct page *page, struct writeback_control *wbc,
diff --git a/mm/readahead.c b/mm/readahead.c
index 20e58e820e44..40be3ae0afe3 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -47,11 +47,11 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping,
 		if (!trylock_page(page))
 			BUG();
 		page->mapping = mapping;
-		do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		do_invalidatepage(page, 0, PAGE_SIZE);
 		page->mapping = NULL;
 		unlock_page(page);
 	}
-	page_cache_release(page);
+	put_page(page);
 }
 
 /*
@@ -93,14 +93,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
 			read_cache_pages_invalidate_page(mapping, page);
 			continue;
 		}
-		page_cache_release(page);
+		put_page(page);
 
 		ret = filler(data, page);
 		if (unlikely(ret)) {
 			read_cache_pages_invalidate_pages(mapping, pages);
 			break;
 		}
-		task_io_account_read(PAGE_CACHE_SIZE);
+		task_io_account_read(PAGE_SIZE);
 	}
 	return ret;
 }
@@ -130,7 +130,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 				mapping_gfp_constraint(mapping, GFP_KERNEL))) {
 			mapping->a_ops->readpage(filp, page);
 		}
-		page_cache_release(page);
+		put_page(page);
 	}
 	ret = 0;
 
@@ -163,7 +163,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	if (isize == 0)
 		goto out;
 
-	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+	end_index = ((isize - 1) >> PAGE_SHIFT);
 
 	/*
 	 * Preallocate as many pages as we will need.
@@ -216,7 +216,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	while (nr_to_read) {
 		int err;
 
-		unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;
+		unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
 
 		if (this_chunk > nr_to_read)
 			this_chunk = nr_to_read;
@@ -425,7 +425,7 @@ ondemand_readahead(struct address_space *mapping,
 	 * trivial case: (offset - prev_offset) == 1
 	 * unaligned reads: (offset - prev_offset) == 0
 	 */
-	prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
+	prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
 	if (offset - prev_offset <= 1UL)
 		goto initial_readahead;
 
@@ -558,8 +558,8 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
 	if (f.file) {
 		if (f.file->f_mode & FMODE_READ) {
 			struct address_space *mapping = f.file->f_mapping;
-			pgoff_t start = offset >> PAGE_CACHE_SHIFT;
-			pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+			pgoff_t start = offset >> PAGE_SHIFT;
+			pgoff_t end = (offset + count - 1) >> PAGE_SHIFT;
 			unsigned long len = end - start + 1;
 			ret = do_readahead(mapping, f.file, start, len);
 		}
diff --git a/mm/rmap.c b/mm/rmap.c
index c399a0d41b31..525b92f866a7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1555,7 +1555,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 discard:
 	page_remove_rmap(page, PageHuge(page));
-	page_cache_release(page);
+	put_page(page);
 
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
diff --git a/mm/shmem.c b/mm/shmem.c
index 9428c51ab2d6..719bd6b88d98 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -75,8 +75,8 @@ static struct vfsmount *shm_mnt;
 
 #include "internal.h"
 
-#define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
-#define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
+#define BLOCKS_PER_PAGE  (PAGE_SIZE/512)
+#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)
 
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
@@ -176,13 +176,13 @@ static inline int shmem_reacct_size(unsigned long flags,
 static inline int shmem_acct_block(unsigned long flags)
 {
 	return (flags & VM_NORESERVE) ?
-		security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
+		security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0;
 }
 
 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 {
 	if (flags & VM_NORESERVE)
-		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
+		vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
 }
 
 static const struct super_operations shmem_ops;
@@ -300,7 +300,7 @@ static int shmem_add_to_page_cache(struct page *page,
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 
-	page_cache_get(page);
+	get_page(page);
 	page->mapping = mapping;
 	page->index = index;
 
@@ -318,7 +318,7 @@ static int shmem_add_to_page_cache(struct page *page,
 	} else {
 		page->mapping = NULL;
 		spin_unlock_irq(&mapping->tree_lock);
-		page_cache_release(page);
+		put_page(page);
 	}
 	return error;
 }
@@ -338,7 +338,7 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	__dec_zone_page_state(page, NR_SHMEM);
 	spin_unlock_irq(&mapping->tree_lock);
-	page_cache_release(page);
+	put_page(page);
 	BUG_ON(error);
 }
 
@@ -474,10 +474,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
-	unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
-	unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
+	pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	pgoff_t end = (lend + 1) >> PAGE_SHIFT;
+	unsigned int partial_start = lstart & (PAGE_SIZE - 1);
+	unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
 	struct pagevec pvec;
 	pgoff_t indices[PAGEVEC_SIZE];
 	long nr_swaps_freed = 0;
@@ -530,7 +530,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 		struct page *page = NULL;
 		shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
 		if (page) {
-			unsigned int top = PAGE_CACHE_SIZE;
+			unsigned int top = PAGE_SIZE;
 			if (start > end) {
 				top = partial_end;
 				partial_end = 0;
@@ -538,7 +538,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			zero_user_segment(page, partial_start, top);
 			set_page_dirty(page);
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 		}
 	}
 	if (partial_end) {
@@ -548,7 +548,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			zero_user_segment(page, 0, partial_end);
 			set_page_dirty(page);
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 		}
 	}
 	if (start >= end)
@@ -833,7 +833,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 		mem_cgroup_commit_charge(page, memcg, true, false);
 out:
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return error;
 }
 
@@ -1080,7 +1080,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 	if (!newpage)
 		return -ENOMEM;
 
-	page_cache_get(newpage);
+	get_page(newpage);
 	copy_highpage(newpage, oldpage);
 	flush_dcache_page(newpage);
 
@@ -1120,8 +1120,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 	set_page_private(oldpage, 0);
 
 	unlock_page(oldpage);
-	page_cache_release(oldpage);
-	page_cache_release(oldpage);
+	put_page(oldpage);
+	put_page(oldpage);
 	return error;
 }
 
@@ -1145,7 +1145,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	int once = 0;
 	int alloced = 0;
 
-	if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
+	if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
 		return -EFBIG;
 repeat:
 	swap.val = 0;
@@ -1156,7 +1156,7 @@ repeat:
 	}
 
 	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
-	    ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
 		error = -EINVAL;
 		goto unlock;
 	}
@@ -1169,7 +1169,7 @@ repeat:
 		if (sgp != SGP_READ)
 			goto clear;
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		page = NULL;
 	}
 	if (page || (sgp == SGP_READ && !swap.val)) {
@@ -1327,7 +1327,7 @@ clear:
 
 	/* Perhaps the file has been truncated since we checked */
 	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
-	    ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
 		if (alloced) {
 			ClearPageDirty(page);
 			delete_from_page_cache(page);
@@ -1355,7 +1355,7 @@ failed:
 unlock:
 	if (page) {
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	if (error == -ENOSPC && !once++) {
 		info = SHMEM_I(inode);
@@ -1577,7 +1577,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	pgoff_t index = pos >> PAGE_SHIFT;
 
 	/* i_mutex is held by caller */
 	if (unlikely(info->seals)) {
@@ -1601,16 +1601,16 @@ shmem_write_end(struct file *file, struct address_space *mapping,
 		i_size_write(inode, pos + copied);
 
 	if (!PageUptodate(page)) {
-		if (copied < PAGE_CACHE_SIZE) {
-			unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+		if (copied < PAGE_SIZE) {
+			unsigned from = pos & (PAGE_SIZE - 1);
 			zero_user_segments(page, 0, from,
-					from + copied, PAGE_CACHE_SIZE);
+					from + copied, PAGE_SIZE);
 		}
 		SetPageUptodate(page);
 	}
 	set_page_dirty(page);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 
 	return copied;
 }
@@ -1635,8 +1635,8 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (!iter_is_iovec(to))
 		sgp = SGP_DIRTY;
 
-	index = *ppos >> PAGE_CACHE_SHIFT;
-	offset = *ppos & ~PAGE_CACHE_MASK;
+	index = *ppos >> PAGE_SHIFT;
+	offset = *ppos & ~PAGE_MASK;
 
 	for (;;) {
 		struct page *page = NULL;
@@ -1644,11 +1644,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		unsigned long nr, ret;
 		loff_t i_size = i_size_read(inode);
 
-		end_index = i_size >> PAGE_CACHE_SHIFT;
+		end_index = i_size >> PAGE_SHIFT;
 		if (index > end_index)
 			break;
 		if (index == end_index) {
-			nr = i_size & ~PAGE_CACHE_MASK;
+			nr = i_size & ~PAGE_MASK;
 			if (nr <= offset)
 				break;
 		}
@@ -1666,14 +1666,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		 * We must evaluate after, since reads (unlike writes)
 		 * are called without i_mutex protection against truncate
 		 */
-		nr = PAGE_CACHE_SIZE;
+		nr = PAGE_SIZE;
 		i_size = i_size_read(inode);
-		end_index = i_size >> PAGE_CACHE_SHIFT;
+		end_index = i_size >> PAGE_SHIFT;
 		if (index == end_index) {
-			nr = i_size & ~PAGE_CACHE_MASK;
+			nr = i_size & ~PAGE_MASK;
 			if (nr <= offset) {
 				if (page)
-					page_cache_release(page);
+					put_page(page);
 				break;
 			}
 		}
@@ -1694,7 +1694,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 				mark_page_accessed(page);
 		} else {
 			page = ZERO_PAGE(0);
-			page_cache_get(page);
+			get_page(page);
 		}
 
 		/*
@@ -1704,10 +1704,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		ret = copy_page_to_iter(page, offset, nr, to);
 		retval += ret;
 		offset += ret;
-		index += offset >> PAGE_CACHE_SHIFT;
-		offset &= ~PAGE_CACHE_MASK;
+		index += offset >> PAGE_SHIFT;
+		offset &= ~PAGE_MASK;
 
-		page_cache_release(page);
+		put_page(page);
 		if (!iov_iter_count(to))
 			break;
 		if (ret < nr) {
@@ -1717,7 +1717,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		cond_resched();
 	}
 
-	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+	*ppos = ((loff_t) index << PAGE_SHIFT) + offset;
 	file_accessed(file);
 	return retval ? retval : error;
 }
@@ -1755,9 +1755,9 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 	if (splice_grow_spd(pipe, &spd))
 		return -ENOMEM;
 
-	index = *ppos >> PAGE_CACHE_SHIFT;
-	loff = *ppos & ~PAGE_CACHE_MASK;
-	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	index = *ppos >> PAGE_SHIFT;
+	loff = *ppos & ~PAGE_MASK;
+	req_pages = (len + loff + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	nr_pages = min(req_pages, spd.nr_pages_max);
 
 	spd.nr_pages = find_get_pages_contig(mapping, index,
@@ -1774,7 +1774,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 		index++;
 	}
 
-	index = *ppos >> PAGE_CACHE_SHIFT;
+	index = *ppos >> PAGE_SHIFT;
 	nr_pages = spd.nr_pages;
 	spd.nr_pages = 0;
 
@@ -1784,7 +1784,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 		if (!len)
 			break;
 
-		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+		this_len = min_t(unsigned long, len, PAGE_SIZE - loff);
 		page = spd.pages[page_nr];
 
 		if (!PageUptodate(page) || page->mapping != mapping) {
@@ -1793,19 +1793,19 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 			if (error)
 				break;
 			unlock_page(page);
-			page_cache_release(spd.pages[page_nr]);
+			put_page(spd.pages[page_nr]);
 			spd.pages[page_nr] = page;
 		}
 
 		isize = i_size_read(inode);
-		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+		end_index = (isize - 1) >> PAGE_SHIFT;
 		if (unlikely(!isize || index > end_index))
 			break;
 
 		if (end_index == index) {
 			unsigned int plen;
 
-			plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+			plen = ((isize - 1) & ~PAGE_MASK) + 1;
 			if (plen <= loff)
 				break;
 
@@ -1822,7 +1822,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 	}
 
 	while (page_nr < nr_pages)
-		page_cache_release(spd.pages[page_nr++]);
+		put_page(spd.pages[page_nr++]);
 
 	if (spd.nr_pages)
 		error = splice_to_pipe(pipe, &spd);
@@ -1904,10 +1904,10 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 	else if (offset >= inode->i_size)
 		offset = -ENXIO;
 	else {
-		start = offset >> PAGE_CACHE_SHIFT;
-		end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		start = offset >> PAGE_SHIFT;
+		end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		new_offset = shmem_seek_hole_data(mapping, start, end, whence);
-		new_offset <<= PAGE_CACHE_SHIFT;
+		new_offset <<= PAGE_SHIFT;
 		if (new_offset > offset) {
 			if (new_offset < inode->i_size)
 				offset = new_offset;
@@ -2203,8 +2203,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		goto out;
 	}
 
-	start = offset >> PAGE_CACHE_SHIFT;
-	end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	start = offset >> PAGE_SHIFT;
+	end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	/* Try to avoid a swapstorm if len is impossible to satisfy */
 	if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
 		error = -ENOSPC;
@@ -2237,8 +2237,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		if (error) {
 			/* Remove the !PageUptodate pages we added */
 			shmem_undo_range(inode,
-				(loff_t)start << PAGE_CACHE_SHIFT,
-				(loff_t)index << PAGE_CACHE_SHIFT, true);
+				(loff_t)start << PAGE_SHIFT,
+				(loff_t)index << PAGE_SHIFT, true);
 			goto undone;
 		}
 
@@ -2259,7 +2259,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		 */
 		set_page_dirty(page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 		cond_resched();
 	}
 
@@ -2280,7 +2280,7 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
 
 	buf->f_type = TMPFS_MAGIC;
-	buf->f_bsize = PAGE_CACHE_SIZE;
+	buf->f_bsize = PAGE_SIZE;
 	buf->f_namelen = NAME_MAX;
 	if (sbinfo->max_blocks) {
 		buf->f_blocks = sbinfo->max_blocks;
@@ -2523,7 +2523,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	struct shmem_inode_info *info;
 
 	len = strlen(symname) + 1;
-	if (len > PAGE_CACHE_SIZE)
+	if (len > PAGE_SIZE)
 		return -ENAMETOOLONG;
 
 	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
@@ -2562,7 +2562,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 		SetPageUptodate(page);
 		set_page_dirty(page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	dir->i_size += BOGO_DIRENT_SIZE;
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -2835,7 +2835,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 			if (*rest)
 				goto bad_val;
 			sbinfo->max_blocks =
-				DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
+				DIV_ROUND_UP(size, PAGE_SIZE);
 		} else if (!strcmp(this_char,"nr_blocks")) {
 			sbinfo->max_blocks = memparse(value, &rest);
 			if (*rest)
@@ -2940,7 +2940,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
 
 	if (sbinfo->max_blocks != shmem_default_max_blocks())
 		seq_printf(seq, ",size=%luk",
-			sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
+			sbinfo->max_blocks << (PAGE_SHIFT - 10));
 	if (sbinfo->max_inodes != shmem_default_max_inodes())
 		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
 	if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
@@ -3082,8 +3082,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 	sbinfo->free_inodes = sbinfo->max_inodes;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = TMPFS_MAGIC;
 	sb->s_op = &shmem_ops;
 	sb->s_time_gran = 1;
diff --git a/mm/swap.c b/mm/swap.c
index 09fe5e97714a..ea641e247033 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -114,7 +114,7 @@ void put_pages_list(struct list_head *pages)
 
 		victim = list_entry(pages->prev, struct page, lru);
 		list_del(&victim->lru);
-		page_cache_release(victim);
+		put_page(victim);
 	}
 }
 EXPORT_SYMBOL(put_pages_list);
@@ -142,7 +142,7 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
 			return seg;
 
 		pages[seg] = kmap_to_page(kiov[seg].iov_base);
-		page_cache_get(pages[seg]);
+		get_page(pages[seg]);
 	}
 
 	return seg;
@@ -236,7 +236,7 @@ void rotate_reclaimable_page(struct page *page)
 		struct pagevec *pvec;
 		unsigned long flags;
 
-		page_cache_get(page);
+		get_page(page);
 		local_irq_save(flags);
 		pvec = this_cpu_ptr(&lru_rotate_pvecs);
 		if (!pagevec_add(pvec, page))
@@ -294,7 +294,7 @@ void activate_page(struct page *page)
 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 		struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
 
-		page_cache_get(page);
+		get_page(page);
 		if (!pagevec_add(pvec, page))
 			pagevec_lru_move_fn(pvec, __activate_page, NULL);
 		put_cpu_var(activate_page_pvecs);
@@ -389,7 +389,7 @@ static void __lru_cache_add(struct page *page)
 {
 	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
 
-	page_cache_get(page);
+	get_page(page);
 	if (!pagevec_space(pvec))
 		__pagevec_lru_add(pvec);
 	pagevec_add(pvec, page);
@@ -646,7 +646,7 @@ void deactivate_page(struct page *page)
 	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
 		struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
 
-		page_cache_get(page);
+		get_page(page);
 		if (!pagevec_add(pvec, page))
 			pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
 		put_cpu_var(lru_deactivate_pvecs);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 69cb2464e7dc..366ce3518703 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -85,7 +85,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
 	VM_BUG_ON_PAGE(PageSwapCache(page), page);
 	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 
-	page_cache_get(page);
+	get_page(page);
 	SetPageSwapCache(page);
 	set_page_private(page, entry.val);
 
@@ -109,7 +109,7 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry)
 		VM_BUG_ON(error == -EEXIST);
 		set_page_private(page, 0UL);
 		ClearPageSwapCache(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 
 	return error;
@@ -226,7 +226,7 @@ void delete_from_swap_cache(struct page *page)
 	spin_unlock_irq(&address_space->tree_lock);
 
 	swapcache_free(entry);
-	page_cache_release(page);
+	put_page(page);
 }
 
 /* 
@@ -252,7 +252,7 @@ static inline void free_swap_cache(struct page *page)
 void free_page_and_swap_cache(struct page *page)
 {
 	free_swap_cache(page);
-	page_cache_release(page);
+	put_page(page);
 }
 
 /*
@@ -380,7 +380,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	} while (err != -ENOMEM);
 
 	if (new_page)
-		page_cache_release(new_page);
+		put_page(new_page);
 	return found_page;
 }
 
@@ -495,7 +495,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 			continue;
 		if (offset != entry_offset)
 			SetPageReadahead(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	blk_finish_plug(&plug);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 560ad380634c..83874eced5bf 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -119,7 +119,7 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
 		ret = try_to_free_swap(page);
 		unlock_page(page);
 	}
-	page_cache_release(page);
+	put_page(page);
 	return ret;
 }
 
@@ -1000,7 +1000,7 @@ int free_swap_and_cache(swp_entry_t entry)
 			page = find_get_page(swap_address_space(entry),
 						entry.val);
 			if (page && !trylock_page(page)) {
-				page_cache_release(page);
+				put_page(page);
 				page = NULL;
 			}
 		}
@@ -1017,7 +1017,7 @@ int free_swap_and_cache(swp_entry_t entry)
 			SetPageDirty(page);
 		}
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	return p != NULL;
 }
@@ -1518,7 +1518,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 		}
 		if (retval) {
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 			break;
 		}
 
@@ -1570,7 +1570,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 		 */
 		SetPageDirty(page);
 		unlock_page(page);
-		page_cache_release(page);
+		put_page(page);
 
 		/*
 		 * Make sure that we aren't completely killing
@@ -2574,7 +2574,7 @@ bad_swap:
 out:
 	if (page && !IS_ERR(page)) {
 		kunmap(page);
-		page_cache_release(page);
+		put_page(page);
 	}
 	if (name)
 		putname(name);
diff --git a/mm/truncate.c b/mm/truncate.c
index 7598b552ae03..b00272810871 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -118,7 +118,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
 		return -EIO;
 
 	if (page_has_private(page))
-		do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
+		do_invalidatepage(page, 0, PAGE_SIZE);
 
 	/*
 	 * Some filesystems seem to re-dirty the page even after
@@ -159,8 +159,8 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
 {
 	if (page_mapped(page)) {
 		unmap_mapping_range(mapping,
-				   (loff_t)page->index << PAGE_CACHE_SHIFT,
-				   PAGE_CACHE_SIZE, 0);
+				   (loff_t)page->index << PAGE_SHIFT,
+				   PAGE_SIZE, 0);
 	}
 	return truncate_complete_page(mapping, page);
 }
@@ -241,8 +241,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		return;
 
 	/* Offsets within partial pages */
-	partial_start = lstart & (PAGE_CACHE_SIZE - 1);
-	partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
+	partial_start = lstart & (PAGE_SIZE - 1);
+	partial_end = (lend + 1) & (PAGE_SIZE - 1);
 
 	/*
 	 * 'start' and 'end' always covers the range of pages to be fully
@@ -250,7 +250,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	 * start of the range and 'partial_end' at the end of the range.
 	 * Note that 'end' is exclusive while 'lend' is inclusive.
 	 */
-	start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (lend == -1)
 		/*
 		 * lend == -1 indicates end-of-file so we have to set 'end'
@@ -259,7 +259,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		 */
 		end = -1;
 	else
-		end = (lend + 1) >> PAGE_CACHE_SHIFT;
+		end = (lend + 1) >> PAGE_SHIFT;
 
 	pagevec_init(&pvec, 0);
 	index = start;
@@ -298,7 +298,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	if (partial_start) {
 		struct page *page = find_lock_page(mapping, start - 1);
 		if (page) {
-			unsigned int top = PAGE_CACHE_SIZE;
+			unsigned int top = PAGE_SIZE;
 			if (start > end) {
 				/* Truncation within a single page */
 				top = partial_end;
@@ -311,7 +311,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 				do_invalidatepage(page, partial_start,
 						  top - partial_start);
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 		}
 	}
 	if (partial_end) {
@@ -324,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 				do_invalidatepage(page, 0,
 						  partial_end);
 			unlock_page(page);
-			page_cache_release(page);
+			put_page(page);
 		}
 	}
 	/*
@@ -538,7 +538,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	if (mapping->a_ops->freepage)
 		mapping->a_ops->freepage(page);
 
-	page_cache_release(page);	/* pagecache ref */
+	put_page(page);	/* pagecache ref */
 	return 1;
 failed:
 	spin_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -608,18 +608,18 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 					 * Zap the rest of the file in one hit.
 					 */
 					unmap_mapping_range(mapping,
-					   (loff_t)index << PAGE_CACHE_SHIFT,
+					   (loff_t)index << PAGE_SHIFT,
 					   (loff_t)(1 + end - index)
-							 << PAGE_CACHE_SHIFT,
-					    0);
+							 << PAGE_SHIFT,
+							 0);
 					did_range_unmap = 1;
 				} else {
 					/*
 					 * Just zap this page
 					 */
 					unmap_mapping_range(mapping,
-					   (loff_t)index << PAGE_CACHE_SHIFT,
-					   PAGE_CACHE_SIZE, 0);
+					   (loff_t)index << PAGE_SHIFT,
+					   PAGE_SIZE, 0);
 				}
 			}
 			BUG_ON(page_mapped(page));
@@ -744,14 +744,14 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
 
 	WARN_ON(to > inode->i_size);
 
-	if (from >= to || bsize == PAGE_CACHE_SIZE)
+	if (from >= to || bsize == PAGE_SIZE)
 		return;
 	/* Page straddling @from will not have any hole block created? */
 	rounded_from = round_up(from, bsize);
-	if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1)))
+	if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
 		return;
 
-	index = from >> PAGE_CACHE_SHIFT;
+	index = from >> PAGE_SHIFT;
 	page = find_lock_page(inode->i_mapping, index);
 	/* Page not cached? Nothing to do */
 	if (!page)
@@ -763,7 +763,7 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
 	if (page_mkclean(page))
 		set_page_dirty(page);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 }
 EXPORT_SYMBOL(pagecache_isize_extended);
 
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9f3a0290b273..af817e5060fb 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -93,7 +93,7 @@ out_release_uncharge_unlock:
 	pte_unmap_unlock(dst_pte, ptl);
 	mem_cgroup_cancel_charge(page, memcg, false);
 out_release:
-	page_cache_release(page);
+	put_page(page);
 	goto out;
 }
 
@@ -287,7 +287,7 @@ out_unlock:
 	up_read(&dst_mm->mmap_sem);
 out:
 	if (page)
-		page_cache_release(page);
+		put_page(page);
 	BUG_ON(copied < 0);
 	BUG_ON(err > 0);
 	BUG_ON(!copied && !err);
diff --git a/mm/zswap.c b/mm/zswap.c
index bf14508afd64..91dad80d068b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -869,7 +869,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 
 	case ZSWAP_SWAPCACHE_EXIST:
 		/* page is already in the swap cache, ignore for now */
-		page_cache_release(page);
+		put_page(page);
 		ret = -EEXIST;
 		goto fail;
 
@@ -897,7 +897,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 
 	/* start writeback */
 	__swap_writepage(page, &wbc, end_swap_bio_write);
-	page_cache_release(page);
+	put_page(page);
 	zswap_written_back_pages++;
 
 	spin_lock(&tree->lock);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1831f6353622..a5502898ea33 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -269,7 +269,7 @@ static void _ceph_msgr_exit(void)
 	}
 
 	BUG_ON(zero_page == NULL);
-	page_cache_release(zero_page);
+	put_page(zero_page);
 	zero_page = NULL;
 
 	ceph_msgr_slab_exit();
@@ -282,7 +282,7 @@ int ceph_msgr_init(void)
 
 	BUG_ON(zero_page != NULL);
 	zero_page = ZERO_PAGE(0);
-	page_cache_get(zero_page);
+	get_page(zero_page);
 
 	/*
 	 * The number of active work items is limited by the number of
@@ -1602,7 +1602,7 @@ static int write_partial_skip(struct ceph_connection *con)
 
 	dout("%s %p %d left\n", __func__, con, con->out_skip);
 	while (con->out_skip > 0) {
-		size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
+		size_t size = min(con->out_skip, (int) PAGE_SIZE);
 
 		ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);
 		if (ret <= 0)
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index c7c220a736e5..6864007e64fc 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -56,7 +56,7 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
 		size_t bit = pl->room;
 		int ret;
 
-		memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+		memcpy(pl->mapped_tail + (pl->length & ~PAGE_MASK),
 		       buf, bit);
 		pl->length += bit;
 		pl->room -= bit;
@@ -67,7 +67,7 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
 			return ret;
 	}
 
-	memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+	memcpy(pl->mapped_tail + (pl->length & ~PAGE_MASK), buf, len);
 	pl->length += len;
 	pl->room -= len;
 	return 0;
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 10297f7a89ba..00d2601407c5 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -95,19 +95,19 @@ int ceph_copy_user_to_page_vector(struct page **pages,
 					 loff_t off, size_t len)
 {
 	int i = 0;
-	int po = off & ~PAGE_CACHE_MASK;
+	int po = off & ~PAGE_MASK;
 	int left = len;
 	int l, bad;
 
 	while (left > 0) {
-		l = min_t(int, PAGE_CACHE_SIZE-po, left);
+		l = min_t(int, PAGE_SIZE-po, left);
 		bad = copy_from_user(page_address(pages[i]) + po, data, l);
 		if (bad == l)
 			return -EFAULT;
 		data += l - bad;
 		left -= l - bad;
 		po += l - bad;
-		if (po == PAGE_CACHE_SIZE) {
+		if (po == PAGE_SIZE) {
 			po = 0;
 			i++;
 		}
@@ -121,17 +121,17 @@ void ceph_copy_to_page_vector(struct page **pages,
 				    loff_t off, size_t len)
 {
 	int i = 0;
-	size_t po = off & ~PAGE_CACHE_MASK;
+	size_t po = off & ~PAGE_MASK;
 	size_t left = len;
 
 	while (left > 0) {
-		size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		size_t l = min_t(size_t, PAGE_SIZE-po, left);
 
 		memcpy(page_address(pages[i]) + po, data, l);
 		data += l;
 		left -= l;
 		po += l;
-		if (po == PAGE_CACHE_SIZE) {
+		if (po == PAGE_SIZE) {
 			po = 0;
 			i++;
 		}
@@ -144,17 +144,17 @@ void ceph_copy_from_page_vector(struct page **pages,
 				    loff_t off, size_t len)
 {
 	int i = 0;
-	size_t po = off & ~PAGE_CACHE_MASK;
+	size_t po = off & ~PAGE_MASK;
 	size_t left = len;
 
 	while (left > 0) {
-		size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+		size_t l = min_t(size_t, PAGE_SIZE-po, left);
 
 		memcpy(data, page_address(pages[i]) + po, l);
 		data += l;
 		left -= l;
 		po += l;
-		if (po == PAGE_CACHE_SIZE) {
+		if (po == PAGE_SIZE) {
 			po = 0;
 			i++;
 		}
@@ -168,25 +168,25 @@ EXPORT_SYMBOL(ceph_copy_from_page_vector);
  */
 void ceph_zero_page_vector_range(int off, int len, struct page **pages)
 {
-	int i = off >> PAGE_CACHE_SHIFT;
+	int i = off >> PAGE_SHIFT;
 
-	off &= ~PAGE_CACHE_MASK;
+	off &= ~PAGE_MASK;
 
 	dout("zero_page_vector_page %u~%u\n", off, len);
 
 	/* leading partial page? */
 	if (off) {
-		int end = min((int)PAGE_CACHE_SIZE, off + len);
+		int end = min((int)PAGE_SIZE, off + len);
 		dout("zeroing %d %p head from %d\n", i, pages[i],
 		     (int)off);
 		zero_user_segment(pages[i], off, end);
 		len -= (end - off);
 		i++;
 	}
-	while (len >= PAGE_CACHE_SIZE) {
+	while (len >= PAGE_SIZE) {
 		dout("zeroing %d %p len=%d\n", i, pages[i], len);
-		zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
-		len -= PAGE_CACHE_SIZE;
+		zero_user_segment(pages[i], 0, PAGE_SIZE);
+		len -= PAGE_SIZE;
 		i++;
 	}
 	/* trailing partial page? */
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 8c6bc795f060..15612ffa8d57 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1728,8 +1728,8 @@ alloc_enc_pages(struct rpc_rqst *rqstp)
 		return 0;
 	}
 
-	first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
-	last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT;
+	first = snd_buf->page_base >> PAGE_SHIFT;
+	last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_SHIFT;
 	rqstp->rq_enc_pages_num = last - first + 1 + 1;
 	rqstp->rq_enc_pages
 		= kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *),
@@ -1775,10 +1775,10 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
 	status = alloc_enc_pages(rqstp);
 	if (status)
 		return status;
-	first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
+	first = snd_buf->page_base >> PAGE_SHIFT;
 	inpages = snd_buf->pages + first;
 	snd_buf->pages = rqstp->rq_enc_pages;
-	snd_buf->page_base -= first << PAGE_CACHE_SHIFT;
+	snd_buf->page_base -= first << PAGE_SHIFT;
 	/*
 	 * Give the tail its own page, in case we need extra space in the
 	 * head when wrapping:
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index d94a8e1e9f05..045e11ecd332 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -465,7 +465,7 @@ encryptor(struct scatterlist *sg, void *data)
 	page_pos = desc->pos - outbuf->head[0].iov_len;
 	if (page_pos >= 0 && page_pos < outbuf->page_len) {
 		/* pages are not in place: */
-		int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT;
+		int i = (page_pos + outbuf->page_base) >> PAGE_SHIFT;
 		in_page = desc->pages[i];
 	} else {
 		in_page = sg_page(sg);
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index 765088e4ad84..a737c2da0837 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -79,9 +79,9 @@ gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
 		len -= buf->head[0].iov_len;
 	if (len <= buf->page_len) {
 		unsigned int last = (buf->page_base + len - 1)
-					>>PAGE_CACHE_SHIFT;
+					>>PAGE_SHIFT;
 		unsigned int offset = (buf->page_base + len - 1)
-					& (PAGE_CACHE_SIZE - 1);
+					& (PAGE_SIZE - 1);
 		ptr = kmap_atomic(buf->pages[last]);
 		pad = *(ptr + offset);
 		kunmap_atomic(ptr);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 008c25d1b9f9..553bf95f7003 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -881,7 +881,7 @@ static ssize_t cache_downcall(struct address_space *mapping,
 	char *kaddr;
 	ssize_t ret = -ENOMEM;
 
-	if (count >= PAGE_CACHE_SIZE)
+	if (count >= PAGE_SIZE)
 		goto out_slow;
 
 	page = find_or_create_page(mapping, 0, GFP_KERNEL);
@@ -892,7 +892,7 @@ static ssize_t cache_downcall(struct address_space *mapping,
 	ret = cache_do_downcall(kaddr, buf, count, cd);
 	kunmap(page);
 	unlock_page(page);
-	page_cache_release(page);
+	put_page(page);
 	return ret;
 out_slow:
 	return cache_slow_downcall(buf, count, cd);
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 31789ef3e614..fc48eca21fd2 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1390,8 +1390,8 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
 	struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
 	int err;
 
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_blocksize = PAGE_SIZE;
+	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = RPCAUTH_GSSMAGIC;
 	sb->s_op = &s_ops;
 	sb->s_d_op = &simple_dentry_operations;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 2df87f78e518..de70c78025d7 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -96,8 +96,8 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct
 	if (base || xdr->page_base) {
 		pglen -= base;
 		base += xdr->page_base;
-		ppage += base >> PAGE_CACHE_SHIFT;
-		base &= ~PAGE_CACHE_MASK;
+		ppage += base >> PAGE_SHIFT;
+		base &= ~PAGE_MASK;
 	}
 	do {
 		char *kaddr;
@@ -113,7 +113,7 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct
 			}
 		}
 
-		len = PAGE_CACHE_SIZE;
+		len = PAGE_SIZE;
 		kaddr = kmap_atomic(*ppage);
 		if (base) {
 			len -= base;
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 4439ac4c1b53..1e6aba0501a1 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -181,20 +181,20 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
 	pgto_base += len;
 	pgfrom_base += len;
 
-	pgto = pages + (pgto_base >> PAGE_CACHE_SHIFT);
-	pgfrom = pages + (pgfrom_base >> PAGE_CACHE_SHIFT);
+	pgto = pages + (pgto_base >> PAGE_SHIFT);
+	pgfrom = pages + (pgfrom_base >> PAGE_SHIFT);
 
-	pgto_base &= ~PAGE_CACHE_MASK;
-	pgfrom_base &= ~PAGE_CACHE_MASK;
+	pgto_base &= ~PAGE_MASK;
+	pgfrom_base &= ~PAGE_MASK;
 
 	do {
 		/* Are any pointers crossing a page boundary? */
 		if (pgto_base == 0) {
-			pgto_base = PAGE_CACHE_SIZE;
+			pgto_base = PAGE_SIZE;
 			pgto--;
 		}
 		if (pgfrom_base == 0) {
-			pgfrom_base = PAGE_CACHE_SIZE;
+			pgfrom_base = PAGE_SIZE;
 			pgfrom--;
 		}
 
@@ -236,11 +236,11 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
 	char *vto;
 	size_t copy;
 
-	pgto = pages + (pgbase >> PAGE_CACHE_SHIFT);
-	pgbase &= ~PAGE_CACHE_MASK;
+	pgto = pages + (pgbase >> PAGE_SHIFT);
+	pgbase &= ~PAGE_MASK;
 
 	for (;;) {
-		copy = PAGE_CACHE_SIZE - pgbase;
+		copy = PAGE_SIZE - pgbase;
 		if (copy > len)
 			copy = len;
 
@@ -253,7 +253,7 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
 			break;
 
 		pgbase += copy;
-		if (pgbase == PAGE_CACHE_SIZE) {
+		if (pgbase == PAGE_SIZE) {
 			flush_dcache_page(*pgto);
 			pgbase = 0;
 			pgto++;
@@ -280,11 +280,11 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 	char *vfrom;
 	size_t copy;
 
-	pgfrom = pages + (pgbase >> PAGE_CACHE_SHIFT);
-	pgbase &= ~PAGE_CACHE_MASK;
+	pgfrom = pages + (pgbase >> PAGE_SHIFT);
+	pgbase &= ~PAGE_MASK;
 
 	do {
-		copy = PAGE_CACHE_SIZE - pgbase;
+		copy = PAGE_SIZE - pgbase;
 		if (copy > len)
 			copy = len;
 
@@ -293,7 +293,7 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 		kunmap_atomic(vfrom);
 
 		pgbase += copy;
-		if (pgbase == PAGE_CACHE_SIZE) {
+		if (pgbase == PAGE_SIZE) {
 			pgbase = 0;
 			pgfrom++;
 		}
@@ -1038,8 +1038,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
 	if (base < buf->page_len) {
 		subbuf->page_len = min(buf->page_len - base, len);
 		base += buf->page_base;
-		subbuf->page_base = base & ~PAGE_CACHE_MASK;
-		subbuf->pages = &buf->pages[base >> PAGE_CACHE_SHIFT];
+		subbuf->page_base = base & ~PAGE_MASK;
+		subbuf->pages = &buf->pages[base >> PAGE_SHIFT];
 		len -= subbuf->page_len;
 		base = 0;
 	} else {
@@ -1297,9 +1297,9 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
 		todo -= avail_here;
 
 		base += buf->page_base;
-		ppages = buf->pages + (base >> PAGE_CACHE_SHIFT);
-		base &= ~PAGE_CACHE_MASK;
-		avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base,
+		ppages = buf->pages + (base >> PAGE_SHIFT);
+		base &= ~PAGE_MASK;
+		avail_page = min_t(unsigned int, PAGE_SIZE - base,
 					avail_here);
 		c = kmap(*ppages) + base;
 
@@ -1383,7 +1383,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
 			}
 
 			avail_page = min(avail_here,
-				 (unsigned int) PAGE_CACHE_SIZE);
+				 (unsigned int) PAGE_SIZE);
 		}
 		base = buf->page_len;  /* align to start of tail */
 	}
@@ -1479,9 +1479,9 @@ xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
 		if (page_len > len)
 			page_len = len;
 		len -= page_len;
-		page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1);
-		i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT;
-		thislen = PAGE_CACHE_SIZE - page_offset;
+		page_offset = (offset + buf->page_base) & (PAGE_SIZE - 1);
+		i = (offset + buf->page_base) >> PAGE_SHIFT;
+		thislen = PAGE_SIZE - page_offset;
 		do {
 			if (thislen > page_len)
 				thislen = page_len;
@@ -1492,7 +1492,7 @@ xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
 			page_len -= thislen;
 			i++;
 			page_offset = 0;
-			thislen = PAGE_CACHE_SIZE;
+			thislen = PAGE_SIZE;
 		} while (page_len != 0);
 		offset = 0;
 	}
-- 
cgit v1.2.3


From 7ffb8e317bae03b8ee5bdcec93dc3723be945e9b Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Mon, 4 Apr 2016 16:44:02 -0400
Subject: audit: we don't need to __set_current_state(TASK_RUNNING)

Remove the calls to __set_current_state() to mark the task as running
and do some related cleanup in wait_for_auditd() to limit the amount
of work we do when we aren't going to reschedule the current task.

Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 3a3e5deeda8d..f52fbefede09 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -430,7 +430,6 @@ restart:
 					attempts, audit_pid);
 				set_current_state(TASK_INTERRUPTIBLE);
 				schedule();
-				__set_current_state(TASK_RUNNING);
 				goto restart;
 			}
 		}
@@ -1324,15 +1323,14 @@ static inline void audit_get_stamp(struct audit_context *ctx,
 static long wait_for_auditd(long sleep_time)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	add_wait_queue_exclusive(&audit_backlog_wait, &wait);
 
 	if (audit_backlog_limit &&
-	    skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+	    skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
+		add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		sleep_time = schedule_timeout(sleep_time);
-
-	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&audit_backlog_wait, &wait);
+		remove_wait_queue(&audit_backlog_wait, &wait);
+	}
 
 	return sleep_time;
 }
-- 
cgit v1.2.3


From e68503bd6836ba765dc8e0ee77ea675fedc07e41 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: KEYS: Generalise system_verify_data() to provide access to internal
 content

Generalise system_verify_data() to provide access to internal content
through a callback.  This allows all the PKCS#7 stuff to be hidden inside
this function and removed from the PE file parser and the PKCS#7 test key.

If external content is not required, NULL should be passed as data to the
function.  If the callback is not required, that can be set to NULL.

The function is now called verify_pkcs7_signature() to contrast with
verify_pefile_signature() and the definitions of both have been moved into
linux/verification.h along with the key_being_used_for enum.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 arch/x86/kernel/kexec-bzimage64.c       | 18 ++-------
 certs/system_keyring.c                  | 45 ++++++++++++++++-----
 crypto/asymmetric_keys/Kconfig          |  4 +-
 crypto/asymmetric_keys/mscode_parser.c  | 21 ++++------
 crypto/asymmetric_keys/pkcs7_key_type.c | 72 +++++++++++++--------------------
 crypto/asymmetric_keys/pkcs7_parser.c   | 21 +++++-----
 crypto/asymmetric_keys/verify_pefile.c  | 40 +++++-------------
 crypto/asymmetric_keys/verify_pefile.h  |  5 +--
 include/crypto/pkcs7.h                  |  3 +-
 include/crypto/public_key.h             | 14 -------
 include/keys/asymmetric-type.h          |  1 +
 include/keys/system_keyring.h           |  7 +---
 include/linux/verification.h            | 50 +++++++++++++++++++++++
 include/linux/verify_pefile.h           | 22 ----------
 kernel/module_signing.c                 |  5 ++-
 15 files changed, 155 insertions(+), 173 deletions(-)
 create mode 100644 include/linux/verification.h
 delete mode 100644 include/linux/verify_pefile.h

(limited to 'kernel')

diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 2af478e3fd4e..f2356bda2b05 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -19,8 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/efi.h>
-#include <linux/verify_pefile.h>
-#include <keys/system_keyring.h>
+#include <linux/verification.h>
 
 #include <asm/bootparam.h>
 #include <asm/setup.h>
@@ -529,18 +528,9 @@ static int bzImage64_cleanup(void *loader_data)
 #ifdef CONFIG_KEXEC_BZIMAGE_VERIFY_SIG
 static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len)
 {
-	bool trusted;
-	int ret;
-
-	ret = verify_pefile_signature(kernel, kernel_len,
-				      system_trusted_keyring,
-				      VERIFYING_KEXEC_PE_SIGNATURE,
-				      &trusted);
-	if (ret < 0)
-		return ret;
-	if (!trusted)
-		return -EKEYREJECTED;
-	return 0;
+	return verify_pefile_signature(kernel, kernel_len,
+				       NULL,
+				       VERIFYING_KEXEC_PE_SIGNATURE);
 }
 #endif
 
diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index f4180326c2e1..a83bffedc0aa 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -108,16 +108,25 @@ late_initcall(load_system_certificate_list);
 #ifdef CONFIG_SYSTEM_DATA_VERIFICATION
 
 /**
- * Verify a PKCS#7-based signature on system data.
- * @data: The data to be verified.
+ * verify_pkcs7_signature - Verify a PKCS#7-based signature on system data.
+ * @data: The data to be verified (NULL if expecting internal data).
  * @len: Size of @data.
  * @raw_pkcs7: The PKCS#7 message that is the signature.
  * @pkcs7_len: The size of @raw_pkcs7.
+ * @trusted_keys: Trusted keys to use (NULL for system_trusted_keyring).
  * @usage: The use to which the key is being put.
+ * @view_content: Callback to gain access to content.
+ * @ctx: Context for callback.
  */
-int system_verify_data(const void *data, unsigned long len,
-		       const void *raw_pkcs7, size_t pkcs7_len,
-		       enum key_being_used_for usage)
+int verify_pkcs7_signature(const void *data, size_t len,
+			   const void *raw_pkcs7, size_t pkcs7_len,
+			   struct key *trusted_keys,
+			   int untrusted_error,
+			   enum key_being_used_for usage,
+			   int (*view_content)(void *ctx,
+					       const void *data, size_t len,
+					       size_t asn1hdrlen),
+			   void *ctx)
 {
 	struct pkcs7_message *pkcs7;
 	bool trusted;
@@ -128,7 +137,7 @@ int system_verify_data(const void *data, unsigned long len,
 		return PTR_ERR(pkcs7);
 
 	/* The data should be detached - so we need to supply it. */
-	if (pkcs7_supply_detached_data(pkcs7, data, len) < 0) {
+	if (data && pkcs7_supply_detached_data(pkcs7, data, len) < 0) {
 		pr_err("PKCS#7 signature with non-detached data\n");
 		ret = -EBADMSG;
 		goto error;
@@ -138,13 +147,29 @@ int system_verify_data(const void *data, unsigned long len,
 	if (ret < 0)
 		goto error;
 
-	ret = pkcs7_validate_trust(pkcs7, system_trusted_keyring, &trusted);
+	if (!trusted_keys)
+		trusted_keys = system_trusted_keyring;
+	ret = pkcs7_validate_trust(pkcs7, trusted_keys, &trusted);
 	if (ret < 0)
 		goto error;
 
-	if (!trusted) {
+	if (!trusted && untrusted_error) {
 		pr_err("PKCS#7 signature not signed with a trusted key\n");
-		ret = -ENOKEY;
+		ret = untrusted_error;
+		goto error;
+	}
+
+	if (view_content) {
+		size_t asn1hdrlen;
+
+		ret = pkcs7_get_content_data(pkcs7, &data, &len, &asn1hdrlen);
+		if (ret < 0) {
+			if (ret == -ENODATA)
+				pr_devel("PKCS#7 message does not contain data\n");
+			goto error;
+		}
+
+		ret = view_content(ctx, data, len, asn1hdrlen);
 	}
 
 error:
@@ -152,6 +177,6 @@ error:
 	pr_devel("<==%s() = %d\n", __func__, ret);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(system_verify_data);
+EXPORT_SYMBOL_GPL(verify_pkcs7_signature);
 
 #endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
diff --git a/crypto/asymmetric_keys/Kconfig b/crypto/asymmetric_keys/Kconfig
index 91a7e047a765..f7d2ef9789d8 100644
--- a/crypto/asymmetric_keys/Kconfig
+++ b/crypto/asymmetric_keys/Kconfig
@@ -40,8 +40,7 @@ config PKCS7_MESSAGE_PARSER
 
 config PKCS7_TEST_KEY
 	tristate "PKCS#7 testing key type"
-	depends on PKCS7_MESSAGE_PARSER
-	select SYSTEM_TRUSTED_KEYRING
+	depends on SYSTEM_DATA_VERIFICATION
 	help
 	  This option provides a type of key that can be loaded up from a
 	  PKCS#7 message - provided the message is signed by a trusted key.  If
@@ -54,6 +53,7 @@ config PKCS7_TEST_KEY
 config SIGNED_PE_FILE_VERIFICATION
 	bool "Support for PE file signature verification"
 	depends on PKCS7_MESSAGE_PARSER=y
+	depends on SYSTEM_DATA_VERIFICATION
 	select ASN1
 	select OID_REGISTRY
 	help
diff --git a/crypto/asymmetric_keys/mscode_parser.c b/crypto/asymmetric_keys/mscode_parser.c
index 3242cbfaeaa2..6a76d5c70ef6 100644
--- a/crypto/asymmetric_keys/mscode_parser.c
+++ b/crypto/asymmetric_keys/mscode_parser.c
@@ -21,19 +21,13 @@
 /*
  * Parse a Microsoft Individual Code Signing blob
  */
-int mscode_parse(struct pefile_context *ctx)
+int mscode_parse(void *_ctx, const void *content_data, size_t data_len,
+		 size_t asn1hdrlen)
 {
-	const void *content_data;
-	size_t data_len;
-	int ret;
-
-	ret = pkcs7_get_content_data(ctx->pkcs7, &content_data, &data_len, 1);
-
-	if (ret) {
-		pr_debug("PKCS#7 message does not contain data\n");
-		return ret;
-	}
+	struct pefile_context *ctx = _ctx;
 
+	content_data -= asn1hdrlen;
+	data_len += asn1hdrlen;
 	pr_devel("Data: %zu [%*ph]\n", data_len, (unsigned)(data_len),
 		 content_data);
 
@@ -129,7 +123,6 @@ int mscode_note_digest(void *context, size_t hdrlen,
 {
 	struct pefile_context *ctx = context;
 
-	ctx->digest = value;
-	ctx->digest_len = vlen;
-	return 0;
+	ctx->digest = kmemdup(value, vlen, GFP_KERNEL);
+	return ctx->digest ? 0 : -ENOMEM;
 }
diff --git a/crypto/asymmetric_keys/pkcs7_key_type.c b/crypto/asymmetric_keys/pkcs7_key_type.c
index e2d0edbbc71a..ab9bf5363ecd 100644
--- a/crypto/asymmetric_keys/pkcs7_key_type.c
+++ b/crypto/asymmetric_keys/pkcs7_key_type.c
@@ -13,12 +13,9 @@
 #include <linux/key.h>
 #include <linux/err.h>
 #include <linux/module.h>
+#include <linux/verification.h>
 #include <linux/key-type.h>
-#include <keys/asymmetric-type.h>
-#include <crypto/pkcs7.h>
 #include <keys/user-type.h>
-#include <keys/system_keyring.h>
-#include "pkcs7_parser.h"
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("PKCS#7 testing key type");
@@ -29,59 +26,46 @@ MODULE_PARM_DESC(pkcs7_usage,
 		 "Usage to specify when verifying the PKCS#7 message");
 
 /*
- * Preparse a PKCS#7 wrapped and validated data blob.
+ * Retrieve the PKCS#7 message content.
  */
-static int pkcs7_preparse(struct key_preparsed_payload *prep)
+static int pkcs7_view_content(void *ctx, const void *data, size_t len,
+			      size_t asn1hdrlen)
 {
-	enum key_being_used_for usage = pkcs7_usage;
-	struct pkcs7_message *pkcs7;
-	const void *data, *saved_prep_data;
-	size_t datalen, saved_prep_datalen;
-	bool trusted;
+	struct key_preparsed_payload *prep = ctx;
+	const void *saved_prep_data;
+	size_t saved_prep_datalen;
 	int ret;
 
-	kenter("");
-
-	if (usage >= NR__KEY_BEING_USED_FOR) {
-		pr_err("Invalid usage type %d\n", usage);
-		return -EINVAL;
-	}
-
 	saved_prep_data = prep->data;
 	saved_prep_datalen = prep->datalen;
-	pkcs7 = pkcs7_parse_message(saved_prep_data, saved_prep_datalen);
-	if (IS_ERR(pkcs7)) {
-		ret = PTR_ERR(pkcs7);
-		goto error;
-	}
-
-	ret = pkcs7_verify(pkcs7, usage);
-	if (ret < 0)
-		goto error_free;
-
-	ret = pkcs7_validate_trust(pkcs7, system_trusted_keyring, &trusted);
-	if (ret < 0)
-		goto error_free;
-	if (!trusted)
-		pr_warn("PKCS#7 message doesn't chain back to a trusted key\n");
-
-	ret = pkcs7_get_content_data(pkcs7, &data, &datalen, false);
-	if (ret < 0)
-		goto error_free;
-
 	prep->data = data;
-	prep->datalen = datalen;
+	prep->datalen = len;
+
 	ret = user_preparse(prep);
+
 	prep->data = saved_prep_data;
 	prep->datalen = saved_prep_datalen;
-
-error_free:
-	pkcs7_free_message(pkcs7);
-error:
-	kleave(" = %d", ret);
 	return ret;
 }
 
+/*
+ * Preparse a PKCS#7 wrapped and validated data blob.
+ */
+static int pkcs7_preparse(struct key_preparsed_payload *prep)
+{
+	enum key_being_used_for usage = pkcs7_usage;
+
+	if (usage >= NR__KEY_BEING_USED_FOR) {
+		pr_err("Invalid usage type %d\n", usage);
+		return -EINVAL;
+	}
+
+	return verify_pkcs7_signature(NULL, 0,
+				      prep->data, prep->datalen,
+				      NULL, -ENOKEY, usage,
+				      pkcs7_view_content, prep);
+}
+
 /*
  * user defined keys take an arbitrary string as the description and an
  * arbitrary blob of data as the payload
diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c
index 835701613125..af4cd8649117 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.c
+++ b/crypto/asymmetric_keys/pkcs7_parser.c
@@ -168,24 +168,25 @@ EXPORT_SYMBOL_GPL(pkcs7_parse_message);
  * @pkcs7: The preparsed PKCS#7 message to access
  * @_data: Place to return a pointer to the data
  * @_data_len: Place to return the data length
- * @want_wrapper: True if the ASN.1 object header should be included in the data
+ * @_headerlen: Size of ASN.1 header not included in _data
  *
- * Get access to the data content of the PKCS#7 message, including, optionally,
- * the header of the ASN.1 object that contains it.  Returns -ENODATA if the
- * data object was missing from the message.
+ * Get access to the data content of the PKCS#7 message.  The size of the
+ * header of the ASN.1 object that contains it is also provided and can be used
+ * to adjust *_data and *_data_len to get the entire object.
+ *
+ * Returns -ENODATA if the data object was missing from the message.
  */
 int pkcs7_get_content_data(const struct pkcs7_message *pkcs7,
 			   const void **_data, size_t *_data_len,
-			   bool want_wrapper)
+			   size_t *_headerlen)
 {
-	size_t wrapper;
-
 	if (!pkcs7->data)
 		return -ENODATA;
 
-	wrapper = want_wrapper ? pkcs7->data_hdrlen : 0;
-	*_data = pkcs7->data - wrapper;
-	*_data_len = pkcs7->data_len + wrapper;
+	*_data = pkcs7->data;
+	*_data_len = pkcs7->data_len;
+	if (_headerlen)
+		*_headerlen = pkcs7->data_hdrlen;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pkcs7_get_content_data);
diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index 7e8c2338ae25..265351075b0e 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c
@@ -16,7 +16,7 @@
 #include <linux/err.h>
 #include <linux/pe.h>
 #include <linux/asn1.h>
-#include <crypto/pkcs7.h>
+#include <linux/verification.h>
 #include <crypto/hash.h>
 #include "verify_pefile.h"
 
@@ -392,9 +392,8 @@ error_no_desc:
  * verify_pefile_signature - Verify the signature on a PE binary image
  * @pebuf: Buffer containing the PE binary image
  * @pelen: Length of the binary image
- * @trust_keyring: Signing certificates to use as starting points
+ * @trust_keys: Signing certificate(s) to use as starting points
  * @usage: The use to which the key is being put.
- * @_trusted: Set to true if trustworth, false otherwise
  *
  * Validate that the certificate chain inside the PKCS#7 message inside the PE
  * binary image intersects keys we already know and trust.
@@ -418,14 +417,10 @@ error_no_desc:
  * May also return -ENOMEM.
  */
 int verify_pefile_signature(const void *pebuf, unsigned pelen,
-			    struct key *trusted_keyring,
-			    enum key_being_used_for usage,
-			    bool *_trusted)
+			    struct key *trusted_keys,
+			    enum key_being_used_for usage)
 {
-	struct pkcs7_message *pkcs7;
 	struct pefile_context ctx;
-	const void *data;
-	size_t datalen;
 	int ret;
 
 	kenter("");
@@ -439,19 +434,10 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
 	if (ret < 0)
 		return ret;
 
-	pkcs7 = pkcs7_parse_message(pebuf + ctx.sig_offset, ctx.sig_len);
-	if (IS_ERR(pkcs7))
-		return PTR_ERR(pkcs7);
-	ctx.pkcs7 = pkcs7;
-
-	ret = pkcs7_get_content_data(ctx.pkcs7, &data, &datalen, false);
-	if (ret < 0 || datalen == 0) {
-		pr_devel("PKCS#7 message does not contain data\n");
-		ret = -EBADMSG;
-		goto error;
-	}
-
-	ret = mscode_parse(&ctx);
+	ret = verify_pkcs7_signature(NULL, 0,
+				     pebuf + ctx.sig_offset, ctx.sig_len,
+				     trusted_keys, -EKEYREJECTED, usage,
+				     mscode_parse, &ctx);
 	if (ret < 0)
 		goto error;
 
@@ -462,16 +448,8 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
 	 * contents.
 	 */
 	ret = pefile_digest_pe(pebuf, pelen, &ctx);
-	if (ret < 0)
-		goto error;
-
-	ret = pkcs7_verify(pkcs7, usage);
-	if (ret < 0)
-		goto error;
-
-	ret = pkcs7_validate_trust(pkcs7, trusted_keyring, _trusted);
 
 error:
-	pkcs7_free_message(ctx.pkcs7);
+	kfree(ctx.digest);
 	return ret;
 }
diff --git a/crypto/asymmetric_keys/verify_pefile.h b/crypto/asymmetric_keys/verify_pefile.h
index a133eb81a492..cd4d20930728 100644
--- a/crypto/asymmetric_keys/verify_pefile.h
+++ b/crypto/asymmetric_keys/verify_pefile.h
@@ -9,7 +9,6 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
-#include <linux/verify_pefile.h>
 #include <crypto/pkcs7.h>
 #include <crypto/hash_info.h>
 
@@ -23,7 +22,6 @@ struct pefile_context {
 	unsigned	sig_offset;
 	unsigned	sig_len;
 	const struct section_header *secs;
-	struct pkcs7_message *pkcs7;
 
 	/* PKCS#7 MS Individual Code Signing content */
 	const void	*digest;		/* Digest */
@@ -39,4 +37,5 @@ struct pefile_context {
 /*
  * mscode_parser.c
  */
-extern int mscode_parse(struct pefile_context *ctx);
+extern int mscode_parse(void *_ctx, const void *content_data, size_t data_len,
+			size_t asn1hdrlen);
diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h
index 441aff9b5aa7..8323e3e57131 100644
--- a/include/crypto/pkcs7.h
+++ b/include/crypto/pkcs7.h
@@ -12,6 +12,7 @@
 #ifndef _CRYPTO_PKCS7_H
 #define _CRYPTO_PKCS7_H
 
+#include <linux/verification.h>
 #include <crypto/public_key.h>
 
 struct key;
@@ -26,7 +27,7 @@ extern void pkcs7_free_message(struct pkcs7_message *pkcs7);
 
 extern int pkcs7_get_content_data(const struct pkcs7_message *pkcs7,
 				  const void **_data, size_t *_datalen,
-				  bool want_wrapper);
+				  size_t *_headerlen);
 
 /*
  * pkcs7_trust.c
diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index 2f5de5c1a3a0..b3928e801b8c 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -14,20 +14,6 @@
 #ifndef _LINUX_PUBLIC_KEY_H
 #define _LINUX_PUBLIC_KEY_H
 
-/*
- * The use to which an asymmetric key is being put.
- */
-enum key_being_used_for {
-	VERIFYING_MODULE_SIGNATURE,
-	VERIFYING_FIRMWARE_SIGNATURE,
-	VERIFYING_KEXEC_PE_SIGNATURE,
-	VERIFYING_KEY_SIGNATURE,
-	VERIFYING_KEY_SELF_SIGNATURE,
-	VERIFYING_UNSPECIFIED_SIGNATURE,
-	NR__KEY_BEING_USED_FOR
-};
-extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR];
-
 /*
  * Cryptographic data for the public-key subtype of the asymmetric key type.
  *
diff --git a/include/keys/asymmetric-type.h b/include/keys/asymmetric-type.h
index 70a8775bb444..d1e23dda4363 100644
--- a/include/keys/asymmetric-type.h
+++ b/include/keys/asymmetric-type.h
@@ -15,6 +15,7 @@
 #define _KEYS_ASYMMETRIC_TYPE_H
 
 #include <linux/key-type.h>
+#include <linux/verification.h>
 
 extern struct key_type key_type_asymmetric;
 
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index 39fd38cfa8c9..b2d645ac35a0 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -15,6 +15,7 @@
 #ifdef CONFIG_SYSTEM_TRUSTED_KEYRING
 
 #include <linux/key.h>
+#include <linux/verification.h>
 #include <crypto/public_key.h>
 
 extern struct key *system_trusted_keyring;
@@ -29,12 +30,6 @@ static inline struct key *get_system_trusted_keyring(void)
 }
 #endif
 
-#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
-extern int system_verify_data(const void *data, unsigned long len,
-			      const void *raw_pkcs7, size_t pkcs7_len,
-			      enum key_being_used_for usage);
-#endif
-
 #ifdef CONFIG_IMA_MOK_KEYRING
 extern struct key *ima_mok_keyring;
 extern struct key *ima_blacklist_keyring;
diff --git a/include/linux/verification.h b/include/linux/verification.h
new file mode 100644
index 000000000000..bb0fcf941cb7
--- /dev/null
+++ b/include/linux/verification.h
@@ -0,0 +1,50 @@
+/* Signature verification
+ *
+ * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_VERIFICATION_H
+#define _LINUX_VERIFICATION_H
+
+/*
+ * The use to which an asymmetric key is being put.
+ */
+enum key_being_used_for {
+	VERIFYING_MODULE_SIGNATURE,
+	VERIFYING_FIRMWARE_SIGNATURE,
+	VERIFYING_KEXEC_PE_SIGNATURE,
+	VERIFYING_KEY_SIGNATURE,
+	VERIFYING_KEY_SELF_SIGNATURE,
+	VERIFYING_UNSPECIFIED_SIGNATURE,
+	NR__KEY_BEING_USED_FOR
+};
+extern const char *const key_being_used_for[NR__KEY_BEING_USED_FOR];
+
+#ifdef CONFIG_SYSTEM_DATA_VERIFICATION
+
+struct key;
+
+extern int verify_pkcs7_signature(const void *data, size_t len,
+				  const void *raw_pkcs7, size_t pkcs7_len,
+				  struct key *trusted_keys,
+				  int untrusted_error,
+				  enum key_being_used_for usage,
+				  int (*view_content)(void *ctx,
+						      const void *data, size_t len,
+						      size_t asn1hdrlen),
+				  void *ctx);
+
+#ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION
+extern int verify_pefile_signature(const void *pebuf, unsigned pelen,
+				   struct key *trusted_keys,
+				   enum key_being_used_for usage);
+#endif
+
+#endif /* CONFIG_SYSTEM_DATA_VERIFICATION */
+#endif /* _LINUX_VERIFY_PEFILE_H */
diff --git a/include/linux/verify_pefile.h b/include/linux/verify_pefile.h
deleted file mode 100644
index da2049b5161c..000000000000
--- a/include/linux/verify_pefile.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Signed PE file verification
- *
- * Copyright (C) 2014 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#ifndef _LINUX_VERIFY_PEFILE_H
-#define _LINUX_VERIFY_PEFILE_H
-
-#include <crypto/public_key.h>
-
-extern int verify_pefile_signature(const void *pebuf, unsigned pelen,
-				   struct key *trusted_keyring,
-				   enum key_being_used_for usage,
-				   bool *_trusted);
-
-#endif /* _LINUX_VERIFY_PEFILE_H */
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 64b9dead4a07..593aace88a02 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -80,6 +80,7 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 		return -EBADMSG;
 	}
 
-	return system_verify_data(mod, modlen, mod + modlen, sig_len,
-				  VERIFYING_MODULE_SIGNATURE);
+	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
+				      NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE,
+				      NULL, NULL);
 }
-- 
cgit v1.2.3


From bda850cd214e90b1be0cc25bc48c4f6ac53eb543 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:24 +0100
Subject: PKCS#7: Make trust determination dependent on contents of trust
 keyring

Make the determination of the trustworthiness of a key dependent on whether
a key that can verify it is present in the supplied ring of trusted keys
rather than whether or not the verifying key has KEY_FLAG_TRUSTED set.

verify_pkcs7_signature() will return -ENOKEY if the PKCS#7 message trust
chain cannot be verified.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 certs/system_keyring.c                  | 13 ++++---------
 crypto/asymmetric_keys/pkcs7_key_type.c |  2 +-
 crypto/asymmetric_keys/pkcs7_parser.h   |  1 -
 crypto/asymmetric_keys/pkcs7_trust.c    | 18 +++---------------
 crypto/asymmetric_keys/verify_pefile.c  |  2 +-
 crypto/asymmetric_keys/x509_parser.h    |  1 -
 include/crypto/pkcs7.h                  |  3 +--
 include/linux/verification.h            |  1 -
 kernel/module_signing.c                 |  2 +-
 9 files changed, 11 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index a83bffedc0aa..dc18869ff680 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -121,7 +121,6 @@ late_initcall(load_system_certificate_list);
 int verify_pkcs7_signature(const void *data, size_t len,
 			   const void *raw_pkcs7, size_t pkcs7_len,
 			   struct key *trusted_keys,
-			   int untrusted_error,
 			   enum key_being_used_for usage,
 			   int (*view_content)(void *ctx,
 					       const void *data, size_t len,
@@ -129,7 +128,6 @@ int verify_pkcs7_signature(const void *data, size_t len,
 			   void *ctx)
 {
 	struct pkcs7_message *pkcs7;
-	bool trusted;
 	int ret;
 
 	pkcs7 = pkcs7_parse_message(raw_pkcs7, pkcs7_len);
@@ -149,13 +147,10 @@ int verify_pkcs7_signature(const void *data, size_t len,
 
 	if (!trusted_keys)
 		trusted_keys = system_trusted_keyring;
-	ret = pkcs7_validate_trust(pkcs7, trusted_keys, &trusted);
-	if (ret < 0)
-		goto error;
-
-	if (!trusted && untrusted_error) {
-		pr_err("PKCS#7 signature not signed with a trusted key\n");
-		ret = untrusted_error;
+	ret = pkcs7_validate_trust(pkcs7, trusted_keys);
+	if (ret < 0) {
+		if (ret == -ENOKEY)
+			pr_err("PKCS#7 signature not signed with a trusted key\n");
 		goto error;
 	}
 
diff --git a/crypto/asymmetric_keys/pkcs7_key_type.c b/crypto/asymmetric_keys/pkcs7_key_type.c
index ab9bf5363ecd..3b92523882e5 100644
--- a/crypto/asymmetric_keys/pkcs7_key_type.c
+++ b/crypto/asymmetric_keys/pkcs7_key_type.c
@@ -62,7 +62,7 @@ static int pkcs7_preparse(struct key_preparsed_payload *prep)
 
 	return verify_pkcs7_signature(NULL, 0,
 				      prep->data, prep->datalen,
-				      NULL, -ENOKEY, usage,
+				      NULL, usage,
 				      pkcs7_view_content, prep);
 }
 
diff --git a/crypto/asymmetric_keys/pkcs7_parser.h b/crypto/asymmetric_keys/pkcs7_parser.h
index d5eec31e95b6..f4e81074f5e0 100644
--- a/crypto/asymmetric_keys/pkcs7_parser.h
+++ b/crypto/asymmetric_keys/pkcs7_parser.h
@@ -22,7 +22,6 @@ struct pkcs7_signed_info {
 	struct pkcs7_signed_info *next;
 	struct x509_certificate *signer; /* Signing certificate (in msg->certs) */
 	unsigned	index;
-	bool		trusted;
 	bool		unsupported_crypto;	/* T if not usable due to missing crypto */
 
 	/* Message digest - the digest of the Content Data (or NULL) */
diff --git a/crypto/asymmetric_keys/pkcs7_trust.c b/crypto/asymmetric_keys/pkcs7_trust.c
index b9a5487cd82d..36e77cb07bd0 100644
--- a/crypto/asymmetric_keys/pkcs7_trust.c
+++ b/crypto/asymmetric_keys/pkcs7_trust.c
@@ -30,7 +30,6 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
 	struct public_key_signature *sig = sinfo->sig;
 	struct x509_certificate *x509, *last = NULL, *p;
 	struct key *key;
-	bool trusted;
 	int ret;
 
 	kenter(",%u,", sinfo->index);
@@ -42,10 +41,8 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
 
 	for (x509 = sinfo->signer; x509; x509 = x509->signer) {
 		if (x509->seen) {
-			if (x509->verified) {
-				trusted = x509->trusted;
+			if (x509->verified)
 				goto verified;
-			}
 			kleave(" = -ENOKEY [cached]");
 			return -ENOKEY;
 		}
@@ -122,7 +119,6 @@ static int pkcs7_validate_trust_one(struct pkcs7_message *pkcs7,
 
 matched:
 	ret = verify_signature(key, sig);
-	trusted = test_bit(KEY_FLAG_TRUSTED, &key->flags);
 	key_put(key);
 	if (ret < 0) {
 		if (ret == -ENOMEM)
@@ -134,12 +130,9 @@ matched:
 verified:
 	if (x509) {
 		x509->verified = true;
-		for (p = sinfo->signer; p != x509; p = p->signer) {
+		for (p = sinfo->signer; p != x509; p = p->signer)
 			p->verified = true;
-			p->trusted = trusted;
-		}
 	}
-	sinfo->trusted = trusted;
 	kleave(" = 0");
 	return 0;
 }
@@ -148,7 +141,6 @@ verified:
  * pkcs7_validate_trust - Validate PKCS#7 trust chain
  * @pkcs7: The PKCS#7 certificate to validate
  * @trust_keyring: Signing certificates to use as starting points
- * @_trusted: Set to true if trustworth, false otherwise
  *
  * Validate that the certificate chain inside the PKCS#7 message intersects
  * keys we already know and trust.
@@ -170,16 +162,13 @@ verified:
  * May also return -ENOMEM.
  */
 int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
-			 struct key *trust_keyring,
-			 bool *_trusted)
+			 struct key *trust_keyring)
 {
 	struct pkcs7_signed_info *sinfo;
 	struct x509_certificate *p;
 	int cached_ret = -ENOKEY;
 	int ret;
 
-	*_trusted = false;
-
 	for (p = pkcs7->certs; p; p = p->next)
 		p->seen = false;
 
@@ -193,7 +182,6 @@ int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
 				cached_ret = -ENOPKG;
 			continue;
 		case 0:
-			*_trusted |= sinfo->trusted;
 			cached_ret = 0;
 			continue;
 		default:
diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c
index 265351075b0e..672a94c2c3ff 100644
--- a/crypto/asymmetric_keys/verify_pefile.c
+++ b/crypto/asymmetric_keys/verify_pefile.c
@@ -436,7 +436,7 @@ int verify_pefile_signature(const void *pebuf, unsigned pelen,
 
 	ret = verify_pkcs7_signature(NULL, 0,
 				     pebuf + ctx.sig_offset, ctx.sig_len,
-				     trusted_keys, -EKEYREJECTED, usage,
+				     trusted_keys, usage,
 				     mscode_parse, &ctx);
 	if (ret < 0)
 		goto error;
diff --git a/crypto/asymmetric_keys/x509_parser.h b/crypto/asymmetric_keys/x509_parser.h
index f24f4d808e7f..05eef1c68881 100644
--- a/crypto/asymmetric_keys/x509_parser.h
+++ b/crypto/asymmetric_keys/x509_parser.h
@@ -39,7 +39,6 @@ struct x509_certificate {
 	unsigned	index;
 	bool		seen;			/* Infinite recursion prevention */
 	bool		verified;
-	bool		trusted;
 	bool		self_signed;		/* T if self-signed (check unsupported_sig too) */
 	bool		unsupported_key;	/* T if key uses unsupported crypto */
 	bool		unsupported_sig;	/* T if signature uses unsupported crypto */
diff --git a/include/crypto/pkcs7.h b/include/crypto/pkcs7.h
index 8323e3e57131..583f199400a3 100644
--- a/include/crypto/pkcs7.h
+++ b/include/crypto/pkcs7.h
@@ -33,8 +33,7 @@ extern int pkcs7_get_content_data(const struct pkcs7_message *pkcs7,
  * pkcs7_trust.c
  */
 extern int pkcs7_validate_trust(struct pkcs7_message *pkcs7,
-				struct key *trust_keyring,
-				bool *_trusted);
+				struct key *trust_keyring);
 
 /*
  * pkcs7_verify.c
diff --git a/include/linux/verification.h b/include/linux/verification.h
index bb0fcf941cb7..a10549a6c7cd 100644
--- a/include/linux/verification.h
+++ b/include/linux/verification.h
@@ -33,7 +33,6 @@ struct key;
 extern int verify_pkcs7_signature(const void *data, size_t len,
 				  const void *raw_pkcs7, size_t pkcs7_len,
 				  struct key *trusted_keys,
-				  int untrusted_error,
 				  enum key_being_used_for usage,
 				  int (*view_content)(void *ctx,
 						      const void *data, size_t len,
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 593aace88a02..6a64e03b9f44 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -81,6 +81,6 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
 	}
 
 	return verify_pkcs7_signature(mod, modlen, mod + modlen, sig_len,
-				      NULL, -ENOKEY, VERIFYING_MODULE_SIGNATURE,
+				      NULL, VERIFYING_MODULE_SIGNATURE,
 				      NULL, NULL);
 }
-- 
cgit v1.2.3


From 61bf12d3304d78ff499245ea995858c3bedb162e Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 7 Apr 2016 18:06:25 +0200
Subject: livepatch: robustify klp_register_patch() API error checking

Commit 425595a7fc20 ("livepatch: reuse module loader code to write
relocations") adds a possibility of dereferncing pointers supplied by the
consumer of the livepatch API before sanity (NULL) checking them (patch
and patch->mod).

Spotted by smatch tool.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Jessica Yu <jeyu@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index eb5db6e837aa..28c37fa3d3f9 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -876,6 +876,9 @@ int klp_register_patch(struct klp_patch *patch)
 {
 	int ret;
 
+	if (!patch || !patch->mod)
+		return -EINVAL;
+
 	if (!is_livepatch_module(patch->mod)) {
 		pr_err("module %s is not marked as a livepatch module",
 		       patch->mod->name);
@@ -885,9 +888,6 @@ int klp_register_patch(struct klp_patch *patch)
 	if (!klp_initialized())
 		return -ENODEV;
 
-	if (!patch || !patch->mod)
-		return -EINVAL;
-
 	/*
 	 * A reference is taken on the patch module to prevent it from being
 	 * unloaded.  Right now, we don't allow patch modules to unload since
-- 
cgit v1.2.3


From ec5e099d6e941668d121ea9ca7057f4fa00830b0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:22 -0700
Subject: perf: optimize perf_fetch_caller_regs

avoid memset in perf_fetch_caller_regs, since it's the critical path of all tracepoints.
It's called from perf_sw_event_sched, perf_event_task_sched_in and all of perf_trace_##call
with this_cpu_ptr(&__perf_regs[..]) which are zero initialized by perpcu init logic and
subsequent call to perf_arch_fetch_caller_regs initializes the same fields on all archs,
so we can safely drop memset from all of the above cases and move it into
perf_ftrace_function_call that calls it with stack allocated pt_regs.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h      | 2 --
 kernel/trace/trace_event_perf.c | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f291275ffd71..e89f7199c223 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -882,8 +882,6 @@ static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned lo
  */
 static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 {
-	memset(regs, 0, sizeof(*regs));
-
 	perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
 }
 
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 00df25fd86ef..7a68afca8249 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -316,6 +316,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 
 	BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
 
+	memset(&regs, 0, sizeof(regs));
 	perf_fetch_caller_regs(&regs);
 
 	entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
-- 
cgit v1.2.3


From 1e1dcd93b468901e114f279c94a0b356adc5e7cd Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:24 -0700
Subject: perf: split perf_trace_buf_prepare into alloc and update parts

split allows to move expensive update of 'struct trace_entry' to later phase.
Repurpose unused 1st argument of perf_tp_event() to indicate event type.

While splitting use temp variable 'rctx' instead of '*rctx' to avoid
unnecessary loads done by the compiler due to -fno-strict-aliasing

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h      |  2 +-
 include/linux/trace_events.h    |  8 ++++----
 include/trace/perf.h            |  8 ++++----
 kernel/events/core.c            |  6 ++++--
 kernel/trace/trace_event_perf.c | 39 ++++++++++++++++++++-------------------
 kernel/trace/trace_kprobe.c     | 10 ++++++----
 kernel/trace/trace_syscalls.c   | 13 +++++++------
 kernel/trace/trace_uprobe.c     |  5 +++--
 8 files changed, 49 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e89f7199c223..eb41b535ef38 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1016,7 +1016,7 @@ static inline bool perf_paranoid_kernel(void)
 }
 
 extern void perf_event_init(void);
-extern void perf_tp_event(u64 addr, u64 count, void *record,
+extern void perf_tp_event(u16 event_type, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
 			  struct hlist_head *head, int rctx,
 			  struct task_struct *task);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0810f81b6db2..56f795e6a093 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -605,15 +605,15 @@ extern void perf_trace_del(struct perf_event *event, int flags);
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
-extern void *perf_trace_buf_prepare(int size, unsigned short type,
-				    struct pt_regs **regs, int *rctxp);
+void perf_trace_buf_update(void *record, u16 type);
+void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
 static inline void
-perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
+perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
 		       struct task_struct *task)
 {
-	perf_tp_event(addr, count, raw_data, size, regs, head, rctx, task);
+	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
 }
 #endif
 
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 6f7e37869065..77cd9043b7e4 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -53,8 +53,7 @@ perf_trace_##call(void *__data, proto)					\
 			     sizeof(u64));				\
 	__entry_size -= sizeof(u32);					\
 									\
-	entry = perf_trace_buf_prepare(__entry_size,			\
-			event_call->event.type, &__regs, &rctx);	\
+	entry = perf_trace_buf_alloc(__entry_size, &__regs, &rctx);	\
 	if (!entry)							\
 		return;							\
 									\
@@ -64,8 +63,9 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	{ assign; }							\
 									\
-	perf_trace_buf_submit(entry, __entry_size, rctx, 0,		\
-		__count, __regs, head, __task);				\
+	perf_trace_buf_submit(entry, __entry_size, rctx,		\
+			      event_call->event.type, __count, __regs,	\
+			      head, __task);				\
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index de24fbce5277..d8512883c0a0 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6987,7 +6987,7 @@ static int perf_tp_event_match(struct perf_event *event,
 	return 1;
 }
 
-void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
+void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
 		   struct task_struct *task)
 {
@@ -6999,9 +6999,11 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 		.data = record,
 	};
 
-	perf_sample_data_init(&data, addr, 0);
+	perf_sample_data_init(&data, 0, 0);
 	data.raw = &raw;
 
+	perf_trace_buf_update(record, event_type);
+
 	hlist_for_each_entry_rcu(event, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
 			perf_swevent_event(event, count, &data, regs);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 7a68afca8249..5a927075977f 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -260,42 +260,43 @@ void perf_trace_del(struct perf_event *p_event, int flags)
 	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
 }
 
-void *perf_trace_buf_prepare(int size, unsigned short type,
-			     struct pt_regs **regs, int *rctxp)
+void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
 {
-	struct trace_entry *entry;
-	unsigned long flags;
 	char *raw_data;
-	int pc;
+	int rctx;
 
 	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
 
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
-			"perf buffer not large enough"))
+		      "perf buffer not large enough"))
 		return NULL;
 
-	pc = preempt_count();
-
-	*rctxp = perf_swevent_get_recursion_context();
-	if (*rctxp < 0)
+	*rctxp = rctx = perf_swevent_get_recursion_context();
+	if (rctx < 0)
 		return NULL;
 
 	if (regs)
-		*regs = this_cpu_ptr(&__perf_regs[*rctxp]);
-	raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
+		*regs = this_cpu_ptr(&__perf_regs[rctx]);
+	raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
 
 	/* zero the dead bytes from align to not leak stack to user */
 	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
+	return raw_data;
+}
+EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
+NOKPROBE_SYMBOL(perf_trace_buf_alloc);
+
+void perf_trace_buf_update(void *record, u16 type)
+{
+	struct trace_entry *entry = record;
+	int pc = preempt_count();
+	unsigned long flags;
 
-	entry = (struct trace_entry *)raw_data;
 	local_save_flags(flags);
 	tracing_generic_entry_update(entry, flags, pc);
 	entry->type = type;
-
-	return raw_data;
 }
-EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
-NOKPROBE_SYMBOL(perf_trace_buf_prepare);
+NOKPROBE_SYMBOL(perf_trace_buf_update);
 
 #ifdef CONFIG_FUNCTION_TRACER
 static void
@@ -319,13 +320,13 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 	memset(&regs, 0, sizeof(regs));
 	perf_fetch_caller_regs(&regs);
 
-	entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
+	entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
-	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
+	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
 			      1, &regs, head, NULL);
 
 #undef ENTRY_SIZE
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 919e0ddd8fcc..5546eec0505f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1149,14 +1149,15 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1184,14 +1185,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		return;
 
 	entry->func = (unsigned long)tk->rp.kp.addr;
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
 #endif	/* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index e78f364cc192..b2b6efc083a4 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -587,15 +587,16 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	size = ALIGN(size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-				sys_data->enter_event->event.type, NULL, &rctx);
+	rec = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!rec)
 		return;
 
 	rec->nr = syscall_nr;
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(rec, size, rctx,
+			      sys_data->enter_event->event.type, 1, regs,
+			      head, NULL);
 }
 
 static int perf_sysenter_enable(struct trace_event_call *call)
@@ -660,14 +661,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-				sys_data->exit_event->event.type, NULL, &rctx);
+	rec = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!rec)
 		return;
 
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
-	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
+			      1, regs, head, NULL);
 }
 
 static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 7915142c89e4..c53485441c88 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1131,7 +1131,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	if (hlist_empty(head))
 		goto out;
 
-	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
+	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
 		goto out;
 
@@ -1152,7 +1152,8 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 		memset(data + len, 0, size - esize - len);
 	}
 
-	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
+	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
+			      head, NULL);
  out:
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 98b5c2c65c2951772a8fc661f50d675e450e8bce Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:25 -0700
Subject: perf, bpf: allow bpf programs attach to tracepoints

introduce BPF_PROG_TYPE_TRACEPOINT program type and allow it to be attached
to the perf tracepoint handler, which will copy the arguments into
the per-cpu buffer and pass it to the bpf program as its first argument.
The layout of the fields can be discovered by doing
'cat /sys/kernel/debug/tracing/events/sched/sched_switch/format'
prior to the compilation of the program with exception that first 8 bytes
are reserved and not accessible to the program. This area is used to store
the pointer to 'struct pt_regs' which some of the bpf helpers will use:
+---------+
| 8 bytes | hidden 'struct pt_regs *' (inaccessible to bpf program)
+---------+
| N bytes | static tracepoint fields defined in tracepoint/format (bpf readonly)
+---------+
| dynamic | __dynamic_array bytes of tracepoint (inaccessible to bpf yet)
+---------+

Not that all of the fields are already dumped to user space via perf ring buffer
and broken application access it directly without consulting tracepoint/format.
Same rule applies here: static tracepoint fields should only be accessed
in a format defined in tracepoint/format. The order of fields and
field sizes are not an ABI.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/perf.h     | 10 +++++++++-
 include/uapi/linux/bpf.h |  1 +
 kernel/events/core.c     | 13 +++++++++----
 3 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/perf.h b/include/trace/perf.h
index 77cd9043b7e4..a182306eefd7 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -34,6 +34,7 @@ perf_trace_##call(void *__data, proto)					\
 	struct trace_event_call *event_call = __data;			\
 	struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
 	struct trace_event_raw_##call *entry;				\
+	struct bpf_prog *prog = event_call->prog;			\
 	struct pt_regs *__regs;						\
 	u64 __count = 1;						\
 	struct task_struct *__task = NULL;				\
@@ -45,7 +46,7 @@ perf_trace_##call(void *__data, proto)					\
 	__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
 									\
 	head = this_cpu_ptr(event_call->perf_events);			\
-	if (__builtin_constant_p(!__task) && !__task &&			\
+	if (!prog && __builtin_constant_p(!__task) && !__task &&	\
 				hlist_empty(head))			\
 		return;							\
 									\
@@ -63,6 +64,13 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	{ assign; }							\
 									\
+	if (prog) {							\
+		*(struct pt_regs **)entry = __regs;			\
+		if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \
+			perf_swevent_put_recursion_context(rctx);	\
+			return;						\
+		}							\
+	}								\
 	perf_trace_buf_submit(entry, __entry_size, rctx,		\
 			      event_call->event.type, __count, __regs,	\
 			      head, __task);				\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 23917bb47bf3..70eda5aeb304 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_KPROBE,
 	BPF_PROG_TYPE_SCHED_CLS,
 	BPF_PROG_TYPE_SCHED_ACT,
+	BPF_PROG_TYPE_TRACEPOINT,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d8512883c0a0..e5ffe97d6166 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6725,12 +6725,13 @@ int perf_swevent_get_recursion_context(void)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
-inline void perf_swevent_put_recursion_context(int rctx)
+void perf_swevent_put_recursion_context(int rctx)
 {
 	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
 
 	put_recursion_context(swhash->recursion, rctx);
 }
+EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
@@ -7106,6 +7107,7 @@ static void perf_event_free_filter(struct perf_event *event)
 
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
+	bool is_kprobe, is_tracepoint;
 	struct bpf_prog *prog;
 
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
@@ -7114,15 +7116,18 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 	if (event->tp_event->prog)
 		return -EEXIST;
 
-	if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
-		/* bpf programs can only be attached to u/kprobes */
+	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
+	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
+	if (!is_kprobe && !is_tracepoint)
+		/* bpf programs can only be attached to u/kprobe or tracepoint */
 		return -EINVAL;
 
 	prog = bpf_prog_get(prog_fd);
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	if (prog->type != BPF_PROG_TYPE_KPROBE) {
+	if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
+	    (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
 		/* valid fd, but invalid bpf program type */
 		bpf_prog_put(prog);
 		return -EINVAL;
-- 
cgit v1.2.3


From 9fd82b610ba3351f05a59c3e9117cfefe82f7751 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:26 -0700
Subject: bpf: register BPF_PROG_TYPE_TRACEPOINT program type

register tracepoint bpf program type and let it call the same set
of helper functions as BPF_PROG_TYPE_KPROBE

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 43 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e4ffb3ace5f..3e5ebe3254d2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -268,7 +268,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg5_type	= ARG_CONST_STACK_SIZE,
 };
 
-static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_map_lookup_elem:
@@ -295,12 +295,20 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
+	default:
+		return NULL;
+	}
+}
+
+static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
 	default:
-		return NULL;
+		return tracing_func_proto(func_id);
 	}
 }
 
@@ -332,9 +340,42 @@ static struct bpf_prog_type_list kprobe_tl = {
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
 
+static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_perf_event_output:
+	case BPF_FUNC_get_stackid:
+		return NULL;
+	default:
+		return tracing_func_proto(func_id);
+	}
+}
+
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
+		return false;
+	if (type != BPF_READ)
+		return false;
+	if (off % size != 0)
+		return false;
+	return true;
+}
+
+static const struct bpf_verifier_ops tracepoint_prog_ops = {
+	.get_func_proto  = tp_prog_func_proto,
+	.is_valid_access = tp_prog_is_valid_access,
+};
+
+static struct bpf_prog_type_list tracepoint_tl = {
+	.ops	= &tracepoint_prog_ops,
+	.type	= BPF_PROG_TYPE_TRACEPOINT,
+};
+
 static int __init register_kprobe_prog_ops(void)
 {
 	bpf_register_prog_type(&kprobe_tl);
+	bpf_register_prog_type(&tracepoint_tl);
 	return 0;
 }
 late_initcall(register_kprobe_prog_ops);
-- 
cgit v1.2.3


From 9940d67c93b5bb7ddcf862b41b1847cb728186c4 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:27 -0700
Subject: bpf: support bpf_get_stackid() and bpf_perf_event_output() in
 tracepoint programs

needs two wrapper functions to fetch 'struct pt_regs *' to convert
tracepoint bpf context into kprobe bpf context to reuse existing
helper functions

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 kernel/bpf/stackmap.c    |  2 +-
 kernel/trace/bpf_trace.c | 42 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 21ee41b92e8a..198f6ace70ec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -160,6 +160,7 @@ struct bpf_array {
 #define MAX_TAIL_CALL_CNT 32
 
 u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..35114725cf30 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -116,7 +116,7 @@ free_smap:
 	return ERR_PTR(err);
 }
 
-static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 {
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3e5ebe3254d2..413ec5614180 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -340,12 +340,52 @@ static struct bpf_prog_type_list kprobe_tl = {
 	.type	= BPF_PROG_TYPE_KPROBE,
 };
 
+static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+	/*
+	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
+	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
+	 * from there and call the same bpf_perf_event_output() helper
+	 */
+	u64 ctx = *(long *)r1;
+
+	return bpf_perf_event_output(ctx, r2, index, r4, size);
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
+	.func		= bpf_perf_event_output_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
+static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	u64 ctx = *(long *)r1;
+
+	return bpf_get_stackid(ctx, r2, r3, r4, r5);
+}
+
+static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
+	.func		= bpf_get_stackid_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
 	case BPF_FUNC_perf_event_output:
+		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
-		return NULL;
+		return &bpf_get_stackid_proto_tp;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 32bbe0078afe86a8bf4c67c6b3477781b15e94dc Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 18:43:28 -0700
Subject: bpf: sanitize bpf tracepoint access

during bpf program loading remember the last byte of ctx access
and at the time of attaching the program to tracepoint check that
the program doesn't access bytes beyond defined in tracepoint fields

This also disallows access to __dynamic_array fields, but can be
relaxed in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  1 +
 include/linux/trace_events.h |  1 +
 kernel/bpf/verifier.c        |  6 +++++-
 kernel/events/core.c         |  8 ++++++++
 kernel/trace/trace_events.c  | 18 ++++++++++++++++++
 5 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 198f6ace70ec..b2365a6eba3d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -131,6 +131,7 @@ struct bpf_prog_type_list {
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
+	u32 max_ctx_offset;
 	const struct bpf_verifier_ops *ops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 56f795e6a093..fe6441203b59 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -569,6 +569,7 @@ extern int trace_define_field(struct trace_event_call *call, const char *type,
 			      int is_signed, int filter_type);
 extern int trace_add_event_call(struct trace_event_call *call);
 extern int trace_remove_event_call(struct trace_event_call *call);
+extern int trace_event_get_offsets(struct trace_event_call *call);
 
 #define is_signed_type(type)	(((type)(-1)) < (type)1)
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e08f8e9b771..58792fed5678 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -652,8 +652,12 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
 			    enum bpf_access_type t)
 {
 	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t))
+	    env->prog->aux->ops->is_valid_access(off, size, t)) {
+		/* remember the offset of last byte accessed in ctx */
+		if (env->prog->aux->max_ctx_offset < off + size)
+			env->prog->aux->max_ctx_offset = off + size;
 		return 0;
+	}
 
 	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
 	return -EACCES;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e5ffe97d6166..9a01019ff7c8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7133,6 +7133,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	if (is_tracepoint) {
+		int off = trace_event_get_offsets(event->tp_event);
+
+		if (prog->aux->max_ctx_offset > off) {
+			bpf_prog_put(prog);
+			return -EACCES;
+		}
+	}
 	event->tp_event->prog = prog;
 
 	return 0;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771..ced963049e0a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -204,6 +204,24 @@ static void trace_destroy_fields(struct trace_event_call *call)
 	}
 }
 
+/*
+ * run-time version of trace_event_get_offsets_<call>() that returns the last
+ * accessible offset of trace fields excluding __dynamic_array bytes
+ */
+int trace_event_get_offsets(struct trace_event_call *call)
+{
+	struct ftrace_event_field *tail;
+	struct list_head *head;
+
+	head = trace_get_fields(call);
+	/*
+	 * head->next points to the last field with the largest offset,
+	 * since it was added last by trace_define_field()
+	 */
+	tail = list_first_entry(head, struct ftrace_event_field, link);
+	return tail->offset + tail->size;
+}
+
 int trace_event_raw_init(struct trace_event_call *call)
 {
 	int id;
-- 
cgit v1.2.3


From 07016151a446d25397b24588df4ed5cf777a69bb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 5 Apr 2016 22:33:17 +0200
Subject: bpf, verifier: further improve search pruning

The verifier needs to go through every path of the program in
order to check that it terminates safely, which can be quite a
lot of instructions that need to be processed f.e. in cases with
more branchy programs. With search pruning from f1bca824dabb ("bpf:
add search pruning optimization to verifier") the search space can
already be reduced significantly when the verifier detects that
a previously walked path with same register and stack contents
terminated already (see verifier's states_equal()), so the search
can skip walking those states.

When working with larger programs of > ~2000 (out of max 4096)
insns, we found that the current limit of 32k instructions is easily
hit. For example, a case we ran into is that the search space cannot
be pruned due to branches at the beginning of the program that make
use of certain stack space slots (STACK_MISC), which are never used
in the remaining program (STACK_INVALID). Therefore, the verifier
needs to walk paths for the slots in STACK_INVALID state, but also
all remaining paths with a stack structure, where the slots are in
STACK_MISC, which can nearly double the search space needed. After
various experiments, we find that a limit of 64k processed insns is
a more reasonable choice when dealing with larger programs in practice.
This still allows to reject extreme crafted cases that can have a
much higher complexity (f.e. > ~300k) within the 4096 insns limit
due to search pruning not being able to take effect.

Furthermore, we found that a lot of states can be pruned after a
call instruction, f.e. we were able to reduce the search state by
~35% in some cases with this heuristic, trade-off is to keep a bit
more states in env->explored_states. Usually, call instructions
have a number of preceding register assignments and/or stack stores,
where search pruning has a better chance to suceed in states_equal()
test. The current code marks the branch targets with STATE_LIST_MARK
in case of conditional jumps, and the next (t + 1) instruction in
case of unconditional jump so that f.e. a backjump will walk it. We
also did experiments with using t + insns[t].off + 1 as a marker in
the unconditionally jump case instead of t + 1 with the rationale
that these two branches of execution that converge after the label
might have more potential of pruning. We found that it was a bit
better, but not necessarily significantly better than the current
state, perhaps also due to clang not generating back jumps often.
Hence, we left that as is for now.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 58792fed5678..8233021538d3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -202,6 +202,9 @@ struct verifier_env {
 	bool allow_ptr_leaks;
 };
 
+#define BPF_COMPLEXITY_LIMIT_INSNS	65536
+#define BPF_COMPLEXITY_LIMIT_STACK	1024
+
 /* verbose verifier prints what it's seeing
  * bpf_check() is called under lock, so no race to access these global vars
  */
@@ -454,7 +457,7 @@ static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
 	elem->next = env->head;
 	env->head = elem;
 	env->stack_size++;
-	if (env->stack_size > 1024) {
+	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
 		verbose("BPF program is too complex\n");
 		goto err;
 	}
@@ -1543,6 +1546,8 @@ peek_stack:
 				goto peek_stack;
 			else if (ret < 0)
 				goto err_free;
+			if (t + 1 < insn_cnt)
+				env->explored_states[t + 1] = STATE_LIST_MARK;
 		} else if (opcode == BPF_JA) {
 			if (BPF_SRC(insns[t].code) != BPF_K) {
 				ret = -EINVAL;
@@ -1747,7 +1752,7 @@ static int do_check(struct verifier_env *env)
 		insn = &insns[insn_idx];
 		class = BPF_CLASS(insn->code);
 
-		if (++insn_processed > 32768) {
+		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
 			verbose("BPF program is too large. Proccessed %d insn\n",
 				insn_processed);
 			return -E2BIG;
-- 
cgit v1.2.3


From 6c9d9c81924b4b63c7a487e90fddb3b2d0f7d458 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 7 Apr 2016 23:38:46 +0200
Subject: cpufreq: Call cpufreq_disable_fast_switch() in sugov_exit()

Due to differences in the cpufreq core's handling of runtime CPU
offline and nonboot CPUs disabling during system suspend-to-RAM,
fast frequency switching gets disabled after a suspend-to-RAM and
resume cycle on all of the nonboot CPUs.

To prevent that from happening, move the invocation of
cpufreq_disable_fast_switch() from cpufreq_exit_governor() to
sugov_exit(), as the schedutil governor is the only user of fast
frequency switching today anyway.

That simply prevents cpufreq_disable_fast_switch() from being called
without invoking the ->governor callback for the CPUFREQ_GOV_POLICY_EXIT
event (which happens during system suspend now).

Fixes: b7898fda5bc7 (cpufreq: Support for fast frequency switching)
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c        | 19 +++++++++++--------
 include/linux/cpufreq.h          |  1 +
 kernel/sched/cpufreq_schedutil.c |  2 ++
 3 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index a5b7d77d4816..77d77a4e3b74 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -77,7 +77,11 @@ static inline bool has_target(void)
 static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
 static unsigned int __cpufreq_get(struct cpufreq_policy *policy);
 static int cpufreq_start_governor(struct cpufreq_policy *policy);
-static int cpufreq_exit_governor(struct cpufreq_policy *policy);
+
+static inline int cpufreq_exit_governor(struct cpufreq_policy *policy)
+{
+	return cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
+}
 
 /**
  * Two notifier lists: the "policy" list is involved in the
@@ -482,7 +486,11 @@ void cpufreq_enable_fast_switch(struct cpufreq_policy *policy)
 }
 EXPORT_SYMBOL_GPL(cpufreq_enable_fast_switch);
 
-static void cpufreq_disable_fast_switch(struct cpufreq_policy *policy)
+/**
+ * cpufreq_disable_fast_switch - Disable fast frequency switching for policy.
+ * @policy: cpufreq policy to disable fast frequency switching for.
+ */
+void cpufreq_disable_fast_switch(struct cpufreq_policy *policy)
 {
 	mutex_lock(&cpufreq_fast_switch_lock);
 	if (policy->fast_switch_enabled) {
@@ -492,6 +500,7 @@ static void cpufreq_disable_fast_switch(struct cpufreq_policy *policy)
 	}
 	mutex_unlock(&cpufreq_fast_switch_lock);
 }
+EXPORT_SYMBOL_GPL(cpufreq_disable_fast_switch);
 
 /*********************************************************************
  *                          SYSFS INTERFACE                          *
@@ -2060,12 +2069,6 @@ static int cpufreq_start_governor(struct cpufreq_policy *policy)
 	return ret ? ret : cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
 }
 
-static int cpufreq_exit_governor(struct cpufreq_policy *policy)
-{
-	cpufreq_disable_fast_switch(policy);
-	return cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
-}
-
 int cpufreq_register_governor(struct cpufreq_governor *governor)
 {
 	int err;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 55e69ebb035c..4e81e08db752 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -168,6 +168,7 @@ int cpufreq_update_policy(unsigned int cpu);
 bool have_governor_per_policy(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
+void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
 #else
 static inline unsigned int cpufreq_get(unsigned int cpu)
 {
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index d27ae064b476..154ae3a51e86 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -398,6 +398,8 @@ static int sugov_exit(struct cpufreq_policy *policy)
 	struct sugov_tunables *tunables = sg_policy->tunables;
 	unsigned int count;
 
+	cpufreq_disable_fast_switch(policy);
+
 	mutex_lock(&global_tunables_lock);
 
 	count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
-- 
cgit v1.2.3


From fc64005c93090c052637f63578d810b037abb1a1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 Apr 2016 01:33:30 -0400
Subject: don't bother with ->d_inode->i_sb - it's always equal to ->d_sb

... and neither can ever be NULL

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/vfs_inode.c                 |  2 +-
 fs/btrfs/tree-log.c               |  6 +++---
 fs/cifs/cifs_dfs_ref.c            |  2 +-
 fs/cifs/inode.c                   |  3 +--
 fs/cifs/readdir.c                 |  2 +-
 fs/cifs/xattr.c                   | 16 ++++------------
 fs/efs/namei.c                    |  2 +-
 fs/exofs/super.c                  |  2 +-
 fs/ext2/namei.c                   |  2 +-
 fs/ext4/namei.c                   |  4 ++--
 fs/f2fs/namei.c                   |  2 +-
 fs/gfs2/ops_fstype.c              |  2 +-
 fs/gfs2/super.c                   |  2 +-
 fs/jffs2/dir.c                    |  2 +-
 fs/jffs2/super.c                  |  2 +-
 fs/jfs/namei.c                    |  2 +-
 fs/namei.c                        |  4 ++--
 fs/nfs/direct.c                   |  2 +-
 fs/nfsd/nfs3proc.c                |  4 ++--
 fs/nfsd/nfs3xdr.c                 |  2 +-
 fs/nfsd/nfsfh.c                   |  2 +-
 fs/nilfs2/namei.c                 |  2 +-
 fs/ocfs2/file.c                   |  2 +-
 fs/udf/namei.c                    |  2 +-
 fs/ufs/super.c                    |  2 +-
 include/trace/events/ext4.h       |  6 +++---
 kernel/audit_watch.c              |  2 +-
 security/integrity/evm/evm_main.c |  4 ++--
 security/selinux/hooks.c          |  2 +-
 security/smack/smack_lsm.c        |  2 +-
 30 files changed, 41 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3a08b3e6ff1d..f4645c515262 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1071,7 +1071,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	if (IS_ERR(st))
 		return PTR_ERR(st);
 
-	v9fs_stat2inode(st, d_inode(dentry), d_inode(dentry)->i_sb);
+	v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb);
 	generic_fillattr(d_inode(dentry), stat);
 
 	p9stat_free(st);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 24d03c751149..a82e20ecbee1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4851,7 +4851,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 			goto out;
 
 	if (!S_ISDIR(inode->i_mode)) {
-		if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
 			goto out;
 		inode = d_inode(parent);
 	}
@@ -4872,7 +4872,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 			break;
 		}
 
-		if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
 			break;
 
 		if (IS_ROOT(parent))
@@ -5285,7 +5285,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	}
 
 	while (1) {
-		if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb)
+		if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
 			break;
 
 		inode = d_inode(parent);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index e956cba94338..94f2c8a9ae6d 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -302,7 +302,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 	if (full_path == NULL)
 		goto cdda_exit;
 
-	cifs_sb = CIFS_SB(d_inode(mntpt)->i_sb);
+	cifs_sb = CIFS_SB(mntpt->d_sb);
 	tlink = cifs_sb_tlink(cifs_sb);
 	if (IS_ERR(tlink)) {
 		mnt = ERR_CAST(tlink);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index aeb26dbfa1bf..4cd4705ebfad 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2418,8 +2418,7 @@ cifs_setattr_exit:
 int
 cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 {
-	struct inode *inode = d_inode(direntry);
-	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
 	struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb);
 
 	if (pTcon->unix_ext)
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b30a4a6d98a0..4cfb4d9f88e2 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -78,7 +78,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
 {
 	struct dentry *dentry, *alias;
 	struct inode *inode;
-	struct super_block *sb = d_inode(parent)->i_sb;
+	struct super_block *sb = parent->d_sb;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 
 	cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index f5dc2f0df4ad..612de933431f 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -52,9 +52,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
 		return -EIO;
 	if (d_really_is_negative(direntry))
 		return -EIO;
-	sb = d_inode(direntry)->i_sb;
-	if (sb == NULL)
-		return -EIO;
+	sb = direntry->d_sb;
 
 	cifs_sb = CIFS_SB(sb);
 	tlink = cifs_sb_tlink(cifs_sb);
@@ -113,9 +111,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 		return -EIO;
 	if (d_really_is_negative(direntry))
 		return -EIO;
-	sb = d_inode(direntry)->i_sb;
-	if (sb == NULL)
-		return -EIO;
+	sb = direntry->d_sb;
 
 	cifs_sb = CIFS_SB(sb);
 	tlink = cifs_sb_tlink(cifs_sb);
@@ -248,9 +244,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 		return -EIO;
 	if (d_really_is_negative(direntry))
 		return -EIO;
-	sb = d_inode(direntry)->i_sb;
-	if (sb == NULL)
-		return -EIO;
+	sb = direntry->d_sb;
 
 	cifs_sb = CIFS_SB(sb);
 	tlink = cifs_sb_tlink(cifs_sb);
@@ -384,9 +378,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
 		return -EIO;
 	if (d_really_is_negative(direntry))
 		return -EIO;
-	sb = d_inode(direntry)->i_sb;
-	if (sb == NULL)
-		return -EIO;
+	sb = direntry->d_sb;
 
 	cifs_sb = CIFS_SB(sb);
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 40ba9cc41bf7..d34a40edcdb2 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -113,7 +113,7 @@ struct dentry *efs_get_parent(struct dentry *child)
 
 	ino = efs_find_entry(d_inode(child), "..", 2);
 	if (ino)
-		parent = d_obtain_alias(efs_iget(d_inode(child)->i_sb, ino));
+		parent = d_obtain_alias(efs_iget(child->d_sb, ino));
 
 	return parent;
 }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 6658a50530a0..192373653dfb 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -958,7 +958,7 @@ static struct dentry *exofs_get_parent(struct dentry *child)
 	if (!ino)
 		return ERR_PTR(-ESTALE);
 
-	return d_obtain_alias(exofs_iget(d_inode(child)->i_sb, ino));
+	return d_obtain_alias(exofs_iget(child->d_sb, ino));
 }
 
 static struct inode *exofs_nfs_get_inode(struct super_block *sb,
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 7a2be8f7f3c3..1a7eb49a115d 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -82,7 +82,7 @@ struct dentry *ext2_get_parent(struct dentry *child)
 	unsigned long ino = ext2_inode_by_name(d_inode(child), &dotdot);
 	if (!ino)
 		return ERR_PTR(-ENOENT);
-	return d_obtain_alias(ext2_iget(d_inode(child)->i_sb, ino));
+	return d_obtain_alias(ext2_iget(child->d_sb, ino));
 } 
 
 /*
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 48e4b8907826..5611ec9348d7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1638,13 +1638,13 @@ struct dentry *ext4_get_parent(struct dentry *child)
 	ino = le32_to_cpu(de->inode);
 	brelse(bh);
 
-	if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) {
+	if (!ext4_valid_inum(child->d_sb, ino)) {
 		EXT4_ERROR_INODE(d_inode(child),
 				 "bad parent inode number: %u", ino);
 		return ERR_PTR(-EFSCORRUPTED);
 	}
 
-	return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino));
+	return d_obtain_alias(ext4_iget_normal(child->d_sb, ino));
 }
 
 /*
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 7876f1052101..db874ad3514a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -202,7 +202,7 @@ struct dentry *f2fs_get_parent(struct dentry *child)
 	unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot);
 	if (!ino)
 		return ERR_PTR(-ENOENT);
-	return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino));
+	return d_obtain_alias(f2fs_iget(child->d_sb, ino));
 }
 
 static int __recover_dot_dentries(struct inode *dir, nid_t pino)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 49b0bff18fe3..c09c63dcd7a2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1360,7 +1360,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
 		return ERR_PTR(error);
 	}
 	s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
-		 d_inode(path.dentry)->i_sb->s_bdev);
+		 path.dentry->d_sb->s_bdev);
 	path_put(&path);
 	if (IS_ERR(s)) {
 		pr_warn("gfs2 mount does not exist\n");
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f8a0cd821290..9b2ff353e45f 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1176,7 +1176,7 @@ static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *s
 
 static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct super_block *sb = d_inode(dentry)->i_sb;
+	struct super_block *sb = dentry->d_sb;
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_statfs_change_host sc;
 	int error;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 30c4c9ebb693..7a9579e03dbb 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -241,7 +241,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry)
 
 static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry)
 {
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(d_inode(old_dentry)->i_sb);
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dentry->d_sb);
 	struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry));
 	struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i);
 	int ret;
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0a9a114bb9d1..5ef21f4c4c77 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -147,7 +147,7 @@ static struct dentry *jffs2_get_parent(struct dentry *child)
 	JFFS2_DEBUG("Parent of directory ino #%u is #%u\n",
 		    f->inocache->ino, pino);
 
-	return d_obtain_alias(jffs2_iget(d_inode(child)->i_sb, pino));
+	return d_obtain_alias(jffs2_iget(child->d_sb, pino));
 }
 
 static const struct export_operations jffs2_export_ops = {
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 701f89370de7..8a40941ac9a6 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1524,7 +1524,7 @@ struct dentry *jfs_get_parent(struct dentry *dentry)
 	parent_ino =
 		le32_to_cpu(JFS_IP(d_inode(dentry))->i_dtroot.header.idotdot);
 
-	return d_obtain_alias(jfs_iget(d_inode(dentry)->i_sb, parent_ino));
+	return d_obtain_alias(jfs_iget(dentry->d_sb, parent_ino));
 }
 
 const struct inode_operations jfs_dir_inode_operations = {
diff --git a/fs/namei.c b/fs/namei.c
index 3498d53de26f..c0d551fc43a0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2653,7 +2653,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 		return NULL;
 	}
 
-	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
 
 	p = d_ancestor(p2, p1);
 	if (p) {
@@ -2680,7 +2680,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 	inode_unlock(p1->d_inode);
 	if (p1 != p2) {
 		inode_unlock(p2->d_inode);
-		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+		mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
 	}
 }
 EXPORT_SYMBOL(unlock_rename);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 7a0cfd3266e5..e5daa932b823 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -396,7 +396,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
 static void nfs_direct_readpage_release(struct nfs_page *req)
 {
 	dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
-		d_inode(req->wb_context->dentry)->i_sb->s_id,
+		req->wb_context->dentry->d_sb->s_id,
 		(unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)),
 		req->wb_bytes,
 		(long long)req_offset(req));
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 51c3b06e8036..d818e4ffd79f 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -552,7 +552,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle    *argp,
 	 * different read/write sizes for file systems known to have
 	 * problems with large blocks */
 	if (nfserr == 0) {
-		struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb;
+		struct super_block *sb = argp->fh.fh_dentry->d_sb;
 
 		/* Note that we don't care for remote fs's here */
 		if (sb->s_magic == MSDOS_SUPER_MAGIC) {
@@ -588,7 +588,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle      *argp,
 	nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
 
 	if (nfserr == 0) {
-		struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb;
+		struct super_block *sb = argp->fh.fh_dentry->d_sb;
 
 		/* Note that we don't care for remote fs's here */
 		switch (sb->s_magic) {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 2246454dec76..93d5853f8c99 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -146,7 +146,7 @@ static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp)
 	default:
 	case FSIDSOURCE_DEV:
 		p = xdr_encode_hyper(p, (u64)huge_encode_dev
-				     (d_inode(fhp->fh_dentry)->i_sb->s_dev));
+				     (fhp->fh_dentry->d_sb->s_dev));
 		break;
 	case FSIDSOURCE_FSID:
 		p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index c1681ce894c5..a8919444c460 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -426,7 +426,7 @@ static bool is_root_export(struct svc_export *exp)
 
 static struct super_block *exp_sb(struct svc_export *exp)
 {
-	return d_inode(exp->ex_path.dentry)->i_sb;
+	return exp->ex_path.dentry->d_sb;
 }
 
 static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp)
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 7ccdb961eea9..38d67f3e25bc 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -457,7 +457,7 @@ static struct dentry *nilfs_get_parent(struct dentry *child)
 
 	root = NILFS_I(d_inode(child))->i_root;
 
-	inode = nilfs_iget(d_inode(child)->i_sb, root, ino);
+	inode = nilfs_iget(child->d_sb, root, ino);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c18ab45f8d21..c6fdcbd46bba 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1290,7 +1290,7 @@ int ocfs2_getattr(struct vfsmount *mnt,
 		  struct kstat *stat)
 {
 	struct inode *inode = d_inode(dentry);
-	struct super_block *sb = d_inode(dentry)->i_sb;
+	struct super_block *sb = dentry->d_sb;
 	struct ocfs2_super *osb = sb->s_fs_info;
 	int err;
 
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index a2ba11eca995..c3e5c9679371 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1250,7 +1250,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
 	brelse(fibh.sbh);
 
 	tloc = lelb_to_cpu(cfi.icb.extLocation);
-	inode = udf_iget(d_inode(child)->i_sb, &tloc);
+	inode = udf_iget(child->d_sb, &tloc);
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 442fd52ebffe..f04ab232d08d 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -132,7 +132,7 @@ static struct dentry *ufs_get_parent(struct dentry *child)
 	ino = ufs_inode_by_name(d_inode(child), &dot_dot);
 	if (!ino)
 		return ERR_PTR(-ENOENT);
-	return d_obtain_alias(ufs_iget(d_inode(child)->i_sb, ino));
+	return d_obtain_alias(ufs_iget(child->d_sb, ino));
 }
 
 static const struct export_operations ufs_export_ops = {
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 4e4b2fa78609..09c71e9aaebf 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -872,7 +872,7 @@ TRACE_EVENT(ext4_sync_file_enter,
 	TP_fast_assign(
 		struct dentry *dentry = file->f_path.dentry;
 
-		__entry->dev		= d_inode(dentry)->i_sb->s_dev;
+		__entry->dev		= dentry->d_sb->s_dev;
 		__entry->ino		= d_inode(dentry)->i_ino;
 		__entry->datasync	= datasync;
 		__entry->parent		= d_inode(dentry->d_parent)->i_ino;
@@ -1451,7 +1451,7 @@ TRACE_EVENT(ext4_unlink_enter,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= d_inode(dentry)->i_sb->s_dev;
+		__entry->dev		= dentry->d_sb->s_dev;
 		__entry->ino		= d_inode(dentry)->i_ino;
 		__entry->parent		= parent->i_ino;
 		__entry->size		= d_inode(dentry)->i_size;
@@ -1475,7 +1475,7 @@ TRACE_EVENT(ext4_unlink_exit,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= d_inode(dentry)->i_sb->s_dev;
+		__entry->dev		= dentry->d_sb->s_dev;
 		__entry->ino		= d_inode(dentry)->i_ino;
 		__entry->ret		= ret;
 	),
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 3cf1c5978d39..d6709eb70970 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -367,7 +367,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 	inode_unlock(d_backing_inode(parent->dentry));
 	if (d_is_positive(d)) {
 		/* update watch filter fields */
-		watch->dev = d_backing_inode(d)->i_sb->s_dev;
+		watch->dev = d->d_sb->s_dev;
 		watch->ino = d_backing_inode(d)->i_ino;
 	}
 	dput(d);
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index e6ea9d4b1de9..84c6d11fc096 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -299,8 +299,8 @@ static int evm_protect_xattr(struct dentry *dentry, const char *xattr_name,
 			return 0;
 
 		/* exception for pseudo filesystems */
-		if (dentry->d_inode->i_sb->s_magic == TMPFS_MAGIC
-		    || dentry->d_inode->i_sb->s_magic == SYSFS_MAGIC)
+		if (dentry->d_sb->s_magic == TMPFS_MAGIC
+		    || dentry->d_sb->s_magic == SYSFS_MAGIC)
 			return 0;
 
 		integrity_audit_msg(AUDIT_INTEGRITY_METADATA,
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 912deee3f01e..889cd59ca5a7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1316,7 +1316,7 @@ static int selinux_genfs_get_sid(struct dentry *dentry,
 				 u32 *sid)
 {
 	int rc;
-	struct super_block *sb = dentry->d_inode->i_sb;
+	struct super_block *sb = dentry->d_sb;
 	char *buffer, *path;
 
 	buffer = (char *)__get_free_page(GFP_KERNEL);
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 11f79013ae1f..50bcca26c0b7 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1444,7 +1444,7 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
 	 *	XATTR_NAME_SMACKIPOUT
 	 */
 	if (strcmp(name, XATTR_NAME_SMACK) == 0) {
-		struct super_block *sbp = d_backing_inode(dentry)->i_sb;
+		struct super_block *sbp = dentry->d_sb;
 		struct superblock_smack *sbsp = sbp->s_security;
 
 		isp->smk_inode = sbsp->smk_default;
-- 
cgit v1.2.3


From 4923ec0b10d998349c2ac4b38aa4674e539e6f92 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 6 Apr 2016 19:39:21 -0700
Subject: bpf: simplify verifier register state assignments

verifier is using the following structure to track the state of registers:
struct reg_state {
    enum bpf_reg_type type;
    union {
        int imm;
        struct bpf_map *map_ptr;
    };
};
and later on in states_equal() does memcmp(&old->regs[i], &cur->regs[i],..)
to find equivalent states.
Throughout the code of verifier there are assignements to 'imm' and 'map_ptr'
fields and it's not obvious that most of the assignments into 'imm' don't
need to clear extra 4 bytes (like mark_reg_unknown_value() does) to make sure
that memcmp doesn't go over junk left from 'map_ptr' assignment.

Simplify the code by converting 'int' into 'long'

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8233021538d3..6c5d7cd4cb0e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -142,7 +142,7 @@ struct reg_state {
 	enum bpf_reg_type type;
 	union {
 		/* valid when type == CONST_IMM | PTR_TO_STACK */
-		int imm;
+		long imm;
 
 		/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
 		 *   PTR_TO_MAP_VALUE_OR_NULL
@@ -263,7 +263,7 @@ static void print_verifier_state(struct verifier_env *env)
 			continue;
 		verbose(" R%d=%s", i, reg_type_str[t]);
 		if (t == CONST_IMM || t == PTR_TO_STACK)
-			verbose("%d", env->cur_state.regs[i].imm);
+			verbose("%ld", env->cur_state.regs[i].imm);
 		else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
 			 t == PTR_TO_MAP_VALUE_OR_NULL)
 			verbose("(ks=%d,vs=%d)",
@@ -480,7 +480,6 @@ static void init_reg_state(struct reg_state *regs)
 	for (i = 0; i < MAX_BPF_REG; i++) {
 		regs[i].type = NOT_INIT;
 		regs[i].imm = 0;
-		regs[i].map_ptr = NULL;
 	}
 
 	/* frame pointer */
@@ -495,7 +494,6 @@ static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
 	BUG_ON(regno >= MAX_BPF_REG);
 	regs[regno].type = UNKNOWN_VALUE;
 	regs[regno].imm = 0;
-	regs[regno].map_ptr = NULL;
 }
 
 enum reg_arg_type {
-- 
cgit v1.2.3


From a511e1af8b12f44c6e55786c463c9f093c214fb6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 Apr 2016 16:14:26 +0100
Subject: KEYS: Move the point of trust determination to __key_link()

Move the point at which a key is determined to be trustworthy to
__key_link() so that we use the contents of the keyring being linked in to
to determine whether the key being linked in is trusted or not.

What is 'trusted' then becomes a matter of what's in the keyring.

Currently, the test is done when the key is parsed, but given that at that
point we can only sensibly refer to the contents of the system trusted
keyring, we can only use that as the basis for working out the
trustworthiness of a new key.

With this change, a trusted keyring is a set of keys that once the
trusted-only flag is set cannot be added to except by verification through
one of the contained keys.

Further, adding a key into a trusted keyring, whilst it might grant
trustworthiness in the context of that keyring, does not automatically
grant trustworthiness in the context of a second keyring to which it could
be secondarily linked.

To accomplish this, the authentication data associated with the key source
must now be retained.  For an X.509 cert, this means the contents of the
AuthorityKeyIdentifier and the signature data.


If system keyrings are disabled then restrict_link_by_builtin_trusted()
resolves to restrict_link_reject().  The integrity digital signature code
still works correctly with this as it was previously using
KEY_FLAG_TRUSTED_ONLY, which doesn't permit anything to be added if there
is no system keyring against which trust can be determined.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 certs/system_keyring.c                   | 20 +++++++++--
 crypto/asymmetric_keys/restrict.c        | 62 ++++++++++++++++----------------
 crypto/asymmetric_keys/x509_parser.h     |  6 ----
 crypto/asymmetric_keys/x509_public_key.c | 21 +----------
 include/crypto/public_key.h              |  7 ++++
 include/keys/system_keyring.h            | 19 ++++------
 kernel/module_signing.c                  |  2 +-
 security/integrity/digsig.c              | 33 ++++++++++++++++-
 security/integrity/ima/ima_mok.c         |  6 ++--
 9 files changed, 100 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/certs/system_keyring.c b/certs/system_keyring.c
index 417d65882870..4e2fa8ab01d6 100644
--- a/certs/system_keyring.c
+++ b/certs/system_keyring.c
@@ -18,12 +18,26 @@
 #include <keys/system_keyring.h>
 #include <crypto/pkcs7.h>
 
-struct key *system_trusted_keyring;
-EXPORT_SYMBOL_GPL(system_trusted_keyring);
+static struct key *system_trusted_keyring;
 
 extern __initconst const u8 system_certificate_list[];
 extern __initconst const unsigned long system_certificate_list_size;
 
+/**
+ * restrict_link_by_builtin_trusted - Restrict keyring addition by system CA
+ *
+ * Restrict the addition of keys into a keyring based on the key-to-be-added
+ * being vouched for by a key in the system keyring.
+ */
+int restrict_link_by_builtin_trusted(struct key *keyring,
+				     const struct key_type *type,
+				     unsigned long flags,
+				     const union key_payload *payload)
+{
+	return restrict_link_by_signature(system_trusted_keyring,
+					  type, payload);
+}
+
 /*
  * Load the compiled-in keys
  */
@@ -37,7 +51,7 @@ static __init int system_trusted_keyring_init(void)
 			      ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
 			      KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
 			      KEY_ALLOC_NOT_IN_QUOTA,
-			      keyring_restrict_trusted_only, NULL);
+			      restrict_link_by_builtin_trusted, NULL);
 	if (IS_ERR(system_trusted_keyring))
 		panic("Can't allocate system trusted keyring\n");
 	return 0;
diff --git a/crypto/asymmetric_keys/restrict.c b/crypto/asymmetric_keys/restrict.c
index b4c10f2f5034..ac4bddf669de 100644
--- a/crypto/asymmetric_keys/restrict.c
+++ b/crypto/asymmetric_keys/restrict.c
@@ -1,6 +1,6 @@
 /* Instantiate a public key crypto key from an X.509 Certificate
  *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2012, 2016 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -9,20 +9,12 @@
  * 2 of the Licence, or (at your option) any later version.
  */
 
-#define pr_fmt(fmt) "X.509: "fmt
+#define pr_fmt(fmt) "ASYM: "fmt
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/slab.h>
 #include <linux/err.h>
-#include <linux/mpi.h>
-#include <linux/asn1_decoder.h>
-#include <keys/asymmetric-subtype.h>
-#include <keys/asymmetric-parser.h>
-#include <keys/system_keyring.h>
-#include <crypto/hash.h>
 #include <crypto/public_key.h>
 #include "asymmetric_keys.h"
-#include "x509_parser.h"
 
 static bool use_builtin_keys;
 static struct asymmetric_key_id *ca_keyid;
@@ -62,45 +54,55 @@ static int __init ca_keys_setup(char *str)
 __setup("ca_keys=", ca_keys_setup);
 #endif
 
-/*
+/**
+ * restrict_link_by_signature - Restrict additions to a ring of public keys
+ * @trust_keyring: A ring of keys that can be used to vouch for the new cert.
+ * @type: The type of key being added.
+ * @payload: The payload of the new key.
+ *
  * Check the new certificate against the ones in the trust keyring.  If one of
  * those is the signing key and validates the new certificate, then mark the
  * new certificate as being trusted.
  *
- * Return 0 if the new certificate was successfully validated, 1 if we couldn't
- * find a matching parent certificate in the trusted list and an error if there
- * is a matching certificate but the signature check fails.
+ * Returns 0 if the new certificate was accepted, -ENOKEY if we couldn't find a
+ * matching parent certificate in the trusted list, -EKEYREJECTED if the
+ * signature check fails or the key is blacklisted and some other error if
+ * there is a matching certificate but the signature check cannot be performed.
  */
-int x509_validate_trust(struct x509_certificate *cert,
-			struct key *trust_keyring)
+int restrict_link_by_signature(struct key *trust_keyring,
+			       const struct key_type *type,
+			       const union key_payload *payload)
 {
-	struct public_key_signature *sig = cert->sig;
+	const struct public_key_signature *sig;
 	struct key *key;
-	int ret = 1;
+	int ret;
 
-	if (!sig->auth_ids[0] && !sig->auth_ids[1])
-		return 1;
+	pr_devel("==>%s()\n", __func__);
 
 	if (!trust_keyring)
+		return -ENOKEY;
+
+	if (type != &key_type_asymmetric)
 		return -EOPNOTSUPP;
+
+	sig = payload->data[asym_auth];
+	if (!sig->auth_ids[0] && !sig->auth_ids[1])
+		return 0;
+
 	if (ca_keyid && !asymmetric_key_id_partial(sig->auth_ids[1], ca_keyid))
 		return -EPERM;
-	if (cert->unsupported_sig)
-		return -ENOPKG;
 
+	/* See if we have a key that signed this one. */
 	key = find_asymmetric_key(trust_keyring,
 				  sig->auth_ids[0], sig->auth_ids[1],
 				  false);
 	if (IS_ERR(key))
-		return PTR_ERR(key);
+		return -ENOKEY;
 
-	if (!use_builtin_keys ||
-	    test_bit(KEY_FLAG_BUILTIN, &key->flags)) {
-		ret = verify_signature(key, cert->sig);
-		if (ret == -ENOPKG)
-			cert->unsupported_sig = true;
-	}
+	if (use_builtin_keys && !test_bit(KEY_FLAG_BUILTIN, &key->flags))
+		ret = -ENOKEY;
+	else
+		ret = verify_signature(key, sig);
 	key_put(key);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(x509_validate_trust);
diff --git a/crypto/asymmetric_keys/x509_parser.h b/crypto/asymmetric_keys/x509_parser.h
index 7a802b09a509..05eef1c68881 100644
--- a/crypto/asymmetric_keys/x509_parser.h
+++ b/crypto/asymmetric_keys/x509_parser.h
@@ -58,9 +58,3 @@ extern int x509_decode_time(time64_t *_t,  size_t hdrlen,
  */
 extern int x509_get_sig_params(struct x509_certificate *cert);
 extern int x509_check_for_self_signed(struct x509_certificate *cert);
-
-/*
- * public_key_trust.c
- */
-extern int x509_validate_trust(struct x509_certificate *cert,
-			       struct key *trust_keyring);
diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c
index 6d7f42f0de9a..fb732296cd36 100644
--- a/crypto/asymmetric_keys/x509_public_key.c
+++ b/crypto/asymmetric_keys/x509_public_key.c
@@ -178,31 +178,12 @@ static int x509_key_preparse(struct key_preparsed_payload *prep)
 
 	cert->pub->id_type = "X509";
 
-	/* See if we can derive the trustability of this certificate.
-	 *
-	 * When it comes to self-signed certificates, we cannot evaluate
-	 * trustedness except by the fact that we obtained it from a trusted
-	 * location.  So we just rely on x509_validate_trust() failing in this
-	 * case.
-	 *
-	 * Note that there's a possibility of a self-signed cert matching a
-	 * cert that we have (most likely a duplicate that we already trust) -
-	 * in which case it will be marked trusted.
-	 */
-	if (cert->unsupported_sig || cert->self_signed) {
+	if (cert->unsupported_sig) {
 		public_key_signature_free(cert->sig);
 		cert->sig = NULL;
 	} else {
 		pr_devel("Cert Signature: %s + %s\n",
 			 cert->sig->pkey_algo, cert->sig->hash_algo);
-
-		ret = x509_validate_trust(cert, get_system_trusted_keyring());
-		if (ret)
-			ret = x509_validate_trust(cert, get_ima_mok_keyring());
-		if (ret == -EKEYREJECTED)
-			goto error_free_cert;
-		if (!ret)
-			prep->trusted = true;
 	}
 
 	/* Propose a description */
diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h
index 96ef27b8dd41..882ca0e1e7a5 100644
--- a/include/crypto/public_key.h
+++ b/include/crypto/public_key.h
@@ -47,6 +47,13 @@ extern void public_key_signature_free(struct public_key_signature *sig);
 extern struct asymmetric_key_subtype public_key_subtype;
 
 struct key;
+struct key_type;
+union key_payload;
+
+extern int restrict_link_by_signature(struct key *trust_keyring,
+				      const struct key_type *type,
+				      const union key_payload *payload);
+
 extern int verify_signature(const struct key *key,
 			    const struct public_key_signature *sig);
 
diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h
index b2d645ac35a0..93715913a0b1 100644
--- a/include/keys/system_keyring.h
+++ b/include/keys/system_keyring.h
@@ -12,22 +12,17 @@
 #ifndef _KEYS_SYSTEM_KEYRING_H
 #define _KEYS_SYSTEM_KEYRING_H
 
+#include <linux/key.h>
+
 #ifdef CONFIG_SYSTEM_TRUSTED_KEYRING
 
-#include <linux/key.h>
-#include <linux/verification.h>
-#include <crypto/public_key.h>
+extern int restrict_link_by_builtin_trusted(struct key *keyring,
+					    const struct key_type *type,
+					    unsigned long flags,
+					    const union key_payload *payload);
 
-extern struct key *system_trusted_keyring;
-static inline struct key *get_system_trusted_keyring(void)
-{
-	return system_trusted_keyring;
-}
 #else
-static inline struct key *get_system_trusted_keyring(void)
-{
-	return NULL;
-}
+#define restrict_link_by_builtin_trusted restrict_link_reject
 #endif
 
 #ifdef CONFIG_IMA_MOK_KEYRING
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index 6a64e03b9f44..937c844bee4a 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -12,7 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
-#include <keys/system_keyring.h>
+#include <linux/verification.h>
 #include <crypto/public_key.h>
 #include "module-internal.h"
 
diff --git a/security/integrity/digsig.c b/security/integrity/digsig.c
index 659566c2200b..d647178c6bbd 100644
--- a/security/integrity/digsig.c
+++ b/security/integrity/digsig.c
@@ -18,6 +18,8 @@
 #include <linux/cred.h>
 #include <linux/key-type.h>
 #include <linux/digsig.h>
+#include <crypto/public_key.h>
+#include <keys/system_keyring.h>
 
 #include "integrity.h"
 
@@ -40,6 +42,35 @@ static bool init_keyring __initdata = true;
 static bool init_keyring __initdata;
 #endif
 
+#ifdef CONFIG_SYSTEM_TRUSTED_KEYRING
+/*
+ * Restrict the addition of keys into the IMA keyring.
+ *
+ * Any key that needs to go in .ima keyring must be signed by CA in
+ * either .system or .ima_mok keyrings.
+ */
+static int restrict_link_by_ima_mok(struct key *keyring,
+				    const struct key_type *type,
+				    unsigned long flags,
+				    const union key_payload *payload)
+{
+	int ret;
+
+	ret = restrict_link_by_builtin_trusted(keyring, type, flags, payload);
+	if (ret != -ENOKEY)
+		return ret;
+
+	return restrict_link_by_signature(get_ima_mok_keyring(),
+					  type, payload);
+}
+#else
+/*
+ * If there's no system trusted keyring, then keys cannot be loaded into
+ * .ima_mok and added keys cannot be marked trusted.
+ */
+#define restrict_link_by_ima_mok restrict_link_reject
+#endif
+
 int integrity_digsig_verify(const unsigned int id, const char *sig, int siglen,
 			    const char *digest, int digestlen)
 {
@@ -84,7 +115,7 @@ int __init integrity_init_keyring(const unsigned int id)
 				     KEY_USR_VIEW | KEY_USR_READ |
 				     KEY_USR_WRITE | KEY_USR_SEARCH),
 				    KEY_ALLOC_NOT_IN_QUOTA,
-				    NULL, NULL);
+				    restrict_link_by_ima_mok, NULL);
 	if (IS_ERR(keyring[id])) {
 		err = PTR_ERR(keyring[id]);
 		pr_info("Can't allocate %s keyring (%d)\n",
diff --git a/security/integrity/ima/ima_mok.c b/security/integrity/ima/ima_mok.c
index ef91248cb934..2988726d30d6 100644
--- a/security/integrity/ima/ima_mok.c
+++ b/security/integrity/ima/ima_mok.c
@@ -17,7 +17,7 @@
 #include <linux/cred.h>
 #include <linux/err.h>
 #include <linux/init.h>
-#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
 
 
 struct key *ima_mok_keyring;
@@ -36,7 +36,7 @@ __init int ima_mok_init(void)
 			      KEY_USR_VIEW | KEY_USR_READ |
 			      KEY_USR_WRITE | KEY_USR_SEARCH,
 			      KEY_ALLOC_NOT_IN_QUOTA,
-			      keyring_restrict_trusted_only, NULL);
+			      restrict_link_by_builtin_trusted, NULL);
 
 	ima_blacklist_keyring = keyring_alloc(".ima_blacklist",
 				KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
@@ -44,7 +44,7 @@ __init int ima_mok_init(void)
 				KEY_USR_VIEW | KEY_USR_READ |
 				KEY_USR_WRITE | KEY_USR_SEARCH,
 				KEY_ALLOC_NOT_IN_QUOTA,
-				keyring_restrict_trusted_only, NULL);
+				restrict_link_by_builtin_trusted, NULL);
 
 	if (IS_ERR(ima_mok_keyring) || IS_ERR(ima_blacklist_keyring))
 		panic("Can't allocate IMA MOK or blacklist keyrings.");
-- 
cgit v1.2.3


From 1f190931893a98ffd5d4cfdfbfc2452ad0ed3e1b Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 12 Apr 2016 08:47:17 -0700
Subject: locking/locktorture: Fix deboosting NULL pointer dereference

For the case of rtmutex torturing we will randomly call into the
boost() handler, including upon module exiting when the tasks are
deboosted before stopping. In such cases the task may or may not have
already been boosted, and therefore the NULL being explicitly passed
can occur anywhere. Currently we only assume that the task will is
at a higher prio, and in consequence, dereference a NULL pointer.

This patch fixes the case of a rmmod locktorture exploding while
pounding on the rtmutex lock (partial trace):

 task: ffff88081026cf80 ti: ffff880816120000 task.ti: ffff880816120000
 RSP: 0018:ffff880816123eb0  EFLAGS: 00010206
 RAX: ffff88081026cf80 RBX: ffff880816bfa630 RCX: 0000000000160d1b
 RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000
 RBP: ffff88081026cf80 R08: 000000000000001f R09: ffff88017c20ca80
 R10: 0000000000000000 R11: 000000000048c316 R12: ffffffffa05d1840
 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
 FS:  0000000000000000(0000) GS:ffff88203f880000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000008 CR3: 0000000001c0a000 CR4: 00000000000406e0
 Stack:
  ffffffffa05d141d ffff880816bfa630 ffffffffa05d1922 ffff88081e70c2c0
  ffff880816bfa630 ffffffff81095fed 0000000000000000 ffffffff8107bf60
  ffff880816bfa630 ffffffff00000000 ffff880800000000 ffff880816123f08
 Call Trace:
  [<ffffffff81095fed>] kthread+0xbd/0xe0
  [<ffffffff815cf40f>] ret_from_fork+0x3f/0x70

This patch ensures that if the random state pointer is not NULL and current
is not boosted, then do nothing.

 RIP: 0010:[<ffffffffa05c6185>]  [<ffffffffa05c6185>] torture_random+0x5/0x60 [torture]
  [<ffffffffa05d141d>] torture_rtmutex_boost+0x1d/0x90 [locktorture]
  [<ffffffffa05d1922>] lock_torture_writer+0xe2/0x170 [locktorture]

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bobby.prani@gmail.com
Cc: dhowells@redhat.com
Cc: dipankar@in.ibm.com
Cc: dvhart@linux.intel.com
Cc: edumazet@google.com
Cc: fweisbec@gmail.com
Cc: jiangshanlai@gmail.com
Cc: josh@joshtriplett.org
Cc: mathieu.desnoyers@efficios.com
Cc: oleg@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1460476038-27060-1-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/locktorture.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 8ef1919d63b2..9e9c5f454f5c 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -394,12 +394,12 @@ static void torture_rtmutex_boost(struct torture_random_state *trsp)
 
 	if (!rt_task(current)) {
 		/*
-		 * (1) Boost priority once every ~50k operations. When the
+		 * Boost priority once every ~50k operations. When the
 		 * task tries to take the lock, the rtmutex it will account
 		 * for the new priority, and do any corresponding pi-dance.
 		 */
-		if (!(torture_random(trsp) %
-		      (cxt.nrealwriters_stress * factor))) {
+		if (trsp && !(torture_random(trsp) %
+			      (cxt.nrealwriters_stress * factor))) {
 			policy = SCHED_FIFO;
 			param.sched_priority = MAX_RT_PRIO - 1;
 		} else /* common case, do nothing */
-- 
cgit v1.2.3


From c1c33b92db4fb274dfbff778ccf2459e4bebd48e Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 12 Apr 2016 08:47:18 -0700
Subject: locking/locktorture: Fix NULL pointer dereference for cleanup paths

It has been found that paths that invoke cleanups through
lock_torture_cleanup() can trigger NULL pointer dereferencing
bugs during the statistics printing phase. This is mainly
because we should not be calling into statistics before we are
sure things have been set up correctly.

Specifically, early checks (and the need for handling this in
the cleanup call) only include parameter checks and basic
statistics allocation. Once we start write/read kthreads
we then consider the test as started. As such, update the function
in question to check for cxt.lwsa writer stats, if not set,
we either have a bogus parameter or -ENOMEM situation and
therefore only need to deal with general torture calls.

Reported-and-tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bobby.prani@gmail.com
Cc: dhowells@redhat.com
Cc: dipankar@in.ibm.com
Cc: dvhart@linux.intel.com
Cc: edumazet@google.com
Cc: fweisbec@gmail.com
Cc: jiangshanlai@gmail.com
Cc: josh@joshtriplett.org
Cc: mathieu.desnoyers@efficios.com
Cc: oleg@redhat.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1460476038-27060-2-git-send-email-paulmck@linux.vnet.ibm.com
[ Improved the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/locktorture.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 9e9c5f454f5c..d066a50dc87e 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -748,6 +748,15 @@ static void lock_torture_cleanup(void)
 	if (torture_cleanup_begin())
 		return;
 
+	/*
+	 * Indicates early cleanup, meaning that the test has not run,
+	 * such as when passing bogus args when loading the module. As
+	 * such, only perform the underlying torture-specific cleanups,
+	 * and avoid anything related to locktorture.
+	 */
+	if (!cxt.lwsa)
+		goto end;
+
 	if (writer_tasks) {
 		for (i = 0; i < cxt.nrealwriters_stress; i++)
 			torture_stop_kthread(lock_torture_writer,
@@ -776,6 +785,7 @@ static void lock_torture_cleanup(void)
 	else
 		lock_torture_print_module_parms(cxt.cur_ops,
 						"End of test: SUCCESS");
+end:
 	torture_cleanup_end();
 }
 
@@ -870,6 +880,7 @@ static int __init lock_torture_init(void)
 			VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
 			firsterr = -ENOMEM;
 			kfree(cxt.lwsa);
+			cxt.lwsa = NULL;
 			goto unwind;
 		}
 
@@ -878,6 +889,7 @@ static int __init lock_torture_init(void)
 			cxt.lrsa[i].n_lock_acquired = 0;
 		}
 	}
+
 	lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
 
 	/* Prepare torture context. */
-- 
cgit v1.2.3


From c003ed928962a55eb446e78c544b1d7c4f6cb88a Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Fri, 8 Apr 2016 20:58:46 +0200
Subject: locking/lockdep: Deinline register_lock_class(), save 2328 bytes

This function compiles to 1328 bytes of machine code. Three callsites.

Registering a new lock class is definitely not *that* time-critical to inline it.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1460141926-13069-5-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index ed9410936a22..7cc43ef856c1 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -708,7 +708,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
  * yet. Otherwise we look it up. We cache the result in the lock object
  * itself, so actual lookup of the hash should be once per lock object.
  */
-static inline struct lock_class *
+static struct lock_class *
 register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 {
 	struct lockdep_subclass_key *key;
-- 
cgit v1.2.3


From f8e04d854506ddfeba9cb41b601972b28521f104 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 7 Apr 2016 17:12:21 +0200
Subject: locking/rwsem: Get rid of __down_write_nested()

This is no longer used anywhere and all callers (__down_write()) use
0 as a subclass. Ditch __down_write_nested() to make the code easier
to follow.

This shouldn't introduce any functional change.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Signed-off-by: Jason Low <jason.low2@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-alpha@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linux-sh@vger.kernel.org
Cc: linux-xtensa@linux-xtensa.org
Cc: sparclinux@vger.kernel.org
Link: http://lkml.kernel.org/r/1460041951-22347-2-git-send-email-mhocko@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/s390/include/asm/rwsem.h   | 7 +------
 arch/sh/include/asm/rwsem.h     | 5 -----
 arch/sparc/include/asm/rwsem.h  | 7 +------
 arch/x86/include/asm/rwsem.h    | 7 +------
 include/asm-generic/rwsem.h     | 7 +------
 include/linux/rwsem-spinlock.h  | 1 -
 kernel/locking/rwsem-spinlock.c | 7 +------
 7 files changed, 5 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index fead491dfc28..555d23b6b6d1 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -90,7 +90,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * lock for writing
  */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+static inline void __down_write(struct rw_semaphore *sem)
 {
 	signed long old, new, tmp;
 
@@ -108,11 +108,6 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 		rwsem_down_write_failed(sem);
 }
 
-static inline void __down_write(struct rw_semaphore *sem)
-{
-	__down_write_nested(sem, 0);
-}
-
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
diff --git a/arch/sh/include/asm/rwsem.h b/arch/sh/include/asm/rwsem.h
index edab57265293..a5104bebd1eb 100644
--- a/arch/sh/include/asm/rwsem.h
+++ b/arch/sh/include/asm/rwsem.h
@@ -114,11 +114,6 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 		rwsem_downgrade_wake(sem);
 }
 
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
-{
-	__down_write(sem);
-}
-
 /*
  * implement exchange and add functionality
  */
diff --git a/arch/sparc/include/asm/rwsem.h b/arch/sparc/include/asm/rwsem.h
index 069bf4d663a1..e5a0d575bc7f 100644
--- a/arch/sparc/include/asm/rwsem.h
+++ b/arch/sparc/include/asm/rwsem.h
@@ -45,7 +45,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * lock for writing
  */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+static inline void __down_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
@@ -55,11 +55,6 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 		rwsem_down_write_failed(sem);
 }
 
-static inline void __down_write(struct rw_semaphore *sem)
-{
-	__down_write_nested(sem, 0);
-}
-
 static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
 	long tmp;
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index ceec86eb68e9..4a8292a0d6e1 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -99,7 +99,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * lock for writing
  */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+static inline void __down_write(struct rw_semaphore *sem)
 {
 	long tmp;
 	asm volatile("# beginning down_write\n\t"
@@ -116,11 +116,6 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 		     : "memory", "cc");
 }
 
-static inline void __down_write(struct rw_semaphore *sem)
-{
-	__down_write_nested(sem, 0);
-}
-
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index d6d5dc98d7da..b8d8a6cf4ca8 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -53,7 +53,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * lock for writing
  */
-static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
+static inline void __down_write(struct rw_semaphore *sem)
 {
 	long tmp;
 
@@ -63,11 +63,6 @@ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
 		rwsem_down_write_failed(sem);
 }
 
-static inline void __down_write(struct rw_semaphore *sem)
-{
-	__down_write_nested(sem, 0);
-}
-
 static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
 	long tmp;
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index 561e8615528d..a733a5467e6c 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -34,7 +34,6 @@ struct rw_semaphore {
 extern void __down_read(struct rw_semaphore *sem);
 extern int __down_read_trylock(struct rw_semaphore *sem);
 extern void __down_write(struct rw_semaphore *sem);
-extern void __down_write_nested(struct rw_semaphore *sem, int subclass);
 extern int __down_write_trylock(struct rw_semaphore *sem);
 extern void __up_read(struct rw_semaphore *sem);
 extern void __up_write(struct rw_semaphore *sem);
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 3a5048572065..bab26104a5d0 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -191,7 +191,7 @@ int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * get a write lock on the semaphore
  */
-void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
+void __sched __down_write(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -227,11 +227,6 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 }
 
-void __sched __down_write(struct rw_semaphore *sem)
-{
-	__down_write_nested(sem, 0);
-}
-
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-- 
cgit v1.2.3


From d47996082f52baa0ca8b48d26b3cbef5ede70a73 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 7 Apr 2016 17:12:26 +0200
Subject: locking/rwsem: Introduce basis for down_write_killable()

Introduce a generic implementation necessary for down_write_killable().

This is a trivial extension of the already existing down_write() call
which can be interrupted by SIGKILL.  This patch doesn't provide
down_write_killable() yet because arches have to provide the necessary
pieces before.

rwsem_down_write_failed() which is a generic slow path for the
write lock is extended to take a task state and renamed to
__rwsem_down_write_failed_common(). The return value is either a valid
semaphore pointer or ERR_PTR(-EINTR).

rwsem_down_write_failed_killable() is exported as a new way to wait for
the lock and be killable.

For rwsem-spinlock implementation the current __down_write() it updated
in a similar way as __rwsem_down_write_failed_common() except it doesn't
need new exports just visible __down_write_killable().

Architectures which are not using the generic rwsem implementation are
supposed to provide their __down_write_killable() implementation and
use rwsem_down_write_failed_killable() for the slow path.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Signed-off-by: Jason Low <jason.low2@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-alpha@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linux-sh@vger.kernel.org
Cc: linux-xtensa@linux-xtensa.org
Cc: sparclinux@vger.kernel.org
Link: http://lkml.kernel.org/r/1460041951-22347-7-git-send-email-mhocko@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/rwsem.h     | 12 ++++++++++++
 include/linux/rwsem-spinlock.h  |  1 +
 include/linux/rwsem.h           |  2 ++
 kernel/locking/rwsem-spinlock.c | 22 ++++++++++++++++++++--
 kernel/locking/rwsem-xadd.c     | 31 +++++++++++++++++++++++++------
 5 files changed, 60 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index b8d8a6cf4ca8..3fc94a046bf5 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -63,6 +63,18 @@ static inline void __down_write(struct rw_semaphore *sem)
 		rwsem_down_write_failed(sem);
 }
 
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+	long tmp;
+
+	tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
+				     (atomic_long_t *)&sem->count);
+	if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
+		if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+			return -EINTR;
+	return 0;
+}
+
 static inline int __down_write_trylock(struct rw_semaphore *sem)
 {
 	long tmp;
diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index a733a5467e6c..ae0528b834cd 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -34,6 +34,7 @@ struct rw_semaphore {
 extern void __down_read(struct rw_semaphore *sem);
 extern int __down_read_trylock(struct rw_semaphore *sem);
 extern void __down_write(struct rw_semaphore *sem);
+extern int __must_check __down_write_killable(struct rw_semaphore *sem);
 extern int __down_write_trylock(struct rw_semaphore *sem);
 extern void __up_read(struct rw_semaphore *sem);
 extern void __up_write(struct rw_semaphore *sem);
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8f498cdde280..7d7ae029dac5 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -14,6 +14,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
+#include <linux/err.h>
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 #include <linux/osq_lock.h>
 #endif
@@ -43,6 +44,7 @@ struct rw_semaphore {
 
 extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
 extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
 extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
 
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index bab26104a5d0..1591f6b3539f 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -191,11 +191,12 @@ int __down_read_trylock(struct rw_semaphore *sem)
 /*
  * get a write lock on the semaphore
  */
-void __sched __down_write(struct rw_semaphore *sem)
+int __sched __down_write_common(struct rw_semaphore *sem, int state)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
 	unsigned long flags;
+	int ret = 0;
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
@@ -215,16 +216,33 @@ void __sched __down_write(struct rw_semaphore *sem)
 		 */
 		if (sem->count == 0)
 			break;
-		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+		if (signal_pending_state(state, current)) {
+			ret = -EINTR;
+			goto out;
+		}
+		set_task_state(tsk, state);
 		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 		schedule();
 		raw_spin_lock_irqsave(&sem->wait_lock, flags);
 	}
 	/* got the lock */
 	sem->count = -1;
+out:
 	list_del(&waiter.list);
 
 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+	return ret;
+}
+
+void __sched __down_write(struct rw_semaphore *sem)
+{
+	__down_write_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+	return __down_write_common(sem, TASK_KILLABLE);
 }
 
 /*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a4d4de05b2d1..df4dcb883b50 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -433,12 +433,13 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
 /*
  * Wait until we successfully acquire the write lock
  */
-__visible
-struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
+static inline struct rw_semaphore *
+__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 {
 	long count;
 	bool waiting = true; /* any queued threads before us */
 	struct rwsem_waiter waiter;
+	struct rw_semaphore *ret = sem;
 
 	/* undo write bias from down_write operation, stop active locking */
 	count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
@@ -478,7 +479,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 		count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
 
 	/* wait until we successfully acquire the lock */
-	set_current_state(TASK_UNINTERRUPTIBLE);
+	set_current_state(state);
 	while (true) {
 		if (rwsem_try_write_lock(count, sem))
 			break;
@@ -486,21 +487,39 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 
 		/* Block until there are no active lockers. */
 		do {
+			if (signal_pending_state(state, current)) {
+				raw_spin_lock_irq(&sem->wait_lock);
+				ret = ERR_PTR(-EINTR);
+				goto out;
+			}
 			schedule();
-			set_current_state(TASK_UNINTERRUPTIBLE);
+			set_current_state(state);
 		} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
 
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
+out:
 	__set_current_state(TASK_RUNNING);
-
 	list_del(&waiter.list);
 	raw_spin_unlock_irq(&sem->wait_lock);
 
-	return sem;
+	return ret;
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed(struct rw_semaphore *sem)
+{
+	return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(rwsem_down_write_failed);
 
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed_killable(struct rw_semaphore *sem)
+{
+	return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_write_failed_killable);
+
 /*
  * handle waking up a waiter on the semaphore
  * - up_read/up_write has decremented the active part of count if we come here
-- 
cgit v1.2.3


From fb90a6e93c0684ab2629a42462400603aa829b9c Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabinv@axis.com>
Date: Mon, 4 Apr 2016 15:42:02 +0200
Subject: sched/debug: Don't dump sched debug info in SysRq-W

sysrq_sched_debug_show() can dump a lot of information.  Don't print out
all that if we're just trying to get a list of blocked tasks (SysRq-W).
The information is still accessible with SysRq-T.

Signed-off-by: Rabin Vincent <rabinv@axis.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1459777322-30902-1-git-send-email-rabin.vincent@axis.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11594230ef4d..06efbb9c9544 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5050,7 +5050,8 @@ void show_state_filter(unsigned long state_filter)
 	touch_all_softlockup_watchdogs();
 
 #ifdef CONFIG_SCHED_DEBUG
-	sysrq_sched_debug_show();
+	if (!state_filter)
+		sysrq_sched_debug_show();
 #endif
 	rcu_read_unlock();
 	/*
-- 
cgit v1.2.3


From c78b17e28cc2c2df74264afc408bdc6aaf3fbcc8 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 11 Apr 2016 16:38:33 +0200
Subject: sched/clock: Remove pointless test in cpu_clock/local_clock

In case the HAVE_UNSTABLE_SCHED_CLOCK config is set, the cpu_clock() version
checks if sched_clock_stable() is not set and calls sched_clock_cpu(),
otherwise it calls sched_clock().

sched_clock_cpu() checks also if sched_clock_stable() is set and, if true,
calls sched_clock().

sched_clock() will be called in sched_clock_cpu() if sched_clock_stable() is
true.

Remove the duplicate test by directly calling sched_clock_cpu() and let the
static key act in this function instead.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1460385514-14700-1-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/clock.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index fedb967a9841..30c4b202f0ba 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -375,10 +375,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
  */
 u64 cpu_clock(int cpu)
 {
-	if (!sched_clock_stable())
-		return sched_clock_cpu(cpu);
-
-	return sched_clock();
+	return sched_clock_cpu(cpu);
 }
 
 /*
@@ -390,10 +387,7 @@ u64 cpu_clock(int cpu)
  */
 u64 local_clock(void)
 {
-	if (!sched_clock_stable())
-		return sched_clock_cpu(raw_smp_processor_id());
-
-	return sched_clock();
+	return sched_clock_cpu(raw_smp_processor_id());
 }
 
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-- 
cgit v1.2.3


From 2c923e94cd9c6acff3b22f0ae29cfe65e2658b40 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 11 Apr 2016 16:38:34 +0200
Subject: sched/clock: Make local_clock()/cpu_clock() inline

The local_clock/cpu_clock functions were changed to prevent a double
identical test with sched_clock_cpu() when HAVE_UNSTABLE_SCHED_CLOCK
is set. That resulted in one line functions.

As these functions are in all the cases one line functions and in the
hot path, it is useful to specify them as static inline in order to
give a strong hint to the compiler.

After verification, it appears the compiler does not inline them
without this hint. Change those functions to static inline.

sched_clock_cpu() is called via the inlined local_clock()/cpu_clock()
functions from sched.h. So any module code including sched.h will
reference sched_clock_cpu(). Thus it must be exported with the
EXPORT_SYMBOL_GPL macro.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1460385514-14700-2-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 32 ++++++++++++++++++++++++++++++--
 kernel/sched/clock.c  | 42 +-----------------------------------------
 2 files changed, 31 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52c4847b05e2..13c1c1d07270 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2303,8 +2303,6 @@ extern unsigned long long notrace sched_clock(void);
 /*
  * See the comment in kernel/sched/clock.c
  */
-extern u64 cpu_clock(int cpu);
-extern u64 local_clock(void);
 extern u64 running_clock(void);
 extern u64 sched_clock_cpu(int cpu);
 
@@ -2323,6 +2321,16 @@ static inline void sched_clock_idle_sleep_event(void)
 static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
 {
 }
+
+static inline u64 cpu_clock(int cpu)
+{
+	return sched_clock();
+}
+
+static inline u64 local_clock(void)
+{
+	return sched_clock();
+}
 #else
 /*
  * Architectures can set this to 1 if they have specified
@@ -2337,6 +2345,26 @@ extern void clear_sched_clock_stable(void);
 extern void sched_clock_tick(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+
+/*
+ * As outlined in clock.c, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !!                                                  #
+ * ####################################################################
+ */
+static inline u64 cpu_clock(int cpu)
+{
+	return sched_clock_cpu(cpu);
+}
+
+static inline u64 local_clock(void)
+{
+	return sched_clock_cpu(raw_smp_processor_id());
+}
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 30c4b202f0ba..e85a725e5c34 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -318,6 +318,7 @@ u64 sched_clock_cpu(int cpu)
 
 	return clock;
 }
+EXPORT_SYMBOL_GPL(sched_clock_cpu);
 
 void sched_clock_tick(void)
 {
@@ -363,33 +364,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-/*
- * As outlined at the top, provides a fast, high resolution, nanosecond
- * time source that is monotonic per cpu argument and has bounded drift
- * between cpus.
- *
- * ######################### BIG FAT WARNING ##########################
- * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
- * # go backwards !!                                                  #
- * ####################################################################
- */
-u64 cpu_clock(int cpu)
-{
-	return sched_clock_cpu(cpu);
-}
-
-/*
- * Similar to cpu_clock() for the current cpu. Time will only be observed
- * to be monotonic if care is taken to only compare timestampt taken on the
- * same CPU.
- *
- * See cpu_clock().
- */
-u64 local_clock(void)
-{
-	return sched_clock_cpu(raw_smp_processor_id());
-}
-
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 void sched_clock_init(void)
@@ -404,22 +378,8 @@ u64 sched_clock_cpu(int cpu)
 
 	return sched_clock();
 }
-
-u64 cpu_clock(int cpu)
-{
-	return sched_clock();
-}
-
-u64 local_clock(void)
-{
-	return sched_clock();
-}
-
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
-EXPORT_SYMBOL_GPL(cpu_clock);
-EXPORT_SYMBOL_GPL(local_clock);
-
 /*
  * Running clock - returns the time that has elapsed while a guest has been
  * running.
-- 
cgit v1.2.3


From bd92883051a0228cc34996b8e766111ba10c9aac Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 6 Apr 2016 21:59:50 +1000
Subject: sched/cpuacct: Check for NULL when using task_pt_regs()

task_pt_regs() can return NULL for kernel threads, so add a check.
This fixes an oops at boot on ppc64.

Reported-and-Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Tested-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: efault@gmx.de
Cc: htejun@gmail.com
Cc: linuxppc-dev@lists.ozlabs.org
Cc: tj@kernel.org
Cc: yangds.fnst@cn.fujitsu.com
Link: http://lkml.kernel.org/r/20160406215950.04bc3f0b@kryten
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpuacct.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index df947e07aac1..41f85c4d0938 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -316,12 +316,11 @@ static struct cftype files[] = {
 void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
-	int index;
+	int index = CPUACCT_USAGE_SYSTEM;
+	struct pt_regs *regs = task_pt_regs(tsk);
 
-	if (user_mode(task_pt_regs(tsk)))
+	if (regs && user_mode(regs))
 		index = CPUACCT_USAGE_USER;
-	else
-		index = CPUACCT_USAGE_SYSTEM;
 
 	rcu_read_lock();
 
-- 
cgit v1.2.3


From d82bccc69041a51f7b7b9b4a36db0772f4cdba21 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 12 Apr 2016 10:26:19 -0700
Subject: bpf/verifier: reject invalid LD_ABS | BPF_DW instruction

verifier must check for reserved size bits in instruction opcode and
reject BPF_LD | BPF_ABS | BPF_DW and BPF_LD | BPF_IND | BPF_DW instructions,
otherwise interpreter will WARN_RATELIMIT on them during execution.

Fixes: ddd872bc3098 ("bpf: verifier: add checks for BPF_ABS | BPF_IND instructions")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2e08f8e9b771..618ef77c302a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1374,6 +1374,7 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
 	}
 
 	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
+	    BPF_SIZE(insn->code) == BPF_DW ||
 	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
 		verbose("BPF_LD_ABS uses reserved fields\n");
 		return -EINVAL;
-- 
cgit v1.2.3


From 04cf31a759ef575f750a63777cee95500e410994 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 24 Mar 2016 22:04:01 +1100
Subject: ftrace: Make ftrace_location_range() global

In order to support live patching on powerpc we would like to call
ftrace_location_range(), so make it global.

Signed-off-by: Torsten Duwe <duwe@suse.de>
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/ftrace.h |  1 +
 kernel/trace/ftrace.c  | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 81de7123959d..3481a8e405f9 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -455,6 +455,7 @@ int ftrace_update_record(struct dyn_ftrace *rec, int enable);
 int ftrace_test_record(struct dyn_ftrace *rec, int enable);
 void ftrace_run_stop_machine(int command);
 unsigned long ftrace_location(unsigned long ip);
+unsigned long ftrace_location_range(unsigned long start, unsigned long end);
 unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec);
 unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eca592f977b2..e1b3f2312db0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1533,7 +1533,19 @@ static int ftrace_cmp_recs(const void *a, const void *b)
 	return 0;
 }
 
-static unsigned long ftrace_location_range(unsigned long start, unsigned long end)
+/**
+ * ftrace_location_range - return the first address of a traced location
+ *	if it touches the given ip range
+ * @start: start of range to search.
+ * @end: end of range to search (inclusive). @end points to the last byte
+ *	to check.
+ *
+ * Returns rec->ip if the related ftrace location is a least partly within
+ * the given address range. That is, the first address of the instruction
+ * that is either a NOP or call to the function tracer. It checks the ftrace
+ * internal tables to determine if the address belongs or not.
+ */
+unsigned long ftrace_location_range(unsigned long start, unsigned long end)
 {
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
-- 
cgit v1.2.3


From 28e7cbd3e0f5fefec892842d1391ebd508fdb5ce Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 24 Mar 2016 22:04:02 +1100
Subject: livepatch: Allow architectures to specify an alternate ftrace
 location

When livepatch tries to patch a function it takes the function address
and asks ftrace to install the livepatch handler at that location.
ftrace will look for an mcount call site at that exact address.

On powerpc the mcount location is not the first instruction of the
function, and in fact it's not at a constant offset from the start of
the function. To accommodate this add a hook which arch code can
override to customise the behaviour.

Signed-off-by: Torsten Duwe <duwe@suse.de>
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 kernel/livepatch/core.c | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index bc2c85c064c1..35ee2c5b7176 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -298,6 +298,19 @@ unlock:
 	rcu_read_unlock();
 }
 
+/*
+ * Convert a function address into the appropriate ftrace location.
+ *
+ * Usually this is just the address of the function, but on some architectures
+ * it's more complicated so allow them to provide a custom behaviour.
+ */
+#ifndef klp_get_ftrace_location
+static unsigned long klp_get_ftrace_location(unsigned long faddr)
+{
+	return faddr;
+}
+#endif
+
 static void klp_disable_func(struct klp_func *func)
 {
 	struct klp_ops *ops;
@@ -312,8 +325,14 @@ static void klp_disable_func(struct klp_func *func)
 		return;
 
 	if (list_is_singular(&ops->func_stack)) {
+		unsigned long ftrace_loc;
+
+		ftrace_loc = klp_get_ftrace_location(func->old_addr);
+		if (WARN_ON(!ftrace_loc))
+			return;
+
 		WARN_ON(unregister_ftrace_function(&ops->fops));
-		WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0));
+		WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0));
 
 		list_del_rcu(&func->stack_node);
 		list_del(&ops->node);
@@ -338,6 +357,15 @@ static int klp_enable_func(struct klp_func *func)
 
 	ops = klp_find_ops(func->old_addr);
 	if (!ops) {
+		unsigned long ftrace_loc;
+
+		ftrace_loc = klp_get_ftrace_location(func->old_addr);
+		if (!ftrace_loc) {
+			pr_err("failed to find location for function '%s'\n",
+				func->old_name);
+			return -EINVAL;
+		}
+
 		ops = kzalloc(sizeof(*ops), GFP_KERNEL);
 		if (!ops)
 			return -ENOMEM;
@@ -352,7 +380,7 @@ static int klp_enable_func(struct klp_func *func)
 		INIT_LIST_HEAD(&ops->func_stack);
 		list_add_rcu(&func->stack_node, &ops->func_stack);
 
-		ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
+		ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0);
 		if (ret) {
 			pr_err("failed to set ftrace filter for function '%s' (%d)\n",
 			       func->old_name, ret);
@@ -363,7 +391,7 @@ static int klp_enable_func(struct klp_func *func)
 		if (ret) {
 			pr_err("failed to register ftrace handler for function '%s' (%d)\n",
 			       func->old_name, ret);
-			ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+			ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0);
 			goto err;
 		}
 
-- 
cgit v1.2.3


From 51d7b120418e99d6b3bf8df9eb3cc31e8171dee4 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 14 Apr 2016 12:05:37 -0700
Subject: /proc/iomem: only expose physical resource addresses to privileged
 users

In commit c4004b02f8e5b ("x86: remove the kernel code/data/bss resources
from /proc/iomem") I was hoping to remove the phyiscal kernel address
data from /proc/iomem entirely, but that had to be reverted because some
system programs actually use it.

This limits all the detailed resource information to properly
credentialed users instead.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 2e78ead30934..9b5f04404152 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -105,16 +105,25 @@ static int r_show(struct seq_file *m, void *v)
 {
 	struct resource *root = m->private;
 	struct resource *r = v, *p;
+	unsigned long long start, end;
 	int width = root->end < 0x10000 ? 4 : 8;
 	int depth;
 
 	for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
 		if (p->parent == root)
 			break;
+
+	if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) {
+		start = r->start;
+		end = r->end;
+	} else {
+		start = end = 0;
+	}
+
 	seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
 			depth * 2, "",
-			width, (unsigned long long) r->start,
-			width, (unsigned long long) r->end,
+			width, start,
+			width, end,
 			r->name ? r->name : "<BAD>");
 	return 0;
 }
-- 
cgit v1.2.3


From 33ff9823c569f3aceb071071914919177a6bed6a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 13 Apr 2016 00:10:50 +0200
Subject: bpf, verifier: add bpf_call_arg_meta for passing meta data

Currently, when the verifier checks calls in check_call() function, we
call check_func_arg() for all 5 arguments e.g. to make sure expected types
are correct. In some cases, we collect meta data (here: map pointer) to
perform additional checks such as checking stack boundary on key/value
sizes for subsequent arguments. As we're going to extend the meta data,
add a generic struct bpf_call_arg_meta that we can use for passing into
check_func_arg().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6c5d7cd4cb0e..202f8f738542 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -205,6 +205,10 @@ struct verifier_env {
 #define BPF_COMPLEXITY_LIMIT_INSNS	65536
 #define BPF_COMPLEXITY_LIMIT_STACK	1024
 
+struct bpf_call_arg_meta {
+	struct bpf_map *map_ptr;
+};
+
 /* verbose verifier prints what it's seeing
  * bpf_check() is called under lock, so no race to access these global vars
  */
@@ -822,7 +826,8 @@ static int check_stack_boundary(struct verifier_env *env, int regno,
 }
 
 static int check_func_arg(struct verifier_env *env, u32 regno,
-			  enum bpf_arg_type arg_type, struct bpf_map **mapp)
+			  enum bpf_arg_type arg_type,
+			  struct bpf_call_arg_meta *meta)
 {
 	struct reg_state *reg = env->cur_state.regs + regno;
 	enum bpf_reg_type expected_type;
@@ -875,14 +880,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 
 	if (arg_type == ARG_CONST_MAP_PTR) {
 		/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
-		*mapp = reg->map_ptr;
-
+		meta->map_ptr = reg->map_ptr;
 	} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
 		/* bpf_map_xxx(..., map_ptr, ..., key) call:
 		 * check that [key, key + map->key_size) are within
 		 * stack limits and initialized
 		 */
-		if (!*mapp) {
+		if (!meta->map_ptr) {
 			/* in function declaration map_ptr must come before
 			 * map_key, so that it's verified and known before
 			 * we have to check map_key here. Otherwise it means
@@ -891,19 +895,19 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		err = check_stack_boundary(env, regno, (*mapp)->key_size,
+		err = check_stack_boundary(env, regno, meta->map_ptr->key_size,
 					   false);
 	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
 		 */
-		if (!*mapp) {
+		if (!meta->map_ptr) {
 			/* kernel subsystem misconfigured verifier */
 			verbose("invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		err = check_stack_boundary(env, regno, (*mapp)->value_size,
-					   false);
+		err = check_stack_boundary(env, regno,
+					   meta->map_ptr->value_size, false);
 	} else if (arg_type == ARG_CONST_STACK_SIZE ||
 		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
@@ -954,8 +958,8 @@ static int check_call(struct verifier_env *env, int func_id)
 	struct verifier_state *state = &env->cur_state;
 	const struct bpf_func_proto *fn = NULL;
 	struct reg_state *regs = state->regs;
-	struct bpf_map *map = NULL;
 	struct reg_state *reg;
+	struct bpf_call_arg_meta meta;
 	int i, err;
 
 	/* find function prototype */
@@ -978,20 +982,22 @@ static int check_call(struct verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
+	memset(&meta, 0, sizeof(meta));
+
 	/* check args */
-	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
+	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
 	if (err)
 		return err;
-	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
+	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
 	if (err)
 		return err;
-	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
 	if (err)
 		return err;
-	err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+	err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta);
 	if (err)
 		return err;
-	err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+	err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta);
 	if (err)
 		return err;
 
@@ -1013,18 +1019,18 @@ static int check_call(struct verifier_env *env, int func_id)
 		 * can check 'value_size' boundary of memory access
 		 * to map element returned from bpf_map_lookup_elem()
 		 */
-		if (map == NULL) {
+		if (meta.map_ptr == NULL) {
 			verbose("kernel subsystem misconfigured verifier\n");
 			return -EINVAL;
 		}
-		regs[BPF_REG_0].map_ptr = map;
+		regs[BPF_REG_0].map_ptr = meta.map_ptr;
 	} else {
 		verbose("unknown return type %d of func %d\n",
 			fn->ret_type, func_id);
 		return -EINVAL;
 	}
 
-	err = check_map_func_compatibility(map, func_id);
+	err = check_map_func_compatibility(meta.map_ptr, func_id);
 	if (err)
 		return err;
 
-- 
cgit v1.2.3


From 435faee1aae9c1ac231f89e4faf0437bfe29f425 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 13 Apr 2016 00:10:51 +0200
Subject: bpf, verifier: add ARG_PTR_TO_RAW_STACK type

When passing buffers from eBPF stack space into a helper function, we have
ARG_PTR_TO_STACK argument type for helpers available. The verifier makes sure
that such buffers are initialized, within boundaries, etc.

However, the downside with this is that we have a couple of helper functions
such as bpf_skb_load_bytes() that fill out the passed buffer in the expected
success case anyway, so zero initializing them prior to the helper call is
unneeded/wasted instructions in the eBPF program that can be avoided.

Therefore, add a new helper function argument type called ARG_PTR_TO_RAW_STACK.
The idea is to skip the STACK_MISC check in check_stack_boundary() and color
the related stack slots as STACK_MISC after we checked all call arguments.

Helper functions using ARG_PTR_TO_RAW_STACK must make sure that every path of
the helper function will fill the provided buffer area, so that we cannot leak
any uninitialized stack memory. This f.e. means that error paths need to
memset() the buffers, but the expected fast-path doesn't have to do this
anymore.

Since there's no such helper needing more than at most one ARG_PTR_TO_RAW_STACK
argument, we can keep it simple and don't need to check for multiple areas.
Should in future such a use-case really appear, we have check_raw_mode() that
will make sure we implement support for it first.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  5 +++++
 kernel/bpf/verifier.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 59 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b2365a6eba3d..5fb3c610fa96 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -66,6 +66,11 @@ enum bpf_arg_type {
 	 * functions that access data on eBPF program stack
 	 */
 	ARG_PTR_TO_STACK,	/* any pointer to eBPF program stack */
+	ARG_PTR_TO_RAW_STACK,	/* any pointer to eBPF program stack, area does not
+				 * need to be initialized, helper function must fill
+				 * all bytes or clear them in error case.
+				 */
+
 	ARG_CONST_STACK_SIZE,	/* number of bytes accessed from stack */
 	ARG_CONST_STACK_SIZE_OR_ZERO, /* number of bytes accessed from stack or 0 */
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 202f8f738542..9c843a5417da 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -207,6 +207,9 @@ struct verifier_env {
 
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
+	bool raw_mode;
+	int regno;
+	int access_size;
 };
 
 /* verbose verifier prints what it's seeing
@@ -789,7 +792,8 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
  * and all elements of stack are initialized
  */
 static int check_stack_boundary(struct verifier_env *env, int regno,
-				int access_size, bool zero_size_allowed)
+				int access_size, bool zero_size_allowed,
+				struct bpf_call_arg_meta *meta)
 {
 	struct verifier_state *state = &env->cur_state;
 	struct reg_state *regs = state->regs;
@@ -815,6 +819,12 @@ static int check_stack_boundary(struct verifier_env *env, int regno,
 		return -EACCES;
 	}
 
+	if (meta && meta->raw_mode) {
+		meta->access_size = access_size;
+		meta->regno = regno;
+		return 0;
+	}
+
 	for (i = 0; i < access_size; i++) {
 		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
 			verbose("invalid indirect read from stack off %d+%d size %d\n",
@@ -859,7 +869,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		expected_type = CONST_PTR_TO_MAP;
 	} else if (arg_type == ARG_PTR_TO_CTX) {
 		expected_type = PTR_TO_CTX;
-	} else if (arg_type == ARG_PTR_TO_STACK) {
+	} else if (arg_type == ARG_PTR_TO_STACK ||
+		   arg_type == ARG_PTR_TO_RAW_STACK) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a CONST_IMM type. Final test
@@ -867,6 +878,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		 */
 		if (reg->type == CONST_IMM && reg->imm == 0)
 			expected_type = CONST_IMM;
+		meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK;
 	} else {
 		verbose("unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
@@ -896,7 +908,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno, meta->map_ptr->key_size,
-					   false);
+					   false, NULL);
 	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
 		/* bpf_map_xxx(..., map_ptr, ..., value) call:
 		 * check [value, value + map->value_size) validity
@@ -907,7 +919,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno,
-					   meta->map_ptr->value_size, false);
+					   meta->map_ptr->value_size,
+					   false, NULL);
 	} else if (arg_type == ARG_CONST_STACK_SIZE ||
 		   arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO);
@@ -922,7 +935,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 			return -EACCES;
 		}
 		err = check_stack_boundary(env, regno - 1, reg->imm,
-					   zero_size_allowed);
+					   zero_size_allowed, meta);
 	}
 
 	return err;
@@ -953,6 +966,24 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 	return 0;
 }
 
+static int check_raw_mode(const struct bpf_func_proto *fn)
+{
+	int count = 0;
+
+	if (fn->arg1_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg2_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg3_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg4_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+	if (fn->arg5_type == ARG_PTR_TO_RAW_STACK)
+		count++;
+
+	return count > 1 ? -EINVAL : 0;
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -984,6 +1015,15 @@ static int check_call(struct verifier_env *env, int func_id)
 
 	memset(&meta, 0, sizeof(meta));
 
+	/* We only support one arg being in raw mode at the moment, which
+	 * is sufficient for the helper functions we have right now.
+	 */
+	err = check_raw_mode(fn);
+	if (err) {
+		verbose("kernel subsystem misconfigured func %d\n", func_id);
+		return err;
+	}
+
 	/* check args */
 	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
 	if (err)
@@ -1001,6 +1041,15 @@ static int check_call(struct verifier_env *env, int func_id)
 	if (err)
 		return err;
 
+	/* Mark slots with STACK_MISC in case of raw mode, stack offset
+	 * is inferred from register state.
+	 */
+	for (i = 0; i < meta.access_size; i++) {
+		err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1);
+		if (err)
+			return err;
+	}
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		reg = regs + caller_saved[i];
-- 
cgit v1.2.3


From 074f528eed408b467516e142fa4c45e5b0d2ba16 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 13 Apr 2016 00:10:52 +0200
Subject: bpf: convert relevant helper args to ARG_PTR_TO_RAW_STACK

This patch converts all helpers that can use ARG_PTR_TO_RAW_STACK as argument
type. For tc programs this is bpf_skb_load_bytes(), bpf_skb_get_tunnel_key(),
bpf_skb_get_tunnel_opt(). For tracing, this optimizes bpf_get_current_comm()
and bpf_probe_read(). The check in bpf_skb_load_bytes() for MAX_BPF_STACK can
also be removed since the verifier already makes sure we stay within bounds
on stack buffers.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/helpers.c     | 17 +++++++++++----
 kernel/trace/bpf_trace.c | 10 ++++++---
 net/core/filter.c        | 57 +++++++++++++++++++++++++++++++++---------------
 3 files changed, 60 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 50da680c479f..ad7a0573f71b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -163,17 +163,26 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5)
 	struct task_struct *task = current;
 	char *buf = (char *) (long) r1;
 
-	if (!task)
-		return -EINVAL;
+	if (unlikely(!task))
+		goto err_clear;
 
-	strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm)));
+	strncpy(buf, task->comm, size);
+
+	/* Verifier guarantees that size > 0. For task->comm exceeding
+	 * size, guarantee that buf is %NUL-terminated. Unconditionally
+	 * done here to save the size test.
+	 */
+	buf[size - 1] = 0;
 	return 0;
+err_clear:
+	memset(buf, 0, size);
+	return -EINVAL;
 }
 
 const struct bpf_func_proto bpf_get_current_comm_proto = {
 	.func		= bpf_get_current_comm,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg1_type	= ARG_PTR_TO_RAW_STACK,
 	.arg2_type	= ARG_CONST_STACK_SIZE,
 };
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 413ec5614180..685587885374 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -62,17 +62,21 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
 static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
 	void *dst = (void *) (long) r1;
-	int size = (int) r2;
+	int ret, size = (int) r2;
 	void *unsafe_ptr = (void *) (long) r3;
 
-	return probe_kernel_read(dst, unsafe_ptr, size);
+	ret = probe_kernel_read(dst, unsafe_ptr, size);
+	if (unlikely(ret < 0))
+		memset(dst, 0, size);
+
+	return ret;
 }
 
 static const struct bpf_func_proto bpf_probe_read_proto = {
 	.func		= bpf_probe_read,
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg1_type	= ARG_PTR_TO_RAW_STACK,
 	.arg2_type	= ARG_CONST_STACK_SIZE,
 	.arg3_type	= ARG_ANYTHING,
 };
diff --git a/net/core/filter.c b/net/core/filter.c
index e8486ba601ea..5d2ac2b9d1c4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1409,16 +1409,19 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	unsigned int len = (unsigned int) r4;
 	void *ptr;
 
-	if (unlikely((u32) offset > 0xffff || len > MAX_BPF_STACK))
-		return -EFAULT;
+	if (unlikely((u32) offset > 0xffff))
+		goto err_clear;
 
 	ptr = skb_header_pointer(skb, offset, len, to);
 	if (unlikely(!ptr))
-		return -EFAULT;
+		goto err_clear;
 	if (ptr != to)
 		memcpy(to, ptr, len);
 
 	return 0;
+err_clear:
+	memset(to, 0, len);
+	return -EFAULT;
 }
 
 static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
@@ -1427,7 +1430,7 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
 	.arg2_type	= ARG_ANYTHING,
-	.arg3_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_PTR_TO_RAW_STACK,
 	.arg4_type	= ARG_CONST_STACK_SIZE,
 };
 
@@ -1756,12 +1759,19 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
 	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	u8 compat[sizeof(struct bpf_tunnel_key)];
+	void *to_orig = to;
+	int err;
 
-	if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6))))
-		return -EINVAL;
-	if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags))
-		return -EPROTO;
+	if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
+		err = -EINVAL;
+		goto err_clear;
+	}
+	if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
+		err = -EPROTO;
+		goto err_clear;
+	}
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
+		err = -EINVAL;
 		switch (size) {
 		case offsetof(struct bpf_tunnel_key, tunnel_label):
 		case offsetof(struct bpf_tunnel_key, tunnel_ext):
@@ -1771,12 +1781,12 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 			 * a common path later on.
 			 */
 			if (ip_tunnel_info_af(info) != AF_INET)
-				return -EINVAL;
+				goto err_clear;
 set_compat:
 			to = (struct bpf_tunnel_key *)compat;
 			break;
 		default:
-			return -EINVAL;
+			goto err_clear;
 		}
 	}
 
@@ -1793,9 +1803,12 @@ set_compat:
 	}
 
 	if (unlikely(size != sizeof(struct bpf_tunnel_key)))
-		memcpy((void *)(long) r2, to, size);
+		memcpy(to_orig, to, size);
 
 	return 0;
+err_clear:
+	memset(to_orig, 0, size);
+	return err;
 }
 
 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
@@ -1803,7 +1816,7 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_PTR_TO_RAW_STACK,
 	.arg3_type	= ARG_CONST_STACK_SIZE,
 	.arg4_type	= ARG_ANYTHING,
 };
@@ -1813,16 +1826,26 @@ static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	u8 *to = (u8 *) (long) r2;
 	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	int err;
 
 	if (unlikely(!info ||
-		     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)))
-		return -ENOENT;
-	if (unlikely(size < info->options_len))
-		return -ENOMEM;
+		     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
+		err = -ENOENT;
+		goto err_clear;
+	}
+	if (unlikely(size < info->options_len)) {
+		err = -ENOMEM;
+		goto err_clear;
+	}
 
 	ip_tunnel_info_opts_get(to, info);
+	if (size > info->options_len)
+		memset(to + info->options_len, 0, size - info->options_len);
 
 	return info->options_len;
+err_clear:
+	memset(to, 0, size);
+	return err;
 }
 
 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
@@ -1830,7 +1853,7 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_PTR_TO_RAW_STACK,
 	.arg3_type	= ARG_CONST_STACK_SIZE,
 };
 
-- 
cgit v1.2.3


From 266a0a790fb545fa1802a899ac44f61b1d6335a7 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 16 Apr 2016 22:29:33 +0200
Subject: bpf: avoid warning for wrong pointer cast

Two new functions in bpf contain a cast from a 'u64' to a
pointer. This works on 64-bit architectures but causes a warning
on all 32-bit architectures:

kernel/trace/bpf_trace.c: In function 'bpf_perf_event_output_tp':
kernel/trace/bpf_trace.c:350:13: error: cast to pointer from integer of different size [-Werror=int-to-pointer-cast]
  u64 ctx = *(long *)r1;

This changes the cast to first convert the u64 argument into a uintptr_t,
which is guaranteed to be the same size as a pointer.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Fixes: 9940d67c93b5 ("bpf: support bpf_get_stackid() and bpf_perf_event_output() in tracepoint programs")
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 685587885374..f389629dade7 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -351,7 +351,7 @@ static u64 bpf_perf_event_output_tp(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
 	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
 	 * from there and call the same bpf_perf_event_output() helper
 	 */
-	u64 ctx = *(long *)r1;
+	u64 ctx = *(long *)(uintptr_t)r1;
 
 	return bpf_perf_event_output(ctx, r2, index, r4, size);
 }
@@ -369,7 +369,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
 
 static u64 bpf_get_stackid_tp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
-	u64 ctx = *(long *)r1;
+	u64 ctx = *(long *)(uintptr_t)r1;
 
 	return bpf_get_stackid(ctx, r2, r3, r4, r5);
 }
-- 
cgit v1.2.3


From 6687659568e2ec5b3ac24b39c5d26ce8b9d90434 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Sun, 17 Apr 2016 23:31:41 -0700
Subject: locking/pvqspinlock: Fix division by zero in qstat_read()

While playing with the qstat statistics (in <debugfs>/qlockstat/) I ran into
the following splat on a VM when opening pv_hash_hops:

  divide error: 0000 [#1] SMP
  ...
  RIP: 0010:[<ffffffff810b61fe>]  [<ffffffff810b61fe>] qstat_read+0x12e/0x1e0
  ...
  Call Trace:
    [<ffffffff811cad7c>] ? mem_cgroup_commit_charge+0x6c/0xd0
    [<ffffffff8119750c>] ? page_add_new_anon_rmap+0x8c/0xd0
    [<ffffffff8118d3b9>] ? handle_mm_fault+0x1439/0x1b40
    [<ffffffff811937a9>] ? do_mmap+0x449/0x550
    [<ffffffff811d3de3>] ? __vfs_read+0x23/0xd0
    [<ffffffff811d4ab2>] ? rw_verify_area+0x52/0xd0
    [<ffffffff811d4bb1>] ? vfs_read+0x81/0x120
    [<ffffffff811d5f12>] ? SyS_read+0x42/0xa0
    [<ffffffff815720f6>] ? entry_SYSCALL_64_fastpath+0x1e/0xa8

Fix this by verifying that qstat_pv_kick_unlock is in fact non-zero,
similarly to what the qstat_pv_latency_wake case does, as if nothing
else, this can come from resetting the statistics, thus having 0 kicks
should be quite valid in this context.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Waiman Long <Waiman.Long@hpe.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: waiman.long@hpe.com
Link: http://lkml.kernel.org/r/1460961103-24953-1-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_stat.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index eb2a2c9bc3fc..d734b7502001 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -136,10 +136,12 @@ static ssize_t qstat_read(struct file *file, char __user *user_buf,
 	}
 
 	if (counter == qstat_pv_hash_hops) {
-		u64 frac;
+		u64 frac = 0;
 
-		frac = 100ULL * do_div(stat, kicks);
-		frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+		if (kicks) {
+			frac = 100ULL * do_div(stat, kicks);
+			frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+		}
 
 		/*
 		 * Return a X.XX decimal number
-- 
cgit v1.2.3


From 9ebc57cfaad21aacbc363eecead579269a46b493 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 12 Apr 2016 20:39:55 -0400
Subject: tracing: Rename check_ignore_pid() to ignore_this_task()

The name "check_ignore_pid" is confusing in trying to figure out if the pid
should be ignored or not. Rename it to "ignore_this_task" which is pretty
straight forward, as a task (not a pid) is passed in, and should if true
should be ignored.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771..598a18675a6b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -484,7 +484,7 @@ static int cmp_pid(const void *key, const void *elt)
 }
 
 static bool
-check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
+ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 {
 	pid_t search_pid;
 	pid_t *pid;
@@ -517,8 +517,8 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, prev) &&
-		       check_ignore_pid(pid_list, next));
+		       ignore_this_task(pid_list, prev) &&
+		       ignore_this_task(pid_list, next));
 }
 
 static void
@@ -531,7 +531,7 @@ event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, next));
+		       ignore_this_task(pid_list, next));
 }
 
 static void
@@ -547,7 +547,7 @@ event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, task));
+		       ignore_this_task(pid_list, task));
 }
 
 static void
@@ -564,7 +564,7 @@ event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
 
 	/* Set tracing if current is enabled */
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, current));
+		       ignore_this_task(pid_list, current));
 }
 
 static void __ftrace_clear_event_pids(struct trace_array *tr)
@@ -1561,7 +1561,7 @@ static void ignore_task_cpu(void *data)
 					     mutex_is_locked(&event_mutex));
 
 	this_cpu_write(tr->trace_buffer.data->ignore_pid,
-		       check_ignore_pid(pid_list, current));
+		       ignore_this_task(pid_list, current));
 }
 
 static ssize_t
-- 
cgit v1.2.3


From f4d34a87e9c10f0ffd03d3548db6bfb200d06cdf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 13 Apr 2016 16:27:49 -0400
Subject: tracing: Use pid bitmap instead of a pid array for set_event_pid

In order to add the ability to let tasks that are filtered by the events
have their children also be traced on fork (and then not traced on exit),
convert the array into a pid bitmask. Most of the time the number of pids is
only 32768 pids or a 4k bitmask, which is the same size as the default list
currently is, and that list could grow if more pids are listed.

This also greatly simplifies the code.

Suggested-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h        |   5 +-
 kernel/trace/trace_events.c | 221 ++++++++++++++++++++------------------------
 2 files changed, 102 insertions(+), 124 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3fff4adfd431..68cbb8e10aea 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -177,9 +177,8 @@ struct trace_options {
 };
 
 struct trace_pid_list {
-	unsigned int			nr_pids;
-	int				order;
-	pid_t				*pids;
+	int				pid_max;
+	unsigned long			*pids;
 };
 
 /*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 598a18675a6b..45f7cc72bf25 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,7 @@
 #include <linux/kthread.h>
 #include <linux/tracefs.h>
 #include <linux/uaccess.h>
-#include <linux/bsearch.h>
+#include <linux/vmalloc.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
 #include <linux/sort.h>
@@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr)
 	mutex_unlock(&event_mutex);
 }
 
-static int cmp_pid(const void *key, const void *elt)
-{
-	const pid_t *search_pid = key;
-	const pid_t *pid = elt;
-
-	if (*search_pid == *pid)
-		return 0;
-	if (*search_pid < *pid)
-		return -1;
-	return 1;
-}
+/* Shouldn't this be in a header? */
+extern int pid_max;
 
 static bool
 ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 {
-	pid_t search_pid;
-	pid_t *pid;
+	pid_t pid;
 
 	/*
 	 * Return false, because if filtered_pids does not exist,
@@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 	if (!filtered_pids)
 		return false;
 
-	search_pid = task->pid;
+	pid = task->pid;
 
-	pid = bsearch(&search_pid, filtered_pids->pids,
-		      filtered_pids->nr_pids, sizeof(pid_t),
-		      cmp_pid);
-	if (!pid)
+	/*
+	 * If pid_max changed after filtered_pids was created, we
+	 * by default ignore all pids greater than the previous pid_max.
+	 */
+	if (task->pid >= filtered_pids->pid_max)
 		return true;
 
-	return false;
+	return !test_bit(task->pid, filtered_pids->pids);
 }
 
 static void
@@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr)
 	/* Wait till all users are no longer using pid filtering */
 	synchronize_sched();
 
-	free_pages((unsigned long)pid_list->pids, pid_list->order);
+	vfree(pid_list->pids);
 	kfree(pid_list);
 }
 
@@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p)
 	mutex_unlock(&event_mutex);
 }
 
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct trace_array *tr = m->private;
+	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
+	unsigned long pid = (unsigned long)v;
+
+	(*pos)++;
+
+	/* pid already is +1 of the actual prevous bit */
+	pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+	/* Return pid + 1 to allow zero to be represented */
+	if (pid < pid_list->pid_max)
+		return (void *)(pid + 1);
+
+	return NULL;
+}
+
 static void *p_start(struct seq_file *m, loff_t *pos)
 	__acquires(RCU)
 {
 	struct trace_pid_list *pid_list;
 	struct trace_array *tr = m->private;
+	unsigned long pid;
+	loff_t l = 0;
 
 	/*
 	 * Grab the mutex, to keep calls to p_next() having the same
@@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 
 	pid_list = rcu_dereference_sched(tr->filtered_pids);
 
-	if (!pid_list || *pos >= pid_list->nr_pids)
+	if (!pid_list)
+		return NULL;
+
+	pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+	if (pid >= pid_list->pid_max)
 		return NULL;
 
-	return (void *)&pid_list->pids[*pos];
+	/* Return pid + 1 so that zero can be the exit value */
+	for (pid++; pid && l < *pos;
+	     pid = (unsigned long)p_next(m, (void *)pid, &l))
+		;
+	return (void *)pid;
 }
 
 static void p_stop(struct seq_file *m, void *p)
@@ -976,25 +996,11 @@ static void p_stop(struct seq_file *m, void *p)
 	mutex_unlock(&event_mutex);
 }
 
-static void *
-p_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct trace_array *tr = m->private;
-	struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
-
-	(*pos)++;
-
-	if (*pos >= pid_list->nr_pids)
-		return NULL;
-
-	return (void *)&pid_list->pids[*pos];
-}
-
 static int p_show(struct seq_file *m, void *v)
 {
-	pid_t *pid = v;
+	unsigned long pid = (unsigned long)v - 1;
 
-	seq_printf(m, "%d\n", *pid);
+	seq_printf(m, "%lu\n", pid);
 	return 0;
 }
 
@@ -1543,11 +1549,6 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 	return r;
 }
 
-static int max_pids(struct trace_pid_list *pid_list)
-{
-	return (PAGE_SIZE << pid_list->order) / sizeof(pid_t);
-}
-
 static void ignore_task_cpu(void *data)
 {
 	struct trace_array *tr = data;
@@ -1571,7 +1572,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	struct seq_file *m = filp->private_data;
 	struct trace_array *tr = m->private;
 	struct trace_pid_list *filtered_pids = NULL;
-	struct trace_pid_list *pid_list = NULL;
+	struct trace_pid_list *pid_list;
 	struct trace_event_file *file;
 	struct trace_parser parser;
 	unsigned long val;
@@ -1579,7 +1580,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	ssize_t read = 0;
 	ssize_t ret = 0;
 	pid_t pid;
-	int i;
+	int nr_pids = 0;
 
 	if (!cnt)
 		return 0;
@@ -1592,10 +1593,43 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 		return -ENOMEM;
 
 	mutex_lock(&event_mutex);
+	filtered_pids = rcu_dereference_protected(tr->filtered_pids,
+					     lockdep_is_held(&event_mutex));
+
 	/*
-	 * Load as many pids into the array before doing a
-	 * swap from the tr->filtered_pids to the new list.
+	 * Always recreate a new array. The write is an all or nothing
+	 * operation. Always create a new array when adding new pids by
+	 * the user. If the operation fails, then the current list is
+	 * not modified.
 	 */
+	pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+	if (!pid_list) {
+		read = -ENOMEM;
+		goto out;
+	}
+	pid_list->pid_max = READ_ONCE(pid_max);
+	/* Only truncating will shrink pid_max */
+	if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
+		pid_list->pid_max = filtered_pids->pid_max;
+	pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
+	if (!pid_list->pids) {
+		kfree(pid_list);
+		read = -ENOMEM;
+		goto out;
+	}
+	if (filtered_pids) {
+		/* copy the current bits to the new max */
+		pid = find_first_bit(filtered_pids->pids,
+				     filtered_pids->pid_max);
+		while (pid < filtered_pids->pid_max) {
+			set_bit(pid, pid_list->pids);
+			pid = find_next_bit(filtered_pids->pids,
+					    filtered_pids->pid_max,
+					    pid + 1);
+			nr_pids++;
+		}
+	}
+
 	while (cnt > 0) {
 
 		this_pos = 0;
@@ -1613,92 +1647,35 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 		ret = -EINVAL;
 		if (kstrtoul(parser.buffer, 0, &val))
 			break;
-		if (val > INT_MAX)
+		if (val >= pid_list->pid_max)
 			break;
 
 		pid = (pid_t)val;
 
-		ret = -ENOMEM;
-		if (!pid_list) {
-			pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
-			if (!pid_list)
-				break;
-
-			filtered_pids = rcu_dereference_protected(tr->filtered_pids,
-							lockdep_is_held(&event_mutex));
-			if (filtered_pids)
-				pid_list->order = filtered_pids->order;
-			else
-				pid_list->order = 0;
-
-			pid_list->pids = (void *)__get_free_pages(GFP_KERNEL,
-								  pid_list->order);
-			if (!pid_list->pids)
-				break;
-
-			if (filtered_pids) {
-				pid_list->nr_pids = filtered_pids->nr_pids;
-				memcpy(pid_list->pids, filtered_pids->pids,
-				       pid_list->nr_pids * sizeof(pid_t));
-			} else
-				pid_list->nr_pids = 0;
-		}
-
-		if (pid_list->nr_pids >= max_pids(pid_list)) {
-			pid_t *pid_page;
-
-			pid_page = (void *)__get_free_pages(GFP_KERNEL,
-							    pid_list->order + 1);
-			if (!pid_page)
-				break;
-			memcpy(pid_page, pid_list->pids,
-			       pid_list->nr_pids * sizeof(pid_t));
-			free_pages((unsigned long)pid_list->pids, pid_list->order);
-
-			pid_list->order++;
-			pid_list->pids = pid_page;
-		}
+		set_bit(pid, pid_list->pids);
+		nr_pids++;
 
-		pid_list->pids[pid_list->nr_pids++] = pid;
 		trace_parser_clear(&parser);
 		ret = 0;
 	}
 	trace_parser_put(&parser);
 
 	if (ret < 0) {
-		if (pid_list)
-			free_pages((unsigned long)pid_list->pids, pid_list->order);
+		vfree(pid_list->pids);
 		kfree(pid_list);
-		mutex_unlock(&event_mutex);
-		return ret;
-	}
-
-	if (!pid_list) {
-		mutex_unlock(&event_mutex);
-		return ret;
+		read = ret;
+		goto out;
 	}
 
-	sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL);
-
-	/* Remove duplicates */
-	for (i = 1; i < pid_list->nr_pids; i++) {
-		int start = i;
-
-		while (i < pid_list->nr_pids &&
-		       pid_list->pids[i - 1] == pid_list->pids[i])
-			i++;
-
-		if (start != i) {
-			if (i < pid_list->nr_pids) {
-				memmove(&pid_list->pids[start], &pid_list->pids[i],
-					(pid_list->nr_pids - i) * sizeof(pid_t));
-				pid_list->nr_pids -= i - start;
-				i = start;
-			} else
-				pid_list->nr_pids = start;
-		}
+	if (!nr_pids) {
+		/* Cleared the list of pids */
+		vfree(pid_list->pids);
+		kfree(pid_list);
+		read = ret;
+		if (!filtered_pids)
+			goto out;
+		pid_list = NULL;
 	}
-
 	rcu_assign_pointer(tr->filtered_pids, pid_list);
 
 	list_for_each_entry(file, &tr->events, list) {
@@ -1708,7 +1685,7 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	if (filtered_pids) {
 		synchronize_sched();
 
-		free_pages((unsigned long)filtered_pids->pids, filtered_pids->order);
+		vfree(filtered_pids->pids);
 		kfree(filtered_pids);
 	} else {
 		/*
@@ -1745,10 +1722,12 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
 	 */
 	on_each_cpu(ignore_task_cpu, tr, 1);
 
+ out:
 	mutex_unlock(&event_mutex);
 
 	ret = read;
-	*ppos += read;
+	if (read > 0)
+		*ppos += read;
 
 	return ret;
 }
-- 
cgit v1.2.3


From c37775d57830a36382a9774bb84eca4ce3d019cc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 13 Apr 2016 16:59:18 -0400
Subject: tracing: Add infrastructure to allow set_event_pid to follow children

Add the infrastructure needed to have the PIDs in set_event_pid to
automatically add PIDs of the children of the tasks that have their PIDs in
set_event_pid. This will also remove PIDs from set_event_pid when a task
exits

This is implemented by adding hooks into the fork and exit tracepoints. On
fork, the PIDs are added to the list, and on exit, they are removed.

Add a new option called event_fork that when set, PIDs in set_event_pid will
automatically get their children PIDs added when they fork, as well as any
task that exits will have its PID removed from set_event_pid.

This works for instances as well.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        |  3 ++
 kernel/trace/trace.h        |  2 ++
 kernel/trace/trace_events.c | 84 +++++++++++++++++++++++++++++++++++++++------
 3 files changed, 79 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a2f0b9f33e9b..0d12dbde8399 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3571,6 +3571,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
 	if (mask == TRACE_ITER_RECORD_CMD)
 		trace_event_enable_cmd_record(enabled);
 
+	if (mask == TRACE_ITER_EVENT_FORK)
+		trace_event_follow_fork(tr, enabled);
+
 	if (mask == TRACE_ITER_OVERWRITE) {
 		ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
 #ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 68cbb8e10aea..2525042760e6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -655,6 +655,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
 extern cycle_t ftrace_now(int cpu);
 
 extern void trace_find_cmdline(int pid, char comm[]);
+extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
@@ -966,6 +967,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 		C(STOP_ON_FREE,		"disable_on_free"),	\
 		C(IRQ_INFO,		"irq-info"),		\
 		C(MARKERS,		"markers"),		\
+		C(EVENT_FORK,		"event-fork"),		\
 		FUNCTION_FLAGS					\
 		FGRAPH_FLAGS					\
 		STACK_FLAGS					\
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 45f7cc72bf25..add81dff7520 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -474,11 +474,23 @@ static void ftrace_clear_events(struct trace_array *tr)
 /* Shouldn't this be in a header? */
 extern int pid_max;
 
+/* Returns true if found in filter */
 static bool
-ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid)
 {
-	pid_t pid;
+	/*
+	 * If pid_max changed after filtered_pids was created, we
+	 * by default ignore all pids greater than the previous pid_max.
+	 */
+	if (search_pid >= filtered_pids->pid_max)
+		return false;
+
+	return test_bit(search_pid, filtered_pids->pids);
+}
 
+static bool
+ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
 	/*
 	 * Return false, because if filtered_pids does not exist,
 	 * all pids are good to trace.
@@ -486,16 +498,68 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task)
 	if (!filtered_pids)
 		return false;
 
-	pid = task->pid;
+	return !find_filtered_pid(filtered_pids, task->pid);
+}
 
-	/*
-	 * If pid_max changed after filtered_pids was created, we
-	 * by default ignore all pids greater than the previous pid_max.
-	 */
-	if (task->pid >= filtered_pids->pid_max)
-		return true;
+static void filter_add_remove_task(struct trace_pid_list *pid_list,
+				   struct task_struct *self,
+				   struct task_struct *task)
+{
+	if (!pid_list)
+		return;
+
+	/* For forks, we only add if the forking task is listed */
+	if (self) {
+		if (!find_filtered_pid(pid_list, self->pid))
+			return;
+	}
+
+	/* Sorry, but we don't support pid_max changing after setting */
+	if (task->pid >= pid_list->pid_max)
+		return;
+
+	/* "self" is set for forks, and NULL for exits */
+	if (self)
+		set_bit(task->pid, pid_list->pids);
+	else
+		clear_bit(task->pid, pid_list->pids);
+}
+
+static void
+event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
+{
+	struct trace_pid_list *pid_list;
+	struct trace_array *tr = data;
+
+	pid_list = rcu_dereference_sched(tr->filtered_pids);
+	filter_add_remove_task(pid_list, NULL, task);
+}
 
-	return !test_bit(task->pid, filtered_pids->pids);
+static void
+event_filter_pid_sched_process_fork(void *data,
+				    struct task_struct *self,
+				    struct task_struct *task)
+{
+	struct trace_pid_list *pid_list;
+	struct trace_array *tr = data;
+
+	pid_list = rcu_dereference_sched(tr->filtered_pids);
+	filter_add_remove_task(pid_list, self, task);
+}
+
+void trace_event_follow_fork(struct trace_array *tr, bool enable)
+{
+	if (enable) {
+		register_trace_prio_sched_process_fork(event_filter_pid_sched_process_fork,
+						       tr, INT_MIN);
+		register_trace_prio_sched_process_exit(event_filter_pid_sched_process_exit,
+						       tr, INT_MAX);
+	} else {
+		unregister_trace_sched_process_fork(event_filter_pid_sched_process_fork,
+						    tr);
+		unregister_trace_sched_process_exit(event_filter_pid_sched_process_exit,
+						    tr);
+	}
 }
 
 static void
-- 
cgit v1.2.3


From 08d43a5fa063e03c860f2f391a30c388bcbc948e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 10 Dec 2015 12:50:50 -0600
Subject: tracing: Add lock-free tracing_map

Add tracing_map, a special-purpose lock-free map for tracing.

tracing_map is designed to aggregate or 'sum' one or more values
associated with a specific object of type tracing_map_elt, which
is associated by the map to a given key.

It provides various hooks allowing per-tracer customization and is
separated out into a separate file in order to allow it to be shared
between multiple tracers, but isn't meant to be generally used outside
of that context.

The tracing_map implementation was inspired by lock-free map
algorithms originated by Dr. Cliff Click:

 http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable
 http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf

Link: http://lkml.kernel.org/r/b43d68d1add33582a396f553c8ef705a33a6a748.1449767187.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig       |   13 +
 kernel/trace/Makefile      |    1 +
 kernel/trace/tracing_map.c | 1058 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/tracing_map.h |  282 ++++++++++++
 4 files changed, 1354 insertions(+)
 create mode 100644 kernel/trace/tracing_map.c
 create mode 100644 kernel/trace/tracing_map.h

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e45db6b0d878..48b732943e0e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -528,6 +528,19 @@ config MMIOTRACE
 	  See Documentation/trace/mmiotrace.txt.
 	  If you are not helping to develop drivers, say N.
 
+config TRACING_MAP
+	bool
+	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
+	default n
+	help
+	  tracing_map is a special-purpose lock-free map for tracing,
+	  separated out as a stand-alone facility in order to allow it
+	  to be shared between multiple tracers.  It isn't meant to be
+	  generally used outside of that context, and is normally
+	  selected by tracers that use it.
+
+	  If in doubt, say N.
+
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 9b1044e936a6..4255c4057aaa 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_seq.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
+obj-$(CONFIG_TRACING_MAP) += tracing_map.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
new file mode 100644
index 000000000000..dd6401dd145e
--- /dev/null
+++ b/kernel/trace/tracing_map.c
@@ -0,0 +1,1058 @@
+/*
+ * tracing_map - lock-free map for tracing
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com>
+ *
+ * tracing_map implementation inspired by lock-free map algorithms
+ * originated by Dr. Cliff Click:
+ *
+ * http://www.azulsystems.com/blog/cliff/2007-03-26-non-blocking-hashtable
+ * http://www.azulsystems.com/events/javaone_2007/2007_LockFreeHash.pdf
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+#include "tracing_map.h"
+#include "trace.h"
+
+/*
+ * NOTE: For a detailed description of the data structures used by
+ * these functions (such as tracing_map_elt) please see the overview
+ * of tracing_map data structures at the beginning of tracing_map.h.
+ */
+
+/**
+ * tracing_map_update_sum - Add a value to a tracing_map_elt's sum field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given sum associated with the tracing_map_elt
+ * @n: The value to add to the sum
+ *
+ * Add n to sum i associated with the specified tracing_map_elt
+ * instance.  The index i is the index returned by the call to
+ * tracing_map_add_sum_field() when the tracing map was set up.
+ */
+void tracing_map_update_sum(struct tracing_map_elt *elt, unsigned int i, u64 n)
+{
+	atomic64_add(n, &elt->fields[i].sum);
+}
+
+/**
+ * tracing_map_read_sum - Return the value of a tracing_map_elt's sum field
+ * @elt: The tracing_map_elt
+ * @i: The index of the given sum associated with the tracing_map_elt
+ *
+ * Retrieve the value of the sum i associated with the specified
+ * tracing_map_elt instance.  The index i is the index returned by the
+ * call to tracing_map_add_sum_field() when the tracing map was set
+ * up.
+ *
+ * Return: The sum associated with field i for elt.
+ */
+u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i)
+{
+	return (u64)atomic64_read(&elt->fields[i].sum);
+}
+
+int tracing_map_cmp_string(void *val_a, void *val_b)
+{
+	char *a = val_a;
+	char *b = val_b;
+
+	return strcmp(a, b);
+}
+
+int tracing_map_cmp_none(void *val_a, void *val_b)
+{
+	return 0;
+}
+
+static int tracing_map_cmp_atomic64(void *val_a, void *val_b)
+{
+	u64 a = atomic64_read((atomic64_t *)val_a);
+	u64 b = atomic64_read((atomic64_t *)val_b);
+
+	return (a > b) ? 1 : ((a < b) ? -1 : 0);
+}
+
+#define DEFINE_TRACING_MAP_CMP_FN(type)					\
+static int tracing_map_cmp_##type(void *val_a, void *val_b)		\
+{									\
+	type a = *(type *)val_a;					\
+	type b = *(type *)val_b;					\
+									\
+	return (a > b) ? 1 : ((a < b) ? -1 : 0);			\
+}
+
+DEFINE_TRACING_MAP_CMP_FN(s64);
+DEFINE_TRACING_MAP_CMP_FN(u64);
+DEFINE_TRACING_MAP_CMP_FN(s32);
+DEFINE_TRACING_MAP_CMP_FN(u32);
+DEFINE_TRACING_MAP_CMP_FN(s16);
+DEFINE_TRACING_MAP_CMP_FN(u16);
+DEFINE_TRACING_MAP_CMP_FN(s8);
+DEFINE_TRACING_MAP_CMP_FN(u8);
+
+tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size,
+					 int field_is_signed)
+{
+	tracing_map_cmp_fn_t fn = tracing_map_cmp_none;
+
+	switch (field_size) {
+	case 8:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s64;
+		else
+			fn = tracing_map_cmp_u64;
+		break;
+	case 4:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s32;
+		else
+			fn = tracing_map_cmp_u32;
+		break;
+	case 2:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s16;
+		else
+			fn = tracing_map_cmp_u16;
+		break;
+	case 1:
+		if (field_is_signed)
+			fn = tracing_map_cmp_s8;
+		else
+			fn = tracing_map_cmp_u8;
+		break;
+	}
+
+	return fn;
+}
+
+static int tracing_map_add_field(struct tracing_map *map,
+				 tracing_map_cmp_fn_t cmp_fn)
+{
+	int ret = -EINVAL;
+
+	if (map->n_fields < TRACING_MAP_FIELDS_MAX) {
+		ret = map->n_fields;
+		map->fields[map->n_fields++].cmp_fn = cmp_fn;
+	}
+
+	return ret;
+}
+
+/**
+ * tracing_map_add_sum_field - Add a field describing a tracing_map sum
+ * @map: The tracing_map
+ *
+ * Add a sum field to the key and return the index identifying it in
+ * the map and associated tracing_map_elts.  This is the index used
+ * for instance to update a sum for a particular tracing_map_elt using
+ * tracing_map_update_sum() or reading it via tracing_map_read_sum().
+ *
+ * Return: The index identifying the field in the map and associated
+ * tracing_map_elts.
+ */
+int tracing_map_add_sum_field(struct tracing_map *map)
+{
+	return tracing_map_add_field(map, tracing_map_cmp_atomic64);
+}
+
+/**
+ * tracing_map_add_key_field - Add a field describing a tracing_map key
+ * @map: The tracing_map
+ * @offset: The offset within the key
+ * @cmp_fn: The comparison function that will be used to sort on the key
+ *
+ * Let the map know there is a key and that if it's used as a sort key
+ * to use cmp_fn.
+ *
+ * A key can be a subset of a compound key; for that purpose, the
+ * offset param is used to describe where within the the compound key
+ * the key referenced by this key field resides.
+ *
+ * Return: The index identifying the field in the map and associated
+ * tracing_map_elts.
+ */
+int tracing_map_add_key_field(struct tracing_map *map,
+			      unsigned int offset,
+			      tracing_map_cmp_fn_t cmp_fn)
+
+{
+	int idx = tracing_map_add_field(map, cmp_fn);
+
+	if (idx < 0)
+		return idx;
+
+	map->fields[idx].offset = offset;
+
+	map->key_idx[map->n_keys++] = idx;
+
+	return idx;
+}
+
+void tracing_map_array_clear(struct tracing_map_array *a)
+{
+	unsigned int i;
+
+	if (!a->pages)
+		return;
+
+	for (i = 0; i < a->n_pages; i++)
+		memset(a->pages[i], 0, PAGE_SIZE);
+}
+
+void tracing_map_array_free(struct tracing_map_array *a)
+{
+	unsigned int i;
+
+	if (!a)
+		return;
+
+	if (!a->pages) {
+		kfree(a);
+		return;
+	}
+
+	for (i = 0; i < a->n_pages; i++) {
+		if (!a->pages[i])
+			break;
+		free_page((unsigned long)a->pages[i]);
+	}
+}
+
+struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
+						  unsigned int entry_size)
+{
+	struct tracing_map_array *a;
+	unsigned int i;
+
+	a = kzalloc(sizeof(*a), GFP_KERNEL);
+	if (!a)
+		return NULL;
+
+	a->entry_size_shift = fls(roundup_pow_of_two(entry_size) - 1);
+	a->entries_per_page = PAGE_SIZE / (1 << a->entry_size_shift);
+	a->n_pages = n_elts / a->entries_per_page;
+	if (!a->n_pages)
+		a->n_pages = 1;
+	a->entry_shift = fls(a->entries_per_page) - 1;
+	a->entry_mask = (1 << a->entry_shift) - 1;
+
+	a->pages = kcalloc(a->n_pages, sizeof(void *), GFP_KERNEL);
+	if (!a->pages)
+		goto free;
+
+	for (i = 0; i < a->n_pages; i++) {
+		a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!a->pages[i])
+			goto free;
+	}
+ out:
+	return a;
+ free:
+	tracing_map_array_free(a);
+	a = NULL;
+
+	goto out;
+}
+
+static void tracing_map_elt_clear(struct tracing_map_elt *elt)
+{
+	unsigned i;
+
+	for (i = 0; i < elt->map->n_fields; i++)
+		if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
+			atomic64_set(&elt->fields[i].sum, 0);
+
+	if (elt->map->ops && elt->map->ops->elt_clear)
+		elt->map->ops->elt_clear(elt);
+}
+
+static void tracing_map_elt_init_fields(struct tracing_map_elt *elt)
+{
+	unsigned int i;
+
+	tracing_map_elt_clear(elt);
+
+	for (i = 0; i < elt->map->n_fields; i++) {
+		elt->fields[i].cmp_fn = elt->map->fields[i].cmp_fn;
+
+		if (elt->fields[i].cmp_fn != tracing_map_cmp_atomic64)
+			elt->fields[i].offset = elt->map->fields[i].offset;
+	}
+}
+
+static void tracing_map_elt_free(struct tracing_map_elt *elt)
+{
+	if (!elt)
+		return;
+
+	if (elt->map->ops && elt->map->ops->elt_free)
+		elt->map->ops->elt_free(elt);
+	kfree(elt->fields);
+	kfree(elt->key);
+	kfree(elt);
+}
+
+static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
+{
+	struct tracing_map_elt *elt;
+	int err = 0;
+
+	elt = kzalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return ERR_PTR(-ENOMEM);
+
+	elt->map = map;
+
+	elt->key = kzalloc(map->key_size, GFP_KERNEL);
+	if (!elt->key) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	elt->fields = kcalloc(map->n_fields, sizeof(*elt->fields), GFP_KERNEL);
+	if (!elt->fields) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	tracing_map_elt_init_fields(elt);
+
+	if (map->ops && map->ops->elt_alloc) {
+		err = map->ops->elt_alloc(elt);
+		if (err)
+			goto free;
+	}
+	return elt;
+ free:
+	tracing_map_elt_free(elt);
+
+	return ERR_PTR(err);
+}
+
+static struct tracing_map_elt *get_free_elt(struct tracing_map *map)
+{
+	struct tracing_map_elt *elt = NULL;
+	int idx;
+
+	idx = atomic_inc_return(&map->next_elt);
+	if (idx < map->max_elts) {
+		elt = *(TRACING_MAP_ELT(map->elts, idx));
+		if (map->ops && map->ops->elt_init)
+			map->ops->elt_init(elt);
+	}
+
+	return elt;
+}
+
+static void tracing_map_free_elts(struct tracing_map *map)
+{
+	unsigned int i;
+
+	if (!map->elts)
+		return;
+
+	for (i = 0; i < map->max_elts; i++)
+		tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i)));
+
+	tracing_map_array_free(map->elts);
+}
+
+static int tracing_map_alloc_elts(struct tracing_map *map)
+{
+	unsigned int i;
+
+	map->elts = tracing_map_array_alloc(map->max_elts,
+					    sizeof(struct tracing_map_elt *));
+	if (!map->elts)
+		return -ENOMEM;
+
+	for (i = 0; i < map->max_elts; i++) {
+		*(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map);
+		if (!(*(TRACING_MAP_ELT(map->elts, i)))) {
+			tracing_map_free_elts(map);
+
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static inline bool keys_match(void *key, void *test_key, unsigned key_size)
+{
+	bool match = true;
+
+	if (memcmp(key, test_key, key_size))
+		match = false;
+
+	return match;
+}
+
+static inline struct tracing_map_elt *
+__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
+{
+	u32 idx, key_hash, test_key;
+	struct tracing_map_entry *entry;
+
+	key_hash = jhash(key, map->key_size, 0);
+	if (key_hash == 0)
+		key_hash = 1;
+	idx = key_hash >> (32 - (map->map_bits + 1));
+
+	while (1) {
+		idx &= (map->map_size - 1);
+		entry = TRACING_MAP_ENTRY(map->map, idx);
+		test_key = entry->key;
+
+		if (test_key && test_key == key_hash && entry->val &&
+		    keys_match(key, entry->val->key, map->key_size)) {
+			atomic64_inc(&map->hits);
+			return entry->val;
+		}
+
+		if (!test_key) {
+			if (lookup_only)
+				break;
+
+			if (!cmpxchg(&entry->key, 0, key_hash)) {
+				struct tracing_map_elt *elt;
+
+				elt = get_free_elt(map);
+				if (!elt) {
+					atomic64_inc(&map->drops);
+					entry->key = 0;
+					break;
+				}
+
+				memcpy(elt->key, key, map->key_size);
+				entry->val = elt;
+				atomic64_inc(&map->hits);
+
+				return entry->val;
+			}
+		}
+
+		idx++;
+	}
+
+	return NULL;
+}
+
+/**
+ * tracing_map_insert - Insert key and/or retrieve val from a tracing_map
+ * @map: The tracing_map to insert into
+ * @key: The key to insert
+ *
+ * Inserts a key into a tracing_map and creates and returns a new
+ * tracing_map_elt for it, or if the key has already been inserted by
+ * a previous call, returns the tracing_map_elt already associated
+ * with it.  When the map was created, the number of elements to be
+ * allocated for the map was specified (internally maintained as
+ * 'max_elts' in struct tracing_map), and that number of
+ * tracing_map_elts was created by tracing_map_init().  This is the
+ * pre-allocated pool of tracing_map_elts that tracing_map_insert()
+ * will allocate from when adding new keys.  Once that pool is
+ * exhausted, tracing_map_insert() is useless and will return NULL to
+ * signal that state.  There are two user-visible tracing_map
+ * variables, 'hits' and 'drops', which are updated by this function.
+ * Every time an element is either successfully inserted or retrieved,
+ * the 'hits' value is incrememented.  Every time an element insertion
+ * fails, the 'drops' value is incremented.
+ *
+ * This is a lock-free tracing map insertion function implementing a
+ * modified form of Cliff Click's basic insertion algorithm.  It
+ * requires the table size be a power of two.  To prevent any
+ * possibility of an infinite loop we always make the internal table
+ * size double the size of the requested table size (max_elts * 2).
+ * Likewise, we never reuse a slot or resize or delete elements - when
+ * we've reached max_elts entries, we simply return NULL once we've
+ * run out of entries.  Readers can at any point in time traverse the
+ * tracing map and safely access the key/val pairs.
+ *
+ * Return: the tracing_map_elt pointer val associated with the key.
+ * If this was a newly inserted key, the val will be a newly allocated
+ * and associated tracing_map_elt pointer val.  If the key wasn't
+ * found and the pool of tracing_map_elts has been exhausted, NULL is
+ * returned and no further insertions will succeed.
+ */
+struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key)
+{
+	return __tracing_map_insert(map, key, false);
+}
+
+/**
+ * tracing_map_lookup - Retrieve val from a tracing_map
+ * @map: The tracing_map to perform the lookup on
+ * @key: The key to look up
+ *
+ * Looks up key in tracing_map and if found returns the matching
+ * tracing_map_elt.  This is a lock-free lookup; see
+ * tracing_map_insert() for details on tracing_map and how it works.
+ * Every time an element is retrieved, the 'hits' value is
+ * incrememented.  There is one user-visible tracing_map variable,
+ * 'hits', which is updated by this function.  Every time an element
+ * is successfully retrieved, the 'hits' value is incrememented.  The
+ * 'drops' value is never updated by this function.
+ *
+ * Return: the tracing_map_elt pointer val associated with the key.
+ * If the key wasn't found, NULL is returned.
+ */
+struct tracing_map_elt *tracing_map_lookup(struct tracing_map *map, void *key)
+{
+	return __tracing_map_insert(map, key, true);
+}
+
+/**
+ * tracing_map_destroy - Destroy a tracing_map
+ * @map: The tracing_map to destroy
+ *
+ * Frees a tracing_map along with its associated array of
+ * tracing_map_elts.
+ *
+ * Callers should make sure there are no readers or writers actively
+ * reading or inserting into the map before calling this.
+ */
+void tracing_map_destroy(struct tracing_map *map)
+{
+	if (!map)
+		return;
+
+	tracing_map_free_elts(map);
+
+	tracing_map_array_free(map->map);
+	kfree(map);
+}
+
+/**
+ * tracing_map_clear - Clear a tracing_map
+ * @map: The tracing_map to clear
+ *
+ * Resets the tracing map to a cleared or initial state.  The
+ * tracing_map_elts are all cleared, and the array of struct
+ * tracing_map_entry is reset to an initialized state.
+ *
+ * Callers should make sure there are no writers actively inserting
+ * into the map before calling this.
+ */
+void tracing_map_clear(struct tracing_map *map)
+{
+	unsigned int i;
+
+	atomic_set(&map->next_elt, -1);
+	atomic64_set(&map->hits, 0);
+	atomic64_set(&map->drops, 0);
+
+	tracing_map_array_clear(map->map);
+
+	for (i = 0; i < map->max_elts; i++)
+		tracing_map_elt_clear(*(TRACING_MAP_ELT(map->elts, i)));
+}
+
+static void set_sort_key(struct tracing_map *map,
+			 struct tracing_map_sort_key *sort_key)
+{
+	map->sort_key = *sort_key;
+}
+
+/**
+ * tracing_map_create - Create a lock-free map and element pool
+ * @map_bits: The size of the map (2 ** map_bits)
+ * @key_size: The size of the key for the map in bytes
+ * @ops: Optional client-defined tracing_map_ops instance
+ * @private_data: Client data associated with the map
+ *
+ * Creates and sets up a map to contain 2 ** map_bits number of
+ * elements (internally maintained as 'max_elts' in struct
+ * tracing_map).  Before using, map fields should be added to the map
+ * with tracing_map_add_sum_field() and tracing_map_add_key_field().
+ * tracing_map_init() should then be called to allocate the array of
+ * tracing_map_elts, in order to avoid allocating anything in the map
+ * insertion path.  The user-specified map size reflects the maximum
+ * number of elements that can be contained in the table requested by
+ * the user - internally we double that in order to keep the table
+ * sparse and keep collisions manageable.
+ *
+ * A tracing_map is a special-purpose map designed to aggregate or
+ * 'sum' one or more values associated with a specific object of type
+ * tracing_map_elt, which is attached by the map to a given key.
+ *
+ * tracing_map_create() sets up the map itself, and provides
+ * operations for inserting tracing_map_elts, but doesn't allocate the
+ * tracing_map_elts themselves, or provide a means for describing the
+ * keys or sums associated with the tracing_map_elts.  All
+ * tracing_map_elts for a given map have the same set of sums and
+ * keys, which are defined by the client using the functions
+ * tracing_map_add_key_field() and tracing_map_add_sum_field().  Once
+ * the fields are defined, the pool of elements allocated for the map
+ * can be created, which occurs when the client code calls
+ * tracing_map_init().
+ *
+ * When tracing_map_init() returns, tracing_map_elt elements can be
+ * inserted into the map using tracing_map_insert().  When called,
+ * tracing_map_insert() grabs a free tracing_map_elt from the pool, or
+ * finds an existing match in the map and in either case returns it.
+ * The client can then use tracing_map_update_sum() and
+ * tracing_map_read_sum() to update or read a given sum field for the
+ * tracing_map_elt.
+ *
+ * The client can at any point retrieve and traverse the current set
+ * of inserted tracing_map_elts in a tracing_map, via
+ * tracing_map_sort_entries().  Sorting can be done on any field,
+ * including keys.
+ *
+ * See tracing_map.h for a description of tracing_map_ops.
+ *
+ * Return: the tracing_map pointer if successful, ERR_PTR if not.
+ */
+struct tracing_map *tracing_map_create(unsigned int map_bits,
+				       unsigned int key_size,
+				       const struct tracing_map_ops *ops,
+				       void *private_data)
+{
+	struct tracing_map *map;
+	unsigned int i;
+
+	if (map_bits < TRACING_MAP_BITS_MIN ||
+	    map_bits > TRACING_MAP_BITS_MAX)
+		return ERR_PTR(-EINVAL);
+
+	map = kzalloc(sizeof(*map), GFP_KERNEL);
+	if (!map)
+		return ERR_PTR(-ENOMEM);
+
+	map->map_bits = map_bits;
+	map->max_elts = (1 << map_bits);
+	atomic_set(&map->next_elt, -1);
+
+	map->map_size = (1 << (map_bits + 1));
+	map->ops = ops;
+
+	map->private_data = private_data;
+
+	map->map = tracing_map_array_alloc(map->map_size,
+					   sizeof(struct tracing_map_entry));
+	if (!map->map)
+		goto free;
+
+	map->key_size = key_size;
+	for (i = 0; i < TRACING_MAP_KEYS_MAX; i++)
+		map->key_idx[i] = -1;
+ out:
+	return map;
+ free:
+	tracing_map_destroy(map);
+	map = ERR_PTR(-ENOMEM);
+
+	goto out;
+}
+
+/**
+ * tracing_map_init - Allocate and clear a map's tracing_map_elts
+ * @map: The tracing_map to initialize
+ *
+ * Allocates a clears a pool of tracing_map_elts equal to the
+ * user-specified size of 2 ** map_bits (internally maintained as
+ * 'max_elts' in struct tracing_map).  Before using, the map fields
+ * should be added to the map with tracing_map_add_sum_field() and
+ * tracing_map_add_key_field().  tracing_map_init() should then be
+ * called to allocate the array of tracing_map_elts, in order to avoid
+ * allocating anything in the map insertion path.  The user-specified
+ * map size reflects the max number of elements requested by the user
+ * - internally we double that in order to keep the table sparse and
+ * keep collisions manageable.
+ *
+ * See tracing_map.h for a description of tracing_map_ops.
+ *
+ * Return: the tracing_map pointer if successful, ERR_PTR if not.
+ */
+int tracing_map_init(struct tracing_map *map)
+{
+	int err;
+
+	if (map->n_fields < 2)
+		return -EINVAL; /* need at least 1 key and 1 val */
+
+	err = tracing_map_alloc_elts(map);
+	if (err)
+		return err;
+
+	tracing_map_clear(map);
+
+	return err;
+}
+
+static int cmp_entries_dup(const struct tracing_map_sort_entry **a,
+			   const struct tracing_map_sort_entry **b)
+{
+	int ret = 0;
+
+	if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size))
+		ret = 1;
+
+	return ret;
+}
+
+static int cmp_entries_sum(const struct tracing_map_sort_entry **a,
+			   const struct tracing_map_sort_entry **b)
+{
+	const struct tracing_map_elt *elt_a, *elt_b;
+	struct tracing_map_sort_key *sort_key;
+	struct tracing_map_field *field;
+	tracing_map_cmp_fn_t cmp_fn;
+	void *val_a, *val_b;
+	int ret = 0;
+
+	elt_a = (*a)->elt;
+	elt_b = (*b)->elt;
+
+	sort_key = &elt_a->map->sort_key;
+
+	field = &elt_a->fields[sort_key->field_idx];
+	cmp_fn = field->cmp_fn;
+
+	val_a = &elt_a->fields[sort_key->field_idx].sum;
+	val_b = &elt_b->fields[sort_key->field_idx].sum;
+
+	ret = cmp_fn(val_a, val_b);
+	if (sort_key->descending)
+		ret = -ret;
+
+	return ret;
+}
+
+static int cmp_entries_key(const struct tracing_map_sort_entry **a,
+			   const struct tracing_map_sort_entry **b)
+{
+	const struct tracing_map_elt *elt_a, *elt_b;
+	struct tracing_map_sort_key *sort_key;
+	struct tracing_map_field *field;
+	tracing_map_cmp_fn_t cmp_fn;
+	void *val_a, *val_b;
+	int ret = 0;
+
+	elt_a = (*a)->elt;
+	elt_b = (*b)->elt;
+
+	sort_key = &elt_a->map->sort_key;
+
+	field = &elt_a->fields[sort_key->field_idx];
+
+	cmp_fn = field->cmp_fn;
+
+	val_a = elt_a->key + field->offset;
+	val_b = elt_b->key + field->offset;
+
+	ret = cmp_fn(val_a, val_b);
+	if (sort_key->descending)
+		ret = -ret;
+
+	return ret;
+}
+
+static void destroy_sort_entry(struct tracing_map_sort_entry *entry)
+{
+	if (!entry)
+		return;
+
+	if (entry->elt_copied)
+		tracing_map_elt_free(entry->elt);
+
+	kfree(entry);
+}
+
+/**
+ * tracing_map_destroy_sort_entries - Destroy an array of sort entries
+ * @entries: The entries to destroy
+ * @n_entries: The number of entries in the array
+ *
+ * Destroy the elements returned by a tracing_map_sort_entries() call.
+ */
+void tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries,
+				      unsigned int n_entries)
+{
+	unsigned int i;
+
+	for (i = 0; i < n_entries; i++)
+		destroy_sort_entry(entries[i]);
+
+	vfree(entries);
+}
+
+static struct tracing_map_sort_entry *
+create_sort_entry(void *key, struct tracing_map_elt *elt)
+{
+	struct tracing_map_sort_entry *sort_entry;
+
+	sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL);
+	if (!sort_entry)
+		return NULL;
+
+	sort_entry->key = key;
+	sort_entry->elt = elt;
+
+	return sort_entry;
+}
+
+static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
+{
+	struct tracing_map_elt *dup_elt;
+	unsigned int i;
+
+	dup_elt = tracing_map_elt_alloc(elt->map);
+	if (!dup_elt)
+		return NULL;
+
+	if (elt->map->ops && elt->map->ops->elt_copy)
+		elt->map->ops->elt_copy(dup_elt, elt);
+
+	dup_elt->private_data = elt->private_data;
+	memcpy(dup_elt->key, elt->key, elt->map->key_size);
+
+	for (i = 0; i < elt->map->n_fields; i++) {
+		atomic64_set(&dup_elt->fields[i].sum,
+			     atomic64_read(&elt->fields[i].sum));
+		dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
+	}
+
+	return dup_elt;
+}
+
+static int merge_dup(struct tracing_map_sort_entry **sort_entries,
+		     unsigned int target, unsigned int dup)
+{
+	struct tracing_map_elt *target_elt, *elt;
+	bool first_dup = (target - dup) == 1;
+	int i;
+
+	if (first_dup) {
+		elt = sort_entries[target]->elt;
+		target_elt = copy_elt(elt);
+		if (!target_elt)
+			return -ENOMEM;
+		sort_entries[target]->elt = target_elt;
+		sort_entries[target]->elt_copied = true;
+	} else
+		target_elt = sort_entries[target]->elt;
+
+	elt = sort_entries[dup]->elt;
+
+	for (i = 0; i < elt->map->n_fields; i++)
+		atomic64_add(atomic64_read(&elt->fields[i].sum),
+			     &target_elt->fields[i].sum);
+
+	sort_entries[dup]->dup = true;
+
+	return 0;
+}
+
+static int merge_dups(struct tracing_map_sort_entry **sort_entries,
+		      int n_entries, unsigned int key_size)
+{
+	unsigned int dups = 0, total_dups = 0;
+	int err, i, j;
+	void *key;
+
+	if (n_entries < 2)
+		return total_dups;
+
+	sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
+	     (int (*)(const void *, const void *))cmp_entries_dup, NULL);
+
+	key = sort_entries[0]->key;
+	for (i = 1; i < n_entries; i++) {
+		if (!memcmp(sort_entries[i]->key, key, key_size)) {
+			dups++; total_dups++;
+			err = merge_dup(sort_entries, i - dups, i);
+			if (err)
+				return err;
+			continue;
+		}
+		key = sort_entries[i]->key;
+		dups = 0;
+	}
+
+	if (!total_dups)
+		return total_dups;
+
+	for (i = 0, j = 0; i < n_entries; i++) {
+		if (!sort_entries[i]->dup) {
+			sort_entries[j] = sort_entries[i];
+			if (j++ != i)
+				sort_entries[i] = NULL;
+		} else {
+			destroy_sort_entry(sort_entries[i]);
+			sort_entries[i] = NULL;
+		}
+	}
+
+	return total_dups;
+}
+
+static bool is_key(struct tracing_map *map, unsigned int field_idx)
+{
+	unsigned int i;
+
+	for (i = 0; i < map->n_keys; i++)
+		if (map->key_idx[i] == field_idx)
+			return true;
+	return false;
+}
+
+static void sort_secondary(struct tracing_map *map,
+			   const struct tracing_map_sort_entry **entries,
+			   unsigned int n_entries,
+			   struct tracing_map_sort_key *primary_key,
+			   struct tracing_map_sort_key *secondary_key)
+{
+	int (*primary_fn)(const struct tracing_map_sort_entry **,
+			  const struct tracing_map_sort_entry **);
+	int (*secondary_fn)(const struct tracing_map_sort_entry **,
+			    const struct tracing_map_sort_entry **);
+	unsigned i, start = 0, n_sub = 1;
+
+	if (is_key(map, primary_key->field_idx))
+		primary_fn = cmp_entries_key;
+	else
+		primary_fn = cmp_entries_sum;
+
+	if (is_key(map, secondary_key->field_idx))
+		secondary_fn = cmp_entries_key;
+	else
+		secondary_fn = cmp_entries_sum;
+
+	for (i = 0; i < n_entries - 1; i++) {
+		const struct tracing_map_sort_entry **a = &entries[i];
+		const struct tracing_map_sort_entry **b = &entries[i + 1];
+
+		if (primary_fn(a, b) == 0) {
+			n_sub++;
+			if (i < n_entries - 2)
+				continue;
+		}
+
+		if (n_sub < 2) {
+			start = i + 1;
+			n_sub = 1;
+			continue;
+		}
+
+		set_sort_key(map, secondary_key);
+		sort(&entries[start], n_sub,
+		     sizeof(struct tracing_map_sort_entry *),
+		     (int (*)(const void *, const void *))secondary_fn, NULL);
+		set_sort_key(map, primary_key);
+
+		start = i + 1;
+		n_sub = 1;
+	}
+}
+
+/**
+ * tracing_map_sort_entries - Sort the current set of tracing_map_elts in a map
+ * @map: The tracing_map
+ * @sort_key: The sort key to use for sorting
+ * @sort_entries: outval: pointer to allocated and sorted array of entries
+ *
+ * tracing_map_sort_entries() sorts the current set of entries in the
+ * map and returns the list of tracing_map_sort_entries containing
+ * them to the client in the sort_entries param.  The client can
+ * access the struct tracing_map_elt element of interest directly as
+ * the 'elt' field of a returned struct tracing_map_sort_entry object.
+ *
+ * The sort_key has only two fields: idx and descending.  'idx' refers
+ * to the index of the field added via tracing_map_add_sum_field() or
+ * tracing_map_add_key_field() when the tracing_map was initialized.
+ * 'descending' is a flag that if set reverses the sort order, which
+ * by default is ascending.
+ *
+ * The client should not hold on to the returned array but should use
+ * it and call tracing_map_destroy_sort_entries() when done.
+ *
+ * Return: the number of sort_entries in the struct tracing_map_sort_entry
+ * array, negative on error
+ */
+int tracing_map_sort_entries(struct tracing_map *map,
+			     struct tracing_map_sort_key *sort_keys,
+			     unsigned int n_sort_keys,
+			     struct tracing_map_sort_entry ***sort_entries)
+{
+	int (*cmp_entries_fn)(const struct tracing_map_sort_entry **,
+			      const struct tracing_map_sort_entry **);
+	struct tracing_map_sort_entry *sort_entry, **entries;
+	int i, n_entries, ret;
+
+	entries = vmalloc(map->max_elts * sizeof(sort_entry));
+	if (!entries)
+		return -ENOMEM;
+
+	for (i = 0, n_entries = 0; i < map->map_size; i++) {
+		struct tracing_map_entry *entry;
+
+		entry = TRACING_MAP_ENTRY(map->map, i);
+
+		if (!entry->key || !entry->val)
+			continue;
+
+		entries[n_entries] = create_sort_entry(entry->val->key,
+						       entry->val);
+		if (!entries[n_entries++]) {
+			ret = -ENOMEM;
+			goto free;
+		}
+	}
+
+	if (n_entries == 0) {
+		ret = 0;
+		goto free;
+	}
+
+	if (n_entries == 1) {
+		*sort_entries = entries;
+		return 1;
+	}
+
+	ret = merge_dups(entries, n_entries, map->key_size);
+	if (ret < 0)
+		goto free;
+	n_entries -= ret;
+
+	if (is_key(map, sort_keys[0].field_idx))
+		cmp_entries_fn = cmp_entries_key;
+	else
+		cmp_entries_fn = cmp_entries_sum;
+
+	set_sort_key(map, &sort_keys[0]);
+
+	sort(entries, n_entries, sizeof(struct tracing_map_sort_entry *),
+	     (int (*)(const void *, const void *))cmp_entries_fn, NULL);
+
+	if (n_sort_keys > 1)
+		sort_secondary(map,
+			       (const struct tracing_map_sort_entry **)entries,
+			       n_entries,
+			       &sort_keys[0],
+			       &sort_keys[1]);
+
+	*sort_entries = entries;
+
+	return n_entries;
+ free:
+	tracing_map_destroy_sort_entries(entries, n_entries);
+
+	return ret;
+}
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
new file mode 100644
index 000000000000..75718255a8ad
--- /dev/null
+++ b/kernel/trace/tracing_map.h
@@ -0,0 +1,282 @@
+#ifndef __TRACING_MAP_H
+#define __TRACING_MAP_H
+
+#define TRACING_MAP_BITS_DEFAULT	11
+#define TRACING_MAP_BITS_MAX		17
+#define TRACING_MAP_BITS_MIN		7
+
+#define TRACING_MAP_FIELDS_MAX		4
+#define TRACING_MAP_KEYS_MAX		2
+
+#define TRACING_MAP_SORT_KEYS_MAX	2
+
+typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
+
+/*
+ * This is an overview of the tracing_map data structures and how they
+ * relate to the tracing_map API.  The details of the algorithms
+ * aren't discussed here - this is just a general overview of the data
+ * structures and how they interact with the API.
+ *
+ * The central data structure of the tracing_map is an initially
+ * zeroed array of struct tracing_map_entry (stored in the map field
+ * of struct tracing_map).  tracing_map_entry is a very simple data
+ * structure containing only two fields: a 32-bit unsigned 'key'
+ * variable and a pointer named 'val'.  This array of struct
+ * tracing_map_entry is essentially a hash table which will be
+ * modified by a single function, tracing_map_insert(), but which can
+ * be traversed and read by a user at any time (though the user does
+ * this indirectly via an array of tracing_map_sort_entry - see the
+ * explanation of that data structure in the discussion of the
+ * sorting-related data structures below).
+ *
+ * The central function of the tracing_map API is
+ * tracing_map_insert().  tracing_map_insert() hashes the
+ * arbitrarily-sized key passed into it into a 32-bit unsigned key.
+ * It then uses this key, truncated to the array size, as an index
+ * into the array of tracing_map_entries.  If the value of the 'key'
+ * field of the tracing_map_entry found at that location is 0, then
+ * that entry is considered to be free and can be claimed, by
+ * replacing the 0 in the 'key' field of the tracing_map_entry with
+ * the new 32-bit hashed key.  Once claimed, that tracing_map_entry's
+ * 'val' field is then used to store a unique element which will be
+ * forever associated with that 32-bit hashed key in the
+ * tracing_map_entry.
+ *
+ * That unique element now in the tracing_map_entry's 'val' field is
+ * an instance of tracing_map_elt, where 'elt' in the latter part of
+ * that variable name is short for 'element'.  The purpose of a
+ * tracing_map_elt is to hold values specific to the the particular
+ * 32-bit hashed key it's assocated with.  Things such as the unique
+ * set of aggregated sums associated with the 32-bit hashed key, along
+ * with a copy of the full key associated with the entry, and which
+ * was used to produce the 32-bit hashed key.
+ *
+ * When tracing_map_create() is called to create the tracing map, the
+ * user specifies (indirectly via the map_bits param, the details are
+ * unimportant for this discussion) the maximum number of elements
+ * that the map can hold (stored in the max_elts field of struct
+ * tracing_map).  This is the maximum possible number of
+ * tracing_map_entries in the tracing_map_entry array which can be
+ * 'claimed' as described in the above discussion, and therefore is
+ * also the maximum number of tracing_map_elts that can be associated
+ * with the tracing_map_entry array in the tracing_map.  Because of
+ * the way the insertion algorithm works, the size of the allocated
+ * tracing_map_entry array is always twice the maximum number of
+ * elements (2 * max_elts).  This value is stored in the map_size
+ * field of struct tracing_map.
+ *
+ * Because tracing_map_insert() needs to work from any context,
+ * including from within the memory allocation functions themselves,
+ * both the tracing_map_entry array and a pool of max_elts
+ * tracing_map_elts are pre-allocated before any call is made to
+ * tracing_map_insert().
+ *
+ * The tracing_map_entry array is allocated as a single block by
+ * tracing_map_create().
+ *
+ * Because the tracing_map_elts are much larger objects and can't
+ * generally be allocated together as a single large array without
+ * failure, they're allocated individually, by tracing_map_init().
+ *
+ * The pool of tracing_map_elts are allocated by tracing_map_init()
+ * rather than by tracing_map_create() because at the time
+ * tracing_map_create() is called, there isn't enough information to
+ * create the tracing_map_elts.  Specifically,the user first needs to
+ * tell the tracing_map implementation how many fields the
+ * tracing_map_elts contain, and which types of fields they are (key
+ * or sum).  The user does this via the tracing_map_add_sum_field()
+ * and tracing_map_add_key_field() functions, following which the user
+ * calls tracing_map_init() to finish up the tracing map setup.  The
+ * array holding the pointers which make up the pre-allocated pool of
+ * tracing_map_elts is allocated as a single block and is stored in
+ * the elts field of struct tracing_map.
+ *
+ * There is also a set of structures used for sorting that might
+ * benefit from some minimal explanation.
+ *
+ * struct tracing_map_sort_key is used to drive the sort at any given
+ * time.  By 'any given time' we mean that a different
+ * tracing_map_sort_key will be used at different times depending on
+ * whether the sort currently being performed is a primary or a
+ * secondary sort.
+ *
+ * The sort key is very simple, consisting of the field index of the
+ * tracing_map_elt field to sort on (which the user saved when adding
+ * the field), and whether the sort should be done in an ascending or
+ * descending order.
+ *
+ * For the convenience of the sorting code, a tracing_map_sort_entry
+ * is created for each tracing_map_elt, again individually allocated
+ * to avoid failures that might be expected if allocated as a single
+ * large array of struct tracing_map_sort_entry.
+ * tracing_map_sort_entry instances are the objects expected by the
+ * various internal sorting functions, and are also what the user
+ * ultimately receives after calling tracing_map_sort_entries().
+ * Because it doesn't make sense for users to access an unordered and
+ * sparsely populated tracing_map directly, the
+ * tracing_map_sort_entries() function is provided so that users can
+ * retrieve a sorted list of all existing elements.  In addition to
+ * the associated tracing_map_elt 'elt' field contained within the
+ * tracing_map_sort_entry, which is the object of interest to the
+ * user, tracing_map_sort_entry objects contain a number of additional
+ * fields which are used for caching and internal purposes and can
+ * safely be ignored.
+*/
+
+struct tracing_map_field {
+	tracing_map_cmp_fn_t		cmp_fn;
+	union {
+		atomic64_t			sum;
+		unsigned int			offset;
+	};
+};
+
+struct tracing_map_elt {
+	struct tracing_map		*map;
+	struct tracing_map_field	*fields;
+	void				*key;
+	void				*private_data;
+};
+
+struct tracing_map_entry {
+	u32				key;
+	struct tracing_map_elt		*val;
+};
+
+struct tracing_map_sort_key {
+	unsigned int			field_idx;
+	bool				descending;
+};
+
+struct tracing_map_sort_entry {
+	void				*key;
+	struct tracing_map_elt		*elt;
+	bool				elt_copied;
+	bool				dup;
+};
+
+struct tracing_map_array {
+	unsigned int entries_per_page;
+	unsigned int entry_size_shift;
+	unsigned int entry_shift;
+	unsigned int entry_mask;
+	unsigned int n_pages;
+	void **pages;
+};
+
+#define TRACING_MAP_ARRAY_ELT(array, idx)				\
+	(array->pages[idx >> array->entry_shift] +			\
+	 ((idx & array->entry_mask) << array->entry_size_shift))
+
+#define TRACING_MAP_ENTRY(array, idx)					\
+	((struct tracing_map_entry *)TRACING_MAP_ARRAY_ELT(array, idx))
+
+#define TRACING_MAP_ELT(array, idx)					\
+	((struct tracing_map_elt **)TRACING_MAP_ARRAY_ELT(array, idx))
+
+struct tracing_map {
+	unsigned int			key_size;
+	unsigned int			map_bits;
+	unsigned int			map_size;
+	unsigned int			max_elts;
+	atomic_t			next_elt;
+	struct tracing_map_array	*elts;
+	struct tracing_map_array	*map;
+	const struct tracing_map_ops	*ops;
+	void				*private_data;
+	struct tracing_map_field	fields[TRACING_MAP_FIELDS_MAX];
+	unsigned int			n_fields;
+	int				key_idx[TRACING_MAP_KEYS_MAX];
+	unsigned int			n_keys;
+	struct tracing_map_sort_key	sort_key;
+	atomic64_t			hits;
+	atomic64_t			drops;
+};
+
+/**
+ * struct tracing_map_ops - callbacks for tracing_map
+ *
+ * The methods in this structure define callback functions for various
+ * operations on a tracing_map or objects related to a tracing_map.
+ *
+ * For a detailed description of tracing_map_elt objects please see
+ * the overview of tracing_map data structures at the beginning of
+ * this file.
+ *
+ * All the methods below are optional.
+ *
+ * @elt_alloc: When a tracing_map_elt is allocated, this function, if
+ *	defined, will be called and gives clients the opportunity to
+ *	allocate additional data and attach it to the element
+ *	(tracing_map_elt->private_data is meant for that purpose).
+ *	Element allocation occurs before tracing begins, when the
+ *	tracing_map_init() call is made by client code.
+ *
+ * @elt_copy: At certain points in the lifetime of an element, it may
+ *	need to be copied.  The copy should include a copy of the
+ *	client-allocated data, which can be copied into the 'to'
+ *	element from the 'from' element.
+ *
+ * @elt_free: When a tracing_map_elt is freed, this function is called
+ *	and allows client-allocated per-element data to be freed.
+ *
+ * @elt_clear: This callback allows per-element client-defined data to
+ *	be cleared, if applicable.
+ *
+ * @elt_init: This callback allows per-element client-defined data to
+ *	be initialized when used i.e. when the element is actually
+ *	claimed by tracing_map_insert() in the context of the map
+ *	insertion.
+ */
+struct tracing_map_ops {
+	int			(*elt_alloc)(struct tracing_map_elt *elt);
+	void			(*elt_copy)(struct tracing_map_elt *to,
+					    struct tracing_map_elt *from);
+	void			(*elt_free)(struct tracing_map_elt *elt);
+	void			(*elt_clear)(struct tracing_map_elt *elt);
+	void			(*elt_init)(struct tracing_map_elt *elt);
+};
+
+extern struct tracing_map *
+tracing_map_create(unsigned int map_bits,
+		   unsigned int key_size,
+		   const struct tracing_map_ops *ops,
+		   void *private_data);
+extern int tracing_map_init(struct tracing_map *map);
+
+extern int tracing_map_add_sum_field(struct tracing_map *map);
+extern int tracing_map_add_key_field(struct tracing_map *map,
+				     unsigned int offset,
+				     tracing_map_cmp_fn_t cmp_fn);
+
+extern void tracing_map_destroy(struct tracing_map *map);
+extern void tracing_map_clear(struct tracing_map *map);
+
+extern struct tracing_map_elt *
+tracing_map_insert(struct tracing_map *map, void *key);
+extern struct tracing_map_elt *
+tracing_map_lookup(struct tracing_map *map, void *key);
+
+extern tracing_map_cmp_fn_t tracing_map_cmp_num(int field_size,
+						int field_is_signed);
+extern int tracing_map_cmp_string(void *val_a, void *val_b);
+extern int tracing_map_cmp_none(void *val_a, void *val_b);
+
+extern void tracing_map_update_sum(struct tracing_map_elt *elt,
+				   unsigned int i, u64 n);
+extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
+extern void tracing_map_set_field_descr(struct tracing_map *map,
+					unsigned int i,
+					unsigned int key_offset,
+					tracing_map_cmp_fn_t cmp_fn);
+extern int
+tracing_map_sort_entries(struct tracing_map *map,
+			 struct tracing_map_sort_key *sort_keys,
+			 unsigned int n_sort_keys,
+			 struct tracing_map_sort_entry ***sort_entries);
+
+extern void
+tracing_map_destroy_sort_entries(struct tracing_map_sort_entry **entries,
+				 unsigned int n_entries);
+#endif /* __TRACING_MAP_H */
-- 
cgit v1.2.3


From 8d44f2f34fb50f40185ef6404b0047223a27ba82 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 19 Feb 2016 14:47:00 -0500
Subject: tracing: Fix TRACING_MAP Kconfig

The config option for TRACING_MAP has "default n", which is not needed
because the default of configs is 'n'.

Also, since the TRACING_MAP has no config prompt, there's no reason to
include "If in doubt, say N" in the help text.

Fixed a typo in the comments of tracing_map.h.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig       | 3 ---
 kernel/trace/tracing_map.h | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 48b732943e0e..d39556fd863a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -531,7 +531,6 @@ config MMIOTRACE
 config TRACING_MAP
 	bool
 	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
-	default n
 	help
 	  tracing_map is a special-purpose lock-free map for tracing,
 	  separated out as a stand-alone facility in order to allow it
@@ -539,8 +538,6 @@ config TRACING_MAP
 	  generally used outside of that context, and is normally
 	  selected by tracers that use it.
 
-	  If in doubt, say N.
-
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 75718255a8ad..1f7eda548787 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -46,7 +46,7 @@ typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
  * That unique element now in the tracing_map_entry's 'val' field is
  * an instance of tracing_map_elt, where 'elt' in the latter part of
  * that variable name is short for 'element'.  The purpose of a
- * tracing_map_elt is to hold values specific to the the particular
+ * tracing_map_elt is to hold values specific to the particular
  * 32-bit hashed key it's assocated with.  Things such as the unique
  * set of aggregated sums associated with the 32-bit hashed key, along
  * with a copy of the full key associated with the entry, and which
-- 
cgit v1.2.3


From 3b772b96b8338bca2532839b2cd7802800e66037 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:41 -0600
Subject: tracing: Update some tracing_map constants and comments

Make it clear exactly how many keys and values are supported through
better defines, and add 1 to the vals count, since normally clients
want support for at least a hitcount and two other values.

Also, note the error return value for tracing_map_add_key/val_field()
in the comments.

Link: http://lkml.kernel.org/r/6696fa02ebc716aa344c27a571a2afaa25e5b4d4.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 4 ++--
 kernel/trace/tracing_map.h | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index dd6401dd145e..e0f172932eca 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -163,7 +163,7 @@ static int tracing_map_add_field(struct tracing_map *map,
  * tracing_map_update_sum() or reading it via tracing_map_read_sum().
  *
  * Return: The index identifying the field in the map and associated
- * tracing_map_elts.
+ * tracing_map_elts, or -EINVAL on error.
  */
 int tracing_map_add_sum_field(struct tracing_map *map)
 {
@@ -184,7 +184,7 @@ int tracing_map_add_sum_field(struct tracing_map *map)
  * the key referenced by this key field resides.
  *
  * Return: The index identifying the field in the map and associated
- * tracing_map_elts.
+ * tracing_map_elts, or -EINVAL on error.
  */
 int tracing_map_add_key_field(struct tracing_map *map,
 			      unsigned int offset,
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 1f7eda548787..618838f5f30a 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -5,9 +5,10 @@
 #define TRACING_MAP_BITS_MAX		17
 #define TRACING_MAP_BITS_MIN		7
 
-#define TRACING_MAP_FIELDS_MAX		4
 #define TRACING_MAP_KEYS_MAX		2
-
+#define TRACING_MAP_VALS_MAX		3
+#define TRACING_MAP_FIELDS_MAX		(TRACING_MAP_KEYS_MAX + \
+					 TRACING_MAP_VALS_MAX)
 #define TRACING_MAP_SORT_KEYS_MAX	2
 
 typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
-- 
cgit v1.2.3


From 7ef224d1d0e3a1ade02d02c01ce1dcffb736d2c3 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:42 -0600
Subject: tracing: Add 'hist' event trigger command

'hist' triggers allow users to continually aggregate trace events,
which can then be viewed afterwards by simply reading a 'hist' file
containing the aggregation in a human-readable format.

The basic idea is very simple and boils down to a mechanism whereby
trace events, rather than being exhaustively dumped in raw form and
viewed directly, are automatically 'compressed' into meaningful tables
completely defined by the user.

This is done strictly via single-line command-line commands and
without the aid of any kind of programming language or interpreter.

A surprising number of typical use cases can be accomplished by users
via this simple mechanism.  In fact, a large number of the tasks that
users typically do using the more complicated script-based tracing
tools, at least during the initial stages of an investigation, can be
accomplished by simply specifying a set of keys and values to be used
in the creation of a hash table.

The Linux kernel trace event subsystem happens to provide an extensive
list of keys and values ready-made for such a purpose in the form of
the event format files associated with each trace event.  By simply
consulting the format file for field names of interest and by plugging
them into the hist trigger command, users can create an endless number
of useful aggregations to help with investigating various properties
of the system.  See Documentation/trace/events.txt for examples.

hist triggers are implemented on top of the existing event trigger
infrastructure, and as such are consistent with the existing triggers
from a user's perspective as well.

The basic syntax follows the existing trigger syntax.  Users start an
aggregation by writing a 'hist' trigger to the event of interest's
trigger file:

  # echo hist:keys=xxx [ if filter] > event/trigger

Once a hist trigger has been set up, by default it continually
aggregates every matching event into a hash table using the event key
and a value field named 'hitcount'.

To view the aggregation at any point in time, simply read the 'hist'
file in the same directory as the 'trigger' file:

  # cat event/hist

The detailed syntax provides additional options for user control, and
is described exhaustively in Documentation/trace/events.txt and in the
virtual tracing/README file in the tracing subsystem.

Link: http://lkml.kernel.org/r/72d263b5e1853fe9c314953b65833c3aa75479f2.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h        |   1 +
 kernel/trace/Kconfig                |  16 +
 kernel/trace/Makefile               |   1 +
 kernel/trace/trace.c                |  17 +
 kernel/trace/trace.h                |   7 +
 kernel/trace/trace_events.c         |   4 +
 kernel/trace/trace_events_hist.c    | 849 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_trigger.c |   1 +
 8 files changed, 896 insertions(+)
 create mode 100644 kernel/trace/trace_events_hist.c

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 0810f81b6db2..404603720650 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -407,6 +407,7 @@ enum event_trigger_type {
 	ETT_SNAPSHOT		= (1 << 1),
 	ETT_STACKTRACE		= (1 << 2),
 	ETT_EVENT_ENABLE	= (1 << 3),
+	ETT_EVENT_HIST		= (1 << 4),
 };
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d39556fd863a..fafeaf803bd0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -538,6 +538,22 @@ config TRACING_MAP
 	  generally used outside of that context, and is normally
 	  selected by tracers that use it.
 
+config HIST_TRIGGERS
+	bool "Histogram triggers"
+	depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select TRACING_MAP
+	default n
+	help
+	  Hist triggers allow one or more arbitrary trace event fields
+	  to be aggregated into hash tables and dumped to stdout by
+	  reading a debugfs/tracefs file.  They're useful for
+	  gathering quick and dirty (though precise) summaries of
+	  event activity as an initial guide for further investigation
+	  using more advanced tools.
+
+	  See Documentation/trace/events.txt.
+	  If in doubt, say N.
+
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 4255c4057aaa..979e7bfbde7a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -54,6 +54,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
 obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0d12dbde8399..6cf8fd03b028 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3812,6 +3812,9 @@ static const char readme_msg[] =
 #endif
 #ifdef CONFIG_TRACER_SNAPSHOT
 	"\t\t    snapshot\n"
+#endif
+#ifdef CONFIG_HIST_TRIGGERS
+	"\t\t    hist (see below)\n"
 #endif
 	"\t   example: echo traceoff > events/block/block_unplug/trigger\n"
 	"\t            echo traceoff:3 > events/block/block_unplug/trigger\n"
@@ -3828,6 +3831,20 @@ static const char readme_msg[] =
 	"\t   To remove a trigger with a count:\n"
 	"\t     echo '!<trigger>:0 > <system>/<event>/trigger\n"
 	"\t   Filters can be ignored when removing a trigger.\n"
+#ifdef CONFIG_HIST_TRIGGERS
+	"      hist trigger\t- If set, event hits are aggregated into a hash table\n"
+	"\t    Format: hist:keys=<field1>\n"
+	"\t            [:size=#entries]\n"
+	"\t            [if <filter>]\n\n"
+	"\t    When a matching event is hit, an entry is added to a hash\n"
+	"\t    table using the key named, and the value of a sum called\n"
+	"\t    'hitcount' is incremented.  Keys correspond to fields in the\n"
+	"\t    event's format description.  Keys can be any field.  The\n"
+	"\t    'size' parameter can be  used to specify more or fewer than\n"
+	"\t    the default 2048 entries for the hashtable size.\n\n"
+	"\t    Reading the 'hist' file for the event will dump the hash\n"
+	"\t    table in its entirety to stdout."
+#endif
 ;
 
 static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2525042760e6..505f8a45f426 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1162,6 +1162,13 @@ extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
 extern const struct file_operations event_trigger_fops;
+extern const struct file_operations event_hist_fops;
+
+#ifdef CONFIG_HIST_TRIGGERS
+extern int register_trigger_hist_cmd(void);
+#else
+static inline int register_trigger_hist_cmd(void) { return 0; }
+#endif
 
 extern int register_trigger_cmds(void);
 extern void clear_event_triggers(struct trace_array *tr);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index add81dff7520..e7cb983ee93c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2141,6 +2141,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	trace_create_file("trigger", 0644, file->dir, file,
 			  &event_trigger_fops);
 
+#ifdef CONFIG_HIST_TRIGGERS
+	trace_create_file("hist", 0444, file->dir, file,
+			  &event_hist_fops);
+#endif
 	trace_create_file("format", 0444, file->dir, call,
 			  &ftrace_event_format_fops);
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
new file mode 100644
index 000000000000..23b45e462117
--- /dev/null
+++ b/kernel/trace/trace_events_hist.c
@@ -0,0 +1,849 @@
+/*
+ * trace_events_hist - trace event hist triggers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Copyright (C) 2015 Tom Zanussi <tom.zanussi@linux.intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+
+#include "tracing_map.h"
+#include "trace.h"
+
+struct hist_field;
+
+typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
+
+struct hist_field {
+	struct ftrace_event_field	*field;
+	unsigned long			flags;
+	hist_field_fn_t			fn;
+	unsigned int			size;
+};
+
+static u64 hist_field_counter(struct hist_field *field, void *event)
+{
+	return 1;
+}
+
+static u64 hist_field_string(struct hist_field *hist_field, void *event)
+{
+	char *addr = (char *)(event + hist_field->field->offset);
+
+	return (u64)(unsigned long)addr;
+}
+
+#define DEFINE_HIST_FIELD_FN(type)					\
+static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
+{									\
+	type *addr = (type *)(event + hist_field->field->offset);	\
+									\
+	return (u64)*addr;						\
+}
+
+DEFINE_HIST_FIELD_FN(s64);
+DEFINE_HIST_FIELD_FN(u64);
+DEFINE_HIST_FIELD_FN(s32);
+DEFINE_HIST_FIELD_FN(u32);
+DEFINE_HIST_FIELD_FN(s16);
+DEFINE_HIST_FIELD_FN(u16);
+DEFINE_HIST_FIELD_FN(s8);
+DEFINE_HIST_FIELD_FN(u8);
+
+#define for_each_hist_field(i, hist_data)	\
+	for ((i) = 0; (i) < (hist_data)->n_fields; (i)++)
+
+#define for_each_hist_val_field(i, hist_data)	\
+	for ((i) = 0; (i) < (hist_data)->n_vals; (i)++)
+
+#define for_each_hist_key_field(i, hist_data)	\
+	for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
+
+#define HITCOUNT_IDX		0
+#define HIST_KEY_MAX		1
+#define HIST_KEY_SIZE_MAX	MAX_FILTER_STR_VAL
+
+enum hist_field_flags {
+	HIST_FIELD_FL_HITCOUNT	= 1,
+	HIST_FIELD_FL_KEY	= 2,
+	HIST_FIELD_FL_STRING	= 4,
+};
+
+struct hist_trigger_attrs {
+	char		*keys_str;
+	unsigned int	map_bits;
+};
+
+struct hist_trigger_data {
+	struct hist_field               *fields[TRACING_MAP_FIELDS_MAX];
+	unsigned int			n_vals;
+	unsigned int			n_keys;
+	unsigned int			n_fields;
+	unsigned int			key_size;
+	struct tracing_map_sort_key	sort_keys[TRACING_MAP_SORT_KEYS_MAX];
+	unsigned int			n_sort_keys;
+	struct trace_event_file		*event_file;
+	struct hist_trigger_attrs	*attrs;
+	struct tracing_map		*map;
+};
+
+static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
+{
+	hist_field_fn_t fn = NULL;
+
+	switch (field_size) {
+	case 8:
+		if (field_is_signed)
+			fn = hist_field_s64;
+		else
+			fn = hist_field_u64;
+		break;
+	case 4:
+		if (field_is_signed)
+			fn = hist_field_s32;
+		else
+			fn = hist_field_u32;
+		break;
+	case 2:
+		if (field_is_signed)
+			fn = hist_field_s16;
+		else
+			fn = hist_field_u16;
+		break;
+	case 1:
+		if (field_is_signed)
+			fn = hist_field_s8;
+		else
+			fn = hist_field_u8;
+		break;
+	}
+
+	return fn;
+}
+
+static int parse_map_size(char *str)
+{
+	unsigned long size, map_bits;
+	int ret;
+
+	strsep(&str, "=");
+	if (!str) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = kstrtoul(str, 0, &size);
+	if (ret)
+		goto out;
+
+	map_bits = ilog2(roundup_pow_of_two(size));
+	if (map_bits < TRACING_MAP_BITS_MIN ||
+	    map_bits > TRACING_MAP_BITS_MAX)
+		ret = -EINVAL;
+	else
+		ret = map_bits;
+ out:
+	return ret;
+}
+
+static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
+{
+	if (!attrs)
+		return;
+
+	kfree(attrs->keys_str);
+	kfree(attrs);
+}
+
+static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
+{
+	struct hist_trigger_attrs *attrs;
+	int ret = 0;
+
+	attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
+	if (!attrs)
+		return ERR_PTR(-ENOMEM);
+
+	while (trigger_str) {
+		char *str = strsep(&trigger_str, ":");
+
+		if ((strncmp(str, "key=", strlen("key=")) == 0) ||
+		    (strncmp(str, "keys=", strlen("keys=")) == 0))
+			attrs->keys_str = kstrdup(str, GFP_KERNEL);
+		else if (strncmp(str, "size=", strlen("size=")) == 0) {
+			int map_bits = parse_map_size(str);
+
+			if (map_bits < 0) {
+				ret = map_bits;
+				goto free;
+			}
+			attrs->map_bits = map_bits;
+		} else {
+			ret = -EINVAL;
+			goto free;
+		}
+	}
+
+	if (!attrs->keys_str) {
+		ret = -EINVAL;
+		goto free;
+	}
+
+	return attrs;
+ free:
+	destroy_hist_trigger_attrs(attrs);
+
+	return ERR_PTR(ret);
+}
+
+static void destroy_hist_field(struct hist_field *hist_field)
+{
+	kfree(hist_field);
+}
+
+static struct hist_field *create_hist_field(struct ftrace_event_field *field,
+					    unsigned long flags)
+{
+	struct hist_field *hist_field;
+
+	if (field && is_function_field(field))
+		return NULL;
+
+	hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
+	if (!hist_field)
+		return NULL;
+
+	if (flags & HIST_FIELD_FL_HITCOUNT) {
+		hist_field->fn = hist_field_counter;
+		goto out;
+	}
+
+	if (is_string_field(field)) {
+		flags |= HIST_FIELD_FL_STRING;
+		hist_field->fn = hist_field_string;
+	} else {
+		hist_field->fn = select_value_fn(field->size,
+						 field->is_signed);
+		if (!hist_field->fn) {
+			destroy_hist_field(hist_field);
+			return NULL;
+		}
+	}
+ out:
+	hist_field->field = field;
+	hist_field->flags = flags;
+
+	return hist_field;
+}
+
+static void destroy_hist_fields(struct hist_trigger_data *hist_data)
+{
+	unsigned int i;
+
+	for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
+		if (hist_data->fields[i]) {
+			destroy_hist_field(hist_data->fields[i]);
+			hist_data->fields[i] = NULL;
+		}
+	}
+}
+
+static int create_hitcount_val(struct hist_trigger_data *hist_data)
+{
+	hist_data->fields[HITCOUNT_IDX] =
+		create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT);
+	if (!hist_data->fields[HITCOUNT_IDX])
+		return -ENOMEM;
+
+	hist_data->n_vals++;
+
+	if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int create_val_fields(struct hist_trigger_data *hist_data,
+			     struct trace_event_file *file)
+{
+	int ret;
+
+	ret = create_hitcount_val(hist_data);
+
+	return ret;
+}
+
+static int create_key_field(struct hist_trigger_data *hist_data,
+			    unsigned int key_idx,
+			    struct trace_event_file *file,
+			    char *field_str)
+{
+	struct ftrace_event_field *field = NULL;
+	unsigned long flags = 0;
+	unsigned int key_size;
+	int ret = 0;
+
+	if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
+		return -EINVAL;
+
+	flags |= HIST_FIELD_FL_KEY;
+
+	field = trace_find_event_field(file->event_call, field_str);
+	if (!field) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	key_size = field->size;
+
+	hist_data->fields[key_idx] = create_hist_field(field, flags);
+	if (!hist_data->fields[key_idx]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key_size = ALIGN(key_size, sizeof(u64));
+	hist_data->fields[key_idx]->size = key_size;
+	hist_data->key_size = key_size;
+	if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	hist_data->n_keys++;
+
+	if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
+		return -EINVAL;
+
+	ret = key_size;
+ out:
+	return ret;
+}
+
+static int create_key_fields(struct hist_trigger_data *hist_data,
+			     struct trace_event_file *file)
+{
+	unsigned int i, n_vals = hist_data->n_vals;
+	char *fields_str, *field_str;
+	int ret = -EINVAL;
+
+	fields_str = hist_data->attrs->keys_str;
+	if (!fields_str)
+		goto out;
+
+	strsep(&fields_str, "=");
+	if (!fields_str)
+		goto out;
+
+	for (i = n_vals; i < n_vals + HIST_KEY_MAX; i++) {
+		field_str = strsep(&fields_str, ",");
+		if (!field_str)
+			break;
+		ret = create_key_field(hist_data, i, file, field_str);
+		if (ret < 0)
+			goto out;
+	}
+	if (fields_str) {
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = 0;
+ out:
+	return ret;
+}
+
+static int create_hist_fields(struct hist_trigger_data *hist_data,
+			      struct trace_event_file *file)
+{
+	int ret;
+
+	ret = create_val_fields(hist_data, file);
+	if (ret)
+		goto out;
+
+	ret = create_key_fields(hist_data, file);
+	if (ret)
+		goto out;
+
+	hist_data->n_fields = hist_data->n_vals + hist_data->n_keys;
+ out:
+	return ret;
+}
+
+static int create_sort_keys(struct hist_trigger_data *hist_data)
+{
+	int ret = 0;
+
+	hist_data->n_sort_keys = 1; /* sort_keys[0] is always hitcount */
+
+	return ret;
+}
+
+static void destroy_hist_data(struct hist_trigger_data *hist_data)
+{
+	destroy_hist_trigger_attrs(hist_data->attrs);
+	destroy_hist_fields(hist_data);
+	tracing_map_destroy(hist_data->map);
+	kfree(hist_data);
+}
+
+static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
+{
+	struct tracing_map *map = hist_data->map;
+	struct ftrace_event_field *field;
+	struct hist_field *hist_field;
+	unsigned int i, idx;
+
+	for_each_hist_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		if (hist_field->flags & HIST_FIELD_FL_KEY) {
+			tracing_map_cmp_fn_t cmp_fn;
+
+			field = hist_field->field;
+
+			if (is_string_field(field))
+				cmp_fn = tracing_map_cmp_string;
+			else
+				cmp_fn = tracing_map_cmp_num(field->size,
+							     field->is_signed);
+			idx = tracing_map_add_key_field(map, 0, cmp_fn);
+		} else
+			idx = tracing_map_add_sum_field(map);
+
+		if (idx < 0)
+			return idx;
+	}
+
+	return 0;
+}
+
+static struct hist_trigger_data *
+create_hist_data(unsigned int map_bits,
+		 struct hist_trigger_attrs *attrs,
+		 struct trace_event_file *file)
+{
+	struct hist_trigger_data *hist_data;
+	int ret = 0;
+
+	hist_data = kzalloc(sizeof(*hist_data), GFP_KERNEL);
+	if (!hist_data)
+		return ERR_PTR(-ENOMEM);
+
+	hist_data->attrs = attrs;
+
+	ret = create_hist_fields(hist_data, file);
+	if (ret)
+		goto free;
+
+	ret = create_sort_keys(hist_data);
+	if (ret)
+		goto free;
+
+	hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
+					    NULL, hist_data);
+	if (IS_ERR(hist_data->map)) {
+		ret = PTR_ERR(hist_data->map);
+		hist_data->map = NULL;
+		goto free;
+	}
+
+	ret = create_tracing_map_fields(hist_data);
+	if (ret)
+		goto free;
+
+	ret = tracing_map_init(hist_data->map);
+	if (ret)
+		goto free;
+
+	hist_data->event_file = file;
+ out:
+	return hist_data;
+ free:
+	hist_data->attrs = NULL;
+
+	destroy_hist_data(hist_data);
+
+	hist_data = ERR_PTR(ret);
+
+	goto out;
+}
+
+static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
+				    struct tracing_map_elt *elt,
+				    void *rec)
+{
+	struct hist_field *hist_field;
+	unsigned int i;
+	u64 hist_val;
+
+	for_each_hist_val_field(i, hist_data) {
+		hist_field = hist_data->fields[i];
+		hist_val = hist_field->fn(hist_field, rec);
+		tracing_map_update_sum(elt, i, hist_val);
+	}
+}
+
+static void event_hist_trigger(struct event_trigger_data *data, void *rec)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct hist_field *key_field;
+	struct tracing_map_elt *elt;
+	u64 field_contents;
+	void *key = NULL;
+	unsigned int i;
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		field_contents = key_field->fn(key_field, rec);
+		if (key_field->flags & HIST_FIELD_FL_STRING)
+			key = (void *)(unsigned long)field_contents;
+		else
+			key = (void *)&field_contents;
+	}
+
+	elt = tracing_map_insert(hist_data->map, key);
+	if (elt)
+		hist_trigger_elt_update(hist_data, elt, rec);
+}
+
+static void
+hist_trigger_entry_print(struct seq_file *m,
+			 struct hist_trigger_data *hist_data, void *key,
+			 struct tracing_map_elt *elt)
+{
+	struct hist_field *key_field;
+	unsigned int i;
+	u64 uval;
+
+	seq_puts(m, "{ ");
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (i > hist_data->n_vals)
+			seq_puts(m, ", ");
+
+		if (key_field->flags & HIST_FIELD_FL_STRING) {
+			seq_printf(m, "%s: %-50s", key_field->field->name,
+				   (char *)key);
+		} else {
+			uval = *(u64 *)key;
+			seq_printf(m, "%s: %10llu",
+				   key_field->field->name, uval);
+		}
+	}
+
+	seq_puts(m, " }");
+
+	seq_printf(m, " hitcount: %10llu",
+		   tracing_map_read_sum(elt, HITCOUNT_IDX));
+
+	seq_puts(m, "\n");
+}
+
+static int print_entries(struct seq_file *m,
+			 struct hist_trigger_data *hist_data)
+{
+	struct tracing_map_sort_entry **sort_entries = NULL;
+	struct tracing_map *map = hist_data->map;
+	unsigned int i, n_entries;
+
+	n_entries = tracing_map_sort_entries(map, hist_data->sort_keys,
+					     hist_data->n_sort_keys,
+					     &sort_entries);
+	if (n_entries < 0)
+		return n_entries;
+
+	for (i = 0; i < n_entries; i++)
+		hist_trigger_entry_print(m, hist_data,
+					 sort_entries[i]->key,
+					 sort_entries[i]->elt);
+
+	tracing_map_destroy_sort_entries(sort_entries, n_entries);
+
+	return n_entries;
+}
+
+static int hist_show(struct seq_file *m, void *v)
+{
+	struct event_trigger_data *test, *data = NULL;
+	struct trace_event_file *event_file;
+	struct hist_trigger_data *hist_data;
+	int n_entries, ret = 0;
+
+	mutex_lock(&event_mutex);
+
+	event_file = event_file_data(m->private);
+	if (unlikely(!event_file)) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	list_for_each_entry_rcu(test, &event_file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			data = test;
+			break;
+		}
+	}
+	if (!data)
+		goto out_unlock;
+
+	seq_puts(m, "# event histogram\n#\n# trigger info: ");
+	data->ops->print(m, data->ops, data);
+	seq_puts(m, "\n");
+
+	hist_data = data->private_data;
+	n_entries = print_entries(m, hist_data);
+	if (n_entries < 0) {
+		ret = n_entries;
+		n_entries = 0;
+	}
+
+	seq_printf(m, "\nTotals:\n    Hits: %llu\n    Entries: %u\n    Dropped: %llu\n",
+		   (u64)atomic64_read(&hist_data->map->hits),
+		   n_entries, (u64)atomic64_read(&hist_data->map->drops));
+ out_unlock:
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+static int event_hist_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hist_show, file);
+}
+
+const struct file_operations event_hist_fops = {
+	.open = event_hist_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
+{
+	seq_printf(m, "%s", hist_field->field->name);
+}
+
+static int event_hist_trigger_print(struct seq_file *m,
+				    struct event_trigger_ops *ops,
+				    struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct hist_field *key_field;
+	unsigned int i;
+
+	seq_puts(m, "hist:keys=");
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (i > hist_data->n_vals)
+			seq_puts(m, ",");
+
+		hist_field_print(m, key_field);
+	}
+
+	seq_puts(m, ":vals=");
+	seq_puts(m, "hitcount");
+
+	seq_puts(m, ":sort=");
+	seq_puts(m, "hitcount");
+
+	seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
+
+	if (data->filter_str)
+		seq_printf(m, " if %s", data->filter_str);
+
+	seq_puts(m, " [active]");
+
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static void event_hist_trigger_free(struct event_trigger_ops *ops,
+				    struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+
+	if (WARN_ON_ONCE(data->ref <= 0))
+		return;
+
+	data->ref--;
+	if (!data->ref) {
+		trigger_data_free(data);
+		destroy_hist_data(hist_data);
+	}
+}
+
+static struct event_trigger_ops event_hist_trigger_ops = {
+	.func			= event_hist_trigger,
+	.print			= event_hist_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_hist_trigger_free,
+};
+
+static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
+							    char *param)
+{
+	return &event_hist_trigger_ops;
+}
+
+static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
+				 struct event_trigger_data *data,
+				 struct trace_event_file *file)
+{
+	struct event_trigger_data *test;
+	int ret = 0;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	if (data->ops->init) {
+		ret = data->ops->init(data->ops, data);
+		if (ret < 0)
+			goto out;
+	}
+
+	list_add_rcu(&data->list, &file->triggers);
+	ret++;
+
+	update_cond_flag(file);
+	if (trace_event_trigger_enable_disable(file, 1) < 0) {
+		list_del_rcu(&data->list);
+		update_cond_flag(file);
+		ret--;
+	}
+ out:
+	return ret;
+}
+
+static int event_hist_trigger_func(struct event_command *cmd_ops,
+				   struct trace_event_file *file,
+				   char *glob, char *cmd, char *param)
+{
+	unsigned int hist_trigger_bits = TRACING_MAP_BITS_DEFAULT;
+	struct event_trigger_data *trigger_data;
+	struct hist_trigger_attrs *attrs;
+	struct event_trigger_ops *trigger_ops;
+	struct hist_trigger_data *hist_data;
+	char *trigger;
+	int ret = 0;
+
+	if (!param)
+		return -EINVAL;
+
+	/* separate the trigger from the filter (k:v [if filter]) */
+	trigger = strsep(&param, " \t");
+	if (!trigger)
+		return -EINVAL;
+
+	attrs = parse_hist_trigger_attrs(trigger);
+	if (IS_ERR(attrs))
+		return PTR_ERR(attrs);
+
+	if (attrs->map_bits)
+		hist_trigger_bits = attrs->map_bits;
+
+	hist_data = create_hist_data(hist_trigger_bits, attrs, file);
+	if (IS_ERR(hist_data)) {
+		destroy_hist_trigger_attrs(attrs);
+		return PTR_ERR(hist_data);
+	}
+
+	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
+
+	ret = -ENOMEM;
+	trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
+	if (!trigger_data)
+		goto out_free;
+
+	trigger_data->count = -1;
+	trigger_data->ops = trigger_ops;
+	trigger_data->cmd_ops = cmd_ops;
+
+	INIT_LIST_HEAD(&trigger_data->list);
+	RCU_INIT_POINTER(trigger_data->filter, NULL);
+
+	trigger_data->private_data = hist_data;
+
+	if (glob[0] == '!') {
+		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
+		ret = 0;
+		goto out_free;
+	}
+
+	if (!param) /* if param is non-empty, it's supposed to be a filter */
+		goto out_reg;
+
+	if (!cmd_ops->set_filter)
+		goto out_reg;
+
+	ret = cmd_ops->set_filter(param, trigger_data, file);
+	if (ret < 0)
+		goto out_free;
+ out_reg:
+	ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
+	/*
+	 * The above returns on success the # of triggers registered,
+	 * but if it didn't register any it returns zero.  Consider no
+	 * triggers registered a failure too.
+	 */
+	if (!ret) {
+		ret = -ENOENT;
+		goto out_free;
+	} else if (ret < 0)
+		goto out_free;
+	/* Just return zero, not the number of registered triggers */
+	ret = 0;
+ out:
+	return ret;
+ out_free:
+	if (cmd_ops->set_filter)
+		cmd_ops->set_filter(NULL, trigger_data, NULL);
+
+	kfree(trigger_data);
+
+	destroy_hist_data(hist_data);
+	goto out;
+}
+
+static struct event_command trigger_hist_cmd = {
+	.name			= "hist",
+	.trigger_type		= ETT_EVENT_HIST,
+	.flags			= EVENT_CMD_FL_NEEDS_REC,
+	.func			= event_hist_trigger_func,
+	.reg			= hist_register_trigger,
+	.unreg			= unregister_trigger,
+	.get_trigger_ops	= event_hist_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+__init int register_trigger_hist_cmd(void)
+{
+	int ret;
+
+	ret = register_event_command(&trigger_hist_cmd);
+	WARN_ON(ret < 0);
+
+	return ret;
+}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d67992f3bb0e..d29092afe005 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1447,6 +1447,7 @@ __init int register_trigger_cmds(void)
 	register_trigger_snapshot_cmd();
 	register_trigger_stacktrace_cmd();
 	register_trigger_enable_disable_cmds();
+	register_trigger_hist_cmd();
 
 	return 0;
 }
-- 
cgit v1.2.3


From f2606835d70d2a2e6a134f01821da8149e124796 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:43 -0600
Subject: tracing: Add hist trigger support for multiple values ('vals=' param)

Allow users to specify trace event fields to use in aggregated sums
via a new 'vals=' keyword.  Before this addition, the only aggregated
sum supported was the implied value 'hitcount'.  With this addition,
'hitcount' is also supported as an explicit value field, as is any
numeric trace event field.

This expands the hist trigger syntax from this:

  # echo hist:keys=xxx [ if filter] > event/trigger

to this:

  # echo hist:keys=xxx:vals=yyy [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/2a5d1adb5ba6c65d7bb2148e379f2fed47f29a68.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             | 12 +++---
 kernel/trace/trace_events_hist.c | 79 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6cf8fd03b028..bb62f54c5480 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3834,14 +3834,16 @@ static const char readme_msg[] =
 #ifdef CONFIG_HIST_TRIGGERS
 	"      hist trigger\t- If set, event hits are aggregated into a hash table\n"
 	"\t    Format: hist:keys=<field1>\n"
+	"\t            [:values=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
 	"\t            [if <filter>]\n\n"
 	"\t    When a matching event is hit, an entry is added to a hash\n"
-	"\t    table using the key named, and the value of a sum called\n"
-	"\t    'hitcount' is incremented.  Keys correspond to fields in the\n"
-	"\t    event's format description.  Keys can be any field.  The\n"
-	"\t    'size' parameter can be  used to specify more or fewer than\n"
-	"\t    the default 2048 entries for the hashtable size.\n\n"
+	"\t    table using the key(s) and value(s) named, and the value of a\n"
+	"\t    sum called 'hitcount' is incremented.  Keys and values\n"
+	"\t    correspond to fields in the event's format description.  Keys\n"
+	"\t    can be any field.  Values must correspond to numeric fields.\n"
+	"\t    The 'size' parameter can be  used to specify more or fewer\n"
+	"\t    than the default 2048 entries for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
 	"\t    table in its entirety to stdout."
 #endif
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 23b45e462117..1b33590b9c8f 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -84,6 +84,7 @@ enum hist_field_flags {
 
 struct hist_trigger_attrs {
 	char		*keys_str;
+	char		*vals_str;
 	unsigned int	map_bits;
 };
 
@@ -165,6 +166,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
 		return;
 
 	kfree(attrs->keys_str);
+	kfree(attrs->vals_str);
 	kfree(attrs);
 }
 
@@ -183,6 +185,10 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 		if ((strncmp(str, "key=", strlen("key=")) == 0) ||
 		    (strncmp(str, "keys=", strlen("keys=")) == 0))
 			attrs->keys_str = kstrdup(str, GFP_KERNEL);
+		else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
+			 (strncmp(str, "vals=", strlen("vals=")) == 0) ||
+			 (strncmp(str, "values=", strlen("values=")) == 0))
+			attrs->vals_str = kstrdup(str, GFP_KERNEL);
 		else if (strncmp(str, "size=", strlen("size=")) == 0) {
 			int map_bits = parse_map_size(str);
 
@@ -276,13 +282,70 @@ static int create_hitcount_val(struct hist_trigger_data *hist_data)
 	return 0;
 }
 
+static int create_val_field(struct hist_trigger_data *hist_data,
+			    unsigned int val_idx,
+			    struct trace_event_file *file,
+			    char *field_str)
+{
+	struct ftrace_event_field *field = NULL;
+	unsigned long flags = 0;
+	int ret = 0;
+
+	if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
+		return -EINVAL;
+	field = trace_find_event_field(file->event_call, field_str);
+	if (!field) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	hist_data->fields[val_idx] = create_hist_field(field, flags);
+	if (!hist_data->fields[val_idx]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++hist_data->n_vals;
+
+	if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
+		ret = -EINVAL;
+ out:
+	return ret;
+}
+
 static int create_val_fields(struct hist_trigger_data *hist_data,
 			     struct trace_event_file *file)
 {
+	char *fields_str, *field_str;
+	unsigned int i, j;
 	int ret;
 
 	ret = create_hitcount_val(hist_data);
+	if (ret)
+		goto out;
 
+	fields_str = hist_data->attrs->vals_str;
+	if (!fields_str)
+		goto out;
+
+	strsep(&fields_str, "=");
+	if (!fields_str)
+		goto out;
+
+	for (i = 0, j = 1; i < TRACING_MAP_VALS_MAX &&
+		     j < TRACING_MAP_VALS_MAX; i++) {
+		field_str = strsep(&fields_str, ",");
+		if (!field_str)
+			break;
+		if (strcmp(field_str, "hitcount") == 0)
+			continue;
+		ret = create_val_field(hist_data, j++, file, field_str);
+		if (ret)
+			goto out;
+	}
+	if (fields_str && (strcmp(fields_str, "hitcount") != 0))
+		ret = -EINVAL;
+ out:
 	return ret;
 }
 
@@ -552,6 +615,12 @@ hist_trigger_entry_print(struct seq_file *m,
 	seq_printf(m, " hitcount: %10llu",
 		   tracing_map_read_sum(elt, HITCOUNT_IDX));
 
+	for (i = 1; i < hist_data->n_vals; i++) {
+		seq_printf(m, "  %s: %10llu",
+			   hist_data->fields[i]->field->name,
+			   tracing_map_read_sum(elt, i));
+	}
+
 	seq_puts(m, "\n");
 }
 
@@ -659,7 +728,15 @@ static int event_hist_trigger_print(struct seq_file *m,
 	}
 
 	seq_puts(m, ":vals=");
-	seq_puts(m, "hitcount");
+
+	for_each_hist_val_field(i, hist_data) {
+		if (i == HITCOUNT_IDX)
+			seq_puts(m, "hitcount");
+		else {
+			seq_puts(m, ",");
+			hist_field_print(m, hist_data->fields[i]);
+		}
+	}
 
 	seq_puts(m, ":sort=");
 	seq_puts(m, "hitcount");
-- 
cgit v1.2.3


From 76a3b0c8ac344e1d0f436160cbb59b670b086947 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:44 -0600
Subject: tracing: Add hist trigger support for compound keys

Allow users to specify multiple trace event fields to use in keys by
allowing multiple fields in the 'keys=' keyword.  With this addition,
any unique combination of any of the fields named in the 'keys'
keyword will result in a new entry being added to the hash table.

Link: http://lkml.kernel.org/r/0cfa24e6ac3b0dcece7737d94aa1f322ae3afc4b.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             | 10 ++++++----
 kernel/trace/trace_events_hist.c | 41 +++++++++++++++++++++++++++++-----------
 2 files changed, 36 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb62f54c5480..7a4c4dcf0bfe 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3833,7 +3833,7 @@ static const char readme_msg[] =
 	"\t   Filters can be ignored when removing a trigger.\n"
 #ifdef CONFIG_HIST_TRIGGERS
 	"      hist trigger\t- If set, event hits are aggregated into a hash table\n"
-	"\t    Format: hist:keys=<field1>\n"
+	"\t    Format: hist:keys=<field1[,field2,...]>\n"
 	"\t            [:values=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
 	"\t            [if <filter>]\n\n"
@@ -3841,9 +3841,11 @@ static const char readme_msg[] =
 	"\t    table using the key(s) and value(s) named, and the value of a\n"
 	"\t    sum called 'hitcount' is incremented.  Keys and values\n"
 	"\t    correspond to fields in the event's format description.  Keys\n"
-	"\t    can be any field.  Values must correspond to numeric fields.\n"
-	"\t    The 'size' parameter can be  used to specify more or fewer\n"
-	"\t    than the default 2048 entries for the hashtable size.\n\n"
+	"\t    can be any field.  Compound keys consisting of up to two\n"
+	"\t    fields can be specified by the 'keys' keyword.  Values must\n"
+	"\t    correspond to numeric fields.  The 'size' parameter can be\n"
+	"\t    used to specify more or fewer than the default 2048 entries\n"
+	"\t    for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
 	"\t    table in its entirety to stdout."
 #endif
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1b33590b9c8f..65fdfc6cb633 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -32,6 +32,7 @@ struct hist_field {
 	unsigned long			flags;
 	hist_field_fn_t			fn;
 	unsigned int			size;
+	unsigned int			offset;
 };
 
 static u64 hist_field_counter(struct hist_field *field, void *event)
@@ -73,8 +74,7 @@ DEFINE_HIST_FIELD_FN(u8);
 	for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
 
 #define HITCOUNT_IDX		0
-#define HIST_KEY_MAX		1
-#define HIST_KEY_SIZE_MAX	MAX_FILTER_STR_VAL
+#define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + sizeof(u64))
 
 enum hist_field_flags {
 	HIST_FIELD_FL_HITCOUNT	= 1,
@@ -351,6 +351,7 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
 
 static int create_key_field(struct hist_trigger_data *hist_data,
 			    unsigned int key_idx,
+			    unsigned int key_offset,
 			    struct trace_event_file *file,
 			    char *field_str)
 {
@@ -380,7 +381,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 
 	key_size = ALIGN(key_size, sizeof(u64));
 	hist_data->fields[key_idx]->size = key_size;
-	hist_data->key_size = key_size;
+	hist_data->fields[key_idx]->offset = key_offset;
+	hist_data->key_size += key_size;
 	if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
 		ret = -EINVAL;
 		goto out;
@@ -399,7 +401,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 static int create_key_fields(struct hist_trigger_data *hist_data,
 			     struct trace_event_file *file)
 {
-	unsigned int i, n_vals = hist_data->n_vals;
+	unsigned int i, key_offset = 0, n_vals = hist_data->n_vals;
 	char *fields_str, *field_str;
 	int ret = -EINVAL;
 
@@ -411,13 +413,15 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
 	if (!fields_str)
 		goto out;
 
-	for (i = n_vals; i < n_vals + HIST_KEY_MAX; i++) {
+	for (i = n_vals; i < n_vals + TRACING_MAP_KEYS_MAX; i++) {
 		field_str = strsep(&fields_str, ",");
 		if (!field_str)
 			break;
-		ret = create_key_field(hist_data, i, file, field_str);
+		ret = create_key_field(hist_data, i, key_offset,
+				       file, field_str);
 		if (ret < 0)
 			goto out;
+		key_offset += ret;
 	}
 	if (fields_str) {
 		ret = -EINVAL;
@@ -482,7 +486,10 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
 			else
 				cmp_fn = tracing_map_cmp_num(field->size,
 							     field->is_signed);
-			idx = tracing_map_add_key_field(map, 0, cmp_fn);
+			idx = tracing_map_add_key_field(map,
+							hist_field->offset,
+							cmp_fn);
+
 		} else
 			idx = tracing_map_add_sum_field(map);
 
@@ -562,12 +569,16 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
+	char compound_key[HIST_KEY_SIZE_MAX];
 	struct hist_field *key_field;
 	struct tracing_map_elt *elt;
 	u64 field_contents;
 	void *key = NULL;
 	unsigned int i;
 
+	if (hist_data->n_keys > 1)
+		memset(compound_key, 0, hist_data->key_size);
+
 	for_each_hist_key_field(i, hist_data) {
 		key_field = hist_data->fields[i];
 
@@ -576,8 +587,16 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 			key = (void *)(unsigned long)field_contents;
 		else
 			key = (void *)&field_contents;
+
+		if (hist_data->n_keys > 1) {
+			memcpy(compound_key + key_field->offset, key,
+			       key_field->size);
+		}
 	}
 
+	if (hist_data->n_keys > 1)
+		key = compound_key;
+
 	elt = tracing_map_insert(hist_data->map, key);
 	if (elt)
 		hist_trigger_elt_update(hist_data, elt, rec);
@@ -602,11 +621,11 @@ hist_trigger_entry_print(struct seq_file *m,
 
 		if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
-				   (char *)key);
+				   (char *)(key + key_field->offset));
 		} else {
-			uval = *(u64 *)key;
-			seq_printf(m, "%s: %10llu",
-				   key_field->field->name, uval);
+			uval = *(u64 *)(key + key_field->offset);
+			seq_printf(m, "%s: %10llu", key_field->field->name,
+				   uval);
 		}
 	}
 
-- 
cgit v1.2.3


From e62347d2453474fa514fbbbc4636313d34d3c850 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:45 -0600
Subject: tracing: Add hist trigger support for user-defined sorting ('sort='
 param)

Allow users to specify keys and/or values to sort on.  With this
addition, keys and values specified using the 'keys=' and 'vals='
keywords can be used to sort the hist trigger output via a new 'sort='
keyword.  If multiple sort keys are specified, the output will be
sorted using the second key as a secondary sort key, etc.  The default
sort order is ascending; if the user wants a different sort order,
'.descending' can be appended to the specific sort key.  Before this
addition, output was always sorted by 'hitcount' in ascending order.

This expands the hist trigger syntax from this:

    # echo hist:keys=xxx:vals=yyy \
          [ if filter] > event/trigger

to this:

    # echo hist:keys=xxx:vals=yyy:sort=zzz.descending \
          [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/b30a41db66ba486979c4f987aff5fab500ea53b3.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |   6 ++-
 kernel/trace/trace_events_hist.c | 112 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 114 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7a4c4dcf0bfe..d16fa8e1cdbf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3835,6 +3835,7 @@ static const char readme_msg[] =
 	"      hist trigger\t- If set, event hits are aggregated into a hash table\n"
 	"\t    Format: hist:keys=<field1[,field2,...]>\n"
 	"\t            [:values=<field1[,field2,...]>]\n"
+	"\t            [:sort=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
 	"\t            [if <filter>]\n\n"
 	"\t    When a matching event is hit, an entry is added to a hash\n"
@@ -3843,7 +3844,10 @@ static const char readme_msg[] =
 	"\t    correspond to fields in the event's format description.  Keys\n"
 	"\t    can be any field.  Compound keys consisting of up to two\n"
 	"\t    fields can be specified by the 'keys' keyword.  Values must\n"
-	"\t    correspond to numeric fields.  The 'size' parameter can be\n"
+	"\t    correspond to numeric fields.  Sort keys consisting of up to\n"
+	"\t    two fields can be specified using the 'sort' keyword.  The\n"
+	"\t    sort direction can be modified by appending '.descending' or\n"
+	"\t    '.ascending' to a sort field.  The 'size' parameter can be\n"
 	"\t    used to specify more or fewer than the default 2048 entries\n"
 	"\t    for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 65fdfc6cb633..03ba4530c08c 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -85,6 +85,7 @@ enum hist_field_flags {
 struct hist_trigger_attrs {
 	char		*keys_str;
 	char		*vals_str;
+	char		*sort_key_str;
 	unsigned int	map_bits;
 };
 
@@ -165,6 +166,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
 	if (!attrs)
 		return;
 
+	kfree(attrs->sort_key_str);
 	kfree(attrs->keys_str);
 	kfree(attrs->vals_str);
 	kfree(attrs);
@@ -189,6 +191,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 			 (strncmp(str, "vals=", strlen("vals=")) == 0) ||
 			 (strncmp(str, "values=", strlen("values=")) == 0))
 			attrs->vals_str = kstrdup(str, GFP_KERNEL);
+		else if (strncmp(str, "sort=", strlen("sort=")) == 0)
+			attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
 		else if (strncmp(str, "size=", strlen("size=")) == 0) {
 			int map_bits = parse_map_size(str);
 
@@ -450,12 +454,92 @@ static int create_hist_fields(struct hist_trigger_data *hist_data,
 	return ret;
 }
 
+static int is_descending(const char *str)
+{
+	if (!str)
+		return 0;
+
+	if (strcmp(str, "descending") == 0)
+		return 1;
+
+	if (strcmp(str, "ascending") == 0)
+		return 0;
+
+	return -EINVAL;
+}
+
 static int create_sort_keys(struct hist_trigger_data *hist_data)
 {
-	int ret = 0;
+	char *fields_str = hist_data->attrs->sort_key_str;
+	struct ftrace_event_field *field = NULL;
+	struct tracing_map_sort_key *sort_key;
+	int descending, ret = 0;
+	unsigned int i, j;
+
+	hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
+
+	if (!fields_str)
+		goto out;
+
+	strsep(&fields_str, "=");
+	if (!fields_str) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
+		char *field_str, *field_name;
+
+		sort_key = &hist_data->sort_keys[i];
+
+		field_str = strsep(&fields_str, ",");
+		if (!field_str) {
+			if (i == 0)
+				ret = -EINVAL;
+			break;
+		}
+
+		if ((i == TRACING_MAP_SORT_KEYS_MAX - 1) && fields_str) {
+			ret = -EINVAL;
+			break;
+		}
 
-	hist_data->n_sort_keys = 1; /* sort_keys[0] is always hitcount */
+		field_name = strsep(&field_str, ".");
+		if (!field_name) {
+			ret = -EINVAL;
+			break;
+		}
+
+		if (strcmp(field_name, "hitcount") == 0) {
+			descending = is_descending(field_str);
+			if (descending < 0) {
+				ret = descending;
+				break;
+			}
+			sort_key->descending = descending;
+			continue;
+		}
 
+		for (j = 1; j < hist_data->n_fields; j++) {
+			field = hist_data->fields[j]->field;
+			if (field && (strcmp(field_name, field->name) == 0)) {
+				sort_key->field_idx = j;
+				descending = is_descending(field_str);
+				if (descending < 0) {
+					ret = descending;
+					goto out;
+				}
+				sort_key->descending = descending;
+				break;
+			}
+		}
+		if (j == hist_data->n_fields) {
+			ret = -EINVAL;
+			break;
+		}
+	}
+	hist_data->n_sort_keys = i;
+ out:
 	return ret;
 }
 
@@ -758,7 +842,29 @@ static int event_hist_trigger_print(struct seq_file *m,
 	}
 
 	seq_puts(m, ":sort=");
-	seq_puts(m, "hitcount");
+
+	for (i = 0; i < hist_data->n_sort_keys; i++) {
+		struct tracing_map_sort_key *sort_key;
+
+		sort_key = &hist_data->sort_keys[i];
+
+		if (i > 0)
+			seq_puts(m, ",");
+
+		if (sort_key->field_idx == HITCOUNT_IDX)
+			seq_puts(m, "hitcount");
+		else {
+			unsigned int idx = sort_key->field_idx;
+
+			if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
+				return -EINVAL;
+
+			hist_field_print(m, hist_data->fields[idx]);
+		}
+
+		if (sort_key->descending)
+			seq_puts(m, ".descending");
+	}
 
 	seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
 
-- 
cgit v1.2.3


From 83e99914c9e2677d8a80f2a23eca0d215d5bfb0f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:46 -0600
Subject: tracing: Add hist trigger support for pausing and continuing a trace

Allow users to append 'pause' or 'continue' to an existing trigger in
order to have it paused or to have a paused trace continue.

This expands the hist trigger syntax from this:
    # echo hist:keys=xxx:vals=yyy:sort=zzz.descending \
          [ if filter] >> event/trigger

to this:

    # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause or cont \
          [ if filter] >> event/trigger

Link: http://lkml.kernel.org/r/b672a92c14702cb924cdf6fc27ea1809bed04907.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  7 ++++++-
 kernel/trace/trace_events_hist.c | 31 ++++++++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d16fa8e1cdbf..7d5c63dd410a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3837,6 +3837,7 @@ static const char readme_msg[] =
 	"\t            [:values=<field1[,field2,...]>]\n"
 	"\t            [:sort=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
+	"\t            [:pause][:continue]\n"
 	"\t            [if <filter>]\n\n"
 	"\t    When a matching event is hit, an entry is added to a hash\n"
 	"\t    table using the key(s) and value(s) named, and the value of a\n"
@@ -3851,7 +3852,11 @@ static const char readme_msg[] =
 	"\t    used to specify more or fewer than the default 2048 entries\n"
 	"\t    for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
-	"\t    table in its entirety to stdout."
+	"\t    table in its entirety to stdout.\n\n"
+	"\t    The 'pause' parameter can be used to pause an existing hist\n"
+	"\t    trigger or to start a hist trigger but not log any events\n"
+	"\t    until told to do so.  'continue' can be used to start or\n"
+	"\t    restart a paused hist trigger.\n\n"
 #endif
 ;
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 03ba4530c08c..e3fa9c89f125 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -86,6 +86,8 @@ struct hist_trigger_attrs {
 	char		*keys_str;
 	char		*vals_str;
 	char		*sort_key_str;
+	bool		pause;
+	bool		cont;
 	unsigned int	map_bits;
 };
 
@@ -193,6 +195,11 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 			attrs->vals_str = kstrdup(str, GFP_KERNEL);
 		else if (strncmp(str, "sort=", strlen("sort=")) == 0)
 			attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+		else if (strcmp(str, "pause") == 0)
+			attrs->pause = true;
+		else if ((strcmp(str, "cont") == 0) ||
+			 (strcmp(str, "continue") == 0))
+			attrs->cont = true;
 		else if (strncmp(str, "size=", strlen("size=")) == 0) {
 			int map_bits = parse_map_size(str);
 
@@ -871,7 +878,10 @@ static int event_hist_trigger_print(struct seq_file *m,
 	if (data->filter_str)
 		seq_printf(m, " if %s", data->filter_str);
 
-	seq_puts(m, " [active]");
+	if (data->paused)
+		seq_puts(m, " [paused]");
+	else
+		seq_puts(m, " [active]");
 
 	seq_putc(m, '\n');
 
@@ -910,16 +920,30 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 				 struct event_trigger_data *data,
 				 struct trace_event_file *file)
 {
+	struct hist_trigger_data *hist_data = data->private_data;
 	struct event_trigger_data *test;
 	int ret = 0;
 
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
-			ret = -EEXIST;
+			if (hist_data->attrs->pause)
+				test->paused = true;
+			else if (hist_data->attrs->cont)
+				test->paused = false;
+			else
+				ret = -EEXIST;
 			goto out;
 		}
 	}
 
+	if (hist_data->attrs->cont) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	if (hist_data->attrs->pause)
+		data->paused = true;
+
 	if (data->ops->init) {
 		ret = data->ops->init(data->ops, data);
 		if (ret < 0)
@@ -1011,7 +1035,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 	 * triggers registered a failure too.
 	 */
 	if (!ret) {
-		ret = -ENOENT;
+		if (!(attrs->pause || attrs->cont))
+			ret = -ENOENT;
 		goto out_free;
 	} else if (ret < 0)
 		goto out_free;
-- 
cgit v1.2.3


From e86ae9baacfa9efe957df4fc037f079772636d76 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:47 -0600
Subject: tracing: Add hist trigger support for clearing a trace

Allow users to append 'clear' to an existing trigger in order to have
the hash table cleared.

This expands the hist trigger syntax from this:
    # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause/cont \
           [ if filter] >> event/trigger

to this:

    # echo hist:keys=xxx:vals=yyy:sort=zzz.descending:pause/cont/clear \
          [ if filter] >> event/trigger

Link: http://lkml.kernel.org/r/ae15dd0d9b2f7af07a37c1ff682063e2dbcdf160.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  5 ++++-
 kernel/trace/trace_events_hist.c | 24 ++++++++++++++++++++++--
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7d5c63dd410a..4097cd3763f7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3837,7 +3837,7 @@ static const char readme_msg[] =
 	"\t            [:values=<field1[,field2,...]>]\n"
 	"\t            [:sort=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
-	"\t            [:pause][:continue]\n"
+	"\t            [:pause][:continue][:clear]\n"
 	"\t            [if <filter>]\n\n"
 	"\t    When a matching event is hit, an entry is added to a hash\n"
 	"\t    table using the key(s) and value(s) named, and the value of a\n"
@@ -3857,6 +3857,9 @@ static const char readme_msg[] =
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
 	"\t    restart a paused hist trigger.\n\n"
+	"\t    The 'clear' parameter will clear the contents of a running\n"
+	"\t    hist trigger and leave its current paused/active state\n"
+	"\t    unchanged.\n\n"
 #endif
 ;
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index e3fa9c89f125..83cb25e067ad 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -88,6 +88,7 @@ struct hist_trigger_attrs {
 	char		*sort_key_str;
 	bool		pause;
 	bool		cont;
+	bool		clear;
 	unsigned int	map_bits;
 };
 
@@ -200,6 +201,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 		else if ((strcmp(str, "cont") == 0) ||
 			 (strcmp(str, "continue") == 0))
 			attrs->cont = true;
+		else if (strcmp(str, "clear") == 0)
+			attrs->clear = true;
 		else if (strncmp(str, "size=", strlen("size=")) == 0) {
 			int map_bits = parse_map_size(str);
 
@@ -916,6 +919,21 @@ static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
 	return &event_hist_trigger_ops;
 }
 
+static void hist_clear(struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+	bool paused;
+
+	paused = data->paused;
+	data->paused = true;
+
+	synchronize_sched();
+
+	tracing_map_clear(hist_data->map);
+
+	data->paused = paused;
+}
+
 static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 				 struct event_trigger_data *data,
 				 struct trace_event_file *file)
@@ -930,13 +948,15 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 				test->paused = true;
 			else if (hist_data->attrs->cont)
 				test->paused = false;
+			else if (hist_data->attrs->clear)
+				hist_clear(test);
 			else
 				ret = -EEXIST;
 			goto out;
 		}
 	}
 
-	if (hist_data->attrs->cont) {
+	if (hist_data->attrs->cont || hist_data->attrs->clear) {
 		ret = -ENOENT;
 		goto out;
 	}
@@ -1035,7 +1055,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 	 * triggers registered a failure too.
 	 */
 	if (!ret) {
-		if (!(attrs->pause || attrs->cont))
+		if (!(attrs->pause || attrs->cont || attrs->clear))
 			ret = -ENOENT;
 		goto out_free;
 	} else if (ret < 0)
-- 
cgit v1.2.3


From 0c4a6b4666e8eb86dead3f09b40bb8ca4f614e4f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:48 -0600
Subject: tracing: Add hist trigger 'hex' modifier for displaying numeric
 fields

Allow users to have numeric fields displayed as hex values in the
output by appending '.hex' to field names:

   # echo hist:keys=aaa,bbb.hex:vals=ccc.hex ... \
              [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/67bd431edda2af5798d7694818f7e8d71b6b3463.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  5 +++-
 kernel/trace/trace_events_hist.c | 62 ++++++++++++++++++++++++++++++++++++----
 2 files changed, 60 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4097cd3763f7..2eb2eafbe7f5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3852,7 +3852,10 @@ static const char readme_msg[] =
 	"\t    used to specify more or fewer than the default 2048 entries\n"
 	"\t    for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
-	"\t    table in its entirety to stdout.\n\n"
+	"\t    table in its entirety to stdout.  The default format used to\n"
+	"\t    display a given field can be modified by appending any of the\n"
+	"\t    following modifiers to the field name, as applicable:\n\n"
+	"\t            .hex        display a number as a hex value\n\n"
 	"\t    The 'pause' parameter can be used to pause an existing hist\n"
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 83cb25e067ad..8e49eeef8867 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -80,6 +80,7 @@ enum hist_field_flags {
 	HIST_FIELD_FL_HITCOUNT	= 1,
 	HIST_FIELD_FL_KEY	= 2,
 	HIST_FIELD_FL_STRING	= 4,
+	HIST_FIELD_FL_HEX	= 8,
 };
 
 struct hist_trigger_attrs {
@@ -303,11 +304,23 @@ static int create_val_field(struct hist_trigger_data *hist_data,
 {
 	struct ftrace_event_field *field = NULL;
 	unsigned long flags = 0;
+	char *field_name;
 	int ret = 0;
 
 	if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
 		return -EINVAL;
-	field = trace_find_event_field(file->event_call, field_str);
+
+	field_name = strsep(&field_str, ".");
+	if (field_str) {
+		if (strcmp(field_str, "hex") == 0)
+			flags |= HIST_FIELD_FL_HEX;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	field = trace_find_event_field(file->event_call, field_name);
 	if (!field) {
 		ret = -EINVAL;
 		goto out;
@@ -372,6 +385,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 	struct ftrace_event_field *field = NULL;
 	unsigned long flags = 0;
 	unsigned int key_size;
+	char *field_name;
 	int ret = 0;
 
 	if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
@@ -379,7 +393,17 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 
 	flags |= HIST_FIELD_FL_KEY;
 
-	field = trace_find_event_field(file->event_call, field_str);
+	field_name = strsep(&field_str, ".");
+	if (field_str) {
+		if (strcmp(field_str, "hex") == 0)
+			flags |= HIST_FIELD_FL_HEX;
+		else {
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	field = trace_find_event_field(file->event_call, field_name);
 	if (!field) {
 		ret = -EINVAL;
 		goto out;
@@ -713,7 +737,11 @@ hist_trigger_entry_print(struct seq_file *m,
 		if (i > hist_data->n_vals)
 			seq_puts(m, ", ");
 
-		if (key_field->flags & HIST_FIELD_FL_STRING) {
+		if (key_field->flags & HIST_FIELD_FL_HEX) {
+			uval = *(u64 *)(key + key_field->offset);
+			seq_printf(m, "%s: %llx",
+				   key_field->field->name, uval);
+		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
 		} else {
@@ -729,9 +757,15 @@ hist_trigger_entry_print(struct seq_file *m,
 		   tracing_map_read_sum(elt, HITCOUNT_IDX));
 
 	for (i = 1; i < hist_data->n_vals; i++) {
-		seq_printf(m, "  %s: %10llu",
-			   hist_data->fields[i]->field->name,
-			   tracing_map_read_sum(elt, i));
+		if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
+			seq_printf(m, "  %s: %10llx",
+				   hist_data->fields[i]->field->name,
+				   tracing_map_read_sum(elt, i));
+		} else {
+			seq_printf(m, "  %s: %10llu",
+				   hist_data->fields[i]->field->name,
+				   tracing_map_read_sum(elt, i));
+		}
 	}
 
 	seq_puts(m, "\n");
@@ -816,9 +850,25 @@ const struct file_operations event_hist_fops = {
 	.release = single_release,
 };
 
+static const char *get_hist_field_flags(struct hist_field *hist_field)
+{
+	const char *flags_str = NULL;
+
+	if (hist_field->flags & HIST_FIELD_FL_HEX)
+		flags_str = "hex";
+
+	return flags_str;
+}
+
 static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
 {
 	seq_printf(m, "%s", hist_field->field->name);
+	if (hist_field->flags) {
+		const char *flags_str = get_hist_field_flags(hist_field);
+
+		if (flags_str)
+			seq_printf(m, ".%s", flags_str);
+	}
 }
 
 static int event_hist_trigger_print(struct seq_file *m,
-- 
cgit v1.2.3


From c6afad49d127f6d7c9957319f55173a2198b1ba8 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:49 -0600
Subject: tracing: Add hist trigger 'sym' and 'sym-offset' modifiers

Allow users to have address fields displayed as symbols in the output
by appending '.sym' or 'sym-offset' to field names:

   # echo hist:keys=aaa.sym,bbb.sym-offset ... \
              [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/87d4935821491c0275513f0fbfb9bab8d3d3f079.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  4 +++-
 kernel/trace/trace_events_hist.c | 29 +++++++++++++++++++++++++----
 2 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2eb2eafbe7f5..34459eb0c844 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3855,7 +3855,9 @@ static const char readme_msg[] =
 	"\t    table in its entirety to stdout.  The default format used to\n"
 	"\t    display a given field can be modified by appending any of the\n"
 	"\t    following modifiers to the field name, as applicable:\n\n"
-	"\t            .hex        display a number as a hex value\n\n"
+	"\t            .hex        display a number as a hex value\n"
+	"\t            .sym        display an address as a symbol\n"
+	"\t            .sym-offset display an address as a symbol and offset\n\n"
 	"\t    The 'pause' parameter can be used to pause an existing hist\n"
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 8e49eeef8867..808cc06cdb61 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -77,10 +77,12 @@ DEFINE_HIST_FIELD_FN(u8);
 #define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + sizeof(u64))
 
 enum hist_field_flags {
-	HIST_FIELD_FL_HITCOUNT	= 1,
-	HIST_FIELD_FL_KEY	= 2,
-	HIST_FIELD_FL_STRING	= 4,
-	HIST_FIELD_FL_HEX	= 8,
+	HIST_FIELD_FL_HITCOUNT		= 1,
+	HIST_FIELD_FL_KEY		= 2,
+	HIST_FIELD_FL_STRING		= 4,
+	HIST_FIELD_FL_HEX		= 8,
+	HIST_FIELD_FL_SYM		= 16,
+	HIST_FIELD_FL_SYM_OFFSET	= 32,
 };
 
 struct hist_trigger_attrs {
@@ -397,6 +399,10 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 	if (field_str) {
 		if (strcmp(field_str, "hex") == 0)
 			flags |= HIST_FIELD_FL_HEX;
+		else if (strcmp(field_str, "sym") == 0)
+			flags |= HIST_FIELD_FL_SYM;
+		else if (strcmp(field_str, "sym-offset") == 0)
+			flags |= HIST_FIELD_FL_SYM_OFFSET;
 		else {
 			ret = -EINVAL;
 			goto out;
@@ -726,6 +732,7 @@ hist_trigger_entry_print(struct seq_file *m,
 			 struct tracing_map_elt *elt)
 {
 	struct hist_field *key_field;
+	char str[KSYM_SYMBOL_LEN];
 	unsigned int i;
 	u64 uval;
 
@@ -741,6 +748,16 @@ hist_trigger_entry_print(struct seq_file *m,
 			uval = *(u64 *)(key + key_field->offset);
 			seq_printf(m, "%s: %llx",
 				   key_field->field->name, uval);
+		} else if (key_field->flags & HIST_FIELD_FL_SYM) {
+			uval = *(u64 *)(key + key_field->offset);
+			sprint_symbol_no_offset(str, uval);
+			seq_printf(m, "%s: [%llx] %-45s",
+				   key_field->field->name, uval, str);
+		} else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
+			uval = *(u64 *)(key + key_field->offset);
+			sprint_symbol(str, uval);
+			seq_printf(m, "%s: [%llx] %-55s",
+				   key_field->field->name, uval, str);
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
@@ -856,6 +873,10 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 
 	if (hist_field->flags & HIST_FIELD_FL_HEX)
 		flags_str = "hex";
+	else if (hist_field->flags & HIST_FIELD_FL_SYM)
+		flags_str = "sym";
+	else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
+		flags_str = "sym-offset";
 
 	return flags_str;
 }
-- 
cgit v1.2.3


From 6b4827ad028a1ab2fc4dcf1f5e6e077018d1b770 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:50 -0600
Subject: tracing: Add hist trigger 'execname' modifier

Allow users to have common_pid field values displayed as program names
in the output by appending '.execname' to a common_pid field name:

   # echo hist:keys=common_pid.execname ... \
              [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/e172e81f10f5b8d1f08450e3763c850f39fbf698.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |   3 +-
 kernel/trace/trace_events_hist.c | 100 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 101 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 34459eb0c844..58a8bee50c6e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3857,7 +3857,8 @@ static const char readme_msg[] =
 	"\t    following modifiers to the field name, as applicable:\n\n"
 	"\t            .hex        display a number as a hex value\n"
 	"\t            .sym        display an address as a symbol\n"
-	"\t            .sym-offset display an address as a symbol and offset\n\n"
+	"\t            .sym-offset display an address as a symbol and offset\n"
+	"\t            .execname   display a common_pid as a program name\n\n"
 	"\t    The 'pause' parameter can be used to pause an existing hist\n"
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 808cc06cdb61..227e6c21b1eb 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -83,6 +83,7 @@ enum hist_field_flags {
 	HIST_FIELD_FL_HEX		= 8,
 	HIST_FIELD_FL_SYM		= 16,
 	HIST_FIELD_FL_SYM_OFFSET	= 32,
+	HIST_FIELD_FL_EXECNAME		= 64,
 };
 
 struct hist_trigger_attrs {
@@ -232,6 +233,73 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 	return ERR_PTR(ret);
 }
 
+static inline void save_comm(char *comm, struct task_struct *task)
+{
+	if (!task->pid) {
+		strcpy(comm, "<idle>");
+		return;
+	}
+
+	if (WARN_ON_ONCE(task->pid < 0)) {
+		strcpy(comm, "<XXX>");
+		return;
+	}
+
+	memcpy(comm, task->comm, TASK_COMM_LEN);
+}
+
+static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt)
+{
+	kfree((char *)elt->private_data);
+}
+
+static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt)
+{
+	struct hist_trigger_data *hist_data = elt->map->private_data;
+	struct hist_field *key_field;
+	unsigned int i;
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
+			unsigned int size = TASK_COMM_LEN + 1;
+
+			elt->private_data = kzalloc(size, GFP_KERNEL);
+			if (!elt->private_data)
+				return -ENOMEM;
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to,
+				       struct tracing_map_elt *from)
+{
+	char *comm_from = from->private_data;
+	char *comm_to = to->private_data;
+
+	if (comm_from)
+		memcpy(comm_to, comm_from, TASK_COMM_LEN + 1);
+}
+
+static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt)
+{
+	char *comm = elt->private_data;
+
+	if (comm)
+		save_comm(comm, current);
+}
+
+static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
+	.elt_alloc	= hist_trigger_elt_comm_alloc,
+	.elt_copy	= hist_trigger_elt_comm_copy,
+	.elt_free	= hist_trigger_elt_comm_free,
+	.elt_init	= hist_trigger_elt_comm_init,
+};
+
 static void destroy_hist_field(struct hist_field *hist_field)
 {
 	kfree(hist_field);
@@ -403,6 +471,9 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 			flags |= HIST_FIELD_FL_SYM;
 		else if (strcmp(field_str, "sym-offset") == 0)
 			flags |= HIST_FIELD_FL_SYM_OFFSET;
+		else if ((strcmp(field_str, "execname") == 0) &&
+			 (strcmp(field_name, "common_pid") == 0))
+			flags |= HIST_FIELD_FL_EXECNAME;
 		else {
 			ret = -EINVAL;
 			goto out;
@@ -624,11 +695,27 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
 	return 0;
 }
 
+static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
+{
+	struct hist_field *key_field;
+	unsigned int i;
+
+	for_each_hist_key_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+
+		if (key_field->flags & HIST_FIELD_FL_EXECNAME)
+			return true;
+	}
+
+	return false;
+}
+
 static struct hist_trigger_data *
 create_hist_data(unsigned int map_bits,
 		 struct hist_trigger_attrs *attrs,
 		 struct trace_event_file *file)
 {
+	const struct tracing_map_ops *map_ops = NULL;
 	struct hist_trigger_data *hist_data;
 	int ret = 0;
 
@@ -646,8 +733,11 @@ create_hist_data(unsigned int map_bits,
 	if (ret)
 		goto free;
 
+	if (need_tracing_map_ops(hist_data))
+		map_ops = &hist_trigger_elt_comm_ops;
+
 	hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
-					    NULL, hist_data);
+					    map_ops, hist_data);
 	if (IS_ERR(hist_data->map)) {
 		ret = PTR_ERR(hist_data->map);
 		hist_data->map = NULL;
@@ -758,6 +848,12 @@ hist_trigger_entry_print(struct seq_file *m,
 			sprint_symbol(str, uval);
 			seq_printf(m, "%s: [%llx] %-55s",
 				   key_field->field->name, uval, str);
+		} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
+			char *comm = elt->private_data;
+
+			uval = *(u64 *)(key + key_field->offset);
+			seq_printf(m, "%s: %-16s[%10llu]",
+				   key_field->field->name, comm, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
@@ -877,6 +973,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 		flags_str = "sym";
 	else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
 		flags_str = "sym-offset";
+	else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
+		flags_str = "execname";
 
 	return flags_str;
 }
-- 
cgit v1.2.3


From 316961988b5ec71bbf4b2ad447662770349aec13 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:51 -0600
Subject: tracing: Add hist trigger 'syscall' modifier

Allow users to have syscall id fields displayed as syscall names in
the output by appending '.syscall' to field names:

   # echo hist:keys=aaa.syscall ... \
              [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/2bab1e59933d76a14b545bd2e02f80b8b08ac4d3.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  3 ++-
 kernel/trace/trace_events_hist.c | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 58a8bee50c6e..0dd23c5b1c34 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3858,7 +3858,8 @@ static const char readme_msg[] =
 	"\t            .hex        display a number as a hex value\n"
 	"\t            .sym        display an address as a symbol\n"
 	"\t            .sym-offset display an address as a symbol and offset\n"
-	"\t            .execname   display a common_pid as a program name\n\n"
+	"\t            .execname   display a common_pid as a program name\n"
+	"\t            .syscall    display a syscall id as a syscall name\n\n"
 	"\t    The 'pause' parameter can be used to pause an existing hist\n"
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 227e6c21b1eb..f9e7ecb33847 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -84,6 +84,7 @@ enum hist_field_flags {
 	HIST_FIELD_FL_SYM		= 16,
 	HIST_FIELD_FL_SYM_OFFSET	= 32,
 	HIST_FIELD_FL_EXECNAME		= 64,
+	HIST_FIELD_FL_SYSCALL		= 128,
 };
 
 struct hist_trigger_attrs {
@@ -474,6 +475,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 		else if ((strcmp(field_str, "execname") == 0) &&
 			 (strcmp(field_name, "common_pid") == 0))
 			flags |= HIST_FIELD_FL_EXECNAME;
+		else if (strcmp(field_str, "syscall") == 0)
+			flags |= HIST_FIELD_FL_SYSCALL;
 		else {
 			ret = -EINVAL;
 			goto out;
@@ -854,6 +857,16 @@ hist_trigger_entry_print(struct seq_file *m,
 			uval = *(u64 *)(key + key_field->offset);
 			seq_printf(m, "%s: %-16s[%10llu]",
 				   key_field->field->name, comm, uval);
+		} else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
+			const char *syscall_name;
+
+			uval = *(u64 *)(key + key_field->offset);
+			syscall_name = get_syscall_name(uval);
+			if (!syscall_name)
+				syscall_name = "unknown_syscall";
+
+			seq_printf(m, "%s: %-30s[%3llu]",
+				   key_field->field->name, syscall_name, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
@@ -975,6 +988,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 		flags_str = "sym-offset";
 	else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
 		flags_str = "execname";
+	else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
+		flags_str = "syscall";
 
 	return flags_str;
 }
-- 
cgit v1.2.3


From 69a0200c2e25d61c50091549d00cfeb426c258f5 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:52 -0600
Subject: tracing: Add hist trigger support for stacktraces as keys

It's often useful to be able to use a stacktrace as a hash key, for
keeping a count of the number of times a particular call path resulted
in a trace event, for instance.  Add a special key named 'stacktrace'
which can be used as key in a 'keys=' param for this purpose:

    # echo hist:keys=stacktrace ... \
               [ if filter] > event/trigger

Link: http://lkml.kernel.org/r/87515e90b3785232a874a12156174635a348edb1.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  16 ++---
 kernel/trace/trace_events_hist.c | 135 +++++++++++++++++++++++++++++----------
 2 files changed, 109 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0dd23c5b1c34..2238bfde799b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3843,14 +3843,14 @@ static const char readme_msg[] =
 	"\t    table using the key(s) and value(s) named, and the value of a\n"
 	"\t    sum called 'hitcount' is incremented.  Keys and values\n"
 	"\t    correspond to fields in the event's format description.  Keys\n"
-	"\t    can be any field.  Compound keys consisting of up to two\n"
-	"\t    fields can be specified by the 'keys' keyword.  Values must\n"
-	"\t    correspond to numeric fields.  Sort keys consisting of up to\n"
-	"\t    two fields can be specified using the 'sort' keyword.  The\n"
-	"\t    sort direction can be modified by appending '.descending' or\n"
-	"\t    '.ascending' to a sort field.  The 'size' parameter can be\n"
-	"\t    used to specify more or fewer than the default 2048 entries\n"
-	"\t    for the hashtable size.\n\n"
+	"\t    can be any field, or the special string 'stacktrace'.\n"
+	"\t    Compound keys consisting of up to two fields can be specified\n"
+	"\t    by the 'keys' keyword.  Values must correspond to numeric\n"
+	"\t    fields.  Sort keys consisting of up to two fields can be\n"
+	"\t    specified using the 'sort' keyword.  The sort direction can\n"
+	"\t    be modified by appending '.descending' or '.ascending' to a\n"
+	"\t    sort field.  The 'size' parameter can be used to specify more\n"
+	"\t    or fewer than the default 2048 entries for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
 	"\t    table in its entirety to stdout.  The default format used to\n"
 	"\t    display a given field can be modified by appending any of the\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f9e7ecb33847..21cae42b54da 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -35,6 +35,11 @@ struct hist_field {
 	unsigned int			offset;
 };
 
+static u64 hist_field_none(struct hist_field *field, void *event)
+{
+	return 0;
+}
+
 static u64 hist_field_counter(struct hist_field *field, void *event)
 {
 	return 1;
@@ -73,8 +78,12 @@ DEFINE_HIST_FIELD_FN(u8);
 #define for_each_hist_key_field(i, hist_data)	\
 	for ((i) = (hist_data)->n_vals; (i) < (hist_data)->n_fields; (i)++)
 
+#define HIST_STACKTRACE_DEPTH	16
+#define HIST_STACKTRACE_SIZE	(HIST_STACKTRACE_DEPTH * sizeof(unsigned long))
+#define HIST_STACKTRACE_SKIP	5
+
 #define HITCOUNT_IDX		0
-#define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + sizeof(u64))
+#define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
 
 enum hist_field_flags {
 	HIST_FIELD_FL_HITCOUNT		= 1,
@@ -85,6 +94,7 @@ enum hist_field_flags {
 	HIST_FIELD_FL_SYM_OFFSET	= 32,
 	HIST_FIELD_FL_EXECNAME		= 64,
 	HIST_FIELD_FL_SYSCALL		= 128,
+	HIST_FIELD_FL_STACKTRACE	= 256,
 };
 
 struct hist_trigger_attrs {
@@ -323,6 +333,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 		goto out;
 	}
 
+	if (flags & HIST_FIELD_FL_STACKTRACE) {
+		hist_field->fn = hist_field_none;
+		goto out;
+	}
+
 	if (is_string_field(field)) {
 		flags |= HIST_FIELD_FL_STRING;
 		hist_field->fn = hist_field_string;
@@ -456,7 +471,6 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 	struct ftrace_event_field *field = NULL;
 	unsigned long flags = 0;
 	unsigned int key_size;
-	char *field_name;
 	int ret = 0;
 
 	if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX))
@@ -464,33 +478,39 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 
 	flags |= HIST_FIELD_FL_KEY;
 
-	field_name = strsep(&field_str, ".");
-	if (field_str) {
-		if (strcmp(field_str, "hex") == 0)
-			flags |= HIST_FIELD_FL_HEX;
-		else if (strcmp(field_str, "sym") == 0)
-			flags |= HIST_FIELD_FL_SYM;
-		else if (strcmp(field_str, "sym-offset") == 0)
-			flags |= HIST_FIELD_FL_SYM_OFFSET;
-		else if ((strcmp(field_str, "execname") == 0) &&
-			 (strcmp(field_name, "common_pid") == 0))
-			flags |= HIST_FIELD_FL_EXECNAME;
-		else if (strcmp(field_str, "syscall") == 0)
-			flags |= HIST_FIELD_FL_SYSCALL;
-		else {
+	if (strcmp(field_str, "stacktrace") == 0) {
+		flags |= HIST_FIELD_FL_STACKTRACE;
+		key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
+	} else {
+		char *field_name = strsep(&field_str, ".");
+
+		if (field_str) {
+			if (strcmp(field_str, "hex") == 0)
+				flags |= HIST_FIELD_FL_HEX;
+			else if (strcmp(field_str, "sym") == 0)
+				flags |= HIST_FIELD_FL_SYM;
+			else if (strcmp(field_str, "sym-offset") == 0)
+				flags |= HIST_FIELD_FL_SYM_OFFSET;
+			else if ((strcmp(field_str, "execname") == 0) &&
+				 (strcmp(field_name, "common_pid") == 0))
+				flags |= HIST_FIELD_FL_EXECNAME;
+			else if (strcmp(field_str, "syscall") == 0)
+				flags |= HIST_FIELD_FL_SYSCALL;
+			else {
+				ret = -EINVAL;
+				goto out;
+			}
+		}
+
+		field = trace_find_event_field(file->event_call, field_name);
+		if (!field) {
 			ret = -EINVAL;
 			goto out;
 		}
-	}
 
-	field = trace_find_event_field(file->event_call, field_name);
-	if (!field) {
-		ret = -EINVAL;
-		goto out;
+		key_size = field->size;
 	}
 
-	key_size = field->size;
-
 	hist_data->fields[key_idx] = create_hist_field(field, flags);
 	if (!hist_data->fields[key_idx]) {
 		ret = -ENOMEM;
@@ -679,7 +699,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
 
 			field = hist_field->field;
 
-			if (is_string_field(field))
+			if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
+				cmp_fn = tracing_map_cmp_none;
+			else if (is_string_field(field))
 				cmp_fn = tracing_map_cmp_string;
 			else
 				cmp_fn = tracing_map_cmp_num(field->size,
@@ -786,7 +808,9 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
+	unsigned long entries[HIST_STACKTRACE_DEPTH];
 	char compound_key[HIST_KEY_SIZE_MAX];
+	struct stack_trace stacktrace;
 	struct hist_field *key_field;
 	struct tracing_map_elt *elt;
 	u64 field_contents;
@@ -799,15 +823,27 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 	for_each_hist_key_field(i, hist_data) {
 		key_field = hist_data->fields[i];
 
-		field_contents = key_field->fn(key_field, rec);
-		if (key_field->flags & HIST_FIELD_FL_STRING)
-			key = (void *)(unsigned long)field_contents;
-		else
-			key = (void *)&field_contents;
+		if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
+			stacktrace.max_entries = HIST_STACKTRACE_DEPTH;
+			stacktrace.entries = entries;
+			stacktrace.nr_entries = 0;
+			stacktrace.skip = HIST_STACKTRACE_SKIP;
 
-		if (hist_data->n_keys > 1) {
-			memcpy(compound_key + key_field->offset, key,
-			       key_field->size);
+			memset(stacktrace.entries, 0, HIST_STACKTRACE_SIZE);
+			save_stack_trace(&stacktrace);
+
+			key = entries;
+		} else {
+			field_contents = key_field->fn(key_field, rec);
+			if (key_field->flags & HIST_FIELD_FL_STRING)
+				key = (void *)(unsigned long)field_contents;
+			else
+				key = (void *)&field_contents;
+
+			if (hist_data->n_keys > 1) {
+				memcpy(compound_key + key_field->offset, key,
+				       key_field->size);
+			}
 		}
 	}
 
@@ -819,6 +855,24 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 		hist_trigger_elt_update(hist_data, elt, rec);
 }
 
+static void hist_trigger_stacktrace_print(struct seq_file *m,
+					  unsigned long *stacktrace_entries,
+					  unsigned int max_entries)
+{
+	char str[KSYM_SYMBOL_LEN];
+	unsigned int spaces = 8;
+	unsigned int i;
+
+	for (i = 0; i < max_entries; i++) {
+		if (stacktrace_entries[i] == ULONG_MAX)
+			return;
+
+		seq_printf(m, "%*c", 1 + spaces, ' ');
+		sprint_symbol(str, stacktrace_entries[i]);
+		seq_printf(m, "%s\n", str);
+	}
+}
+
 static void
 hist_trigger_entry_print(struct seq_file *m,
 			 struct hist_trigger_data *hist_data, void *key,
@@ -826,6 +880,7 @@ hist_trigger_entry_print(struct seq_file *m,
 {
 	struct hist_field *key_field;
 	char str[KSYM_SYMBOL_LEN];
+	bool multiline = false;
 	unsigned int i;
 	u64 uval;
 
@@ -867,6 +922,12 @@ hist_trigger_entry_print(struct seq_file *m,
 
 			seq_printf(m, "%s: %-30s[%3llu]",
 				   key_field->field->name, syscall_name, uval);
+		} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
+			seq_puts(m, "stacktrace:\n");
+			hist_trigger_stacktrace_print(m,
+						      key + key_field->offset,
+						      HIST_STACKTRACE_DEPTH);
+			multiline = true;
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
@@ -877,7 +938,10 @@ hist_trigger_entry_print(struct seq_file *m,
 		}
 	}
 
-	seq_puts(m, " }");
+	if (!multiline)
+		seq_puts(m, " ");
+
+	seq_puts(m, "}");
 
 	seq_printf(m, " hitcount: %10llu",
 		   tracing_map_read_sum(elt, HITCOUNT_IDX));
@@ -1021,7 +1085,10 @@ static int event_hist_trigger_print(struct seq_file *m,
 		if (i > hist_data->n_vals)
 			seq_puts(m, ",");
 
-		hist_field_print(m, key_field);
+		if (key_field->flags & HIST_FIELD_FL_STACKTRACE)
+			seq_puts(m, "stacktrace");
+		else
+			hist_field_print(m, key_field);
 	}
 
 	seq_puts(m, ":vals=");
-- 
cgit v1.2.3


From 79e577cbce4c4c2bf0d64ec315adb04eda40736b Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 3 Mar 2016 12:54:53 -0600
Subject: tracing: Support string type key properly

The string in a trace event is usually recorded as dynamic array which
is variable length.  But current hist code only support fixed length
array so it cannot support most strings.

This patch fixes it by checking filter_type of the field and get
proper pointer with it.  With this, it can get a histogram of exec()
based on filenames like below:

  # cd /sys/kernel/tracing/events/sched/sched_process_exec
  # cat 'hist:key=filename' > trigger
  # ps
   PID TTY       TIME CMD
     1 ?     00:00:00 init
    29 ?     00:00:00 sh
    38 ?     00:00:00 ps
  # ls
  enable  filter  format  hist  id  trigger
  # cat hist
  # trigger info: hist:keys=filename:vals=hitcount:sort=hitcount:size=2048 [active]

  { filename: /usr/bin/ps                         } hitcount:          1
  { filename: /usr/bin/ls                         } hitcount:          1
  { filename: /usr/bin/cat                        } hitcount:          1

  Totals:
      Hits: 3
      Entries: 3
      Dropped: 0

Link: http://lkml.kernel.org/r/610180d6df0cfdf11ee205452f3b241dea657233.1457029949.git.tom.zanussi@linux.intel.com

Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
[ Added (unsigned long) typecast to fix compile warning ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 49 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 21cae42b54da..418ff27cbbaf 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -52,12 +52,28 @@ static u64 hist_field_string(struct hist_field *hist_field, void *event)
 	return (u64)(unsigned long)addr;
 }
 
+static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
+{
+	u32 str_item = *(u32 *)(event + hist_field->field->offset);
+	int str_loc = str_item & 0xffff;
+	char *addr = (char *)(event + str_loc);
+
+	return (u64)(unsigned long)addr;
+}
+
+static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
+{
+	char **addr = (char **)(event + hist_field->field->offset);
+
+	return (u64)(unsigned long)*addr;
+}
+
 #define DEFINE_HIST_FIELD_FN(type)					\
 static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
 {									\
 	type *addr = (type *)(event + hist_field->field->offset);	\
 									\
-	return (u64)*addr;						\
+	return (u64)(unsigned long)*addr;				\
 }
 
 DEFINE_HIST_FIELD_FN(s64);
@@ -340,7 +356,13 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 
 	if (is_string_field(field)) {
 		flags |= HIST_FIELD_FL_STRING;
-		hist_field->fn = hist_field_string;
+
+		if (field->filter_type == FILTER_STATIC_STRING)
+			hist_field->fn = hist_field_string;
+		else if (field->filter_type == FILTER_DYN_STRING)
+			hist_field->fn = hist_field_dynstring;
+		else
+			hist_field->fn = hist_field_pstring;
 	} else {
 		hist_field->fn = select_value_fn(field->size,
 						 field->is_signed);
@@ -508,7 +530,10 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 			goto out;
 		}
 
-		key_size = field->size;
+		if (is_string_field(field)) /* should be last key field */
+			key_size = HIST_KEY_SIZE_MAX - key_offset;
+		else
+			key_size = field->size;
 	}
 
 	hist_data->fields[key_idx] = create_hist_field(field, flags);
@@ -841,8 +866,24 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 				key = (void *)&field_contents;
 
 			if (hist_data->n_keys > 1) {
+				/* ensure NULL-termination */
+				size_t size = key_field->size - 1;
+
+				if (key_field->flags & HIST_FIELD_FL_STRING) {
+					struct ftrace_event_field *field;
+
+					field = key_field->field;
+					if (field->filter_type == FILTER_DYN_STRING)
+						size = *(u32 *)(rec + field->offset) >> 16;
+					else if (field->filter_type == FILTER_PTR_STRING)
+						size = strlen(key);
+
+					if (size > key_field->size - 1)
+						size = key_field->size - 1;
+				}
+
 				memcpy(compound_key + key_field->offset, key,
-				       key_field->size);
+				       size);
 			}
 		}
 	}
-- 
cgit v1.2.3


From 6a475cb17fe161dbde62eca7e4c2cf59edff182f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:54 -0600
Subject: tracing: Remove restriction on string position in hist trigger keys

If we assume the maximum size for a string field, we don't have to
worry about its position.  Since we only allow two keys in a compound
key and having more than one string key in a given compound key
doesn't make much sense anyway, trading a bit of extra space instead
of introducing an arbitrary restriction makes more sense.

We also need to use the event field size for static strings when
copying the contents, otherwise we get random garbage in the key.

Also, cast string return values to avoid warnings on 32-bit compiles.

Finally, rearrange the code without changing any functionality by
moving the compound key updating code into a separate function.

Link: http://lkml.kernel.org/r/8976e1ab04b66bc2700ad1ed0768a2de85ac1983.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 63 ++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 418ff27cbbaf..4f4041d76926 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -530,8 +530,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 			goto out;
 		}
 
-		if (is_string_field(field)) /* should be last key field */
-			key_size = HIST_KEY_SIZE_MAX - key_offset;
+		if (is_string_field(field))
+			key_size = MAX_FILTER_STR_VAL;
 		else
 			key_size = field->size;
 	}
@@ -830,9 +830,34 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
 	}
 }
 
+static inline void add_to_key(char *compound_key, void *key,
+			      struct hist_field *key_field, void *rec)
+{
+	size_t size = key_field->size;
+
+	if (key_field->flags & HIST_FIELD_FL_STRING) {
+		struct ftrace_event_field *field;
+
+		field = key_field->field;
+		if (field->filter_type == FILTER_DYN_STRING)
+			size = *(u32 *)(rec + field->offset) >> 16;
+		else if (field->filter_type == FILTER_PTR_STRING)
+			size = strlen(key);
+		else if (field->filter_type == FILTER_STATIC_STRING)
+			size = field->size;
+
+		/* ensure NULL-termination */
+		if (size > key_field->size - 1)
+			size = key_field->size - 1;
+	}
+
+	memcpy(compound_key + key_field->offset, key, size);
+}
+
 static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
+	bool use_compound_key = (hist_data->n_keys > 1);
 	unsigned long entries[HIST_STACKTRACE_DEPTH];
 	char compound_key[HIST_KEY_SIZE_MAX];
 	struct stack_trace stacktrace;
@@ -842,8 +867,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 	void *key = NULL;
 	unsigned int i;
 
-	if (hist_data->n_keys > 1)
-		memset(compound_key, 0, hist_data->key_size);
+	memset(compound_key, 0, hist_data->key_size);
 
 	for_each_hist_key_field(i, hist_data) {
 		key_field = hist_data->fields[i];
@@ -860,35 +884,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
 			key = entries;
 		} else {
 			field_contents = key_field->fn(key_field, rec);
-			if (key_field->flags & HIST_FIELD_FL_STRING)
+			if (key_field->flags & HIST_FIELD_FL_STRING) {
 				key = (void *)(unsigned long)field_contents;
-			else
+				use_compound_key = true;
+			} else
 				key = (void *)&field_contents;
-
-			if (hist_data->n_keys > 1) {
-				/* ensure NULL-termination */
-				size_t size = key_field->size - 1;
-
-				if (key_field->flags & HIST_FIELD_FL_STRING) {
-					struct ftrace_event_field *field;
-
-					field = key_field->field;
-					if (field->filter_type == FILTER_DYN_STRING)
-						size = *(u32 *)(rec + field->offset) >> 16;
-					else if (field->filter_type == FILTER_PTR_STRING)
-						size = strlen(key);
-
-					if (size > key_field->size - 1)
-						size = key_field->size - 1;
-				}
-
-				memcpy(compound_key + key_field->offset, key,
-				       size);
-			}
 		}
+
+		if (use_compound_key)
+			add_to_key(compound_key, key, key_field, rec);
 	}
 
-	if (hist_data->n_keys > 1)
+	if (use_compound_key)
 		key = compound_key;
 
 	elt = tracing_map_insert(hist_data->map, key);
-- 
cgit v1.2.3


From d0bad49bb0a094a1beb06640785f95cb256b7272 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:55 -0600
Subject: tracing: Add enable_hist/disable_hist triggers

Similar to enable_event/disable_event triggers, these triggers enable
and disable the aggregation of events into maps rather than enabling
and disabling their writing into the trace buffer.

They can be used to automatically start and stop hist triggers based
on a matching filter condition.

If there's a paused hist trigger on system:event, the following would
start it when the filter condition was hit:

  # echo enable_hist:system:event [ if filter] > event/trigger

And the following would disable a running system:event hist trigger:

  # echo disable_hist:system:event [ if filter] > event/trigger

See Documentation/trace/events.txt for real examples.

Link: http://lkml.kernel.org/r/f812f086e52c8b7c8ad5443487375e03c96a601f.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h        |   1 +
 kernel/trace/trace.c                |   8 +++
 kernel/trace/trace.h                |  32 ++++++++++
 kernel/trace/trace_events_hist.c    | 115 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_trigger.c |  71 ++++++++++++----------
 5 files changed, 196 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 404603720650..5f89a5b0c7e6 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -408,6 +408,7 @@ enum event_trigger_type {
 	ETT_STACKTRACE		= (1 << 2),
 	ETT_EVENT_ENABLE	= (1 << 3),
 	ETT_EVENT_HIST		= (1 << 4),
+	ETT_HIST_ENABLE		= (1 << 5),
 };
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2238bfde799b..8430145bea12 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3807,6 +3807,10 @@ static const char readme_msg[] =
 	"\t   trigger: traceon, traceoff\n"
 	"\t            enable_event:<system>:<event>\n"
 	"\t            disable_event:<system>:<event>\n"
+#ifdef CONFIG_HIST_TRIGGERS
+	"\t            enable_hist:<system>:<event>\n"
+	"\t            disable_hist:<system>:<event>\n"
+#endif
 #ifdef CONFIG_STACKTRACE
 	"\t\t    stacktrace\n"
 #endif
@@ -3867,6 +3871,10 @@ static const char readme_msg[] =
 	"\t    The 'clear' parameter will clear the contents of a running\n"
 	"\t    hist trigger and leave its current paused/active state\n"
 	"\t    unchanged.\n\n"
+	"\t    The enable_hist and disable_hist triggers can be used to\n"
+	"\t    have one event conditionally start and stop another event's\n"
+	"\t    already-attached hist trigger.  The syntax is analagous to\n"
+	"\t    the enable_event and disable_event triggers.\n"
 #endif
 ;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 505f8a45f426..cab1f4bfe85b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1166,8 +1166,10 @@ extern const struct file_operations event_hist_fops;
 
 #ifdef CONFIG_HIST_TRIGGERS
 extern int register_trigger_hist_cmd(void);
+extern int register_trigger_hist_enable_disable_cmds(void);
 #else
 static inline int register_trigger_hist_cmd(void) { return 0; }
+static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; }
 #endif
 
 extern int register_trigger_cmds(void);
@@ -1185,6 +1187,34 @@ struct event_trigger_data {
 	struct list_head		list;
 };
 
+/* Avoid typos */
+#define ENABLE_EVENT_STR	"enable_event"
+#define DISABLE_EVENT_STR	"disable_event"
+#define ENABLE_HIST_STR		"enable_hist"
+#define DISABLE_HIST_STR	"disable_hist"
+
+struct enable_trigger_data {
+	struct trace_event_file		*file;
+	bool				enable;
+	bool				hist;
+};
+
+extern int event_enable_trigger_print(struct seq_file *m,
+				      struct event_trigger_ops *ops,
+				      struct event_trigger_data *data);
+extern void event_enable_trigger_free(struct event_trigger_ops *ops,
+				      struct event_trigger_data *data);
+extern int event_enable_trigger_func(struct event_command *cmd_ops,
+				     struct trace_event_file *file,
+				     char *glob, char *cmd, char *param);
+extern int event_enable_register_trigger(char *glob,
+					 struct event_trigger_ops *ops,
+					 struct event_trigger_data *data,
+					 struct trace_event_file *file);
+extern void event_enable_unregister_trigger(char *glob,
+					    struct event_trigger_ops *ops,
+					    struct event_trigger_data *test,
+					    struct trace_event_file *file);
 extern void trigger_data_free(struct event_trigger_data *data);
 extern int event_trigger_init(struct event_trigger_ops *ops,
 			      struct event_trigger_data *data);
@@ -1198,6 +1228,8 @@ extern int set_trigger_filter(char *filter_str,
 			      struct event_trigger_data *trigger_data,
 			      struct trace_event_file *file);
 extern int register_event_command(struct event_command *cmd);
+extern int unregister_event_command(struct event_command *cmd);
+extern int register_trigger_hist_enable_disable_cmds(void);
 
 /**
  * struct event_trigger_ops - callbacks for trace event triggers
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 4f4041d76926..5d4f02792440 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1393,3 +1393,118 @@ __init int register_trigger_hist_cmd(void)
 
 	return ret;
 }
+
+static void
+hist_enable_trigger(struct event_trigger_data *data, void *rec)
+{
+	struct enable_trigger_data *enable_data = data->private_data;
+	struct event_trigger_data *test;
+
+	list_for_each_entry_rcu(test, &enable_data->file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (enable_data->enable)
+				test->paused = false;
+			else
+				test->paused = true;
+			break;
+		}
+	}
+}
+
+static void
+hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
+{
+	if (!data->count)
+		return;
+
+	if (data->count != -1)
+		(data->count)--;
+
+	hist_enable_trigger(data, rec);
+}
+
+static struct event_trigger_ops hist_enable_trigger_ops = {
+	.func			= hist_enable_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_enable_count_trigger_ops = {
+	.func			= hist_enable_count_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_disable_trigger_ops = {
+	.func			= hist_enable_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops hist_disable_count_trigger_ops = {
+	.func			= hist_enable_count_trigger,
+	.print			= event_enable_trigger_print,
+	.init			= event_trigger_init,
+	.free			= event_enable_trigger_free,
+};
+
+static struct event_trigger_ops *
+hist_enable_get_trigger_ops(char *cmd, char *param)
+{
+	struct event_trigger_ops *ops;
+	bool enable;
+
+	enable = (strcmp(cmd, ENABLE_HIST_STR) == 0);
+
+	if (enable)
+		ops = param ? &hist_enable_count_trigger_ops :
+			&hist_enable_trigger_ops;
+	else
+		ops = param ? &hist_disable_count_trigger_ops :
+			&hist_disable_trigger_ops;
+
+	return ops;
+}
+
+static struct event_command trigger_hist_enable_cmd = {
+	.name			= ENABLE_HIST_STR,
+	.trigger_type		= ETT_HIST_ENABLE,
+	.func			= event_enable_trigger_func,
+	.reg			= event_enable_register_trigger,
+	.unreg			= event_enable_unregister_trigger,
+	.get_trigger_ops	= hist_enable_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+static struct event_command trigger_hist_disable_cmd = {
+	.name			= DISABLE_HIST_STR,
+	.trigger_type		= ETT_HIST_ENABLE,
+	.func			= event_enable_trigger_func,
+	.reg			= event_enable_register_trigger,
+	.unreg			= event_enable_unregister_trigger,
+	.get_trigger_ops	= hist_enable_get_trigger_ops,
+	.set_filter		= set_trigger_filter,
+};
+
+static __init void unregister_trigger_hist_enable_disable_cmds(void)
+{
+	unregister_event_command(&trigger_hist_enable_cmd);
+	unregister_event_command(&trigger_hist_disable_cmd);
+}
+
+__init int register_trigger_hist_enable_disable_cmds(void)
+{
+	int ret;
+
+	ret = register_event_command(&trigger_hist_enable_cmd);
+	if (WARN_ON(ret < 0))
+		return ret;
+	ret = register_event_command(&trigger_hist_disable_cmd);
+	if (WARN_ON(ret < 0))
+		unregister_trigger_hist_enable_disable_cmds();
+
+	return ret;
+}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d29092afe005..d133f2094566 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -347,7 +347,7 @@ __init int register_event_command(struct event_command *cmd)
  * Currently we only unregister event commands from __init, so mark
  * this __init too.
  */
-static __init int unregister_event_command(struct event_command *cmd)
+__init int unregister_event_command(struct event_command *cmd)
 {
 	struct event_command *p, *n;
 	int ret = -ENODEV;
@@ -1062,15 +1062,6 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
 	unregister_event_command(&trigger_traceoff_cmd);
 }
 
-/* Avoid typos */
-#define ENABLE_EVENT_STR	"enable_event"
-#define DISABLE_EVENT_STR	"disable_event"
-
-struct enable_trigger_data {
-	struct trace_event_file		*file;
-	bool				enable;
-};
-
 static void
 event_enable_trigger(struct event_trigger_data *data, void *rec)
 {
@@ -1100,14 +1091,16 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
 	event_enable_trigger(data, rec);
 }
 
-static int
-event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
-			   struct event_trigger_data *data)
+int event_enable_trigger_print(struct seq_file *m,
+			       struct event_trigger_ops *ops,
+			       struct event_trigger_data *data)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 
 	seq_printf(m, "%s:%s:%s",
-		   enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+		   enable_data->hist ?
+		   (enable_data->enable ? ENABLE_HIST_STR : DISABLE_HIST_STR) :
+		   (enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR),
 		   enable_data->file->event_call->class->system,
 		   trace_event_name(enable_data->file->event_call));
 
@@ -1124,9 +1117,8 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
 	return 0;
 }
 
-static void
-event_enable_trigger_free(struct event_trigger_ops *ops,
-			  struct event_trigger_data *data)
+void event_enable_trigger_free(struct event_trigger_ops *ops,
+			       struct event_trigger_data *data)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 
@@ -1171,10 +1163,9 @@ static struct event_trigger_ops event_disable_count_trigger_ops = {
 	.free			= event_enable_trigger_free,
 };
 
-static int
-event_enable_trigger_func(struct event_command *cmd_ops,
-			  struct trace_event_file *file,
-			  char *glob, char *cmd, char *param)
+int event_enable_trigger_func(struct event_command *cmd_ops,
+			      struct trace_event_file *file,
+			      char *glob, char *cmd, char *param)
 {
 	struct trace_event_file *event_enable_file;
 	struct enable_trigger_data *enable_data;
@@ -1183,6 +1174,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	struct trace_array *tr = file->tr;
 	const char *system;
 	const char *event;
+	bool hist = false;
 	char *trigger;
 	char *number;
 	bool enable;
@@ -1207,8 +1199,15 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	if (!event_enable_file)
 		goto out;
 
-	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+#ifdef CONFIG_HIST_TRIGGERS
+	hist = ((strcmp(cmd, ENABLE_HIST_STR) == 0) ||
+		(strcmp(cmd, DISABLE_HIST_STR) == 0));
 
+	enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
+		  (strcmp(cmd, ENABLE_HIST_STR) == 0));
+#else
+	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+#endif
 	trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
 
 	ret = -ENOMEM;
@@ -1228,6 +1227,7 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	INIT_LIST_HEAD(&trigger_data->list);
 	RCU_INIT_POINTER(trigger_data->filter, NULL);
 
+	enable_data->hist = hist;
 	enable_data->enable = enable;
 	enable_data->file = event_enable_file;
 	trigger_data->private_data = enable_data;
@@ -1305,10 +1305,10 @@ event_enable_trigger_func(struct event_command *cmd_ops,
 	goto out;
 }
 
-static int event_enable_register_trigger(char *glob,
-					 struct event_trigger_ops *ops,
-					 struct event_trigger_data *data,
-					 struct trace_event_file *file)
+int event_enable_register_trigger(char *glob,
+				  struct event_trigger_ops *ops,
+				  struct event_trigger_data *data,
+				  struct trace_event_file *file)
 {
 	struct enable_trigger_data *enable_data = data->private_data;
 	struct enable_trigger_data *test_enable_data;
@@ -1318,6 +1318,8 @@ static int event_enable_register_trigger(char *glob,
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		test_enable_data = test->private_data;
 		if (test_enable_data &&
+		    (test->cmd_ops->trigger_type ==
+		     data->cmd_ops->trigger_type) &&
 		    (test_enable_data->file == enable_data->file)) {
 			ret = -EEXIST;
 			goto out;
@@ -1343,10 +1345,10 @@ out:
 	return ret;
 }
 
-static void event_enable_unregister_trigger(char *glob,
-					    struct event_trigger_ops *ops,
-					    struct event_trigger_data *test,
-					    struct trace_event_file *file)
+void event_enable_unregister_trigger(char *glob,
+				     struct event_trigger_ops *ops,
+				     struct event_trigger_data *test,
+				     struct trace_event_file *file)
 {
 	struct enable_trigger_data *test_enable_data = test->private_data;
 	struct enable_trigger_data *enable_data;
@@ -1356,6 +1358,8 @@ static void event_enable_unregister_trigger(char *glob,
 	list_for_each_entry_rcu(data, &file->triggers, list) {
 		enable_data = data->private_data;
 		if (enable_data &&
+		    (data->cmd_ops->trigger_type ==
+		     test->cmd_ops->trigger_type) &&
 		    (enable_data->file == test_enable_data->file)) {
 			unregistered = true;
 			list_del_rcu(&data->list);
@@ -1375,8 +1379,12 @@ event_enable_get_trigger_ops(char *cmd, char *param)
 	struct event_trigger_ops *ops;
 	bool enable;
 
+#ifdef CONFIG_HIST_TRIGGERS
+	enable = ((strcmp(cmd, ENABLE_EVENT_STR) == 0) ||
+		  (strcmp(cmd, ENABLE_HIST_STR) == 0));
+#else
 	enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
-
+#endif
 	if (enable)
 		ops = param ? &event_enable_count_trigger_ops :
 			&event_enable_trigger_ops;
@@ -1447,6 +1455,7 @@ __init int register_trigger_cmds(void)
 	register_trigger_snapshot_cmd();
 	register_trigger_stacktrace_cmd();
 	register_trigger_enable_disable_cmds();
+	register_trigger_hist_enable_disable_cmds();
 	register_trigger_hist_cmd();
 
 	return 0;
-- 
cgit v1.2.3


From 52a7f16dedff8f23d03df3ea556dec95b92a5801 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:57 -0600
Subject: tracing: Add support for multiple hist triggers per event

Allow users to define any number of hist triggers per trace event.
Any number of hist triggers may be added for a given event, which may
differ by key, value, or filter.

Reading the event's 'hist' file will display the output of all the
hist triggers defined on an event concatenated in the order they were
defined.

Link: http://lkml.kernel.org/r/48a0c8dd34c344571de880fb35e211c6d9a28961.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/events.txt   | 154 +++++++++++++++++++++++++++++++++--
 kernel/trace/trace.c             |   8 +-
 kernel/trace/trace_events_hist.c | 172 +++++++++++++++++++++++++++++++--------
 3 files changed, 293 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index db223e85bd2f..cc0216760b78 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -550,12 +550,14 @@ The following commands are supported:
 
   'hist' triggers add a 'hist' file to each event's subdirectory.
   Reading the 'hist' file for the event will dump the hash table in
-  its entirety to stdout.  Each printed hash table entry is a simple
-  list of the keys and values comprising the entry; keys are printed
-  first and are delineated by curly braces, and are followed by the
-  set of value fields for the entry.  By default, numeric fields are
-  displayed as base-10 integers.  This can be modified by appending
-  any of the following modifiers to the field name:
+  its entirety to stdout.  If there are multiple hist triggers
+  attached to an event, there will be a table for each trigger in the
+  output.  Each printed hash table entry is a simple list of the keys
+  and values comprising the entry; keys are printed first and are
+  delineated by curly braces, and are followed by the set of value
+  fields for the entry.  By default, numeric fields are displayed as
+  base-10 integers.  This can be modified by appending any of the
+  following modifiers to the field name:
 
         .hex        display a number as a hex value
 	.sym        display an address as a symbol
@@ -1667,3 +1669,143 @@ The following commands are supported:
     .
     .
     .
+
+  The following example demonstrates how multiple hist triggers can be
+  attached to a given event.  This capability can be useful for
+  creating a set of different summaries derived from the same set of
+  events, or for comparing the effects of different filters, among
+  other things.
+
+    # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
+           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
+    # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
+           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
+    # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
+           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
+    # echo 'hist:keys=skbaddr.hex:vals=len' >> \
+           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
+    # echo 'hist:keys=len:vals=common_preempt_count' >> \
+           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
+
+  The above set of commands create four triggers differing only in
+  their filters, along with a completely different though fairly
+  nonsensical trigger.  Note that in order to append multiple hist
+  triggers to the same file, you should use the '>>' operator to
+  append them ('>' will also add the new hist trigger, but will remove
+  any existing hist triggers beforehand).
+
+  Displaying the contents of the 'hist' file for the event shows the
+  contents of all five histograms:
+
+    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
+
+    # event histogram
+    #
+    # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
+    #
+
+    { len:        176 } hitcount:          1  common_preempt_count:          0
+    { len:        223 } hitcount:          1  common_preempt_count:          0
+    { len:       4854 } hitcount:          1  common_preempt_count:          0
+    { len:        395 } hitcount:          1  common_preempt_count:          0
+    { len:        177 } hitcount:          1  common_preempt_count:          0
+    { len:        446 } hitcount:          1  common_preempt_count:          0
+    { len:       1601 } hitcount:          1  common_preempt_count:          0
+    .
+    .
+    .
+    { len:       1280 } hitcount:         66  common_preempt_count:          0
+    { len:        116 } hitcount:         81  common_preempt_count:         40
+    { len:        708 } hitcount:        112  common_preempt_count:          0
+    { len:         46 } hitcount:        221  common_preempt_count:          0
+    { len:       1264 } hitcount:        458  common_preempt_count:          0
+
+    Totals:
+        Hits: 1428
+        Entries: 147
+        Dropped: 0
+
+
+    # event histogram
+    #
+    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
+    #
+
+    { skbaddr: ffff8800baee5e00 } hitcount:          1  len:        130
+    { skbaddr: ffff88005f3d5600 } hitcount:          1  len:       1280
+    { skbaddr: ffff88005f3d4900 } hitcount:          1  len:       1280
+    { skbaddr: ffff88009fed6300 } hitcount:          1  len:        115
+    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:        115
+    { skbaddr: ffff88008cdb1900 } hitcount:          1  len:         46
+    { skbaddr: ffff880064b5ef00 } hitcount:          1  len:        118
+    { skbaddr: ffff880044e3c700 } hitcount:          1  len:         60
+    { skbaddr: ffff880100065900 } hitcount:          1  len:         46
+    { skbaddr: ffff8800d46bd500 } hitcount:          1  len:        116
+    { skbaddr: ffff88005f3d5f00 } hitcount:          1  len:       1280
+    { skbaddr: ffff880100064700 } hitcount:          1  len:        365
+    { skbaddr: ffff8800badb6f00 } hitcount:          1  len:         60
+    .
+    .
+    .
+    { skbaddr: ffff88009fe0be00 } hitcount:         27  len:      24677
+    { skbaddr: ffff88009fe0a400 } hitcount:         27  len:      23052
+    { skbaddr: ffff88009fe0b700 } hitcount:         31  len:      25589
+    { skbaddr: ffff88009fe0b600 } hitcount:         32  len:      27326
+    { skbaddr: ffff88006a462800 } hitcount:         68  len:      71678
+    { skbaddr: ffff88006a463700 } hitcount:         70  len:      72678
+    { skbaddr: ffff88006a462b00 } hitcount:         71  len:      77589
+    { skbaddr: ffff88006a463600 } hitcount:         73  len:      71307
+    { skbaddr: ffff88006a462200 } hitcount:         81  len:      81032
+
+    Totals:
+        Hits: 1451
+        Entries: 318
+        Dropped: 0
+
+
+    # event histogram
+    #
+    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
+    #
+
+
+    Totals:
+        Hits: 0
+        Entries: 0
+        Dropped: 0
+
+
+    # event histogram
+    #
+    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
+    #
+
+    { skbaddr: ffff88009fd2c300 } hitcount:          1  len:       7212
+    { skbaddr: ffff8800d2bcce00 } hitcount:          1  len:       7212
+    { skbaddr: ffff8800d2bcd700 } hitcount:          1  len:       7212
+    { skbaddr: ffff8800d2bcda00 } hitcount:          1  len:      21492
+    { skbaddr: ffff8800ae2e2d00 } hitcount:          1  len:       7212
+    { skbaddr: ffff8800d2bcdb00 } hitcount:          1  len:       7212
+    { skbaddr: ffff88006a4df500 } hitcount:          1  len:       4854
+    { skbaddr: ffff88008ce47b00 } hitcount:          1  len:      18636
+    { skbaddr: ffff8800ae2e2200 } hitcount:          1  len:      12924
+    { skbaddr: ffff88005f3e1000 } hitcount:          1  len:       4356
+    { skbaddr: ffff8800d2bcdc00 } hitcount:          2  len:      24420
+    { skbaddr: ffff8800d2bcc200 } hitcount:          2  len:      12996
+
+    Totals:
+        Hits: 14
+        Entries: 12
+        Dropped: 0
+
+
+    # event histogram
+    #
+    # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
+    #
+
+
+    Totals:
+        Hits: 0
+        Entries: 0
+        Dropped: 0
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8430145bea12..e89b2ed76d2d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3856,9 +3856,11 @@ static const char readme_msg[] =
 	"\t    sort field.  The 'size' parameter can be used to specify more\n"
 	"\t    or fewer than the default 2048 entries for the hashtable size.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
-	"\t    table in its entirety to stdout.  The default format used to\n"
-	"\t    display a given field can be modified by appending any of the\n"
-	"\t    following modifiers to the field name, as applicable:\n\n"
+	"\t    table in its entirety to stdout.  If there are multiple hist\n"
+	"\t    triggers attached to an event, there will be a table for each\n"
+	"\t    trigger in the output.  The default format used to display a\n"
+	"\t    given field can be modified by appending any of the following\n"
+	"\t    modifiers to the field name, as applicable:\n\n"
 	"\t            .hex        display a number as a hex value\n"
 	"\t            .sym        display an address as a symbol\n"
 	"\t            .sym-offset display an address as a symbol and offset\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5d4f02792440..4b02f8ab4dd3 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1032,33 +1032,18 @@ static int print_entries(struct seq_file *m,
 	return n_entries;
 }
 
-static int hist_show(struct seq_file *m, void *v)
+static void hist_trigger_show(struct seq_file *m,
+			      struct event_trigger_data *data, int n)
 {
-	struct event_trigger_data *test, *data = NULL;
-	struct trace_event_file *event_file;
 	struct hist_trigger_data *hist_data;
 	int n_entries, ret = 0;
 
-	mutex_lock(&event_mutex);
-
-	event_file = event_file_data(m->private);
-	if (unlikely(!event_file)) {
-		ret = -ENODEV;
-		goto out_unlock;
-	}
-
-	list_for_each_entry_rcu(test, &event_file->triggers, list) {
-		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
-			data = test;
-			break;
-		}
-	}
-	if (!data)
-		goto out_unlock;
+	if (n > 0)
+		seq_puts(m, "\n\n");
 
 	seq_puts(m, "# event histogram\n#\n# trigger info: ");
 	data->ops->print(m, data->ops, data);
-	seq_puts(m, "\n");
+	seq_puts(m, "#\n\n");
 
 	hist_data = data->private_data;
 	n_entries = print_entries(m, hist_data);
@@ -1070,6 +1055,27 @@ static int hist_show(struct seq_file *m, void *v)
 	seq_printf(m, "\nTotals:\n    Hits: %llu\n    Entries: %u\n    Dropped: %llu\n",
 		   (u64)atomic64_read(&hist_data->map->hits),
 		   n_entries, (u64)atomic64_read(&hist_data->map->drops));
+}
+
+static int hist_show(struct seq_file *m, void *v)
+{
+	struct event_trigger_data *data;
+	struct trace_event_file *event_file;
+	int n = 0, ret = 0;
+
+	mutex_lock(&event_mutex);
+
+	event_file = event_file_data(m->private);
+	if (unlikely(!event_file)) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	list_for_each_entry_rcu(data, &event_file->triggers, list) {
+		if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
+			hist_trigger_show(m, data, n++);
+	}
+
  out_unlock:
 	mutex_unlock(&event_mutex);
 
@@ -1233,6 +1239,54 @@ static void hist_clear(struct event_trigger_data *data)
 	data->paused = paused;
 }
 
+static bool hist_trigger_match(struct event_trigger_data *data,
+			       struct event_trigger_data *data_test)
+{
+	struct tracing_map_sort_key *sort_key, *sort_key_test;
+	struct hist_trigger_data *hist_data, *hist_data_test;
+	struct hist_field *key_field, *key_field_test;
+	unsigned int i;
+
+	hist_data = data->private_data;
+	hist_data_test = data_test->private_data;
+
+	if (hist_data->n_vals != hist_data_test->n_vals ||
+	    hist_data->n_fields != hist_data_test->n_fields ||
+	    hist_data->n_sort_keys != hist_data_test->n_sort_keys)
+		return false;
+
+	if ((data->filter_str && !data_test->filter_str) ||
+	    (!data->filter_str && data_test->filter_str))
+		return false;
+
+	for_each_hist_field(i, hist_data) {
+		key_field = hist_data->fields[i];
+		key_field_test = hist_data_test->fields[i];
+
+		if (key_field->flags != key_field_test->flags)
+			return false;
+		if (key_field->field != key_field_test->field)
+			return false;
+		if (key_field->offset != key_field_test->offset)
+			return false;
+	}
+
+	for (i = 0; i < hist_data->n_sort_keys; i++) {
+		sort_key = &hist_data->sort_keys[i];
+		sort_key_test = &hist_data_test->sort_keys[i];
+
+		if (sort_key->field_idx != sort_key_test->field_idx ||
+		    sort_key->descending != sort_key_test->descending)
+			return false;
+	}
+
+	if (data->filter_str &&
+	    (strcmp(data->filter_str, data_test->filter_str) != 0))
+		return false;
+
+	return true;
+}
+
 static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 				 struct event_trigger_data *data,
 				 struct trace_event_file *file)
@@ -1243,6 +1297,8 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (!hist_trigger_match(data, test))
+				continue;
 			if (hist_data->attrs->pause)
 				test->paused = true;
 			else if (hist_data->attrs->cont)
@@ -1282,6 +1338,44 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 	return ret;
 }
 
+static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
+				    struct event_trigger_data *data,
+				    struct trace_event_file *file)
+{
+	struct event_trigger_data *test;
+	bool unregistered = false;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			if (!hist_trigger_match(data, test))
+				continue;
+			unregistered = true;
+			list_del_rcu(&test->list);
+			trace_event_trigger_enable_disable(file, 0);
+			update_cond_flag(file);
+			break;
+		}
+	}
+
+	if (unregistered && test->ops->free)
+		test->ops->free(test->ops, test);
+}
+
+static void hist_unreg_all(struct trace_event_file *file)
+{
+	struct event_trigger_data *test;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+			list_del_rcu(&test->list);
+			trace_event_trigger_enable_disable(file, 0);
+			update_cond_flag(file);
+			if (test->ops->free)
+				test->ops->free(test->ops, test);
+		}
+	}
+}
+
 static int event_hist_trigger_func(struct event_command *cmd_ops,
 				   struct trace_event_file *file,
 				   char *glob, char *cmd, char *param)
@@ -1331,22 +1425,19 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
 
 	trigger_data->private_data = hist_data;
 
+	/* if param is non-empty, it's supposed to be a filter */
+	if (param && cmd_ops->set_filter) {
+		ret = cmd_ops->set_filter(param, trigger_data, file);
+		if (ret < 0)
+			goto out_free;
+	}
+
 	if (glob[0] == '!') {
 		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
 		ret = 0;
 		goto out_free;
 	}
 
-	if (!param) /* if param is non-empty, it's supposed to be a filter */
-		goto out_reg;
-
-	if (!cmd_ops->set_filter)
-		goto out_reg;
-
-	ret = cmd_ops->set_filter(param, trigger_data, file);
-	if (ret < 0)
-		goto out_free;
- out_reg:
 	ret = cmd_ops->reg(glob, trigger_ops, trigger_data, file);
 	/*
 	 * The above returns on success the # of triggers registered,
@@ -1379,7 +1470,8 @@ static struct event_command trigger_hist_cmd = {
 	.flags			= EVENT_CMD_FL_NEEDS_REC,
 	.func			= event_hist_trigger_func,
 	.reg			= hist_register_trigger,
-	.unreg			= unregister_trigger,
+	.unreg			= hist_unregister_trigger,
+	.unreg_all		= hist_unreg_all,
 	.get_trigger_ops	= event_hist_get_trigger_ops,
 	.set_filter		= set_trigger_filter,
 };
@@ -1406,7 +1498,6 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec)
 				test->paused = false;
 			else
 				test->paused = true;
-			break;
 		}
 	}
 }
@@ -1469,12 +1560,28 @@ hist_enable_get_trigger_ops(char *cmd, char *param)
 	return ops;
 }
 
+static void hist_enable_unreg_all(struct trace_event_file *file)
+{
+	struct event_trigger_data *test;
+
+	list_for_each_entry_rcu(test, &file->triggers, list) {
+		if (test->cmd_ops->trigger_type == ETT_HIST_ENABLE) {
+			list_del_rcu(&test->list);
+			update_cond_flag(file);
+			trace_event_trigger_enable_disable(file, 0);
+			if (test->ops->free)
+				test->ops->free(test->ops, test);
+		}
+	}
+}
+
 static struct event_command trigger_hist_enable_cmd = {
 	.name			= ENABLE_HIST_STR,
 	.trigger_type		= ETT_HIST_ENABLE,
 	.func			= event_enable_trigger_func,
 	.reg			= event_enable_register_trigger,
 	.unreg			= event_enable_unregister_trigger,
+	.unreg_all		= hist_enable_unreg_all,
 	.get_trigger_ops	= hist_enable_get_trigger_ops,
 	.set_filter		= set_trigger_filter,
 };
@@ -1485,6 +1592,7 @@ static struct event_command trigger_hist_disable_cmd = {
 	.func			= event_enable_trigger_func,
 	.reg			= event_enable_register_trigger,
 	.unreg			= event_enable_unregister_trigger,
+	.unreg_all		= hist_enable_unreg_all,
 	.get_trigger_ops	= hist_enable_get_trigger_ops,
 	.set_filter		= set_trigger_filter,
 };
-- 
cgit v1.2.3


From db1388b4ffa9e31e9ff0abacc3bdb121bec8c688 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:58 -0600
Subject: tracing: Add support for named triggers

Named triggers are sets of triggers that share a common set of trigger
data.  An example of functionality that could benefit from this type
of capability would be a set of inlined probes that would each
contribute event counts, for example, to a shared counter data
structure.

The first named trigger registered with a given name owns the common
trigger data that the others subsequently registered with the same
name will reference.  The functions defined here allow users to add,
delete, and find named triggers.

It also adds functions to pause and unpause named triggers; since
named triggers act upon common data, they should also be paused and
unpaused as a group.

Link: http://lkml.kernel.org/r/c09ff648360f65b10a3e321eddafe18060b4a04f.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h                |  13 ++++
 kernel/trace/trace_events_trigger.c | 143 ++++++++++++++++++++++++++++++++++++
 2 files changed, 156 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cab1f4bfe85b..727a3d28bce5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1184,7 +1184,11 @@ struct event_trigger_data {
 	char				*filter_str;
 	void				*private_data;
 	bool				paused;
+	bool				paused_tmp;
 	struct list_head		list;
+	char				*name;
+	struct list_head		named_list;
+	struct event_trigger_data	*named_data;
 };
 
 /* Avoid typos */
@@ -1227,6 +1231,15 @@ extern void unregister_trigger(char *glob, struct event_trigger_ops *ops,
 extern int set_trigger_filter(char *filter_str,
 			      struct event_trigger_data *trigger_data,
 			      struct trace_event_file *file);
+extern struct event_trigger_data *find_named_trigger(const char *name);
+extern bool is_named_trigger(struct event_trigger_data *test);
+extern int save_named_trigger(const char *name,
+			      struct event_trigger_data *data);
+extern void del_named_trigger(struct event_trigger_data *data);
+extern void pause_named_trigger(struct event_trigger_data *data);
+extern void unpause_named_trigger(struct event_trigger_data *data);
+extern void set_named_trigger_data(struct event_trigger_data *data,
+				   struct event_trigger_data *named_data);
 extern int register_event_command(struct event_command *cmd);
 extern int unregister_event_command(struct event_command *cmd);
 extern int register_trigger_hist_enable_disable_cmds(void);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d133f2094566..a975571cde24 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -641,6 +641,7 @@ event_trigger_callback(struct event_command *cmd_ops,
 	trigger_data->ops = trigger_ops;
 	trigger_data->cmd_ops = cmd_ops;
 	INIT_LIST_HEAD(&trigger_data->list);
+	INIT_LIST_HEAD(&trigger_data->named_list);
 
 	if (glob[0] == '!') {
 		cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
@@ -764,6 +765,148 @@ int set_trigger_filter(char *filter_str,
 	return ret;
 }
 
+static LIST_HEAD(named_triggers);
+
+/**
+ * find_named_trigger - Find the common named trigger associated with @name
+ * @name: The name of the set of named triggers to find the common data for
+ *
+ * Named triggers are sets of triggers that share a common set of
+ * trigger data.  The first named trigger registered with a given name
+ * owns the common trigger data that the others subsequently
+ * registered with the same name will reference.  This function
+ * returns the common trigger data associated with that first
+ * registered instance.
+ *
+ * Return: the common trigger data for the given named trigger on
+ * success, NULL otherwise.
+ */
+struct event_trigger_data *find_named_trigger(const char *name)
+{
+	struct event_trigger_data *data;
+
+	if (!name)
+		return NULL;
+
+	list_for_each_entry(data, &named_triggers, named_list) {
+		if (data->named_data)
+			continue;
+		if (strcmp(data->name, name) == 0)
+			return data;
+	}
+
+	return NULL;
+}
+
+/**
+ * is_named_trigger - determine if a given trigger is a named trigger
+ * @test: The trigger data to test
+ *
+ * Return: true if 'test' is a named trigger, false otherwise.
+ */
+bool is_named_trigger(struct event_trigger_data *test)
+{
+	struct event_trigger_data *data;
+
+	list_for_each_entry(data, &named_triggers, named_list) {
+		if (test == data)
+			return true;
+	}
+
+	return false;
+}
+
+/**
+ * save_named_trigger - save the trigger in the named trigger list
+ * @name: The name of the named trigger set
+ * @data: The trigger data to save
+ *
+ * Return: 0 if successful, negative error otherwise.
+ */
+int save_named_trigger(const char *name, struct event_trigger_data *data)
+{
+	data->name = kstrdup(name, GFP_KERNEL);
+	if (!data->name)
+		return -ENOMEM;
+
+	list_add(&data->named_list, &named_triggers);
+
+	return 0;
+}
+
+/**
+ * del_named_trigger - delete a trigger from the named trigger list
+ * @data: The trigger data to delete
+ */
+void del_named_trigger(struct event_trigger_data *data)
+{
+	kfree(data->name);
+	data->name = NULL;
+
+	list_del(&data->named_list);
+}
+
+static void __pause_named_trigger(struct event_trigger_data *data, bool pause)
+{
+	struct event_trigger_data *test;
+
+	list_for_each_entry(test, &named_triggers, named_list) {
+		if (strcmp(test->name, data->name) == 0) {
+			if (pause) {
+				test->paused_tmp = test->paused;
+				test->paused = true;
+			} else {
+				test->paused = test->paused_tmp;
+			}
+		}
+	}
+}
+
+/**
+ * pause_named_trigger - Pause all named triggers with the same name
+ * @data: The trigger data of a named trigger to pause
+ *
+ * Pauses a named trigger along with all other triggers having the
+ * same name.  Because named triggers share a common set of data,
+ * pausing only one is meaningless, so pausing one named trigger needs
+ * to pause all triggers with the same name.
+ */
+void pause_named_trigger(struct event_trigger_data *data)
+{
+	__pause_named_trigger(data, true);
+}
+
+/**
+ * unpause_named_trigger - Un-pause all named triggers with the same name
+ * @data: The trigger data of a named trigger to unpause
+ *
+ * Un-pauses a named trigger along with all other triggers having the
+ * same name.  Because named triggers share a common set of data,
+ * unpausing only one is meaningless, so unpausing one named trigger
+ * needs to unpause all triggers with the same name.
+ */
+void unpause_named_trigger(struct event_trigger_data *data)
+{
+	__pause_named_trigger(data, false);
+}
+
+/**
+ * set_named_trigger_data - Associate common named trigger data
+ * @data: The trigger data of a named trigger to unpause
+ *
+ * Named triggers are sets of triggers that share a common set of
+ * trigger data.  The first named trigger registered with a given name
+ * owns the common trigger data that the others subsequently
+ * registered with the same name will reference.  This function
+ * associates the common trigger data from the first trigger with the
+ * given trigger.
+ */
+void set_named_trigger_data(struct event_trigger_data *data,
+			    struct event_trigger_data *named_data)
+{
+	data->named_data = named_data;
+}
+
 static void
 traceon_trigger(struct event_trigger_data *data, void *rec)
 {
-- 
cgit v1.2.3


From 5463bfda327b1f7310556ef3136533e27c774f13 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Thu, 3 Mar 2016 12:54:59 -0600
Subject: tracing: Add support for named hist triggers

Allow users to define 'named' hist triggers.  All triggers created
with the same 'name=xxx' option will update the same shared histogram
data.

This expands the hist trigger syntax from this:

    # echo hist:keys=xxx ... [ if filter] > event/trigger

to this:

    # echo hist:name=xxx:keys=xxx ... [ if filter] > event/trigger

Named histograms must use a 'compatible' set of keys and values, which
means each event added to a set of named triggers must have the same
names and types.

Reading the 'hist' file of any of the participating events will
produce the same output as any other participating event, which is to
be expected since they share the same data.

Link: http://lkml.kernel.org/r/1dbc84ee3322a75daaf5b3ef1d0cc0a2fb682fc7.1457029949.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Tested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/events.txt   | 274 +++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace.c             |  14 +-
 kernel/trace/trace_events_hist.c | 148 ++++++++++++++++++---
 3 files changed, 407 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index cc0216760b78..08d74d75150d 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -524,7 +524,7 @@ The following commands are supported:
 
         hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
           [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
-          [:clear] [if <filter>]
+          [:clear][:name=histname1] [if <filter>]
 
   When a matching event is hit, an entry is added to a hash table
   using the key(s) and value(s) named.  Keys and values correspond to
@@ -546,18 +546,28 @@ The following commands are supported:
   specified by the 'sort' keyword.  If more than one field is
   specified, the result will be a 'sort within a sort': the first key
   is taken to be the primary sort key and the second the secondary
-  key.
+  key.  If a hist trigger is given a name using the 'name' parameter,
+  its histogram data will be shared with other triggers of the same
+  name, and trigger hits will update this common data.  Only triggers
+  with 'compatible' fields can be combined in this way; triggers are
+  'compatible' if the fields named in the trigger share the same
+  number and type of fields and those fields also have the same names.
+  Note that any two events always share the compatible 'hitcount' and
+  'stacktrace' fields and can therefore be combined using those
+  fields, however pointless that may be.
 
   'hist' triggers add a 'hist' file to each event's subdirectory.
   Reading the 'hist' file for the event will dump the hash table in
   its entirety to stdout.  If there are multiple hist triggers
   attached to an event, there will be a table for each trigger in the
-  output.  Each printed hash table entry is a simple list of the keys
-  and values comprising the entry; keys are printed first and are
-  delineated by curly braces, and are followed by the set of value
-  fields for the entry.  By default, numeric fields are displayed as
-  base-10 integers.  This can be modified by appending any of the
-  following modifiers to the field name:
+  output.  The table displayed for a named trigger will be the same as
+  any other instance having the same name. Each printed hash table
+  entry is a simple list of the keys and values comprising the entry;
+  keys are printed first and are delineated by curly braces, and are
+  followed by the set of value fields for the entry.  By default,
+  numeric fields are displayed as base-10 integers.  This can be
+  modified by appending any of the following modifiers to the field
+  name:
 
         .hex        display a number as a hex value
 	.sym        display an address as a symbol
@@ -1809,3 +1819,251 @@ The following commands are supported:
         Hits: 0
         Entries: 0
         Dropped: 0
+
+  Named triggers can be used to have triggers share a common set of
+  histogram data.  This capability is mostly useful for combining the
+  output of events generated by tracepoints contained inside inline
+  functions, but names can be used in a hist trigger on any event.
+  For example, these two triggers when hit will update the same 'len'
+  field in the shared 'foo' histogram data:
+
+    # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
+           /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
+    # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
+           /sys/kernel/debug/tracing/events/net/netif_rx/trigger
+
+  You can see that they're updating common histogram data by reading
+  each event's hist files at the same time:
+
+    # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
+      cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
+
+    # event histogram
+    #
+    # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
+    #
+
+    { skbaddr: ffff88000ad53500 } hitcount:          1  len:         46
+    { skbaddr: ffff8800af5a1500 } hitcount:          1  len:         76
+    { skbaddr: ffff8800d62a1900 } hitcount:          1  len:         46
+    { skbaddr: ffff8800d2bccb00 } hitcount:          1  len:        468
+    { skbaddr: ffff8800d3c69900 } hitcount:          1  len:         46
+    { skbaddr: ffff88009ff09100 } hitcount:          1  len:         52
+    { skbaddr: ffff88010f13ab00 } hitcount:          1  len:        168
+    { skbaddr: ffff88006a54f400 } hitcount:          1  len:         46
+    { skbaddr: ffff8800d2bcc500 } hitcount:          1  len:        260
+    { skbaddr: ffff880064505000 } hitcount:          1  len:         46
+    { skbaddr: ffff8800baf24e00 } hitcount:          1  len:         32
+    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:         46
+    { skbaddr: ffff8800d3edff00 } hitcount:          1  len:         44
+    { skbaddr: ffff88009fe0b400 } hitcount:          1  len:        168
+    { skbaddr: ffff8800a1c55a00 } hitcount:          1  len:         40
+    { skbaddr: ffff8800d2bcd100 } hitcount:          1  len:         40
+    { skbaddr: ffff880064505f00 } hitcount:          1  len:        174
+    { skbaddr: ffff8800a8bff200 } hitcount:          1  len:        160
+    { skbaddr: ffff880044e3cc00 } hitcount:          1  len:         76
+    { skbaddr: ffff8800a8bfe700 } hitcount:          1  len:         46
+    { skbaddr: ffff8800d2bcdc00 } hitcount:          1  len:         32
+    { skbaddr: ffff8800a1f64800 } hitcount:          1  len:         46
+    { skbaddr: ffff8800d2bcde00 } hitcount:          1  len:        988
+    { skbaddr: ffff88006a5dea00 } hitcount:          1  len:         46
+    { skbaddr: ffff88002e37a200 } hitcount:          1  len:         44
+    { skbaddr: ffff8800a1f32c00 } hitcount:          2  len:        676
+    { skbaddr: ffff88000ad52600 } hitcount:          2  len:        107
+    { skbaddr: ffff8800a1f91e00 } hitcount:          2  len:         92
+    { skbaddr: ffff8800af5a0200 } hitcount:          2  len:        142
+    { skbaddr: ffff8800d2bcc600 } hitcount:          2  len:        220
+    { skbaddr: ffff8800ba36f500 } hitcount:          2  len:         92
+    { skbaddr: ffff8800d021f800 } hitcount:          2  len:         92
+    { skbaddr: ffff8800a1f33600 } hitcount:          2  len:        675
+    { skbaddr: ffff8800a8bfff00 } hitcount:          3  len:        138
+    { skbaddr: ffff8800d62a1300 } hitcount:          3  len:        138
+    { skbaddr: ffff88002e37a100 } hitcount:          4  len:        184
+    { skbaddr: ffff880064504400 } hitcount:          4  len:        184
+    { skbaddr: ffff8800a8bfec00 } hitcount:          4  len:        184
+    { skbaddr: ffff88000ad53700 } hitcount:          5  len:        230
+    { skbaddr: ffff8800d2bcdb00 } hitcount:          5  len:        196
+    { skbaddr: ffff8800a1f90000 } hitcount:          6  len:        276
+    { skbaddr: ffff88006a54f900 } hitcount:          6  len:        276
+
+    Totals:
+        Hits: 81
+        Entries: 42
+        Dropped: 0
+    # event histogram
+    #
+    # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
+    #
+
+    { skbaddr: ffff88000ad53500 } hitcount:          1  len:         46
+    { skbaddr: ffff8800af5a1500 } hitcount:          1  len:         76
+    { skbaddr: ffff8800d62a1900 } hitcount:          1  len:         46
+    { skbaddr: ffff8800d2bccb00 } hitcount:          1  len:        468
+    { skbaddr: ffff8800d3c69900 } hitcount:          1  len:         46
+    { skbaddr: ffff88009ff09100 } hitcount:          1  len:         52
+    { skbaddr: ffff88010f13ab00 } hitcount:          1  len:        168
+    { skbaddr: ffff88006a54f400 } hitcount:          1  len:         46
+    { skbaddr: ffff8800d2bcc500 } hitcount:          1  len:        260
+    { skbaddr: ffff880064505000 } hitcount:          1  len:         46
+    { skbaddr: ffff8800baf24e00 } hitcount:          1  len:         32
+    { skbaddr: ffff88009fe0ad00 } hitcount:          1  len:         46
+    { skbaddr: ffff8800d3edff00 } hitcount:          1  len:         44
+    { skbaddr: ffff88009fe0b400 } hitcount:          1  len:        168
+    { skbaddr: ffff8800a1c55a00 } hitcount:          1  len:         40
+    { skbaddr: ffff8800d2bcd100 } hitcount:          1  len:         40
+    { skbaddr: ffff880064505f00 } hitcount:          1  len:        174
+    { skbaddr: ffff8800a8bff200 } hitcount:          1  len:        160
+    { skbaddr: ffff880044e3cc00 } hitcount:          1  len:         76
+    { skbaddr: ffff8800a8bfe700 } hitcount:          1  len:         46
+    { skbaddr: ffff8800d2bcdc00 } hitcount:          1  len:         32
+    { skbaddr: ffff8800a1f64800 } hitcount:          1  len:         46
+    { skbaddr: ffff8800d2bcde00 } hitcount:          1  len:        988
+    { skbaddr: ffff88006a5dea00 } hitcount:          1  len:         46
+    { skbaddr: ffff88002e37a200 } hitcount:          1  len:         44
+    { skbaddr: ffff8800a1f32c00 } hitcount:          2  len:        676
+    { skbaddr: ffff88000ad52600 } hitcount:          2  len:        107
+    { skbaddr: ffff8800a1f91e00 } hitcount:          2  len:         92
+    { skbaddr: ffff8800af5a0200 } hitcount:          2  len:        142
+    { skbaddr: ffff8800d2bcc600 } hitcount:          2  len:        220
+    { skbaddr: ffff8800ba36f500 } hitcount:          2  len:         92
+    { skbaddr: ffff8800d021f800 } hitcount:          2  len:         92
+    { skbaddr: ffff8800a1f33600 } hitcount:          2  len:        675
+    { skbaddr: ffff8800a8bfff00 } hitcount:          3  len:        138
+    { skbaddr: ffff8800d62a1300 } hitcount:          3  len:        138
+    { skbaddr: ffff88002e37a100 } hitcount:          4  len:        184
+    { skbaddr: ffff880064504400 } hitcount:          4  len:        184
+    { skbaddr: ffff8800a8bfec00 } hitcount:          4  len:        184
+    { skbaddr: ffff88000ad53700 } hitcount:          5  len:        230
+    { skbaddr: ffff8800d2bcdb00 } hitcount:          5  len:        196
+    { skbaddr: ffff8800a1f90000 } hitcount:          6  len:        276
+    { skbaddr: ffff88006a54f900 } hitcount:          6  len:        276
+
+    Totals:
+        Hits: 81
+        Entries: 42
+        Dropped: 0
+
+  And here's an example that shows how to combine histogram data from
+  any two events even if they don't share any 'compatible' fields
+  other than 'hitcount' and 'stacktrace'.  These commands create a
+  couple of triggers named 'bar' using those fields:
+
+    # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
+           /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
+    # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
+          /sys/kernel/debug/tracing/events/net/netif_rx/trigger
+
+  And displaying the output of either shows some interesting if
+  somewhat confusing output:
+
+    # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
+    # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
+
+    # event histogram
+    #
+    # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
+    #
+
+    { stacktrace:
+             _do_fork+0x18e/0x330
+             kernel_thread+0x29/0x30
+             kthreadd+0x154/0x1b0
+             ret_from_fork+0x3f/0x70
+    } hitcount:          1
+    { stacktrace:
+             netif_rx_internal+0xb2/0xd0
+             netif_rx_ni+0x20/0x70
+             dev_loopback_xmit+0xaa/0xd0
+             ip_mc_output+0x126/0x240
+             ip_local_out_sk+0x31/0x40
+             igmp_send_report+0x1e9/0x230
+             igmp_timer_expire+0xe9/0x120
+             call_timer_fn+0x39/0xf0
+             run_timer_softirq+0x1e1/0x290
+             __do_softirq+0xfd/0x290
+             irq_exit+0x98/0xb0
+             smp_apic_timer_interrupt+0x4a/0x60
+             apic_timer_interrupt+0x6d/0x80
+             cpuidle_enter+0x17/0x20
+             call_cpuidle+0x3b/0x60
+             cpu_startup_entry+0x22d/0x310
+    } hitcount:          1
+    { stacktrace:
+             netif_rx_internal+0xb2/0xd0
+             netif_rx_ni+0x20/0x70
+             dev_loopback_xmit+0xaa/0xd0
+             ip_mc_output+0x17f/0x240
+             ip_local_out_sk+0x31/0x40
+             ip_send_skb+0x1a/0x50
+             udp_send_skb+0x13e/0x270
+             udp_sendmsg+0x2bf/0x980
+             inet_sendmsg+0x67/0xa0
+             sock_sendmsg+0x38/0x50
+             SYSC_sendto+0xef/0x170
+             SyS_sendto+0xe/0x10
+             entry_SYSCALL_64_fastpath+0x12/0x6a
+    } hitcount:          2
+    { stacktrace:
+             netif_rx_internal+0xb2/0xd0
+             netif_rx+0x1c/0x60
+             loopback_xmit+0x6c/0xb0
+             dev_hard_start_xmit+0x219/0x3a0
+             __dev_queue_xmit+0x415/0x4f0
+             dev_queue_xmit_sk+0x13/0x20
+             ip_finish_output2+0x237/0x340
+             ip_finish_output+0x113/0x1d0
+             ip_output+0x66/0xc0
+             ip_local_out_sk+0x31/0x40
+             ip_send_skb+0x1a/0x50
+             udp_send_skb+0x16d/0x270
+             udp_sendmsg+0x2bf/0x980
+             inet_sendmsg+0x67/0xa0
+             sock_sendmsg+0x38/0x50
+             ___sys_sendmsg+0x14e/0x270
+    } hitcount:         76
+    { stacktrace:
+             netif_rx_internal+0xb2/0xd0
+             netif_rx+0x1c/0x60
+             loopback_xmit+0x6c/0xb0
+             dev_hard_start_xmit+0x219/0x3a0
+             __dev_queue_xmit+0x415/0x4f0
+             dev_queue_xmit_sk+0x13/0x20
+             ip_finish_output2+0x237/0x340
+             ip_finish_output+0x113/0x1d0
+             ip_output+0x66/0xc0
+             ip_local_out_sk+0x31/0x40
+             ip_send_skb+0x1a/0x50
+             udp_send_skb+0x16d/0x270
+             udp_sendmsg+0x2bf/0x980
+             inet_sendmsg+0x67/0xa0
+             sock_sendmsg+0x38/0x50
+             ___sys_sendmsg+0x269/0x270
+    } hitcount:         77
+    { stacktrace:
+             netif_rx_internal+0xb2/0xd0
+             netif_rx+0x1c/0x60
+             loopback_xmit+0x6c/0xb0
+             dev_hard_start_xmit+0x219/0x3a0
+             __dev_queue_xmit+0x415/0x4f0
+             dev_queue_xmit_sk+0x13/0x20
+             ip_finish_output2+0x237/0x340
+             ip_finish_output+0x113/0x1d0
+             ip_output+0x66/0xc0
+             ip_local_out_sk+0x31/0x40
+             ip_send_skb+0x1a/0x50
+             udp_send_skb+0x16d/0x270
+             udp_sendmsg+0x2bf/0x980
+             inet_sendmsg+0x67/0xa0
+             sock_sendmsg+0x38/0x50
+             SYSC_sendto+0xef/0x170
+    } hitcount:         88
+    { stacktrace:
+             _do_fork+0x18e/0x330
+             SyS_clone+0x19/0x20
+             entry_SYSCALL_64_fastpath+0x12/0x6a
+    } hitcount:        244
+
+    Totals:
+        Hits: 489
+        Entries: 7
+        Dropped: 0
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e89b2ed76d2d..4e342d354c12 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3842,6 +3842,7 @@ static const char readme_msg[] =
 	"\t            [:sort=<field1[,field2,...]>]\n"
 	"\t            [:size=#entries]\n"
 	"\t            [:pause][:continue][:clear]\n"
+	"\t            [:name=histname1]\n"
 	"\t            [if <filter>]\n\n"
 	"\t    When a matching event is hit, an entry is added to a hash\n"
 	"\t    table using the key(s) and value(s) named, and the value of a\n"
@@ -3854,13 +3855,18 @@ static const char readme_msg[] =
 	"\t    specified using the 'sort' keyword.  The sort direction can\n"
 	"\t    be modified by appending '.descending' or '.ascending' to a\n"
 	"\t    sort field.  The 'size' parameter can be used to specify more\n"
-	"\t    or fewer than the default 2048 entries for the hashtable size.\n\n"
+	"\t    or fewer than the default 2048 entries for the hashtable size.\n"
+	"\t    If a hist trigger is given a name using the 'name' parameter,\n"
+	"\t    its histogram data will be shared with other triggers of the\n"
+	"\t    same name, and trigger hits will update this common data.\n\n"
 	"\t    Reading the 'hist' file for the event will dump the hash\n"
 	"\t    table in its entirety to stdout.  If there are multiple hist\n"
 	"\t    triggers attached to an event, there will be a table for each\n"
-	"\t    trigger in the output.  The default format used to display a\n"
-	"\t    given field can be modified by appending any of the following\n"
-	"\t    modifiers to the field name, as applicable:\n\n"
+	"\t    trigger in the output.  The table displayed for a named\n"
+	"\t    trigger will be the same as any other instance having the\n"
+	"\t    same name.  The default format used to display a given field\n"
+	"\t    can be modified by appending any of the following modifiers\n"
+	"\t    to the field name, as applicable:\n\n"
 	"\t            .hex        display a number as a hex value\n"
 	"\t            .sym        display an address as a symbol\n"
 	"\t            .sym-offset display an address as a symbol and offset\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 4b02f8ab4dd3..bdea5603cbde 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -117,6 +117,7 @@ struct hist_trigger_attrs {
 	char		*keys_str;
 	char		*vals_str;
 	char		*sort_key_str;
+	char		*name;
 	bool		pause;
 	bool		cont;
 	bool		clear;
@@ -200,6 +201,7 @@ static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
 	if (!attrs)
 		return;
 
+	kfree(attrs->name);
 	kfree(attrs->sort_key_str);
 	kfree(attrs->keys_str);
 	kfree(attrs->vals_str);
@@ -227,6 +229,8 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
 			attrs->vals_str = kstrdup(str, GFP_KERNEL);
 		else if (strncmp(str, "sort=", strlen("sort=")) == 0)
 			attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
+		else if (strncmp(str, "name=", strlen("name=")) == 0)
+			attrs->name = kstrdup(str, GFP_KERNEL);
 		else if (strcmp(str, "pause") == 0)
 			attrs->pause = true;
 		else if ((strcmp(str, "cont") == 0) ||
@@ -1131,7 +1135,12 @@ static int event_hist_trigger_print(struct seq_file *m,
 	struct hist_field *key_field;
 	unsigned int i;
 
-	seq_puts(m, "hist:keys=");
+	seq_puts(m, "hist:");
+
+	if (data->name)
+		seq_printf(m, "%s:", data->name);
+
+	seq_puts(m, "keys=");
 
 	for_each_hist_key_field(i, hist_data) {
 		key_field = hist_data->fields[i];
@@ -1196,6 +1205,19 @@ static int event_hist_trigger_print(struct seq_file *m,
 	return 0;
 }
 
+static int event_hist_trigger_init(struct event_trigger_ops *ops,
+				   struct event_trigger_data *data)
+{
+	struct hist_trigger_data *hist_data = data->private_data;
+
+	if (!data->ref && hist_data->attrs->name)
+		save_named_trigger(hist_data->attrs->name, data);
+
+	data->ref++;
+
+	return 0;
+}
+
 static void event_hist_trigger_free(struct event_trigger_ops *ops,
 				    struct event_trigger_data *data)
 {
@@ -1206,6 +1228,8 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
 
 	data->ref--;
 	if (!data->ref) {
+		if (data->name)
+			del_named_trigger(data);
 		trigger_data_free(data);
 		destroy_hist_data(hist_data);
 	}
@@ -1214,10 +1238,44 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
 static struct event_trigger_ops event_hist_trigger_ops = {
 	.func			= event_hist_trigger,
 	.print			= event_hist_trigger_print,
-	.init			= event_trigger_init,
+	.init			= event_hist_trigger_init,
 	.free			= event_hist_trigger_free,
 };
 
+static int event_hist_trigger_named_init(struct event_trigger_ops *ops,
+					 struct event_trigger_data *data)
+{
+	data->ref++;
+
+	save_named_trigger(data->named_data->name, data);
+
+	event_hist_trigger_init(ops, data->named_data);
+
+	return 0;
+}
+
+static void event_hist_trigger_named_free(struct event_trigger_ops *ops,
+					  struct event_trigger_data *data)
+{
+	if (WARN_ON_ONCE(data->ref <= 0))
+		return;
+
+	event_hist_trigger_free(ops, data->named_data);
+
+	data->ref--;
+	if (!data->ref) {
+		del_named_trigger(data);
+		trigger_data_free(data);
+	}
+}
+
+static struct event_trigger_ops event_hist_trigger_named_ops = {
+	.func			= event_hist_trigger,
+	.print			= event_hist_trigger_print,
+	.init			= event_hist_trigger_named_init,
+	.free			= event_hist_trigger_named_free,
+};
+
 static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
 							    char *param)
 {
@@ -1227,26 +1285,54 @@ static struct event_trigger_ops *event_hist_get_trigger_ops(char *cmd,
 static void hist_clear(struct event_trigger_data *data)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
-	bool paused;
 
-	paused = data->paused;
-	data->paused = true;
+	if (data->name)
+		pause_named_trigger(data);
 
 	synchronize_sched();
 
 	tracing_map_clear(hist_data->map);
 
-	data->paused = paused;
+	if (data->name)
+		unpause_named_trigger(data);
+}
+
+static bool compatible_field(struct ftrace_event_field *field,
+			     struct ftrace_event_field *test_field)
+{
+	if (field == test_field)
+		return true;
+	if (field == NULL || test_field == NULL)
+		return false;
+	if (strcmp(field->name, test_field->name) != 0)
+		return false;
+	if (strcmp(field->type, test_field->type) != 0)
+		return false;
+	if (field->size != test_field->size)
+		return false;
+	if (field->is_signed != test_field->is_signed)
+		return false;
+
+	return true;
 }
 
 static bool hist_trigger_match(struct event_trigger_data *data,
-			       struct event_trigger_data *data_test)
+			       struct event_trigger_data *data_test,
+			       struct event_trigger_data *named_data,
+			       bool ignore_filter)
 {
 	struct tracing_map_sort_key *sort_key, *sort_key_test;
 	struct hist_trigger_data *hist_data, *hist_data_test;
 	struct hist_field *key_field, *key_field_test;
 	unsigned int i;
 
+	if (named_data && (named_data != data_test) &&
+	    (named_data != data_test->named_data))
+		return false;
+
+	if (!named_data && is_named_trigger(data_test))
+		return false;
+
 	hist_data = data->private_data;
 	hist_data_test = data_test->private_data;
 
@@ -1255,9 +1341,11 @@ static bool hist_trigger_match(struct event_trigger_data *data,
 	    hist_data->n_sort_keys != hist_data_test->n_sort_keys)
 		return false;
 
-	if ((data->filter_str && !data_test->filter_str) ||
-	    (!data->filter_str && data_test->filter_str))
-		return false;
+	if (!ignore_filter) {
+		if ((data->filter_str && !data_test->filter_str) ||
+		   (!data->filter_str && data_test->filter_str))
+			return false;
+	}
 
 	for_each_hist_field(i, hist_data) {
 		key_field = hist_data->fields[i];
@@ -1265,7 +1353,7 @@ static bool hist_trigger_match(struct event_trigger_data *data,
 
 		if (key_field->flags != key_field_test->flags)
 			return false;
-		if (key_field->field != key_field_test->field)
+		if (!compatible_field(key_field->field, key_field_test->field))
 			return false;
 		if (key_field->offset != key_field_test->offset)
 			return false;
@@ -1280,7 +1368,7 @@ static bool hist_trigger_match(struct event_trigger_data *data,
 			return false;
 	}
 
-	if (data->filter_str &&
+	if (!ignore_filter && data->filter_str &&
 	    (strcmp(data->filter_str, data_test->filter_str) != 0))
 		return false;
 
@@ -1292,12 +1380,26 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 				 struct trace_event_file *file)
 {
 	struct hist_trigger_data *hist_data = data->private_data;
-	struct event_trigger_data *test;
+	struct event_trigger_data *test, *named_data = NULL;
 	int ret = 0;
 
+	if (hist_data->attrs->name) {
+		named_data = find_named_trigger(hist_data->attrs->name);
+		if (named_data) {
+			if (!hist_trigger_match(data, named_data, named_data,
+						true)) {
+				ret = -EINVAL;
+				goto out;
+			}
+		}
+	}
+
+	if (hist_data->attrs->name && !named_data)
+		goto new;
+
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
-			if (!hist_trigger_match(data, test))
+			if (!hist_trigger_match(data, test, named_data, false))
 				continue;
 			if (hist_data->attrs->pause)
 				test->paused = true;
@@ -1310,12 +1412,19 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 			goto out;
 		}
 	}
-
+ new:
 	if (hist_data->attrs->cont || hist_data->attrs->clear) {
 		ret = -ENOENT;
 		goto out;
 	}
 
+	if (named_data) {
+		destroy_hist_data(data->private_data);
+		data->private_data = named_data->private_data;
+		set_named_trigger_data(data, named_data);
+		data->ops = &event_hist_trigger_named_ops;
+	}
+
 	if (hist_data->attrs->pause)
 		data->paused = true;
 
@@ -1329,6 +1438,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
 	ret++;
 
 	update_cond_flag(file);
+
 	if (trace_event_trigger_enable_disable(file, 1) < 0) {
 		list_del_rcu(&data->list);
 		update_cond_flag(file);
@@ -1342,12 +1452,16 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
 				    struct event_trigger_data *data,
 				    struct trace_event_file *file)
 {
-	struct event_trigger_data *test;
+	struct hist_trigger_data *hist_data = data->private_data;
+	struct event_trigger_data *test, *named_data = NULL;
 	bool unregistered = false;
 
+	if (hist_data->attrs->name)
+		named_data = find_named_trigger(hist_data->attrs->name);
+
 	list_for_each_entry_rcu(test, &file->triggers, list) {
 		if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
-			if (!hist_trigger_match(data, test))
+			if (!hist_trigger_match(data, test, named_data, false))
 				continue;
 			unregistered = true;
 			list_del_rcu(&test->list);
-- 
cgit v1.2.3


From 4b94f5b7b4a5ffd885609bd033af2ecca0c9cc54 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Thu, 3 Mar 2016 12:55:02 -0600
Subject: tracing: Add hist trigger 'log2' modifier

Allow users to have numeric fields displayed as log2 values in case
value range is very wide by appending '.log2' to field names.

For example,

  # echo 'hist:key=bytes_req' > kmalloc/trigger
  # cat kmalloc/hist

  { bytes_req:        504 } hitcount:          1
  { bytes_req:         11 } hitcount:          1
  { bytes_req:        104 } hitcount:          1
  { bytes_req:         48 } hitcount:          1
  { bytes_req:       2048 } hitcount:          1
  { bytes_req:       4096 } hitcount:          1
  { bytes_req:        240 } hitcount:          1
  { bytes_req:        392 } hitcount:          1
  { bytes_req:         13 } hitcount:          1
  { bytes_req:         28 } hitcount:          1
  { bytes_req:         12 } hitcount:          1
  { bytes_req:         64 } hitcount:          2
  { bytes_req:        128 } hitcount:          2
  { bytes_req:         32 } hitcount:          2
  { bytes_req:          8 } hitcount:         11
  { bytes_req:         10 } hitcount:         13
  { bytes_req:         24 } hitcount:         25
  { bytes_req:        160 } hitcount:         29
  { bytes_req:         16 } hitcount:         33
  { bytes_req:         80 } hitcount:         36

When using '.log2' modifier, the output looks like:

  # echo 'hist:key=bytes_req.log2' > kmalloc/trigger
  # cat kmalloc/hist

  { bytes_req: ~ 2^12 } hitcount:          1
  { bytes_req: ~ 2^11 } hitcount:          1
  { bytes_req: ~ 2^9  } hitcount:          2
  { bytes_req: ~ 2^6  } hitcount:          3
  { bytes_req: ~ 2^3  } hitcount:         13
  { bytes_req: ~ 2^5  } hitcount:         19
  { bytes_req: ~ 2^8  } hitcount:         49
  { bytes_req: ~ 2^7  } hitcount:         57
  { bytes_req: ~ 2^4  } hitcount:         74

Link: http://lkml.kernel.org/r/7ff396b246c6a881f46b979735fddf05a0d6c71a.1457029949.git.tom.zanussi@linux.intel.com

Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c             |  1 +
 kernel/trace/trace_events_hist.c | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4e342d354c12..988a35263fdd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3872,6 +3872,7 @@ static const char readme_msg[] =
 	"\t            .sym-offset display an address as a symbol and offset\n"
 	"\t            .execname   display a common_pid as a program name\n"
 	"\t            .syscall    display a syscall id as a syscall name\n\n"
+	"\t            .log2       display log2 value rather than raw number\n\n"
 	"\t    The 'pause' parameter can be used to pause an existing hist\n"
 	"\t    trigger or to start a hist trigger but not log any events\n"
 	"\t    until told to do so.  'continue' can be used to start or\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index bdea5603cbde..23df38aab503 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -68,6 +68,13 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
 	return (u64)(unsigned long)*addr;
 }
 
+static u64 hist_field_log2(struct hist_field *hist_field, void *event)
+{
+	u64 val = *(u64 *)(event + hist_field->field->offset);
+
+	return (u64) ilog2(roundup_pow_of_two(val));
+}
+
 #define DEFINE_HIST_FIELD_FN(type)					\
 static u64 hist_field_##type(struct hist_field *hist_field, void *event)\
 {									\
@@ -111,6 +118,7 @@ enum hist_field_flags {
 	HIST_FIELD_FL_EXECNAME		= 64,
 	HIST_FIELD_FL_SYSCALL		= 128,
 	HIST_FIELD_FL_STACKTRACE	= 256,
+	HIST_FIELD_FL_LOG2		= 512,
 };
 
 struct hist_trigger_attrs {
@@ -358,6 +366,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 		goto out;
 	}
 
+	if (flags & HIST_FIELD_FL_LOG2) {
+		hist_field->fn = hist_field_log2;
+		goto out;
+	}
+
 	if (is_string_field(field)) {
 		flags |= HIST_FIELD_FL_STRING;
 
@@ -522,6 +535,8 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 				flags |= HIST_FIELD_FL_EXECNAME;
 			else if (strcmp(field_str, "syscall") == 0)
 				flags |= HIST_FIELD_FL_SYSCALL;
+			else if (strcmp(field_str, "log2") == 0)
+				flags |= HIST_FIELD_FL_LOG2;
 			else {
 				ret = -EINVAL;
 				goto out;
@@ -980,6 +995,9 @@ hist_trigger_entry_print(struct seq_file *m,
 						      key + key_field->offset,
 						      HIST_STACKTRACE_DEPTH);
 			multiline = true;
+		} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
+			seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
+				   *(u64 *)(key + key_field->offset));
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
 			seq_printf(m, "%s: %-50s", key_field->field->name,
 				   (char *)(key + key_field->offset));
@@ -1112,6 +1130,8 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 		flags_str = "execname";
 	else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
 		flags_str = "syscall";
+	else if (hist_field->flags & HIST_FIELD_FL_LOG2)
+		flags_str = "log2";
 
 	return flags_str;
 }
-- 
cgit v1.2.3


From d50c744ecde7ee3ba4d7ffb0e1c55e7a2f6bbc8e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 8 Mar 2016 17:17:15 -0500
Subject: tracing: Fix unsigned comparison to zero in hist trigger code

Fengguang Wu's bot found two comparisons of unsigned integers to zero. These
were real bugs, as it would miss error conditions returned to zero.

trace_events_hist.c:426:6-9: WARNING: Unsigned expression compared with zero: idx < 0
trace_events_hist.c:568:5-14: WARNING: Unsigned expression compared with zero: n_entries < 0

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 23df38aab503..f98b6b3a2804 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -734,7 +734,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
 	struct tracing_map *map = hist_data->map;
 	struct ftrace_event_field *field;
 	struct hist_field *hist_field;
-	unsigned int i, idx;
+	int i, idx;
 
 	for_each_hist_field(i, hist_data) {
 		hist_field = hist_data->fields[i];
@@ -1036,7 +1036,7 @@ static int print_entries(struct seq_file *m,
 {
 	struct tracing_map_sort_entry **sort_entries = NULL;
 	struct tracing_map *map = hist_data->map;
-	unsigned int i, n_entries;
+	int i, n_entries;
 
 	n_entries = tracing_map_sort_entries(map, hist_data->sort_keys,
 					     hist_data->n_sort_keys,
-- 
cgit v1.2.3


From 1e33759c788c78f31d4d6f65bac647b23624734c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 18 Apr 2016 21:01:23 +0200
Subject: bpf, trace: add BPF_F_CURRENT_CPU flag for bpf_perf_event_output

Add a BPF_F_CURRENT_CPU flag to optimize the use-case where user space has
per-CPU ring buffers and the eBPF program pushes the data into the current
CPU's ring buffer which saves us an extra helper function call in eBPF.
Also, make sure to properly reserve the remaining flags which are not used.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 4 ++++
 kernel/trace/bpf_trace.c | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 70eda5aeb304..b7b0fb1292e7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -347,6 +347,10 @@ enum bpf_func_id {
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
+/* BPF_FUNC_perf_event_output flags. */
+#define BPF_F_INDEX_MASK		0xffffffffULL
+#define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f389629dade7..b3cc24cb4321 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -225,11 +225,12 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
-static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 {
 	struct pt_regs *regs = (struct pt_regs *) (long) r1;
 	struct bpf_map *map = (struct bpf_map *) (long) r2;
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u64 index = flags & BPF_F_INDEX_MASK;
 	void *data = (void *) (long) r4;
 	struct perf_sample_data sample_data;
 	struct perf_event *event;
@@ -239,6 +240,10 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
 		.data = data,
 	};
 
+	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+		return -EINVAL;
+	if (index == BPF_F_CURRENT_CPU)
+		index = raw_smp_processor_id();
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-- 
cgit v1.2.3


From bd570ff970a54df653b48ed0cfb373f2ebed083d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 18 Apr 2016 21:01:24 +0200
Subject: bpf: add event output helper for notifications/sampling/logging

This patch adds a new helper for cls/act programs that can push events
to user space applications. For networking, this can be f.e. for sampling,
debugging, logging purposes or pushing of arbitrary wake-up events. The
idea is similar to a43eec304259 ("bpf: introduce bpf_perf_event_output()
helper") and 39111695b1b8 ("samples: bpf: add bpf_perf_event_output example").

The eBPF program utilizes a perf event array map that user space populates
with fds from perf_event_open(), the eBPF program calls into the helper
f.e. as skb_event_output(skb, &my_map, BPF_F_CURRENT_CPU, raw, sizeof(raw))
so that the raw data is pushed into the fd f.e. at the map index of the
current CPU.

User space can poll/mmap/etc on this and has a data channel for receiving
events that can be post-processed. The nice thing is that since the eBPF
program and user space application making use of it are tightly coupled,
they can define their own arbitrary raw data format and what/when they
want to push.

While f.e. packet headers could be one part of the meta data that is being
pushed, this is not a substitute for things like packet sockets as whole
packet is not being pushed and push is only done in a single direction.
Intention is more of a generically usable, efficient event pipe to applications.
Workflow is that tc can pin the map and applications can attach themselves
e.g. after cls/act setup to one or multiple map slots, demuxing is done by
the eBPF program.

Adding this facility is with minimal effort, it reuses the helper
introduced in a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
and we get its functionality for free by overloading its BPF_FUNC_ identifier
for cls/act programs, ctx is currently unused, but will be made use of in
future. Example will be added to iproute2's BPF example files.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  2 ++
 kernel/bpf/core.c        |  7 +++++++
 kernel/trace/bpf_trace.c | 27 +++++++++++++++++++++++++++
 net/core/filter.c        |  2 ++
 4 files changed, 38 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5fb3c610fa96..f63afdc43bec 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -169,7 +169,9 @@ u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_fd_array_map_clear(struct bpf_map *map);
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
+
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
+const struct bpf_func_proto *bpf_get_event_output_proto(void);
 
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index be0abf669ced..e4248fe79513 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -764,14 +764,21 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
 const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
 const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
 const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
+
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
 	return NULL;
 }
 
+const struct bpf_func_proto * __weak bpf_get_event_output_proto(void)
+{
+	return NULL;
+}
+
 /* Always built-in helper functions. */
 const struct bpf_func_proto bpf_tail_call_proto = {
 	.func		= NULL,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b3cc24cb4321..780bcbe1d4de 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -277,6 +277,33 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg5_type	= ARG_CONST_STACK_SIZE,
 };
 
+static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
+
+static u64 bpf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
+
+	perf_fetch_caller_regs(regs);
+
+	return bpf_perf_event_output((long)regs, r2, flags, r4, size);
+}
+
+static const struct bpf_func_proto bpf_event_output_proto = {
+	.func		= bpf_event_output,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
+const struct bpf_func_proto *bpf_get_event_output_proto(void)
+{
+	return &bpf_event_output_proto;
+}
+
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
diff --git a/net/core/filter.c b/net/core/filter.c
index 5d2ac2b9d1c4..218e5de8c402 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2039,6 +2039,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_redirect_proto;
 	case BPF_FUNC_get_route_realm:
 		return &bpf_get_route_realm_proto;
+	case BPF_FUNC_perf_event_output:
+		return bpf_get_event_output_proto();
 	default:
 		return sk_filter_func_proto(func_id);
 	}
-- 
cgit v1.2.3


From 89e9e66ba1b3bde9d8ea90566c2aee20697ad681 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 15 Apr 2016 14:35:39 +0200
Subject: futex: Handle unlock_pi race gracefully

If userspace calls UNLOCK_PI unconditionally without trying the TID -> 0
transition in user space first then the user space value might not have the
waiters bit set. This opens the following race:

CPU0	    	      	    CPU1
uval = get_user(futex)
			    lock(hb)
lock(hb)
			    futex |= FUTEX_WAITERS
			    ....
			    unlock(hb)

cmpxchg(futex, uval, newval)

So the cmpxchg fails and returns -EINVAL to user space, which is wrong because
the futex value is valid.

To handle this (yes, yet another) corner case gracefully, check for a flag
change and retry.

[ tglx: Massaged changelog and slightly reworked implementation ]

Fixes: ccf9e6a80d9e ("futex: Make unlock_pi more robust")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Darren Hart <dvhart@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1460723739-5195-1-git-send-email-bigeasy@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index a5d2e74c89e0..fd204e1670c9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1295,10 +1295,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
 	if (unlikely(should_fail_futex(true)))
 		ret = -EFAULT;
 
-	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
 		ret = -EFAULT;
-	else if (curval != uval)
-		ret = -EINVAL;
+	} else if (curval != uval) {
+		/*
+		 * If a unconditional UNLOCK_PI operation (user space did not
+		 * try the TID->0 transition) raced with a waiter setting the
+		 * FUTEX_WAITERS flag between get_user() and locking the hash
+		 * bucket lock, retry the operation.
+		 */
+		if ((FUTEX_TID_MASK & curval) == uval)
+			ret = -EAGAIN;
+		else
+			ret = -EINVAL;
+	}
 	if (ret) {
 		raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
 		return ret;
@@ -2622,6 +2632,15 @@ retry:
 		 */
 		if (ret == -EFAULT)
 			goto pi_faulted;
+		/*
+		 * A unconditional UNLOCK_PI op raced against a waiter
+		 * setting the FUTEX_WAITERS bit. Try again.
+		 */
+		if (ret == -EAGAIN) {
+			spin_unlock(&hb->lock);
+			put_futex_key(&key);
+			goto retry;
+		}
 		/*
 		 * wake_futex_pi has detected invalid state. Tell user
 		 * space.
-- 
cgit v1.2.3


From fe1bce9e2107ba3a8faffe572483b6974201a0e6 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 20 Apr 2016 20:09:24 -0700
Subject: futex: Acknowledge a new waiter in counter before plist

Otherwise an incoming waker on the dest hash bucket can miss
the waiter adding itself to the plist during the lockless
check optimization (small window but still the correct way
of doing this); similarly to the decrement counterpart.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: bigeasy@linutronix.de
Cc: dvhart@infradead.org
Cc: stable@kernel.org
Link: http://lkml.kernel.org/r/1461208164-29150-1-git-send-email-dave@stgolabs.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index fd204e1670c9..c20f06f38ef3 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1535,8 +1535,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
 	if (likely(&hb1->chain != &hb2->chain)) {
 		plist_del(&q->list, &hb1->chain);
 		hb_waiters_dec(hb1);
-		plist_add(&q->list, &hb2->chain);
 		hb_waiters_inc(hb2);
+		plist_add(&q->list, &hb2->chain);
 		q->lock_ptr = &hb2->lock;
 	}
 	get_futex_key_refs(key2);
-- 
cgit v1.2.3


From 4589f450fb285ab85f7513b6649e51ec2a820653 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Thu, 21 Apr 2016 10:08:32 +0100
Subject: genirq: Dont allow affinity mask to be updated on IPIs

The IPI domain re-purposes the IRQ affinity to signify the mask of CPUs
that this IPI will deliver to. This must not be modified before the IPI
is destroyed again, so set the IRQ_NO_BALANCING flag to prevent the
affinity being overwritten by setup_affinity().

Without this, if an IPI is reserved for a single target CPU, then
allocated using __setup_irq(), the affinity is overwritten with
cpu_online_mask. When ipi_destroy() is subsequently called on a
multi-cpu system, it will attempt to free cpumask_weight() IRQs
that were never allocated, and crash.

Fixes: d17bf24e6952 ("genirq: Add a new generic IPI reservation code to irq core")
Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: jason@lakedaemon.net
Cc: marc.zyngier@arm.com
Cc: ralf@linux-mips.org
Cc: Qais Yousef <qsyousef@gmail.com>
Cc: lisa.parratt@imgtec.com
Link: http://lkml.kernel.org/r/1461229712-13057-1-git-send-email-matt.redfearn@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/ipi.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index c37f34b00a11..14777af8e097 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -94,6 +94,7 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain,
 		data = irq_get_irq_data(virq + i);
 		cpumask_copy(data->common->affinity, dest);
 		data->common->ipi_offset = offset;
+		irq_set_status_flags(virq + i, IRQ_NO_BALANCING);
 	}
 	return virq;
 
-- 
cgit v1.2.3


From 85b67bcb7e4a23ced05e7020bf5843b9857f6881 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 18 Apr 2016 20:11:50 -0700
Subject: perf, bpf: minimize the size of perf_trace_() tracepoint handler

move trace_call_bpf() into helper function to minimize the size
of perf_trace_*() tracepoint handlers.
    text	   data	    bss	    dec	 	   hex	filename
10541679	5526646	2945024	19013349	1221ee5	vmlinux_before
10509422	5526646	2945024	18981092	121a0e4	vmlinux_after

It may seem that perf_fetch_caller_regs() can also be moved,
but that is incorrect, since ip/sp will be wrong.

bpf+tracepoint performance is not affected, since
perf_swevent_put_recursion_context() is now inlined.
export_symbol_gpl can also be dropped.

No measurable change in normal perf tracepoints.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/trace_events.h |  5 +++++
 include/trace/perf.h         | 13 +++----------
 kernel/events/core.c         | 20 +++++++++++++++++++-
 3 files changed, 27 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index fe6441203b59..222f6aa0418f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -609,6 +609,11 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
+void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
+			       struct trace_event_call *call, u64 count,
+			       struct pt_regs *regs, struct hlist_head *head,
+			       struct task_struct *task);
+
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
diff --git a/include/trace/perf.h b/include/trace/perf.h
index a182306eefd7..88de5c205e86 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -64,16 +64,9 @@ perf_trace_##call(void *__data, proto)					\
 									\
 	{ assign; }							\
 									\
-	if (prog) {							\
-		*(struct pt_regs **)entry = __regs;			\
-		if (!trace_call_bpf(prog, entry) || hlist_empty(head)) { \
-			perf_swevent_put_recursion_context(rctx);	\
-			return;						\
-		}							\
-	}								\
-	perf_trace_buf_submit(entry, __entry_size, rctx,		\
-			      event_call->event.type, __count, __regs,	\
-			      head, __task);				\
+	perf_trace_run_bpf_submit(entry, __entry_size, rctx,		\
+				  event_call, __count, __regs,		\
+				  head, __task);			\
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5056abffef27..9eb23dc27462 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6741,7 +6741,6 @@ void perf_swevent_put_recursion_context(int rctx)
 
 	put_recursion_context(swhash->recursion, rctx);
 }
-EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
 
 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
@@ -6998,6 +6997,25 @@ static int perf_tp_event_match(struct perf_event *event,
 	return 1;
 }
 
+void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
+			       struct trace_event_call *call, u64 count,
+			       struct pt_regs *regs, struct hlist_head *head,
+			       struct task_struct *task)
+{
+	struct bpf_prog *prog = call->prog;
+
+	if (prog) {
+		*(struct pt_regs **)raw_data = regs;
+		if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+			perf_swevent_put_recursion_context(rctx);
+			return;
+		}
+	}
+	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
+		      rctx, task);
+}
+EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+
 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
 		   struct task_struct *task)
-- 
cgit v1.2.3


From e9fb365a8847dfe8a9fccae0dce77abf7276b5da Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 30 Mar 2016 11:20:48 -0700
Subject: rcutorture: Dump trace buffer upon shutdown

When running from the scripts, rcutorture is completely headless,
so there is no way to to manually dump the trace buffer.  This commit
therefore unconditionally dumps the trace buffer upon timed shutdown.
However, if you are using rmmod to end the test, it is still up to you
to manually dump the trace buffer.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/torture.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index e912ccd960f0..fa0bdeee17ac 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -451,6 +451,7 @@ static int torture_shutdown(void *arg)
 		torture_shutdown_hook();
 	else
 		VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
+	ftrace_dump(DUMP_ALL);
 	kernel_power_off();	/* Shut down the system. */
 	return 0;
 }
-- 
cgit v1.2.3


From 0aa67e75b3d59cfe412bfa54ca23797e6c2e3270 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 30 Mar 2016 11:40:44 -0700
Subject: rcutorture: Add irqs-disabled test for call_rcu()

Mutation testing carried out by Iftekhar Ahmed of Oregon State
University showed that rcutorture is failing to test invocations
of call_rcu() having interrupts disabled.  This commit therefore
adds interrupt disabling around one of the existing invocations
of call_rcu() (and friends).

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 633a68a09440..084a28a732eb 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1478,7 +1478,9 @@ static int rcu_torture_barrier_cbs(void *arg)
 		 * The above smp_load_acquire() ensures barrier_phase load
 		 * is ordered before the folloiwng ->call().
 		 */
+		local_irq_disable(); /* Just to test no-irq call_rcu(). */
 		cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+		local_irq_enable();
 		if (atomic_dec_and_test(&barrier_cbs_count))
 			wake_up(&barrier_wq);
 	} while (!torture_must_stop());
-- 
cgit v1.2.3


From 916633a403702549d37ea353e63a68e5b0dc27ad Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 7 Apr 2016 17:12:31 +0200
Subject: locking/rwsem: Provide down_write_killable()

Now that all the architectures implement the necessary glue code
we can introduce down_write_killable(). The only difference wrt. regular
down_write() is that the slow path waits in TASK_KILLABLE state and the
interruption by the fatal signal is reported as -EINTR to the caller.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Zankel <chris@zankel.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Signed-off-by: Jason Low <jason.low2@hp.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-alpha@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linux-sh@vger.kernel.org
Cc: linux-xtensa@linux-xtensa.org
Cc: sparclinux@vger.kernel.org
Link: http://lkml.kernel.org/r/1460041951-22347-12-git-send-email-mhocko@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/rwsem.h |  6 +++---
 include/linux/lockdep.h      | 15 +++++++++++++++
 include/linux/rwsem.h        |  1 +
 kernel/locking/rwsem.c       | 19 +++++++++++++++++++
 4 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index d759c5f70f49..453744c1d347 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -102,9 +102,9 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 #define ____down_write(sem, slow_path)			\
 ({							\
 	long tmp;					\
-	struct rw_semaphore* ret = sem;			\
+	struct rw_semaphore* ret;			\
 	asm volatile("# beginning down_write\n\t"	\
-		     LOCK_PREFIX "  xadd      %1,(%2)\n\t"	\
+		     LOCK_PREFIX "  xadd      %1,(%3)\n\t"	\
 		     /* adds 0xffff0001, returns the old value */ \
 		     "  test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \
 		     /* was the active mask 0 before? */\
@@ -112,7 +112,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
 		     "  call " slow_path "\n"		\
 		     "1:\n"				\
 		     "# ending down_write"		\
-		     : "+m" (sem->count), "=d" (tmp), "+a" (ret)	\
+		     : "+m" (sem->count), "=d" (tmp), "=a" (ret)	\
 		     : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \
 		     : "memory", "cc");			\
 	ret;						\
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index d026b190c530..accfe56d8c51 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -444,6 +444,18 @@ do {								\
 	lock_acquired(&(_lock)->dep_map, _RET_IP_);			\
 } while (0)
 
+#define LOCK_CONTENDED_RETURN(_lock, try, lock)			\
+({								\
+	int ____err = 0;					\
+	if (!try(_lock)) {					\
+		lock_contended(&(_lock)->dep_map, _RET_IP_);	\
+		____err = lock(_lock);				\
+	}							\
+	if (!____err)						\
+		lock_acquired(&(_lock)->dep_map, _RET_IP_);	\
+	____err;						\
+})
+
 #else /* CONFIG_LOCK_STAT */
 
 #define lock_contended(lockdep_map, ip) do {} while (0)
@@ -452,6 +464,9 @@ do {								\
 #define LOCK_CONTENDED(_lock, try, lock) \
 	lock(_lock)
 
+#define LOCK_CONTENDED_RETURN(_lock, try, lock) \
+	lock(_lock)
+
 #endif /* CONFIG_LOCK_STAT */
 
 #ifdef CONFIG_LOCKDEP
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 7d7ae029dac5..d1c12d160ace 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -118,6 +118,7 @@ extern int down_read_trylock(struct rw_semaphore *sem);
  * lock for writing
  */
 extern void down_write(struct rw_semaphore *sem);
+extern int __must_check down_write_killable(struct rw_semaphore *sem);
 
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 205be0ce34de..c817216c1615 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -54,6 +54,25 @@ void __sched down_write(struct rw_semaphore *sem)
 
 EXPORT_SYMBOL(down_write);
 
+/*
+ * lock for writing
+ */
+int __sched down_write_killable(struct rw_semaphore *sem)
+{
+	might_sleep();
+	rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+
+	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+		return -EINTR;
+	}
+
+	rwsem_set_owner(sem);
+	return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable);
+
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-- 
cgit v1.2.3


From 3b9d6da67e11ca8f78fde887918983523a36b0fa Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 8 Apr 2016 14:40:15 +0200
Subject: cpu/hotplug: Fix rollback during error-out in __cpu_disable()

The recent introduction of the hotplug thread which invokes the callbacks on
the plugged cpu, cased the following regression:

If takedown_cpu() fails, then we run into several issues:

 1) The rollback of the target cpu states is not invoked. That leaves the smp
    threads and the hotplug thread in disabled state.

 2) notify_online() is executed due to a missing skip_onerr flag. That causes
    that both CPU_DOWN_FAILED and CPU_ONLINE notifications are invoked which
    confuses quite some notifiers.

 3) The CPU_DOWN_FAILED notification is not invoked on the target CPU. That's
    not an issue per se, but it is inconsistent and in consequence blocks the
    patches which rely on these states being invoked on the target CPU and not
    on the controlling cpu. It also does not preserve the strict call order on
    rollback which is problematic for the ongoing state machine conversion as
    well.

To fix this we add a rollback flag to the remote callback machinery and invoke
the rollback including the CPU_DOWN_FAILED notification on the remote
cpu. Further mark the notify online state with 'skip_onerr' so we don't get a
double invokation.

This workaround will go away once we moved the unplug invocation to the target
cpu itself.

[ tglx: Massaged changelog and moved the CPU_DOWN_FAILED notifiaction to the
  	target cpu ]

Fixes: 4cb28ced23c4 ("cpu/hotplug: Create hotplug threads")
Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: linux-s390@vger.kernel.org
Cc: rt@linutronix.de
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Anna-Maria Gleixner <anna-maria@linutronix.de>
Link: http://lkml.kernel.org/r/20160408124015.GA21960@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ea42e8da861..3e3f6e49eabb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -36,6 +36,7 @@
  * @target:	The target state
  * @thread:	Pointer to the hotplug thread
  * @should_run:	Thread should execute
+ * @rollback:	Perform a rollback
  * @cb_stat:	The state for a single callback (install/uninstall)
  * @cb:		Single callback function (install/uninstall)
  * @result:	Result of the operation
@@ -47,6 +48,7 @@ struct cpuhp_cpu_state {
 #ifdef CONFIG_SMP
 	struct task_struct	*thread;
 	bool			should_run;
+	bool			rollback;
 	enum cpuhp_state	cb_state;
 	int			(*cb)(unsigned int cpu);
 	int			result;
@@ -301,6 +303,11 @@ static int cpu_notify(unsigned long val, unsigned int cpu)
 	return __cpu_notify(val, cpu, -1, NULL);
 }
 
+static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
+{
+	BUG_ON(cpu_notify(val, cpu));
+}
+
 /* Notifier wrappers for transitioning to state machine */
 static int notify_prepare(unsigned int cpu)
 {
@@ -477,6 +484,16 @@ static void cpuhp_thread_fun(unsigned int cpu)
 		} else {
 			ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb);
 		}
+	} else if (st->rollback) {
+		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+
+		undo_cpu_down(cpu, st, cpuhp_ap_states);
+		/*
+		 * This is a momentary workaround to keep the notifier users
+		 * happy. Will go away once we got rid of the notifiers.
+		 */
+		cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
+		st->rollback = false;
 	} else {
 		/* Cannot happen .... */
 		BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
@@ -636,11 +653,6 @@ static inline void check_for_tasks(int dead_cpu)
 	read_unlock(&tasklist_lock);
 }
 
-static void cpu_notify_nofail(unsigned long val, unsigned int cpu)
-{
-	BUG_ON(cpu_notify(val, cpu));
-}
-
 static int notify_down_prepare(unsigned int cpu)
 {
 	int err, nr_calls = 0;
@@ -721,9 +733,10 @@ static int takedown_cpu(unsigned int cpu)
 	 */
 	err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
 	if (err) {
-		/* CPU didn't die: tell everyone.  Can't complain. */
-		cpu_notify_nofail(CPU_DOWN_FAILED, cpu);
+		/* CPU refused to die */
 		irq_unlock_sparse();
+		/* Unpark the hotplug thread so we can rollback there */
+		kthread_unpark(per_cpu_ptr(&cpuhp_state, cpu)->thread);
 		return err;
 	}
 	BUG_ON(cpu_online(cpu));
@@ -832,6 +845,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	 * to do the further cleanups.
 	 */
 	ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target);
+	if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
+		st->target = prev_state;
+		st->rollback = true;
+		cpuhp_kick_ap_work(cpu);
+	}
 
 	hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE;
 out:
@@ -1249,6 +1267,7 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.name			= "notify:online",
 		.startup		= notify_online,
 		.teardown		= notify_down_prepare,
+		.skip_onerr		= true,
 	},
 #endif
 	/*
-- 
cgit v1.2.3


From 86d3473224b004f920c107206d181d37db735145 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linaro.org>
Date: Fri, 8 Apr 2016 14:02:12 +0800
Subject: time: Introduce do_sys_settimeofday64()

The do_sys_settimeofday() function uses a timespec, which is not year
2038 safe on 32bit systems.

Thus this patch introduces do_sys_settimeofday64(), which allows us to
transition users of do_sys_settimeofday() to using 64bit time types.

Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Baolin Wang <baolin.wang@linaro.org>
[jstultz: Include errno-base.h to avoid build issue on some arches]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h | 17 +++++++++++++++--
 kernel/time/time.c          |  8 ++++----
 2 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 96f37bee3bc1..37dbacf84849 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_TIMEKEEPING_H
 #define _LINUX_TIMEKEEPING_H
 
+#include <asm-generic/errno-base.h>
+
 /* Included from linux/ktime.h */
 
 void timekeeping_init(void);
@@ -11,8 +13,19 @@ extern int timekeeping_suspended;
  */
 extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday64(const struct timespec64 *ts);
-extern int do_sys_settimeofday(const struct timespec *tv,
-			       const struct timezone *tz);
+extern int do_sys_settimeofday64(const struct timespec64 *tv,
+				 const struct timezone *tz);
+static inline int do_sys_settimeofday(const struct timespec *tv,
+				      const struct timezone *tz)
+{
+	struct timespec64 ts64;
+
+	if (!tv)
+		return -EINVAL;
+
+	ts64 = timespec_to_timespec64(*tv);
+	return do_sys_settimeofday64(&ts64, tz);
+}
 
 /*
  * Kernel time accessors
diff --git a/kernel/time/time.c b/kernel/time/time.c
index be115b020d27..a4064b612066 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -160,15 +160,15 @@ static inline void warp_clock(void)
  * various programs will get confused when the clock gets warped.
  */
 
-int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
+int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz)
 {
 	static int firsttime = 1;
 	int error = 0;
 
-	if (tv && !timespec_valid(tv))
+	if (tv && !timespec64_valid(tv))
 		return -EINVAL;
 
-	error = security_settime(tv, tz);
+	error = security_settime64(tv, tz);
 	if (error)
 		return error;
 
@@ -186,7 +186,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
 		}
 	}
 	if (tv)
-		return do_settimeofday(tv);
+		return do_settimeofday64(tv);
 	return 0;
 }
 
-- 
cgit v1.2.3


From b303e7c15d53cd8ef6b349b702e07eee3f102792 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 4 Apr 2016 09:57:40 +0200
Subject: perf/core: Make sysctl_perf_cpu_time_max_percent conform to
 documentation

Markus reported that 0 should also disable the throttling we per
Documentation/sysctl/kernel.txt.

Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: 91a612eea9a3 ("perf/core: Fix dynamic interrupt throttle")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 52bedc5a5aaa..2c78b6f47339 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -412,7 +412,8 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 	if (ret || !write)
 		return ret;
 
-	if (sysctl_perf_cpu_time_max_percent == 100) {
+	if (sysctl_perf_cpu_time_max_percent == 100 ||
+	    sysctl_perf_cpu_time_max_percent == 0) {
 		printk(KERN_WARNING
 		       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
 		WRITE_ONCE(perf_sample_allowed_ns, 0);
-- 
cgit v1.2.3


From c24697566298df04cac9913e0601501b5ee2b3f5 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Tue, 16 Feb 2016 13:57:40 +0800
Subject: locking/lockdep: Fix ->irq_context calculation

task_irq_context() returns the encoded irq_context of the task, the
return value is encoded in the same as ->irq_context of held_lock.

Always return 0 if !(CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING)

Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: sasha.levin@oracle.com
Link: http://lkml.kernel.org/r/1455602265-16490-2-git-send-email-boqun.feng@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index ed9410936a22..beb06f604420 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2932,6 +2932,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 	return 1;
 }
 
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+	return 2 * !!task->hardirq_context + !!task->softirq_context;
+}
+
 static int separate_irq_context(struct task_struct *curr,
 		struct held_lock *hlock)
 {
@@ -2940,8 +2945,6 @@ static int separate_irq_context(struct task_struct *curr,
 	/*
 	 * Keep track of points where we cross into an interrupt context:
 	 */
-	hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
-				curr->softirq_context;
 	if (depth) {
 		struct held_lock *prev_hlock;
 
@@ -2973,6 +2976,11 @@ static inline int mark_irqflags(struct task_struct *curr,
 	return 1;
 }
 
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+	return 0;
+}
+
 static inline int separate_irq_context(struct task_struct *curr,
 		struct held_lock *hlock)
 {
@@ -3241,6 +3249,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->acquire_ip = ip;
 	hlock->instance = lock;
 	hlock->nest_lock = nest_lock;
+	hlock->irq_context = task_irq_context(curr);
 	hlock->trylock = trylock;
 	hlock->read = read;
 	hlock->check = check;
-- 
cgit v1.2.3


From 75dd602a5198a6e5f75534db52b6e6fbaabb33d1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 30 Mar 2016 11:36:59 +0200
Subject: lockdep: Fix lock_chain::base size

lock_chain::base is used to store an index into the chain_hlocks[]
array, however that array contains more elements than can be indexed
using the u16.

Change the lock_chain structure to use a bitfield to encode the data
it needs and add BUILD_BUG_ON() assertions to check the fields are
wide enough.

Also, for DEBUG_LOCKDEP, assert that we don't run out of elements of
that array; as that would wreck the collision detectoring.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alfredo Alvarez Fernandez <alfredoalvarezfernandez@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160330093659.GS3408@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h       |  8 +++++---
 kernel/locking/lockdep.c      | 24 +++++++++++++++++++++++-
 kernel/locking/lockdep_proc.c |  2 ++
 3 files changed, 30 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index d026b190c530..d10ef06971b5 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -196,9 +196,11 @@ struct lock_list {
  * We record lock dependency chains, so that we can cache them:
  */
 struct lock_chain {
-	u8				irq_context;
-	u8				depth;
-	u16				base;
+	/* see BUILD_BUG_ON()s in lookup_chain_cache() */
+	unsigned int			irq_context :  2,
+					depth       :  6,
+					base	    : 24;
+	/* 4 byte hole */
 	struct hlist_node		entry;
 	u64				chain_key;
 };
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index beb06f604420..78c1c0ee6dc1 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2176,15 +2176,37 @@ cache_hit:
 	chain->irq_context = hlock->irq_context;
 	i = get_first_held_lock(curr, hlock);
 	chain->depth = curr->lockdep_depth + 1 - i;
+
+	BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks));
+	BUILD_BUG_ON((1UL << 6)  <= ARRAY_SIZE(curr->held_locks));
+	BUILD_BUG_ON((1UL << 8*sizeof(chain_hlocks[0])) <= ARRAY_SIZE(lock_classes));
+
 	if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
 		chain->base = nr_chain_hlocks;
-		nr_chain_hlocks += chain->depth;
 		for (j = 0; j < chain->depth - 1; j++, i++) {
 			int lock_id = curr->held_locks[i].class_idx - 1;
 			chain_hlocks[chain->base + j] = lock_id;
 		}
 		chain_hlocks[chain->base + j] = class - lock_classes;
 	}
+
+	if (nr_chain_hlocks < MAX_LOCKDEP_CHAIN_HLOCKS)
+		nr_chain_hlocks += chain->depth;
+
+#ifdef CONFIG_DEBUG_LOCKDEP
+	/*
+	 * Important for check_no_collision().
+	 */
+	if (unlikely(nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)) {
+		if (debug_locks_off_graph_unlock())
+			return 0;
+
+		print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!");
+		dump_stack();
+		return 0;
+	}
+#endif
+
 	hlist_add_head_rcu(&chain->entry, hash_head);
 	debug_atomic_inc(chain_lookup_misses);
 	inc_chains();
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index dbb61a302548..a0f61effad25 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -141,6 +141,8 @@ static int lc_show(struct seq_file *m, void *v)
 	int i;
 
 	if (v == SEQ_START_TOKEN) {
+		if (nr_chain_hlocks > MAX_LOCKDEP_CHAIN_HLOCKS)
+			seq_printf(m, "(buggered) ");
 		seq_printf(m, "all lock chains:\n");
 		return 0;
 	}
-- 
cgit v1.2.3


From 9ecda41acb971ebd07c8fb35faf24005c0baea12 Mon Sep 17 00:00:00 2001
From: Wang Nan <wangnan0@huawei.com>
Date: Tue, 5 Apr 2016 14:11:18 +0000
Subject: perf/core: Add ::write_backward attribute to perf event

This patch introduces 'write_backward' bit to perf_event_attr, which
controls the direction of a ring buffer. After set, the corresponding
ring buffer is written from end to beginning. This feature is design to
support reading from overwritable ring buffer.

Ring buffer can be created by mapping a perf event fd. Kernel puts event
records into ring buffer, user tooling like perf fetch them from
address returned by mmap(). To prevent racing between kernel and tooling,
they communicate to each other through 'head' and 'tail' pointers.
Kernel maintains 'head' pointer, points it to the next free area (tail
of the last record). Tooling maintains 'tail' pointer, points it to the
tail of last consumed record (record has already been fetched). Kernel
determines the available space in a ring buffer using these two
pointers to avoid overwrite unfetched records.

By mapping without 'PROT_WRITE', an overwritable ring buffer is created.
Different from normal ring buffer, tooling is unable to maintain 'tail'
pointer because writing is forbidden. Therefore, for this type of ring
buffers, kernel overwrite old records unconditionally, works like flight
recorder. This feature would be useful if reading from overwritable ring
buffer were as easy as reading from normal ring buffer. However,
there's an obscure problem.

The following figure demonstrates a full overwritable ring buffer. In
this figure, the 'head' pointer points to the end of last record, and a
long record 'E' is pending. For a normal ring buffer, a 'tail' pointer
would have pointed to position (X), so kernel knows there's no more
space in the ring buffer. However, for an overwritable ring buffer,
kernel ignore the 'tail' pointer.

   (X)                              head
    .                                |
    .                                V
    +------+-------+----------+------+---+
    |A....A|B.....B|C........C|D....D|   |
    +------+-------+----------+------+---+

Record 'A' is overwritten by event 'E':

      head
       |
       V
    +--+---+-------+----------+------+---+
    |.E|..A|B.....B|C........C|D....D|E..|
    +--+---+-------+----------+------+---+

Now tooling decides to read from this ring buffer. However, none of these
two natural positions, 'head' and the start of this ring buffer, are
pointing to the head of a record. Even the full ring buffer can be
accessed by tooling, it is unable to find a position to start decoding.

The first attempt tries to solve this problem AFAIK can be found from
[1]. It makes kernel to maintain 'tail' pointer: updates it when ring
buffer is half full. However, this approach introduces overhead to
fast path. Test result shows a 1% overhead [2]. In addition, this method
utilizes no more tham 50% records.

Another attempt can be found from [3], which allows putting the size of
an event at the end of each record. This approach allows tooling to find
records in a backward manner from 'head' pointer by reading size of a
record from its tail. However, because of alignment requirement, it
needs 8 bytes to record the size of a record, which is a huge waste. Its
performance is also not good, because more data need to be written.
This approach also introduces some extra branch instructions to fast
path.

'write_backward' is a better solution to this problem.

Following figure demonstrates the state of the overwritable ring buffer
when 'write_backward' is set before overwriting:

       head
        |
        V
    +---+------+----------+-------+------+
    |   |D....D|C........C|B.....B|A....A|
    +---+------+----------+-------+------+

and after overwriting:
                                     head
                                      |
                                      V
    +---+------+----------+-------+---+--+
    |..E|D....D|C........C|B.....B|A..|E.|
    +---+------+----------+-------+---+--+

In each situation, 'head' points to the beginning of the newest record.
From this record, tooling can iterate over the full ring buffer and fetch
records one by one.

The only limitation that needs to be considered is back-to-back reading.
Due to the non-deterministic of user programs, it is impossible to ensure
the ring buffer keeps stable during reading. Consider an extreme situation:
tooling is scheduled out after reading record 'D', then a burst of events
come, eat up the whole ring buffer (one or multiple rounds). When the
tooling process comes back, reading after 'D' is incorrect now.

To prevent this problem, we need to find a way to ensure the ring buffer
is stable during reading. ioctl(PERF_EVENT_IOC_PAUSE_OUTPUT) is
suggested because its overhead is lower than
ioctl(PERF_EVENT_IOC_ENABLE).

By carefully verifying 'header' pointer, reader can avoid pausing the
ring-buffer. For example:

    /* A union of all possible events */
    union perf_event event;

    p = head = perf_mmap__read_head();
    while (true) {
        /* copy header of next event */
        fetch(&event.header, p, sizeof(event.header));

        /* read 'head' pointer */
        head = perf_mmap__read_head();

        /* check overwritten: is the header good? */
        if (!verify(sizeof(event.header), p, head))
            break;

        /* copy the whole event */
        fetch(&event, p, event.header.size);

        /* read 'head' pointer again */
        head = perf_mmap__read_head();

        /* is the whole event good? */
        if (!verify(event.header.size, p, head))
            break;
        p += event.header.size;
    }

However, the overhead is high because:

 a) In-place decoding is not safe.
    Copying-verifying-decoding is required.
 b) Fetching 'head' pointer requires additional synchronization.

(From Alexei Starovoitov:

Even when this trick works, pause is needed for more than stability of
reading. When we collect the events into overwrite buffer we're waiting
for some other trigger (like all cpu utilization spike or just one cpu
running and all others are idle) and when it happens the buffer has
valuable info from the past. At this point new events are no longer
interesting and buffer should be paused, events read and unpaused until
next trigger comes.)

This patch utilizes event's default overflow_handler introduced
previously. perf_event_output_backward() is created as the default
overflow handler for backward ring buffers. To avoid extra overhead to
fast path, original perf_event_output() becomes __perf_event_output()
and marked '__always_inline'. In theory, there's no extra overhead
introduced to fast path.

Performance testing:

Calling 3000000 times of 'close(-1)', use gettimeofday() to check
duration.  Use 'perf record -o /dev/null -e raw_syscalls:*' to capture
system calls. In ns.

Testing environment:

  CPU    : Intel(R) Core(TM) i7-4790 CPU @ 3.60GHz
  Kernel : v4.5.0
                    MEAN         STDVAR
 BASE            800214.950    2853.083
 PRE1           2253846.700    9997.014
 PRE2           2257495.540    8516.293
 POST           2250896.100    8933.921

Where 'BASE' is pure performance without capturing. 'PRE1' is test
result of pure 'v4.5.0' kernel. 'PRE2' is test result before this
patch. 'POST' is test result after this patch. See [4] for the detailed
experimental setup.

Considering the stdvar, this patch doesn't introduce performance
overhead to the fast path.

 [1] http://lkml.iu.edu/hypermail/linux/kernel/1304.1/04584.html
 [2] http://lkml.iu.edu/hypermail/linux/kernel/1307.1/00535.html
 [3] http://lkml.iu.edu/hypermail/linux/kernel/1512.0/01265.html
 [4] http://lkml.kernel.org/g/56F89DCD.1040202@huawei.com

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: <acme@kernel.org>
Cc: <pi3orama@163.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/1459865478-53413-1-git-send-email-wangnan0@huawei.com
[ Fixed the changelog some more. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h      | 28 +++++++++++++++++++++---
 include/uapi/linux/perf_event.h |  3 ++-
 kernel/events/core.c            | 48 ++++++++++++++++++++++++++++++++++++-----
 kernel/events/ring_buffer.c     | 16 +++++++++++++-
 4 files changed, 85 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b8b195fbe787..85749ae8cb5f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -834,14 +834,24 @@ extern int perf_event_overflow(struct perf_event *event,
 				 struct perf_sample_data *data,
 				 struct pt_regs *regs);
 
+extern void perf_event_output_forward(struct perf_event *event,
+				     struct perf_sample_data *data,
+				     struct pt_regs *regs);
+extern void perf_event_output_backward(struct perf_event *event,
+				       struct perf_sample_data *data,
+				       struct pt_regs *regs);
 extern void perf_event_output(struct perf_event *event,
-				struct perf_sample_data *data,
-				struct pt_regs *regs);
+			      struct perf_sample_data *data,
+			      struct pt_regs *regs);
 
 static inline bool
 is_default_overflow_handler(struct perf_event *event)
 {
-	return (event->overflow_handler == perf_event_output);
+	if (likely(event->overflow_handler == perf_event_output_forward))
+		return true;
+	if (unlikely(event->overflow_handler == perf_event_output_backward))
+		return true;
+	return false;
 }
 
 extern void
@@ -1051,8 +1061,20 @@ static inline bool has_aux(struct perf_event *event)
 	return event->pmu->setup_aux;
 }
 
+static inline bool is_write_backward(struct perf_event *event)
+{
+	return !!event->attr.write_backward;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
+extern int perf_output_begin_forward(struct perf_output_handle *handle,
+				    struct perf_event *event,
+				    unsigned int size);
+extern int perf_output_begin_backward(struct perf_output_handle *handle,
+				      struct perf_event *event,
+				      unsigned int size);
+
 extern void perf_output_end(struct perf_output_handle *handle);
 extern unsigned int perf_output_copy(struct perf_output_handle *handle,
 			     const void *buf, unsigned int len);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index a3c19034d5f8..43fc8d213472 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -340,7 +340,8 @@ struct perf_event_attr {
 				comm_exec      :  1, /* flag comm events that are due to an exec */
 				use_clockid    :  1, /* use @clockid for time fields */
 				context_switch :  1, /* context switch data */
-				__reserved_1   : 37;
+				write_backward :  1, /* Write ring buffer from end to beginning */
+				__reserved_1   : 36;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 21ba024c9ed1..eabeb2aec00f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5694,9 +5694,13 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 }
 
-void perf_event_output(struct perf_event *event,
-			struct perf_sample_data *data,
-			struct pt_regs *regs)
+static void __always_inline
+__perf_event_output(struct perf_event *event,
+		    struct perf_sample_data *data,
+		    struct pt_regs *regs,
+		    int (*output_begin)(struct perf_output_handle *,
+					struct perf_event *,
+					unsigned int))
 {
 	struct perf_output_handle handle;
 	struct perf_event_header header;
@@ -5706,7 +5710,7 @@ void perf_event_output(struct perf_event *event,
 
 	perf_prepare_sample(&header, data, event, regs);
 
-	if (perf_output_begin(&handle, event, header.size))
+	if (output_begin(&handle, event, header.size))
 		goto exit;
 
 	perf_output_sample(&handle, &header, data, event);
@@ -5717,6 +5721,30 @@ exit:
 	rcu_read_unlock();
 }
 
+void
+perf_event_output_forward(struct perf_event *event,
+			 struct perf_sample_data *data,
+			 struct pt_regs *regs)
+{
+	__perf_event_output(event, data, regs, perf_output_begin_forward);
+}
+
+void
+perf_event_output_backward(struct perf_event *event,
+			   struct perf_sample_data *data,
+			   struct pt_regs *regs)
+{
+	__perf_event_output(event, data, regs, perf_output_begin_backward);
+}
+
+void
+perf_event_output(struct perf_event *event,
+		  struct perf_sample_data *data,
+		  struct pt_regs *regs)
+{
+	__perf_event_output(event, data, regs, perf_output_begin);
+}
+
 /*
  * read event_id
  */
@@ -8153,8 +8181,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (overflow_handler) {
 		event->overflow_handler	= overflow_handler;
 		event->overflow_handler_context = context;
+	} else if (is_write_backward(event)){
+		event->overflow_handler = perf_event_output_backward;
+		event->overflow_handler_context = NULL;
 	} else {
-		event->overflow_handler = perf_event_output;
+		event->overflow_handler = perf_event_output_forward;
 		event->overflow_handler_context = NULL;
 	}
 
@@ -8388,6 +8419,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	if (output_event->clock != event->clock)
 		goto out;
 
+	/*
+	 * Either writing ring buffer from beginning or from end.
+	 * Mixing is not allowed.
+	 */
+	if (is_write_backward(output_event) != is_write_backward(event))
+		goto out;
+
 	/*
 	 * If both events generate aux data, they must be on the same PMU
 	 */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 60be55a64040..c49bab42dc57 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -230,10 +230,24 @@ out:
 	return -ENOSPC;
 }
 
+int perf_output_begin_forward(struct perf_output_handle *handle,
+			     struct perf_event *event, unsigned int size)
+{
+	return __perf_output_begin(handle, event, size, false);
+}
+
+int perf_output_begin_backward(struct perf_output_handle *handle,
+			       struct perf_event *event, unsigned int size)
+{
+	return __perf_output_begin(handle, event, size, true);
+}
+
 int perf_output_begin(struct perf_output_handle *handle,
 		      struct perf_event *event, unsigned int size)
 {
-	return __perf_output_begin(handle, event, size, false);
+
+	return __perf_output_begin(handle, event, size,
+				   unlikely(is_write_backward(event)));
 }
 
 unsigned int perf_output_copy(struct perf_output_handle *handle,
-- 
cgit v1.2.3


From 1f621e028baf391f6684003e32e009bc934b750f Mon Sep 17 00:00:00 2001
From: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Date: Wed, 6 Apr 2016 18:47:40 +0530
Subject: sched/fair: Fix asym packing to select correct CPU

When asymmetric packing is set in the sched_domain and target CPU is
busy, update_sd_pick_busiest() may not select the busiest runqueue.
When target CPU is busy, find_busiest_group() will ignore checks for
asym packing and may continue to load balance using the currently
selected not-the-busiest runqueue as source runqueue.
Selecting the busiest runqueue as source when the target CPU is busy,
should result in achieving much better load balance.

Also when target CPU is not busy and asymmetric packing is set in sd,
select higher CPU as source CPU for load balancing.

While doing this change, move the check to see if target CPU is busy
into check_asym_packing().

The extent of performance benefit from this change decreases with the
increasing load. However there is benefit in undercommit as well as
overcommit conditions.

1. Record per second ebizzy (32 threads) on a 64 CPU power 7 box. (5 iterations)
4.6.0-rc2
	Testcase:         Min         Max         Avg      StdDev
	  ebizzy:  5223767.00 10368236.00  7946971.00  1753094.76

4.6.0-rc2+asym-changes
	Testcase:         Min         Max         Avg      StdDev     %Change
	  ebizzy:  8617191.00 13872356.00 11383980.00  1783400.89     +24.78%

2. Record per second ebizzy (64 threads) on a 64 CPU power 7 box. (5 iterations)
4.6.0-rc2
	Testcase:         Min         Max         Avg      StdDev
	  ebizzy:  6497666.00 18399783.00 10818093.20  4051452.08

4.6.0-rc2+asym-changes
	Testcase:         Min         Max         Avg      StdDev     %Change
	  ebizzy:  7567365.00 19456937.00 11674063.60  4295407.48      +4.40%

3. Record per second ebizzy (128 threads) on a 64 CPU power 7 box. (5 iterations)
4.6.0-rc2
	Testcase:         Min         Max         Avg      StdDev
	  ebizzy: 37073983.00 40341911.00 38776241.80  1259766.82

4.6.0-rc2+asym-changes
	Testcase:         Min         Max         Avg      StdDev     %Change
	  ebizzy: 38030399.00 41333378.00 39827404.40  1255001.86      +2.54%

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Michael Neuling <mikey@neuling.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1459948660-16073-1-git-send-email-srikar@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8cc1c35cd7c..6e371f43fc80 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6679,6 +6679,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	if (!(env->sd->flags & SD_ASYM_PACKING))
 		return true;
 
+	/* No ASYM_PACKING if target cpu is already busy */
+	if (env->idle == CPU_NOT_IDLE)
+		return true;
 	/*
 	 * ASYM_PACKING needs to move all the work to the lowest
 	 * numbered CPUs in the group, therefore mark all groups
@@ -6688,7 +6691,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 		if (!sds->busiest)
 			return true;
 
-		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
+		/* Prefer to move from highest possible cpu's work */
+		if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
 			return true;
 	}
 
@@ -6834,6 +6838,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 	if (!(env->sd->flags & SD_ASYM_PACKING))
 		return 0;
 
+	if (env->idle == CPU_NOT_IDLE)
+		return 0;
+
 	if (!sds->busiest)
 		return 0;
 
@@ -7026,8 +7033,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	busiest = &sds.busiest_stat;
 
 	/* ASYM feature bypasses nice load balance check */
-	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
-	    check_asym_packing(env, &sds))
+	if (check_asym_packing(env, &sds))
 		return sds.busiest;
 
 	/* There is no busy sibling group to pull tasks from */
-- 
cgit v1.2.3


From 21e96f88776deead303ecd30a17d1d7c2a1776e3 Mon Sep 17 00:00:00 2001
From: Steve Muckle <steve.muckle@linaro.org>
Date: Mon, 21 Mar 2016 17:21:07 -0700
Subject: sched/fair: Move cpufreq hook to update_cfs_rq_load_avg()

The cpufreq hook should be called whenever the root cfs_rq
utilization changes so update_cfs_rq_load_avg() is a better
place for it. The current location is not invoked in the
enqueue_entity() or update_blocked_averages() paths.

Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <Juri.Lelli@arm.com>
Cc: Michael Turquette <mturquette@baylibre.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1458606068-7476-1-git-send-email-smuckle@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 50 ++++++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e371f43fc80..6df80d47a525 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2878,7 +2878,9 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
 	struct sched_avg *sa = &cfs_rq->avg;
+	struct rq *rq = rq_of(cfs_rq);
 	int decayed, removed = 0;
+	int cpu = cpu_of(rq);
 
 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
@@ -2893,7 +2895,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
 	}
 
-	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+	decayed = __update_load_avg(now, cpu, sa,
 		scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
 
 #ifndef CONFIG_64BIT
@@ -2901,28 +2903,6 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
-	return decayed || removed;
-}
-
-/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
-{
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 now = cfs_rq_clock_task(cfs_rq);
-	struct rq *rq = rq_of(cfs_rq);
-	int cpu = cpu_of(rq);
-
-	/*
-	 * Track task load average for carrying it to new CPU after migrated, and
-	 * track group sched_entity load average for task_h_load calc in migration
-	 */
-	__update_load_avg(now, cpu, &se->avg,
-			  se->on_rq * scale_load_down(se->load.weight),
-			  cfs_rq->curr == se, NULL);
-
-	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
-		update_tg_load_avg(cfs_rq, 0);
-
 	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
 		unsigned long max = rq->cpu_capacity_orig;
 
@@ -2943,8 +2923,30 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 		 * See cpu_util().
 		 */
 		cpufreq_update_util(rq_clock(rq),
-				    min(cfs_rq->avg.util_avg, max), max);
+				    min(sa->util_avg, max), max);
 	}
+
+	return decayed || removed;
+}
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 now = cfs_rq_clock_task(cfs_rq);
+	struct rq *rq = rq_of(cfs_rq);
+	int cpu = cpu_of(rq);
+
+	/*
+	 * Track task load average for carrying it to new CPU after migrated, and
+	 * track group sched_entity load average for task_h_load calc in migration
+	 */
+	__update_load_avg(now, cpu, &se->avg,
+			  se->on_rq * scale_load_down(se->load.weight),
+			  cfs_rq->curr == se, NULL);
+
+	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+		update_tg_load_avg(cfs_rq, 0);
 }
 
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-- 
cgit v1.2.3


From 41e0d37f7ac81297c07ba311e4ad39465b8c8295 Mon Sep 17 00:00:00 2001
From: Steve Muckle <steve.muckle@linaro.org>
Date: Mon, 21 Mar 2016 17:21:08 -0700
Subject: sched/fair: Do not call cpufreq hook unless util changed

There's no reason to call the cpufreq hook if the root cfs_rq
utilization has not been modified.

Signed-off-by: Steve Muckle <smuckle@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <Juri.Lelli@arm.com>
Cc: Michael Turquette <mturquette@baylibre.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/1458606068-7476-2-git-send-email-smuckle@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6df80d47a525..81552819444c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2879,20 +2879,21 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
 	struct sched_avg *sa = &cfs_rq->avg;
 	struct rq *rq = rq_of(cfs_rq);
-	int decayed, removed = 0;
+	int decayed, removed_load = 0, removed_util = 0;
 	int cpu = cpu_of(rq);
 
 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
 		sa->load_avg = max_t(long, sa->load_avg - r, 0);
 		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
-		removed = 1;
+		removed_load = 1;
 	}
 
 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
 		sa->util_avg = max_t(long, sa->util_avg - r, 0);
 		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+		removed_util = 1;
 	}
 
 	decayed = __update_load_avg(now, cpu, sa,
@@ -2903,7 +2904,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
-	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq &&
+	    (decayed || removed_util)) {
 		unsigned long max = rq->cpu_capacity_orig;
 
 		/*
@@ -2926,7 +2928,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 				    min(sa->util_avg, max), max);
 	}
 
-	return decayed || removed;
+	return decayed || removed_load;
 }
 
 /* Update task and its cfs_rq load average */
-- 
cgit v1.2.3


From a2c6c91f98247fef0fe75216d607812485aeb0df Mon Sep 17 00:00:00 2001
From: Steve Muckle <steve.muckle@linaro.org>
Date: Thu, 24 Mar 2016 15:26:07 -0700
Subject: sched/fair: Call cpufreq hook in additional paths

The cpufreq hook should be called any time the root CFS rq utilization
changes. This can occur when a task is switched to or from the fair
class, or a task moves between groups or CPUs, but these paths
currently do not call the cpufreq hook.

Fix this by adding the hook to attach_entity_load_avg() and
detach_entity_load_avg().

Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Steve Muckle <smuckle@linaro.org>
[ Added the .update_freq argument to update_cfs_rq_load_avg() to avoid a double cpufreq call. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Juri Lelli <Juri.Lelli@arm.com>
Cc: Michael Turquette <mturquette@baylibre.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Patrick Bellasi <patrick.bellasi@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1458858367-2831-1-git-send-email-smuckle@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 73 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 42 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 81552819444c..c328bd77fe35 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2874,13 +2874,41 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 
 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
 
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	int cpu = cpu_of(rq);
+
+	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+		unsigned long max = rq->cpu_capacity_orig;
+
+		/*
+		 * There are a few boundary cases this might miss but it should
+		 * get called often enough that that should (hopefully) not be
+		 * a real problem -- added to that it only calls on the local
+		 * CPU, so if we enqueue remotely we'll miss an update, but
+		 * the next tick/schedule should update.
+		 *
+		 * It will not get called when we go idle, because the idle
+		 * thread is a different class (!fair), nor will the utilization
+		 * number include things like RT tasks.
+		 *
+		 * As is, the util number is not freq-invariant (we'd have to
+		 * implement arch_scale_freq_capacity() for that).
+		 *
+		 * See cpu_util().
+		 */
+		cpufreq_update_util(rq_clock(rq),
+				    min(cfs_rq->avg.util_avg, max), max);
+	}
+}
+
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 {
 	struct sched_avg *sa = &cfs_rq->avg;
-	struct rq *rq = rq_of(cfs_rq);
 	int decayed, removed_load = 0, removed_util = 0;
-	int cpu = cpu_of(rq);
 
 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
@@ -2896,7 +2924,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 		removed_util = 1;
 	}
 
-	decayed = __update_load_avg(now, cpu, sa,
+	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
 		scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
 
 #ifndef CONFIG_64BIT
@@ -2904,29 +2932,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
-	if (cpu == smp_processor_id() && &rq->cfs == cfs_rq &&
-	    (decayed || removed_util)) {
-		unsigned long max = rq->cpu_capacity_orig;
-
-		/*
-		 * There are a few boundary cases this might miss but it should
-		 * get called often enough that that should (hopefully) not be
-		 * a real problem -- added to that it only calls on the local
-		 * CPU, so if we enqueue remotely we'll miss an update, but
-		 * the next tick/schedule should update.
-		 *
-		 * It will not get called when we go idle, because the idle
-		 * thread is a different class (!fair), nor will the utilization
-		 * number include things like RT tasks.
-		 *
-		 * As is, the util number is not freq-invariant (we'd have to
-		 * implement arch_scale_freq_capacity() for that).
-		 *
-		 * See cpu_util().
-		 */
-		cpufreq_update_util(rq_clock(rq),
-				    min(sa->util_avg, max), max);
-	}
+	if (update_freq && (decayed || removed_util))
+		cfs_rq_util_change(cfs_rq);
 
 	return decayed || removed_load;
 }
@@ -2947,7 +2954,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 			  se->on_rq * scale_load_down(se->load.weight),
 			  cfs_rq->curr == se, NULL);
 
-	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+	if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
 		update_tg_load_avg(cfs_rq, 0);
 }
 
@@ -2976,6 +2983,8 @@ skip_aging:
 	cfs_rq->avg.load_sum += se->avg.load_sum;
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
+
+	cfs_rq_util_change(cfs_rq);
 }
 
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2988,6 +2997,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
 	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
 	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+
+	cfs_rq_util_change(cfs_rq);
 }
 
 /* Add the load generated by se into cfs_rq's load average */
@@ -3005,7 +3016,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			cfs_rq->curr == se, NULL);
 	}
 
-	decayed = update_cfs_rq_load_avg(now, cfs_rq);
+	decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
 
 	cfs_rq->runnable_load_avg += sa->load_avg;
 	cfs_rq->runnable_load_sum += sa->load_sum;
@@ -6213,7 +6224,7 @@ static void update_blocked_averages(int cpu)
 		if (throttled_hierarchy(cfs_rq))
 			continue;
 
-		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
 			update_tg_load_avg(cfs_rq, 0);
 	}
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6274,7 +6285,7 @@ static inline void update_blocked_averages(int cpu)
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	update_rq_clock(rq);
-	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-- 
cgit v1.2.3


From cee1afce3053e7aa0793fbd5f2e845fa2cef9e33 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 13 Apr 2016 15:56:50 +0200
Subject: sched/fair: Gather CPU load functions under a more conventional
 namespace

The CPU load update related functions have a weak naming convention
currently, starting with update_cpu_load_*() which isn't ideal as
"update" is a very generic concept.

Since two of these functions are public already (and a third is to come)
that's enough to introduce a more conventional naming scheme. So let's
do the following rename instead:

	update_cpu_load_*() -> cpu_load_update_*()

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1460555812-25375-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/trace/ftrace.txt | 10 +++++-----
 include/linux/sched.h          |  4 ++--
 kernel/sched/core.c            |  2 +-
 kernel/sched/fair.c            | 24 ++++++++++++------------
 kernel/sched/sched.h           |  4 ++--
 kernel/time/tick-sched.c       |  2 +-
 6 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index f52f297cb406..9857606dd7b7 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set.
   <idle>-0       3dN.1   12us : menu_hrtimer_cancel <-tick_nohz_idle_exit
   <idle>-0       3dN.1   12us : ktime_get <-tick_nohz_idle_exit
   <idle>-0       3dN.1   12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit
-  <idle>-0       3dN.1   13us : update_cpu_load_nohz <-tick_nohz_idle_exit
-  <idle>-0       3dN.1   13us : _raw_spin_lock <-update_cpu_load_nohz
+  <idle>-0       3dN.1   13us : cpu_load_update_nohz <-tick_nohz_idle_exit
+  <idle>-0       3dN.1   13us : _raw_spin_lock <-cpu_load_update_nohz
   <idle>-0       3dN.1   13us : add_preempt_count <-_raw_spin_lock
-  <idle>-0       3dN.2   13us : __update_cpu_load <-update_cpu_load_nohz
-  <idle>-0       3dN.2   14us : sched_avg_update <-__update_cpu_load
-  <idle>-0       3dN.2   14us : _raw_spin_unlock <-update_cpu_load_nohz
+  <idle>-0       3dN.2   13us : __cpu_load_update <-cpu_load_update_nohz
+  <idle>-0       3dN.2   14us : sched_avg_update <-__cpu_load_update
+  <idle>-0       3dN.2   14us : _raw_spin_unlock <-cpu_load_update_nohz
   <idle>-0       3dN.2   14us : sub_preempt_count <-_raw_spin_unlock
   <idle>-0       3dN.1   15us : calc_load_exit_idle <-tick_nohz_idle_exit
   <idle>-0       3dN.1   15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 13c1c1d07270..0b7f6028a50b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(int active);
+extern void cpu_load_update_nohz(int active);
 #else
-static inline void update_cpu_load_nohz(int active) { }
+static inline void cpu_load_update_nohz(int active) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 06efbb9c9544..c98a2688f390 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2917,7 +2917,7 @@ void scheduler_tick(void)
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
-	update_cpu_load_active(rq);
+	cpu_load_update_active(rq);
 	calc_global_load_tick(rq);
 	raw_spin_unlock(&rq->lock);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c328bd77fe35..ecd81c4ebb56 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4559,7 +4559,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 }
 
 /**
- * __update_cpu_load - update the rq->cpu_load[] statistics
+ * __cpu_load_update - update the rq->cpu_load[] statistics
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
@@ -4594,7 +4594,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
  * term. See the @active paramter.
  */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
 			      unsigned long pending_updates, int active)
 {
 	unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
@@ -4642,7 +4642,7 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __update_cpu_load_nohz(struct rq *this_rq,
+static void __cpu_load_update_nohz(struct rq *this_rq,
 				   unsigned long curr_jiffies,
 				   unsigned long load,
 				   int active)
@@ -4657,7 +4657,7 @@ static void __update_cpu_load_nohz(struct rq *this_rq,
 		 * In the NOHZ_FULL case, we were non-idle, we should consider
 		 * its weighted load.
 		 */
-		__update_cpu_load(this_rq, load, pending_updates, active);
+		__cpu_load_update(this_rq, load, pending_updates, active);
 	}
 }
 
@@ -4678,7 +4678,7 @@ static void __update_cpu_load_nohz(struct rq *this_rq,
  * Called from nohz_idle_balance() to update the load ratings before doing the
  * idle balance.
  */
-static void update_cpu_load_idle(struct rq *this_rq)
+static void cpu_load_update_idle(struct rq *this_rq)
 {
 	/*
 	 * bail if there's load or we're actually up-to-date.
@@ -4686,13 +4686,13 @@ static void update_cpu_load_idle(struct rq *this_rq)
 	if (weighted_cpuload(cpu_of(this_rq)))
 		return;
 
-	__update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
+	__cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
 }
 
 /*
  * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
  */
-void update_cpu_load_nohz(int active)
+void cpu_load_update_nohz(int active)
 {
 	struct rq *this_rq = this_rq();
 	unsigned long curr_jiffies = READ_ONCE(jiffies);
@@ -4702,7 +4702,7 @@ void update_cpu_load_nohz(int active)
 		return;
 
 	raw_spin_lock(&this_rq->lock);
-	__update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
+	__cpu_load_update_nohz(this_rq, curr_jiffies, load, active);
 	raw_spin_unlock(&this_rq->lock);
 }
 #endif /* CONFIG_NO_HZ */
@@ -4710,14 +4710,14 @@ void update_cpu_load_nohz(int active)
 /*
  * Called from scheduler_tick()
  */
-void update_cpu_load_active(struct rq *this_rq)
+void cpu_load_update_active(struct rq *this_rq)
 {
 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
 	/*
-	 * See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
+	 * See the mess around cpu_load_update_idle() / cpu_load_update_nohz().
 	 */
 	this_rq->last_load_update_tick = jiffies;
-	__update_cpu_load(this_rq, load, 1, 1);
+	__cpu_load_update(this_rq, load, 1, 1);
 }
 
 /*
@@ -8031,7 +8031,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 		if (time_after_eq(jiffies, rq->next_balance)) {
 			raw_spin_lock_irq(&rq->lock);
 			update_rq_clock(rq);
-			update_cpu_load_idle(rq);
+			cpu_load_update_idle(rq);
 			raw_spin_unlock_irq(&rq->lock);
 			rebalance_domains(rq, CPU_IDLE);
 		}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a7cbad7b3ad2..32d9e22cfacf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -31,9 +31,9 @@ extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq);
 
 #ifdef CONFIG_SMP
-extern void update_cpu_load_active(struct rq *this_rq);
+extern void cpu_load_update_active(struct rq *this_rq);
 #else
-static inline void update_cpu_load_active(struct rq *this_rq) { }
+static inline void cpu_load_update_active(struct rq *this_rq) { }
 #endif
 
 /*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 58e3310c9b21..66bdc9acc283 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -806,7 +806,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int
 {
 	/* Update jiffies first */
 	tick_do_update_jiffies64(now);
-	update_cpu_load_nohz(active);
+	cpu_load_update_nohz(active);
 
 	calc_load_exit_idle();
 	touch_softlockup_watchdog_sched();
-- 
cgit v1.2.3


From 1f41906a6fda1114debd3898668bd7ab6470ee41 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 13 Apr 2016 15:56:51 +0200
Subject: sched/fair: Correctly handle nohz ticks CPU load accounting

Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask
mode. In fact "nohz" or "dynticks" only mean that we exit the periodic
mode and we try to minimize the ticks as much as possible. The nohz
subsystem uses a confusing terminology with the internal state
"ts->tick_stopped" which is also available through its public interface
with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead
reduced with the best effort rather than stopped. In the best case the
tick can indeed be actually stopped but there is no guarantee about that.
If a timer needs to fire one second later, a tick will fire while the
CPU is in nohz mode and this is a very common scenario.

Now this confusion happens to be a problem with CPU load updates:
cpu_load_update_active() doesn't handle nohz ticks correctly because it
assumes that ticks are completely stopped in nohz mode and that
cpu_load_update_active() can't be called in dynticks mode. When that
happens, the whole previous tickless load is ignored and the function
just records the load for the current tick, ignoring potentially long
idle periods behind.

In order to solve this, we could account the current load for the
previous nohz time but there is a risk that we account the load of a
task that got freshly enqueued for the whole nohz period.

So instead, lets record the dynticks load on nohz frame entry so we know
what to record in case of nohz ticks, then use this record to account
the tickless load on nohz ticks and nohz frame end.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1460555812-25375-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h    |  6 ++-
 kernel/sched/fair.c      | 97 +++++++++++++++++++++++++++++++-----------------
 kernel/time/tick-sched.c |  9 +++--
 3 files changed, 72 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0b7f6028a50b..d894f2d61388 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void cpu_load_update_nohz(int active);
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
 #else
-static inline void cpu_load_update_nohz(int active) { }
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ecd81c4ebb56..b70367a3e1ef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4563,7 +4563,6 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
  *
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
@@ -4592,12 +4591,12 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
  *
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
  */
-static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
-			      unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+			    unsigned long pending_updates)
 {
-	unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+	unsigned long tickless_load = this_rq->cpu_load[0];
 	int i, scale;
 
 	this_rq->nr_load_updates++;
@@ -4642,10 +4641,23 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __cpu_load_update_nohz(struct rq *this_rq,
-				   unsigned long curr_jiffies,
-				   unsigned long load,
-				   int active)
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we need to avoid the delta approach from the regular tick when
+ * possible since that would seriously skew the load calculation. This is why we
+ * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
+ * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
+ * loop exit, nohz_idle_balance, nohz full exit...)
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+
+static void cpu_load_update_nohz(struct rq *this_rq,
+				 unsigned long curr_jiffies,
+				 unsigned long load)
 {
 	unsigned long pending_updates;
 
@@ -4657,23 +4669,10 @@ static void __cpu_load_update_nohz(struct rq *this_rq,
 		 * In the NOHZ_FULL case, we were non-idle, we should consider
 		 * its weighted load.
 		 */
-		__cpu_load_update(this_rq, load, pending_updates, active);
+		cpu_load_update(this_rq, load, pending_updates);
 	}
 }
 
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we cannot use the delta approach from the regular tick since that
- * would seriously skew the load calculation. However we'll make do for those
- * updates happening while idle (nohz_idle_balance) or coming out of idle
- * (tick_nohz_idle_exit).
- *
- * This means we might still be one tick off for nohz periods.
- */
-
 /*
  * Called from nohz_idle_balance() to update the load ratings before doing the
  * idle balance.
@@ -4686,26 +4685,56 @@ static void cpu_load_update_idle(struct rq *this_rq)
 	if (weighted_cpuload(cpu_of(this_rq)))
 		return;
 
-	__cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
+	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
 }
 
 /*
- * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ * Record CPU load on nohz entry so we know the tickless load to account
+ * on nohz exit. cpu_load[0] happens then to be updated more frequently
+ * than other cpu_load[idx] but it should be fine as cpu_load readers
+ * shouldn't rely into synchronized cpu_load[*] updates.
  */
-void cpu_load_update_nohz(int active)
+void cpu_load_update_nohz_start(void)
 {
 	struct rq *this_rq = this_rq();
+
+	/*
+	 * This is all lockless but should be fine. If weighted_cpuload changes
+	 * concurrently we'll exit nohz. And cpu_load write can race with
+	 * cpu_load_update_idle() but both updater would be writing the same.
+	 */
+	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+}
+
+/*
+ * Account the tickless load in the end of a nohz frame.
+ */
+void cpu_load_update_nohz_stop(void)
+{
 	unsigned long curr_jiffies = READ_ONCE(jiffies);
-	unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
+	struct rq *this_rq = this_rq();
+	unsigned long load;
 
 	if (curr_jiffies == this_rq->last_load_update_tick)
 		return;
 
+	load = weighted_cpuload(cpu_of(this_rq));
 	raw_spin_lock(&this_rq->lock);
-	__cpu_load_update_nohz(this_rq, curr_jiffies, load, active);
+	cpu_load_update_nohz(this_rq, curr_jiffies, load);
 	raw_spin_unlock(&this_rq->lock);
 }
-#endif /* CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
+static inline void cpu_load_update_nohz(struct rq *this_rq,
+					unsigned long curr_jiffies,
+					unsigned long load) { }
+#endif /* CONFIG_NO_HZ_COMMON */
+
+static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
+{
+	/* See the mess around cpu_load_update_nohz(). */
+	this_rq->last_load_update_tick = READ_ONCE(jiffies);
+	cpu_load_update(this_rq, load, 1);
+}
 
 /*
  * Called from scheduler_tick()
@@ -4713,11 +4742,11 @@ void cpu_load_update_nohz(int active)
 void cpu_load_update_active(struct rq *this_rq)
 {
 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
-	/*
-	 * See the mess around cpu_load_update_idle() / cpu_load_update_nohz().
-	 */
-	this_rq->last_load_update_tick = jiffies;
-	__cpu_load_update(this_rq, load, 1, 1);
+
+	if (tick_nohz_tick_stopped())
+		cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
+	else
+		cpu_load_update_periodic(this_rq, load);
 }
 
 /*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 66bdc9acc283..31872bc53bc4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -776,6 +776,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	if (!ts->tick_stopped) {
 		nohz_balance_enter_idle(cpu);
 		calc_load_enter_idle();
+		cpu_load_update_nohz_start();
 
 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 		ts->tick_stopped = 1;
@@ -802,11 +803,11 @@ out:
 	return tick;
 }
 
-static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active)
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
 	/* Update jiffies first */
 	tick_do_update_jiffies64(now);
-	cpu_load_update_nohz(active);
+	cpu_load_update_nohz_stop();
 
 	calc_load_exit_idle();
 	touch_softlockup_watchdog_sched();
@@ -833,7 +834,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
 	if (can_stop_full_tick(ts))
 		tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
 	else if (ts->tick_stopped)
-		tick_nohz_restart_sched_tick(ts, ktime_get(), 1);
+		tick_nohz_restart_sched_tick(ts, ktime_get());
 #endif
 }
 
@@ -1024,7 +1025,7 @@ void tick_nohz_idle_exit(void)
 		tick_nohz_stop_idle(ts, now);
 
 	if (ts->tick_stopped) {
-		tick_nohz_restart_sched_tick(ts, now, 0);
+		tick_nohz_restart_sched_tick(ts, now);
 		tick_nohz_account_idle_ticks(ts);
 	}
 
-- 
cgit v1.2.3


From 9fd81dd5ce0b12341c9f83346f8d32ac68bd3841 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 19 Apr 2016 17:36:51 +0200
Subject: sched/fair: Optimize !CONFIG_NO_HZ_COMMON CPU load updates

Some code in CPU load update only concern NO_HZ configs but it is
built on all configurations. When NO_HZ isn't built, that code is harmless
but just happens to take some useless ressources in CPU and memory:

1) one useless field in struct rq
2) jiffies record on every tick that is never used (cpu_load_update_periodic)
3) decay_load_missed is called two times on every tick to eventually
   return immediately with no action taken. And that function is dead
   code.

For pure optimization purposes, lets conditionally build the NO_HZ
related code.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1461080211-16271-1-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 5 ++---
 kernel/sched/fair.c  | 9 +++++++--
 kernel/sched/sched.h | 6 ++++--
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c98a2688f390..71dffbb27ce6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7381,8 +7381,6 @@ void __init sched_init(void)
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
 
-		rq->last_load_update_tick = jiffies;
-
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->rd = NULL;
@@ -7401,12 +7399,13 @@ void __init sched_init(void)
 
 		rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
+		rq->last_load_update_tick = jiffies;
 		rq->nohz_flags = 0;
 #endif
 #ifdef CONFIG_NO_HZ_FULL
 		rq->last_sched_tick = 0;
 #endif
-#endif
+#endif /* CONFIG_SMP */
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b70367a3e1ef..b8a33abce650 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4491,7 +4491,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * per rq 'load' arrray crap; XXX kill this.
  */
@@ -4557,6 +4557,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 	}
 	return load;
 }
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /**
  * __cpu_load_update - update the rq->cpu_load[] statistics
@@ -4596,7 +4597,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
 			    unsigned long pending_updates)
 {
-	unsigned long tickless_load = this_rq->cpu_load[0];
+	unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
 	int i, scale;
 
 	this_rq->nr_load_updates++;
@@ -4609,6 +4610,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
 		/* scale is effectively 1 << i now, and >> i divides by scale */
 
 		old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
 		if (tickless_load) {
 			old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
@@ -4619,6 +4621,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
 			 */
 			old_load += tickless_load;
 		}
+#endif
 		new_load = this_load;
 		/*
 		 * Round up the averaging division if load is increasing. This
@@ -4731,8 +4734,10 @@ static inline void cpu_load_update_nohz(struct rq *this_rq,
 
 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
 {
+#ifdef CONFIG_NO_HZ_COMMON
 	/* See the mess around cpu_load_update_nohz(). */
 	this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
 	cpu_load_update(this_rq, load, 1);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 32d9e22cfacf..69da6fcaa0e8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -585,11 +585,13 @@ struct rq {
 #endif
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-	unsigned long last_load_update_tick;
 #ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
+	unsigned long last_load_update_tick;
+#endif /* CONFIG_SMP */
 	u64 nohz_stamp;
 	unsigned long nohz_flags;
-#endif
+#endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
 	unsigned long last_sched_tick;
 #endif
-- 
cgit v1.2.3


From fec148c000d0f9ac21679601722811eb60b4cc52 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Thu, 14 Apr 2016 20:19:28 +0800
Subject: sched/deadline: Fix a bug in dl_overflow()

I got a minus(very big) dl_b->total_bw during my deadline tests.

    # grep dl /proc/sched_debug
    dl_rq[0]:
    .dl_nr_running                 : 0
    .dl_bw->bw                     : 996147
    .dl_bw->total_bw               : -222297900

Something unusual must have happened.

After some digging, I finally noticed that when changing a deadline
task to normal(cfs), and changing it back to deadline immediately,
after it died, we will got the wrong dl_bw->total_bw.

The root cause is in dl_overflow(), it has:
    if (new_bw == p->dl.dl_bw)
	return 0;

1) When a deadline task is changed to !deadline task, it will start
   dl timer in switched_from_dl(), and retain previous deadline parameter
   till the timer expires.

2) If we change it back to deadline with the same bandwidth parameter
   before the timer expires, as it keeps the old bandwidth although it
   is not a deadline task. dl_overflow() simply returns success without
   updating the right data, and got the wrong dl_bw->total_bw.

The solution is simple, if @p is not deadline, don't return.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Juri Lelli <juri.lelli@arm.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1460636368-1993-1-git-send-email-xlpang@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 71dffbb27ce6..9d84d6004745 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2378,7 +2378,8 @@ static int dl_overflow(struct task_struct *p, int policy,
 	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
 	int cpus, err = -1;
 
-	if (new_bw == p->dl.dl_bw)
+	/* !deadline task may carry old deadline bandwidth */
+	if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
 		return 0;
 
 	/*
-- 
cgit v1.2.3


From 80df554275c21edca22ece02448bdb378c2ee9f1 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 22 Apr 2016 17:31:24 +0200
Subject: taskstats: use the libnl API to align nlattr on 64-bit

Goal of this patch is to use the new libnl API to align netlink attribute
when needed.
The layout of the netlink message will be a bit different after the patch,
because the padattr (TASKSTATS_TYPE_STATS) will be inside the nested
attribute instead of before it.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/taskstats.c | 37 +++++--------------------------------
 1 file changed, 5 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 21f82c29c914..b3f05ee20d18 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -357,10 +357,6 @@ static int parse(struct nlattr *na, struct cpumask *mask)
 	return ret;
 }
 
-#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
-#define TASKSTATS_NEEDS_PADDING 1
-#endif
-
 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 {
 	struct nlattr *na, *ret;
@@ -370,29 +366,6 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 			? TASKSTATS_TYPE_AGGR_PID
 			: TASKSTATS_TYPE_AGGR_TGID;
 
-	/*
-	 * The taskstats structure is internally aligned on 8 byte
-	 * boundaries but the layout of the aggregrate reply, with
-	 * two NLA headers and the pid (each 4 bytes), actually
-	 * force the entire structure to be unaligned. This causes
-	 * the kernel to issue unaligned access warnings on some
-	 * architectures like ia64. Unfortunately, some software out there
-	 * doesn't properly unroll the NLA packet and assumes that the start
-	 * of the taskstats structure will always be 20 bytes from the start
-	 * of the netlink payload. Aligning the start of the taskstats
-	 * structure breaks this software, which we don't want. So, for now
-	 * the alignment only happens on architectures that require it
-	 * and those users will have to update to fixed versions of those
-	 * packages. Space is reserved in the packet only when needed.
-	 * This ifdef should be removed in several years e.g. 2012 once
-	 * we can be confident that fixed versions are installed on most
-	 * systems. We add the padding before the aggregate since the
-	 * aggregate is already a defined type.
-	 */
-#ifdef TASKSTATS_NEEDS_PADDING
-	if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
-		goto err;
-#endif
 	na = nla_nest_start(skb, aggr);
 	if (!na)
 		goto err;
@@ -401,7 +374,8 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 		nla_nest_cancel(skb, na);
 		goto err;
 	}
-	ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
+	ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS,
+				sizeof(struct taskstats), TASKSTATS_TYPE_NULL);
 	if (!ret) {
 		nla_nest_cancel(skb, na);
 		goto err;
@@ -500,10 +474,9 @@ static size_t taskstats_packet_size(void)
 	size_t size;
 
 	size = nla_total_size(sizeof(u32)) +
-		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
-#ifdef TASKSTATS_NEEDS_PADDING
-	size += nla_total_size(0); /* Padding for alignment */
-#endif
+		nla_total_size_64bit(sizeof(struct taskstats)) +
+		nla_total_size(0);
+
 	return size;
 }
 
-- 
cgit v1.2.3


From 5cf1cacb49aee39c3e02ae87068fc3c6430659b0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 21 Apr 2016 19:06:48 -0400
Subject: cgroup, cpuset: replace cpuset_post_attach_flush() with
 cgroup_subsys->post_attach callback

Since e93ad19d0564 ("cpuset: make mm migration asynchronous"), cpuset
kicks off asynchronous NUMA node migration if necessary during task
migration and flushes it from cpuset_post_attach_flush() which is
called at the end of __cgroup_procs_write().  This is to avoid
performing migration with cgroup_threadgroup_rwsem write-locked which
can lead to deadlock through dependency on kworker creation.

memcg has a similar issue with charge moving, so let's convert it to
an official callback rather than the current one-off cpuset specific
function.  This patch adds cgroup_subsys->post_attach callback and
makes cpuset register cpuset_post_attach_flush() as its ->post_attach.

The conversion is mostly one-to-one except that the new callback is
called under cgroup_mutex.  This is to guarantee that no other
migration operations are started before ->post_attach callbacks are
finished.  cgroup_mutex is one of the outermost mutex in the system
and has never been and shouldn't be a problem.  We can add specialized
synchronization around __cgroup_procs_write() but I don't think
there's any noticeable benefit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: <stable@vger.kernel.org> # 4.4+ prerequisite for the next patch
---
 include/linux/cgroup-defs.h | 1 +
 include/linux/cpuset.h      | 6 ------
 kernel/cgroup.c             | 7 +++++--
 kernel/cpuset.c             | 4 ++--
 4 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 3e39ae5bc799..5b17de62c962 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -444,6 +444,7 @@ struct cgroup_subsys {
 	int (*can_attach)(struct cgroup_taskset *tset);
 	void (*cancel_attach)(struct cgroup_taskset *tset);
 	void (*attach)(struct cgroup_taskset *tset);
+	void (*post_attach)(void);
 	int (*can_fork)(struct task_struct *task);
 	void (*cancel_fork)(struct task_struct *task);
 	void (*fork)(struct task_struct *task);
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index fea160ee5803..85a868ccb493 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -137,8 +137,6 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 	task_unlock(current);
 }
 
-extern void cpuset_post_attach_flush(void);
-
 #else /* !CONFIG_CPUSETS */
 
 static inline bool cpusets_enabled(void) { return false; }
@@ -245,10 +243,6 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
 	return false;
 }
 
-static inline void cpuset_post_attach_flush(void)
-{
-}
-
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 671dc05c0b0f..909a7d31ffd3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2825,9 +2825,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 				    size_t nbytes, loff_t off, bool threadgroup)
 {
 	struct task_struct *tsk;
+	struct cgroup_subsys *ss;
 	struct cgroup *cgrp;
 	pid_t pid;
-	int ret;
+	int ssid, ret;
 
 	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
 		return -EINVAL;
@@ -2875,8 +2876,10 @@ out_unlock_rcu:
 	rcu_read_unlock();
 out_unlock_threadgroup:
 	percpu_up_write(&cgroup_threadgroup_rwsem);
+	for_each_subsys(ss, ssid)
+		if (ss->post_attach)
+			ss->post_attach();
 	cgroup_kn_unlock(of->kn);
-	cpuset_post_attach_flush();
 	return ret ?: nbytes;
 }
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 00ab5c2b7c5b..1902956baba1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -58,7 +58,6 @@
 #include <asm/uaccess.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
-#include <linux/workqueue.h>
 #include <linux/cgroup.h>
 #include <linux/wait.h>
 
@@ -1016,7 +1015,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 	}
 }
 
-void cpuset_post_attach_flush(void)
+static void cpuset_post_attach(void)
 {
 	flush_workqueue(cpuset_migrate_mm_wq);
 }
@@ -2087,6 +2086,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
 	.can_attach	= cpuset_can_attach,
 	.cancel_attach	= cpuset_cancel_attach,
 	.attach		= cpuset_attach,
+	.post_attach	= cpuset_post_attach,
 	.bind		= cpuset_bind,
 	.legacy_cftypes	= files,
 	.early_init	= true,
-- 
cgit v1.2.3


From 205506228b69be4efbb3809957a69420f26a8d9f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 25 Apr 2016 22:40:12 -0400
Subject: tracing: Do not inherit event-fork option for instances

As the event-fork option requires doing work when enabled and disabled, it
can not be passed down to created instances. The instance must clear this
flag when it is created, and must clear it when its removed.

As more options may be created with this need, a macro ZEROED_TRACE_FLAGS is
created that holds the flags that must not be inherited by the top level
instance, and must be cleared on removal of instances.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 988a35263fdd..5e3ad3481e4b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -253,6 +253,9 @@ unsigned long long ns2usecs(cycle_t nsec)
 #define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK |			\
 	       TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD)
 
+/* trace_flags that are default zero for instances */
+#define ZEROED_TRACE_FLAGS \
+	TRACE_ITER_EVENT_FORK
 
 /*
  * The global_trace is the descriptor that holds the tracing
@@ -6710,7 +6713,7 @@ static int instance_mkdir(const char *name)
 	if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
 		goto out_free_tr;
 
-	tr->trace_flags = global_trace.trace_flags;
+	tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
 
 	cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
 
@@ -6784,6 +6787,12 @@ static int instance_rmdir(const char *name)
 
 	list_del(&tr->list);
 
+	/* Disable all the flags that were enabled coming in */
+	for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++) {
+		if ((1 << i) & ZEROED_TRACE_FLAGS)
+			set_tracer_flag(tr, 1 << i, 0);
+	}
+
 	tracing_set_nop(tr);
 	event_trace_del_tracer(tr);
 	ftrace_destroy_function_files(tr);
-- 
cgit v1.2.3


From c8585c6fcaf2011de54c3592e80a634a2b9e1a7f Mon Sep 17 00:00:00 2001
From: Daeho Jeong <daeho.jeong@samsung.com>
Date: Mon, 25 Apr 2016 23:22:35 -0400
Subject: ext4: fix races between changing inode journal mode and
 ext4_writepages

In ext4, there is a race condition between changing inode journal mode
and ext4_writepages(). While ext4_writepages() is executed on a
non-journalled mode inode, the inode's journal mode could be enabled
by ioctl() and then, some pages dirtied after switching the journal
mode will be still exposed to ext4_writepages() in non-journaled mode.
To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan
Kara's suggestion because we don't want to waste ext4_inode_info's
space for this extra rare case.

Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h                |  4 ++++
 fs/ext4/inode.c               | 15 ++++++++++++---
 fs/ext4/super.c               |  4 ++++
 kernel/locking/percpu-rwsem.c |  1 +
 4 files changed, 21 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cb00b1119ec9..ba5aecc07fbc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,6 +33,7 @@
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
 #include <linux/falloc.h>
+#include <linux/percpu-rwsem.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -1508,6 +1509,9 @@ struct ext4_sb_info {
 	struct ratelimit_state s_err_ratelimit_state;
 	struct ratelimit_state s_warning_ratelimit_state;
 	struct ratelimit_state s_msg_ratelimit_state;
+
+	/* Barrier between changing inodes' journal flags and writepages ops. */
+	struct percpu_rw_semaphore s_journal_flag_rwsem;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 779ef4c11bc1..4d8ebbe00456 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2612,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping,
 	struct blk_plug plug;
 	bool give_up_on_write = false;
 
+	percpu_down_read(&sbi->s_journal_flag_rwsem);
 	trace_ext4_writepages(inode, wbc);
 
-	if (dax_mapping(mapping))
-		return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
-						   wbc);
+	if (dax_mapping(mapping)) {
+		ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+						  wbc);
+		goto out_writepages;
+	}
 
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
@@ -2786,6 +2789,7 @@ retry:
 out_writepages:
 	trace_ext4_writepages_result(inode, wbc, ret,
 				     nr_to_write - wbc->nr_to_write);
+	percpu_up_read(&sbi->s_journal_flag_rwsem);
 	return ret;
 }
 
@@ -5436,6 +5440,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	journal_t *journal;
 	handle_t *handle;
 	int err;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
 	/*
 	 * We have to be very careful here: changing a data block's
@@ -5475,6 +5480,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		}
 	}
 
+	percpu_down_write(&sbi->s_journal_flag_rwsem);
 	jbd2_journal_lock_updates(journal);
 
 	/*
@@ -5491,6 +5497,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		err = jbd2_journal_flush(journal);
 		if (err < 0) {
 			jbd2_journal_unlock_updates(journal);
+			percpu_up_write(&sbi->s_journal_flag_rwsem);
 			ext4_inode_resume_unlocked_dio(inode);
 			return err;
 		}
@@ -5499,6 +5506,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
+	percpu_up_write(&sbi->s_journal_flag_rwsem);
+
 	if (val)
 		up_write(&EXT4_I(inode)->i_mmap_sem);
 	ext4_inode_resume_unlocked_dio(inode);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 304c712dbe12..20c5d52253b4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+	percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -3930,6 +3931,9 @@ no_journal:
 	if (!err)
 		err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
 					  GFP_KERNEL);
+	if (!err)
+		err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount6;
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f231e0bb311c..bec0b647f9cc 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
 	free_percpu(brw->fast_read_ctr);
 	brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
 /*
  * This is the fast-path for down_read/up_read. If it succeeds we rely
-- 
cgit v1.2.3


From 4812952f9c94f67b3cc78ad0a6482bf182aa5a44 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 23 Apr 2016 13:23:47 +0300
Subject: tracing: checking for NULL instead of IS_ERR()

tracing_map_elt_alloc() returns ERR_PTRs on error, never NULL.

Fixes: 08d43a5fa063 ('tracing: Add lock-free tracing_map')
Link: http://lkml.kernel.org/r/20160423102347.GA11136@mwanda

Acked-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index e0f172932eca..e7dfc5eec669 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -814,7 +814,7 @@ static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt)
 	unsigned int i;
 
 	dup_elt = tracing_map_elt_alloc(elt->map);
-	if (!dup_elt)
+	if (IS_ERR(dup_elt))
 		return NULL;
 
 	if (elt->map->ops && elt->map->ops->elt_copy)
-- 
cgit v1.2.3


From 432480c58219eff32904b879eb3fcc1d268a3b06 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 25 Apr 2016 14:01:27 -0500
Subject: tracing: Add check for NULL event field when creating hist field

Smatch flagged create_hist_field() as possibly being able to
dereference a NULL pointer, although the current code exits in all
cases where the event field could be NULL, so it's not actually a
problem.

Still, to prevent future changes to the code from overlooking new
cases, make the NULL pointer check explicit and warn once in that
case.

Link: http://lkml.kernel.org/r/cfbc003f534a3e441b4313272fd412310aba6336.1461610073.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f98b6b3a2804..0c05b8a99806 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -371,6 +371,9 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 		goto out;
 	}
 
+	if (WARN_ON_ONCE(!field))
+		goto out;
+
 	if (is_string_field(field)) {
 		flags |= HIST_FIELD_FL_STRING;
 
-- 
cgit v1.2.3


From 6e4cf657defa8fb06dbf9ed91adb78414758f259 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Mon, 25 Apr 2016 14:01:28 -0500
Subject: tracing: Handle tracing_map_alloc_elts() error path correctly

If tracing_map_elt_alloc() fails, it will return ERR_PTR() instead of
NULL, so change the check to IS_ERROR().  We also need to set the
failed entry in the map->elts array to NULL instead of ERR_PTR() so
tracing_map_free_elts() doesn't try freeing an ERR_PTR().

tracing_map_free_elts() should also zero out what it frees so a
reentrant call won't find previously freed elements.

Link: http://lkml.kernel.org/r/f29d03b00bce3aac8cf151a8a30e6c83e5fee66d.1461610073.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index e7dfc5eec669..0a689bbb78ef 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -366,10 +366,13 @@ static void tracing_map_free_elts(struct tracing_map *map)
 	if (!map->elts)
 		return;
 
-	for (i = 0; i < map->max_elts; i++)
+	for (i = 0; i < map->max_elts; i++) {
 		tracing_map_elt_free(*(TRACING_MAP_ELT(map->elts, i)));
+		*(TRACING_MAP_ELT(map->elts, i)) = NULL;
+	}
 
 	tracing_map_array_free(map->elts);
+	map->elts = NULL;
 }
 
 static int tracing_map_alloc_elts(struct tracing_map *map)
@@ -383,7 +386,8 @@ static int tracing_map_alloc_elts(struct tracing_map *map)
 
 	for (i = 0; i < map->max_elts; i++) {
 		*(TRACING_MAP_ELT(map->elts, i)) = tracing_map_elt_alloc(map);
-		if (!(*(TRACING_MAP_ELT(map->elts, i)))) {
+		if (IS_ERR(*(TRACING_MAP_ELT(map->elts, i)))) {
+			*(TRACING_MAP_ELT(map->elts, i)) = NULL;
 			tracing_map_free_elts(map);
 
 			return -ENOMEM;
-- 
cgit v1.2.3


From 346c09f80459a3ad97df1816d6d606169a51001a Mon Sep 17 00:00:00 2001
From: Roman Pen <roman.penyaev@profitbricks.com>
Date: Tue, 26 Apr 2016 13:15:35 +0200
Subject: workqueue: fix ghost PENDING flag while doing MQ IO

The bug in a workqueue leads to a stalled IO request in MQ ctx->rq_list
with the following backtrace:

[  601.347452] INFO: task kworker/u129:5:1636 blocked for more than 120 seconds.
[  601.347574]       Tainted: G           O    4.4.5-1-storage+ #6
[  601.347651] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  601.348142] kworker/u129:5  D ffff880803077988     0  1636      2 0x00000000
[  601.348519] Workqueue: ibnbd_server_fileio_wq ibnbd_dev_file_submit_io_worker [ibnbd_server]
[  601.348999]  ffff880803077988 ffff88080466b900 ffff8808033f9c80 ffff880803078000
[  601.349662]  ffff880807c95000 7fffffffffffffff ffffffff815b0920 ffff880803077ad0
[  601.350333]  ffff8808030779a0 ffffffff815b01d5 0000000000000000 ffff880803077a38
[  601.350965] Call Trace:
[  601.351203]  [<ffffffff815b0920>] ? bit_wait+0x60/0x60
[  601.351444]  [<ffffffff815b01d5>] schedule+0x35/0x80
[  601.351709]  [<ffffffff815b2dd2>] schedule_timeout+0x192/0x230
[  601.351958]  [<ffffffff812d43f7>] ? blk_flush_plug_list+0xc7/0x220
[  601.352208]  [<ffffffff810bd737>] ? ktime_get+0x37/0xa0
[  601.352446]  [<ffffffff815b0920>] ? bit_wait+0x60/0x60
[  601.352688]  [<ffffffff815af784>] io_schedule_timeout+0xa4/0x110
[  601.352951]  [<ffffffff815b3a4e>] ? _raw_spin_unlock_irqrestore+0xe/0x10
[  601.353196]  [<ffffffff815b093b>] bit_wait_io+0x1b/0x70
[  601.353440]  [<ffffffff815b056d>] __wait_on_bit+0x5d/0x90
[  601.353689]  [<ffffffff81127bd0>] wait_on_page_bit+0xc0/0xd0
[  601.353958]  [<ffffffff81096db0>] ? autoremove_wake_function+0x40/0x40
[  601.354200]  [<ffffffff81127cc4>] __filemap_fdatawait_range+0xe4/0x140
[  601.354441]  [<ffffffff81127d34>] filemap_fdatawait_range+0x14/0x30
[  601.354688]  [<ffffffff81129a9f>] filemap_write_and_wait_range+0x3f/0x70
[  601.354932]  [<ffffffff811ced3b>] blkdev_fsync+0x1b/0x50
[  601.355193]  [<ffffffff811c82d9>] vfs_fsync_range+0x49/0xa0
[  601.355432]  [<ffffffff811cf45a>] blkdev_write_iter+0xca/0x100
[  601.355679]  [<ffffffff81197b1a>] __vfs_write+0xaa/0xe0
[  601.355925]  [<ffffffff81198379>] vfs_write+0xa9/0x1a0
[  601.356164]  [<ffffffff811c59d8>] kernel_write+0x38/0x50

The underlying device is a null_blk, with default parameters:

  queue_mode    = MQ
  submit_queues = 1

Verification that nullb0 has something inflight:

root@pserver8:~# cat /sys/block/nullb0/inflight
       0        1
root@pserver8:~# find /sys/block/nullb0/mq/0/cpu* -name rq_list -print -exec cat {} \;
...
/sys/block/nullb0/mq/0/cpu2/rq_list
CTX pending:
        ffff8838038e2400
...

During debug it became clear that stalled request is always inserted in
the rq_list from the following path:

   save_stack_trace_tsk + 34
   blk_mq_insert_requests + 231
   blk_mq_flush_plug_list + 281
   blk_flush_plug_list + 199
   wait_on_page_bit + 192
   __filemap_fdatawait_range + 228
   filemap_fdatawait_range + 20
   filemap_write_and_wait_range + 63
   blkdev_fsync + 27
   vfs_fsync_range + 73
   blkdev_write_iter + 202
   __vfs_write + 170
   vfs_write + 169
   kernel_write + 56

So blk_flush_plug_list() was called with from_schedule == true.

If from_schedule is true, that means that finally blk_mq_insert_requests()
offloads execution of __blk_mq_run_hw_queue() and uses kblockd workqueue,
i.e. it calls kblockd_schedule_delayed_work_on().

That means, that we race with another CPU, which is about to execute
__blk_mq_run_hw_queue() work.

Further debugging shows the following traces from different CPUs:

  CPU#0                                  CPU#1
  ----------------------------------     -------------------------------
  reqeust A inserted
  STORE hctx->ctx_map[0] bit marked
  kblockd_schedule...() returns 1
  <schedule to kblockd workqueue>
                                         request B inserted
                                         STORE hctx->ctx_map[1] bit marked
                                         kblockd_schedule...() returns 0
  *** WORK PENDING bit is cleared ***
  flush_busy_ctxs() is executed, but
  bit 1, set by CPU#1, is not observed

As a result request B pended forever.

This behaviour can be explained by speculative LOAD of hctx->ctx_map on
CPU#0, which is reordered with clear of PENDING bit and executed _before_
actual STORE of bit 1 on CPU#1.

The proper fix is an explicit full barrier <mfence>, which guarantees
that clear of PENDING bit is to be executed before all possible
speculative LOADS or STORES inside actual work function.

Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Cc: Gioh Kim <gi-oh.kim@profitbricks.com>
Cc: Michael Wang <yun.wang@profitbricks.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 16e13d8628a3..801a698564d4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -666,6 +666,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
 	 */
 	smp_wmb();
 	set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+	/*
+	 * The following mb guarantees that previous clear of a PENDING bit
+	 * will not be reordered with any speculative LOADS or STORES from
+	 * work->current_func, which is executed afterwards.  This possible
+	 * reordering can lead to a missed execution on attempt to qeueue
+	 * the same @work.  E.g. consider this case:
+	 *
+	 *   CPU#0                         CPU#1
+	 *   ----------------------------  --------------------------------
+	 *
+	 * 1  STORE event_indicated
+	 * 2  queue_work_on() {
+	 * 3    test_and_set_bit(PENDING)
+	 * 4 }                             set_..._and_clear_pending() {
+	 * 5                                 set_work_data() # clear bit
+	 * 6                                 smp_mb()
+	 * 7                               work->current_func() {
+	 * 8				      LOAD event_indicated
+	 *				   }
+	 *
+	 * Without an explicit full barrier speculative LOAD on line 8 can
+	 * be executed before CPU#0 does STORE on line 1.  If that happens,
+	 * CPU#0 observes the PENDING bit is still set and new execution of
+	 * a @work is not queued in a hope, that CPU#1 will eventually
+	 * finish the queued @work.  Meanwhile CPU#1 does not see
+	 * event_indicated is set, because speculative LOAD was executed
+	 * before actual STORE.
+	 */
+	smp_mb();
 }
 
 static void clear_work_data(struct work_struct *work)
-- 
cgit v1.2.3


From 4afe6495e5cb3c352d95f07512cbb227e607e2ce Mon Sep 17 00:00:00 2001
From: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Date: Mon, 18 Apr 2016 15:23:29 +0800
Subject: tracing: Don't use the address of the buffer array name in
 copy_from_user

With the following code snippet:

    ...
    char buf[64];
    ...
    if (copy_from_user(&buf, ubuf, cnt))
    ...

Even though the value of "&buf" equals "buf", but there is no need
to get the address of the "buf" again. Use "buf" instead of "&buf".

Link: http://lkml.kernel.org/r/20160418152329.18b72bea@debian

Signed-off-by: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5e3ad3481e4b..46028d47d252 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3664,7 +3664,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
+	if (copy_from_user(buf, ubuf, cnt))
 		return -EFAULT;
 
 	buf[cnt] = 0;
@@ -4537,7 +4537,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 	if (cnt > MAX_TRACER_SIZE)
 		cnt = MAX_TRACER_SIZE;
 
-	if (copy_from_user(&buf, ubuf, cnt))
+	if (copy_from_user(buf, ubuf, cnt))
 		return -EFAULT;
 
 	buf[cnt] = 0;
@@ -5327,7 +5327,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
+	if (copy_from_user(buf, ubuf, cnt))
 		return -EFAULT;
 
 	buf[cnt] = 0;
-- 
cgit v1.2.3


From db0a6fb5d97afe01fd9c47d37c6daa82d4d4001d Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Thu, 21 Apr 2016 14:14:01 -0400
Subject: audit: add tty field to LOGIN event

The tty field was missing from AUDIT_LOGIN events.

Refactor code to create a new function audit_get_tty(), using it to
replace the call in audit_log_task_info() and to add it to
audit_log_set_loginuid().  Lock and bump the kref to protect it, adding
audit_put_tty() alias to decrement it.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h | 24 ++++++++++++++++++++++++
 kernel/audit.c        | 18 +++++-------------
 kernel/auditsc.c      |  8 ++++++--
 3 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b40ed5df5542..32cdafb312d8 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -26,6 +26,7 @@
 #include <linux/sched.h>
 #include <linux/ptrace.h>
 #include <uapi/linux/audit.h>
+#include <linux/tty.h>
 
 #define AUDIT_INO_UNSET ((unsigned long)-1)
 #define AUDIT_DEV_UNSET ((dev_t)-1)
@@ -343,6 +344,23 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 	return tsk->sessionid;
 }
 
+static inline struct tty_struct *audit_get_tty(struct task_struct *tsk)
+{
+	struct tty_struct *tty = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->sighand->siglock, flags);
+	if (tsk->signal)
+		tty = tty_kref_get(tsk->signal->tty);
+	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+	return tty;
+}
+
+static inline void audit_put_tty(struct tty_struct *tty)
+{
+	tty_kref_put(tty);
+}
+
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
 extern void __audit_bprm(struct linux_binprm *bprm);
@@ -500,6 +518,12 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 {
 	return -1;
 }
+static inline struct tty_struct *audit_get_tty(struct task_struct *tsk)
+{
+	return NULL;
+}
+static inline void audit_put_tty(struct tty_struct *tty)
+{ }
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 { }
 static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
diff --git a/kernel/audit.c b/kernel/audit.c
index f52fbefede09..384374a1d232 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -64,7 +64,6 @@
 #include <linux/security.h>
 #endif
 #include <linux/freezer.h>
-#include <linux/tty.h>
 #include <linux/pid_namespace.h>
 #include <net/netns/generic.h>
 
@@ -1871,21 +1870,14 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
 	const struct cred *cred;
 	char comm[sizeof(tsk->comm)];
-	char *tty;
+	struct tty_struct *tty;
 
 	if (!ab)
 		return;
 
 	/* tsk == current */
 	cred = current_cred();
-
-	spin_lock_irq(&tsk->sighand->siglock);
-	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
-		tty = tsk->signal->tty->name;
-	else
-		tty = "(none)";
-	spin_unlock_irq(&tsk->sighand->siglock);
-
+	tty = audit_get_tty(tsk);
 	audit_log_format(ab,
 			 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
 			 " euid=%u suid=%u fsuid=%u"
@@ -1901,11 +1893,11 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 			 from_kgid(&init_user_ns, cred->egid),
 			 from_kgid(&init_user_ns, cred->sgid),
 			 from_kgid(&init_user_ns, cred->fsgid),
-			 tty, audit_get_sessionid(tsk));
-
+			 tty ? tty_name(tty) : "(none)",
+			 audit_get_sessionid(tsk));
+	audit_put_tty(tty);
 	audit_log_format(ab, " comm=");
 	audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
-
 	audit_log_d_path_exe(ab, tsk->mm);
 	audit_log_task_context(ab);
 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 195ffaee50b9..71e14d836e69 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1980,6 +1980,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 {
 	struct audit_buffer *ab;
 	uid_t uid, oldloginuid, loginuid;
+	struct tty_struct *tty;
 
 	if (!audit_enabled)
 		return;
@@ -1987,14 +1988,17 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	uid = from_kuid(&init_user_ns, task_uid(current));
 	oldloginuid = from_kuid(&init_user_ns, koldloginuid);
 	loginuid = from_kuid(&init_user_ns, kloginuid),
+	tty = audit_get_tty(current);
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
 	if (!ab)
 		return;
 	audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
 	audit_log_task_context(ab);
-	audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
-			 oldloginuid, loginuid, oldsessionid, sessionid, !rc);
+	audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
+			 oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
+			 oldsessionid, sessionid, !rc);
+	audit_put_tty(tty);
 	audit_log_end(ab);
 }
 
-- 
cgit v1.2.3


From 8358b02bf67d3a5d8a825070e1aa73f25fb2e4c7 Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Tue, 26 Apr 2016 22:26:26 +0200
Subject: bpf: fix double-fdput in replace_map_fd_with_map_ptr()

When bpf(BPF_PROG_LOAD, ...) was invoked with a BPF program whose bytecode
references a non-map file descriptor as a map file descriptor, the error
handling code called fdput() twice instead of once (in __bpf_map_get() and
in replace_map_fd_with_map_ptr()). If the file descriptor table of the
current task is shared, this causes f_count to be decremented too much,
allowing the struct file to be freed while it is still in use
(use-after-free). This can be exploited to gain root privileges by an
unprivileged user.

This bug was introduced in
commit 0246e64d9a5f ("bpf: handle pseudo BPF_LD_IMM64 insn"), but is only
exploitable since
commit 1be7f75d1668 ("bpf: enable non-root eBPF programs") because
previously, CAP_SYS_ADMIN was required to reach the vulnerable code.

(posted publicly according to request by maintainer)

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 618ef77c302a..db2574e7b8b0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2030,7 +2030,6 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
 			if (IS_ERR(map)) {
 				verbose("fd %d is not pointing to valid bpf_map\n",
 					insn->imm);
-				fdput(f);
 				return PTR_ERR(map);
 			}
 
-- 
cgit v1.2.3


From 7132e2d669bd42c3783327f301aaac5f4463299b Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Date: Mon, 25 Apr 2016 18:56:14 -0300
Subject: ftrace: Match dot symbols when searching functions on ppc64

In the ppc64 big endian ABI, function symbols point to function
descriptors. The symbols which point to the function entry points
have a dot in front of the function name. Consequently, when the
ftrace filter mechanism searches for the symbol corresponding to
an entry point address, it gets the dot symbol.

As a result, ftrace filter users have to be aware of this ABI detail on
ppc64 and prepend a dot to the function name when setting the filter.

The perf probe command insulates the user from this by ignoring the dot
in front of the symbol name when matching function names to symbols,
but the sysfs interface does not. This patch makes the ftrace filter
mechanism do the same when searching symbols.

Fixes the following failure in ftracetest's kprobe_ftrace.tc:

  .../kprobe_ftrace.tc: line 9: echo: write error: Invalid argument

That failure is on this line of kprobe_ftrace.tc:

  echo _do_fork > set_ftrace_filter

This is because there's no _do_fork entry in the functions list:

  # cat available_filter_functions | grep _do_fork
  ._do_fork

This change introduces no regressions on the perf and ftracetest
testsuite results.

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/ftrace.c | 10 ++++++++++
 kernel/trace/ftrace.c        | 12 ++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index 9dac18dabd03..1123a4d8d8dd 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -607,3 +607,13 @@ unsigned long __init arch_syscall_addr(int nr)
 	return sys_call_table[nr*2];
 }
 #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */
+
+#if defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2)
+char *arch_ftrace_match_adjust(char *str, const char *search)
+{
+	if (str[0] == '.' && search[0] != '.')
+		return str + 1;
+	else
+		return str;
+}
+#endif /* defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7e8d792da963..a6c8252d7776 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3456,11 +3456,23 @@ struct ftrace_glob {
 	int type;
 };
 
+/*
+ * If symbols in an architecture don't correspond exactly to the user-visible
+ * name of what they represent, it is possible to define this function to
+ * perform the necessary adjustments.
+*/
+char * __weak arch_ftrace_match_adjust(char *str, const char *search)
+{
+	return str;
+}
+
 static int ftrace_match(char *str, struct ftrace_glob *g)
 {
 	int matched = 0;
 	int slen;
 
+	str = arch_ftrace_match_adjust(str, g->search);
+
 	switch (g->type) {
 	case MATCH_FULL:
 		if (strcmp(str, g->search) == 0)
-- 
cgit v1.2.3


From dad56ee742a3abbb5d9e8108f8537d412bff3f57 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 26 Apr 2016 21:22:22 -0400
Subject: tracing: Move event_trigger_unlock_commit{_regs}() to local header

The functions event_trigger_unlock_commit() and
event_trigger_unlock_commit_regs() are no longer used outside the tracing
system. Move them out of the generic headers and into the local one.

Along with __event_trigger_test_discard() that is only used by them.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 94 --------------------------------------------
 kernel/trace/trace.h         | 94 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 94 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 5f89a5b0c7e6..70a181cb3585 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -452,100 +452,6 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 	return false;
 }
 
-/*
- * Helper function for event_trigger_unlock_commit{_regs}().
- * If there are event triggers attached to this event that requires
- * filtering against its fields, then they wil be called as the
- * entry already holds the field information of the current event.
- *
- * It also checks if the event should be discarded or not.
- * It is to be discarded if the event is soft disabled and the
- * event was only recorded to process triggers, or if the event
- * filter is active and this event did not match the filters.
- *
- * Returns true if the event is discarded, false otherwise.
- */
-static inline bool
-__event_trigger_test_discard(struct trace_event_file *file,
-			     struct ring_buffer *buffer,
-			     struct ring_buffer_event *event,
-			     void *entry,
-			     enum event_trigger_type *tt)
-{
-	unsigned long eflags = file->flags;
-
-	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
-		*tt = event_triggers_call(file, entry);
-
-	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags))
-		ring_buffer_discard_commit(buffer, event);
-	else if (!filter_check_discard(file, entry, buffer, event))
-		return false;
-
-	return true;
-}
-
-/**
- * event_trigger_unlock_commit - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- */
-static inline void
-event_trigger_unlock_commit(struct trace_event_file *file,
-			    struct ring_buffer *buffer,
-			    struct ring_buffer_event *event,
-			    void *entry, unsigned long irq_flags, int pc)
-{
-	enum event_trigger_type tt = ETT_NONE;
-
-	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
-		trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
-
-	if (tt)
-		event_triggers_post_call(file, tt, entry);
-}
-
-/**
- * event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer assoctiated to the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @irq_flags: The state of the interrupts at the start of the event
- * @pc: The state of the preempt count at the start of the event.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- *
- * Same as event_trigger_unlock_commit() but calls
- * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
- */
-static inline void
-event_trigger_unlock_commit_regs(struct trace_event_file *file,
-				 struct ring_buffer *buffer,
-				 struct ring_buffer_event *event,
-				 void *entry, unsigned long irq_flags, int pc,
-				 struct pt_regs *regs)
-{
-	enum event_trigger_type tt = ETT_NONE;
-
-	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
-		trace_buffer_unlock_commit_regs(file->tr, buffer, event,
-						irq_flags, pc, regs);
-
-	if (tt)
-		event_triggers_post_call(file, tt, entry);
-}
-
 #ifdef CONFIG_BPF_EVENTS
 unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
 #else
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 727a3d28bce5..c0eac7b1e5a6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1065,6 +1065,100 @@ struct trace_subsystem_dir {
 	int				nr_events;
 };
 
+/*
+ * Helper function for event_trigger_unlock_commit{_regs}().
+ * If there are event triggers attached to this event that requires
+ * filtering against its fields, then they wil be called as the
+ * entry already holds the field information of the current event.
+ *
+ * It also checks if the event should be discarded or not.
+ * It is to be discarded if the event is soft disabled and the
+ * event was only recorded to process triggers, or if the event
+ * filter is active and this event did not match the filters.
+ *
+ * Returns true if the event is discarded, false otherwise.
+ */
+static inline bool
+__event_trigger_test_discard(struct trace_event_file *file,
+			     struct ring_buffer *buffer,
+			     struct ring_buffer_event *event,
+			     void *entry,
+			     enum event_trigger_type *tt)
+{
+	unsigned long eflags = file->flags;
+
+	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
+		*tt = event_triggers_call(file, entry);
+
+	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags))
+		ring_buffer_discard_commit(buffer, event);
+	else if (!filter_check_discard(file, entry, buffer, event))
+		return false;
+
+	return true;
+}
+
+/**
+ * event_trigger_unlock_commit - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ */
+static inline void
+event_trigger_unlock_commit(struct trace_event_file *file,
+			    struct ring_buffer *buffer,
+			    struct ring_buffer_event *event,
+			    void *entry, unsigned long irq_flags, int pc)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
+
+	if (tt)
+		event_triggers_post_call(file, tt, entry);
+}
+
+/**
+ * event_trigger_unlock_commit_regs - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ *
+ * Same as event_trigger_unlock_commit() but calls
+ * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
+ */
+static inline void
+event_trigger_unlock_commit_regs(struct trace_event_file *file,
+				 struct ring_buffer *buffer,
+				 struct ring_buffer_event *event,
+				 void *entry, unsigned long irq_flags, int pc,
+				 struct pt_regs *regs)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit_regs(file->tr, buffer, event,
+						irq_flags, pc, regs);
+
+	if (tt)
+		event_triggers_post_call(file, tt, entry);
+}
+
 #define FILTER_PRED_INVALID	((unsigned short)-1)
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
 #define FILTER_PRED_FOLD	(1 << 15)
-- 
cgit v1.2.3


From c5dfd78eb79851e278b7973031b9ca363da87a7e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 21 Apr 2016 12:28:50 -0300
Subject: perf core: Allow setting up max frame stack depth via sysctl

The default remains 127, which is good for most cases, and not even hit
most of the time, but then for some cases, as reported by Brendan, 1024+
deep frames are appearing on the radar for things like groovy, ruby.

And in some workloads putting a _lower_ cap on this may make sense. One
that is per event still needs to be put in place tho.

The new file is:

  # cat /proc/sys/kernel/perf_event_max_stack
  127

Chaging it:

  # echo 256 > /proc/sys/kernel/perf_event_max_stack
  # cat /proc/sys/kernel/perf_event_max_stack
  256

But as soon as there is some event using callchains we get:

  # echo 512 > /proc/sys/kernel/perf_event_max_stack
  -bash: echo: write error: Device or resource busy
  #

Because we only allocate the callchain percpu data structures when there
is a user, which allows for changing the max easily, its just a matter
of having no callchain users at that point.

Reported-and-Tested-by: Brendan Gregg <brendan.d.gregg@gmail.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: David Ahern <dsahern@gmail.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/r/20160426002928.GB16708@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 Documentation/sysctl/kernel.txt    | 14 ++++++++++++++
 arch/arm/kernel/perf_callchain.c   |  2 +-
 arch/arm64/kernel/perf_callchain.c |  4 ++--
 arch/metag/kernel/perf_callchain.c |  2 +-
 arch/mips/kernel/perf_event.c      |  4 ++--
 arch/powerpc/perf/callchain.c      |  4 ++--
 arch/sparc/kernel/perf_event.c     |  6 +++---
 arch/x86/events/core.c             |  4 ++--
 arch/xtensa/kernel/perf_event.c    |  4 ++--
 include/linux/perf_event.h         |  8 ++++++--
 kernel/bpf/stackmap.c              |  8 ++++----
 kernel/events/callchain.c          | 35 +++++++++++++++++++++++++++++++++--
 kernel/sysctl.c                    | 12 ++++++++++++
 13 files changed, 84 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a44b128..260cde08e92e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -60,6 +60,7 @@ show up in /proc/sys/kernel:
 - panic_on_warn
 - perf_cpu_time_max_percent
 - perf_event_paranoid
+- perf_event_max_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -654,6 +655,19 @@ users (without CAP_SYS_ADMIN).  The default value is 1.
 
 ==============================================================
 
+perf_event_max_stack:
+
+Controls maximum number of stack frames to copy for (attr.sample_type &
+PERF_SAMPLE_CALLCHAIN) configured events, for instance, when using
+'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 127.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 4e02ae5950ff..27563befa8a2 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+	while ((entry->nr < sysctl_perf_event_max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index ff4665462a02..32c3c6e70119 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < PERF_MAX_STACK_DEPTH &&
+		while (entry->nr < sysctl_perf_event_max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
+		while ((entry->nr < sysctl_perf_event_max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 315633461a94..252abc12a5a3 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < PERF_MAX_STACK_DEPTH) && frame)
+	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index c1cf9c6c3f77..5021c546ad07 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= PERF_MAX_STACK_DEPTH)
+			if (entry->nr >= sysctl_perf_event_max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= PERF_MAX_STACK_DEPTH)
+		if (entry->nr >= sysctl_perf_event_max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index e04a6752b399..22d9015c1acc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 6596f66ce112..a4b8b5aed21c 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < PERF_MAX_STACK_DEPTH);
+	} while (entry->nr < sysctl_perf_event_max_stack);
 }
 
 void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 041e442a3e28..41d93d0e972b 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2277,7 +2277,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2337,7 +2337,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < PERF_MAX_STACK_DEPTH) {
+	while (entry->nr < sysctl_perf_event_max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index 54f01188c29c..a6b00b3af429 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -332,14 +332,14 @@ static int callchain_trace(struct stackframe *frame, void *data)
 void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
 				callchain_trace, NULL, entry);
 }
 
 void perf_callchain_user(struct perf_callchain_entry *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, PERF_MAX_STACK_DEPTH,
+	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
 			      callchain_trace, entry);
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 85749ae8cb5f..a090700cccca 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -58,7 +58,7 @@ struct perf_guest_info_callbacks {
 
 struct perf_callchain_entry {
 	__u64				nr;
-	__u64				ip[PERF_MAX_STACK_DEPTH];
+	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
 struct perf_raw_record {
@@ -993,9 +993,11 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
+extern int sysctl_perf_event_max_stack;
+
 static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
 {
-	if (entry->nr < PERF_MAX_STACK_DEPTH) {
+	if (entry->nr < sysctl_perf_event_max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
@@ -1017,6 +1019,8 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
 
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..f5a19548be12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    value_size < 8 || value_size % 8 ||
-	    value_size / 8 > PERF_MAX_STACK_DEPTH)
+	    value_size / 8 > sysctl_perf_event_max_stack)
 		return ERR_PTR(-EINVAL);
 
 	/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 	u32 max_depth = map->value_size / 8;
-	/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
-	u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 		return -EFAULT;
 
 	/* get_perf_callchain() guarantees that trace->nr >= init_nr
-	 * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 	 */
 	trace_nr = trace->nr - init_nr;
 
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..b9325e7dcba1 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
 	struct perf_callchain_entry	*cpu_entries[0];
 };
 
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+	return (sizeof(struct perf_callchain_entry) +
+		sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
 static atomic_t nr_callchain_events;
 static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
 	if (!entries)
 		return -ENOMEM;
 
-	size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+	size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
 
 	for_each_possible_cpu(cpu) {
 		entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
 
 	cpu = smp_processor_id();
 
-	return &entries->cpu_entries[cpu][*rctx];
+	return (((void *)entries->cpu_entries[cpu]) +
+		(*rctx * perf_callchain_entry__sizeof()));
 }
 
 static void
@@ -215,3 +224,25 @@ exit_put:
 
 	return entry;
 }
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int new_value = sysctl_perf_event_max_stack, ret;
+	struct ctl_table new_table = *table;
+
+	new_table.data = &new_value;
+	ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+	if (ret || !write)
+		return ret;
+
+	mutex_lock(&callchain_mutex);
+	if (atomic_read(&nr_callchain_events))
+		ret = -EBUSY;
+	else
+		sysctl_perf_event_max_stack = new_value;
+
+	mutex_unlock(&callchain_mutex);
+
+	return ret;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 725587f10667..c8b318663525 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -130,6 +130,9 @@ static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
+#ifdef CONFIG_PERF_EVENTS
+static int six_hundred_forty_kb = 640 * 1024;
+#endif
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1144,6 +1147,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "perf_event_max_stack",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(sysctl_perf_event_max_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+		.extra2		= &six_hundred_forty_kb,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{
-- 
cgit v1.2.3


From 65da9a0a3bf2202c2432f42d41eb908f2fa30579 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 27 Apr 2016 10:13:46 -0400
Subject: tracing: Make filter_check_discard() local

Nothing outside of the tracing directory calls filter_check_discard() or
check_filter_check_discard(). They should not be called by modules. Move
their prototypes into the local tracing header and remove their
EXPORT_SYMBOL() macros.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 6 ------
 kernel/trace/trace.c         | 2 --
 kernel/trace/trace.h         | 6 ++++++
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 70a181cb3585..bb383af35cc7 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -413,12 +413,6 @@ enum event_trigger_type {
 
 extern int filter_match_preds(struct event_filter *filter, void *rec);
 
-extern int filter_check_discard(struct trace_event_file *file, void *rec,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event);
-extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
-				     struct ring_buffer *buffer,
-				     struct ring_buffer_event *event);
 extern enum event_trigger_type event_triggers_call(struct trace_event_file *file,
 						   void *rec);
 extern void event_triggers_post_call(struct trace_event_file *file,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 46028d47d252..02f5a5f51d49 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -318,7 +318,6 @@ int filter_check_discard(struct trace_event_file *file, void *rec,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(filter_check_discard);
 
 int call_filter_check_discard(struct trace_event_call *call, void *rec,
 			      struct ring_buffer *buffer,
@@ -332,7 +331,6 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(call_filter_check_discard);
 
 static cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c0eac7b1e5a6..ee8691c66bfe 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1065,6 +1065,12 @@ struct trace_subsystem_dir {
 	int				nr_events;
 };
 
+extern int filter_check_discard(struct trace_event_file *file, void *rec,
+				struct ring_buffer *buffer,
+				struct ring_buffer_event *event);
+extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
+				     struct ring_buffer *buffer,
+				     struct ring_buffer_event *event);
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
-- 
cgit v1.2.3


From 2548d546d40c0014efdde88a53bf7896e917dcce Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 21 Apr 2016 18:03:15 +0200
Subject: nohz/full, sched/rt: Fix missed tick-reenabling bug in
 sched_can_stop_tick()

Chris Metcalf reported a that sched_can_stop_tick() sometimes fails to
re-enable the tick.

His observed problem is that rq->cfs.nr_running can be 1 even though
there are multiple runnable CFS tasks. This happens in the cgroup
case, in which case cfs.nr_running is the number of runnable entities
for that level.

If there is a single runnable cgroup (which can have an arbitrary
number of runnable child entries itself) rq->cfs.nr_running will be 1.

However, looking at that function I think there's more problems with it.

It seems to assume that if there's FIFO tasks, those will run. This is
incorrect. The FIFO task can have a lower prio than an RR task, in which
case the RR task will run.

So the whole fifo_nr_running test seems misplaced, it should go after
the rr_nr_running tests. That is, only if !rr_nr_running, can we use
fifo_nr_running like this.

Reported-by: Chris Metcalf <cmetcalf@mellanox.com>
Tested-by: Chris Metcalf <cmetcalf@mellanox.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Wanpeng Li <kernellwp@gmail.com>
Fixes: 76d92ac305f2 ("sched: Migrate sched to use new tick dependency mask model")
Link: http://lkml.kernel.org/r/20160421160315.GK24771@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fcac37b..d1f7149f8704 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -596,17 +596,8 @@ bool sched_can_stop_tick(struct rq *rq)
 		return false;
 
 	/*
-	 * FIFO realtime policy runs the highest priority task (after DEADLINE).
-	 * Other runnable tasks are of a lower priority. The scheduler tick
-	 * isn't needed.
-	 */
-	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
-	if (fifo_nr_running)
-		return true;
-
-	/*
-	 * Round-robin realtime tasks time slice with other tasks at the same
-	 * realtime priority.
+	 * If there are more than one RR tasks, we need the tick to effect the
+	 * actual RR behaviour.
 	 */
 	if (rq->rt.rr_nr_running) {
 		if (rq->rt.rr_nr_running == 1)
@@ -615,8 +606,20 @@ bool sched_can_stop_tick(struct rq *rq)
 			return false;
 	}
 
-	/* Normal multitasking need periodic preemption checks */
-	if (rq->cfs.nr_running > 1)
+	/*
+	 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
+	 * forced preemption between FIFO tasks.
+	 */
+	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
+	if (fifo_nr_running)
+		return true;
+
+	/*
+	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
+	 * if there's more than one we need the tick for involuntary
+	 * preemption.
+	 */
+	if (rq->nr_running > 1)
 		return false;
 
 	return true;
-- 
cgit v1.2.3


From 79c9ce57eb2d5f1497546a3946b4ae21b6fdc438 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 26 Apr 2016 11:36:53 +0200
Subject: perf/core: Fix perf_event_open() vs. execve() race

Jann reported that the ptrace_may_access() check in
find_lively_task_by_vpid() is racy against exec().

Specifically:

  perf_event_open()		execve()

  ptrace_may_access()
				commit_creds()
  ...				if (get_dumpable() != SUID_DUMP_USER)
				  perf_event_exit_task();
  perf_install_in_context()

would result in installing a counter across the creds boundary.

Fix this by wrapping lots of perf_event_open() in cred_guard_mutex.
This should be fine as perf_event_exit_task() is already called with
cred_guard_mutex held, so all perf locks already nest inside it.

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 52 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2c78b6f47339..4e2ebf6f2f1f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1106,6 +1106,7 @@ static void put_ctx(struct perf_event_context *ctx)
  * function.
  *
  * Lock order:
+ *    cred_guard_mutex
  *	task_struct::perf_event_mutex
  *	  perf_event_context::mutex
  *	    perf_event::child_mutex;
@@ -3421,7 +3422,6 @@ static struct task_struct *
 find_lively_task_by_vpid(pid_t vpid)
 {
 	struct task_struct *task;
-	int err;
 
 	rcu_read_lock();
 	if (!vpid)
@@ -3435,16 +3435,7 @@ find_lively_task_by_vpid(pid_t vpid)
 	if (!task)
 		return ERR_PTR(-ESRCH);
 
-	/* Reuse ptrace permission checks for now. */
-	err = -EACCES;
-	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
-		goto errout;
-
 	return task;
-errout:
-	put_task_struct(task);
-	return ERR_PTR(err);
-
 }
 
 /*
@@ -8414,6 +8405,24 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	get_online_cpus();
 
+	if (task) {
+		err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+		if (err)
+			goto err_cpus;
+
+		/*
+		 * Reuse ptrace permission checks for now.
+		 *
+		 * We must hold cred_guard_mutex across this and any potential
+		 * perf_install_in_context() call for this new event to
+		 * serialize against exec() altering our credentials (and the
+		 * perf_event_exit_task() that could imply).
+		 */
+		err = -EACCES;
+		if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
+			goto err_cred;
+	}
+
 	if (flags & PERF_FLAG_PID_CGROUP)
 		cgroup_fd = pid;
 
@@ -8421,7 +8430,7 @@ SYSCALL_DEFINE5(perf_event_open,
 				 NULL, NULL, cgroup_fd);
 	if (IS_ERR(event)) {
 		err = PTR_ERR(event);
-		goto err_cpus;
+		goto err_cred;
 	}
 
 	if (is_sampling_event(event)) {
@@ -8480,11 +8489,6 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_context;
 	}
 
-	if (task) {
-		put_task_struct(task);
-		task = NULL;
-	}
-
 	/*
 	 * Look up the group leader (we will attach this event to it):
 	 */
@@ -8582,6 +8586,11 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	WARN_ON_ONCE(ctx->parent_ctx);
 
+	/*
+	 * This is the point on no return; we cannot fail hereafter. This is
+	 * where we start modifying current state.
+	 */
+
 	if (move_group) {
 		/*
 		 * See perf_event_ctx_lock() for comments on the details
@@ -8653,6 +8662,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		mutex_unlock(&gctx->mutex);
 	mutex_unlock(&ctx->mutex);
 
+	if (task) {
+		mutex_unlock(&task->signal->cred_guard_mutex);
+		put_task_struct(task);
+	}
+
 	put_online_cpus();
 
 	mutex_lock(&current->perf_event_mutex);
@@ -8685,6 +8699,9 @@ err_alloc:
 	 */
 	if (!event_file)
 		free_event(event);
+err_cred:
+	if (task)
+		mutex_unlock(&task->signal->cred_guard_mutex);
 err_cpus:
 	put_online_cpus();
 err_task:
@@ -8969,6 +8986,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 
 /*
  * When a child task exits, feed back event values to parent events.
+ *
+ * Can be called with cred_guard_mutex held when called from
+ * install_exec_creds().
  */
 void perf_event_exit_task(struct task_struct *child)
 {
-- 
cgit v1.2.3


From 594dd290cf5403a9a5818619dfff42d8e8e0518e Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Fri, 22 Apr 2016 17:07:24 +0800
Subject: sched/cpufreq: Optimize cpufreq update kicker to avoid update
 multiple times

Sometimes delta_exec is 0 due to update_curr() is called multiple times,
this is captured by:

	u64 delta_exec = rq_clock_task(rq) - curr->se.exec_start;

This patch optimizes the cpufreq update kicker by bailing out when nothing
changed, it will benefit the upcoming schedutil, since otherwise it will
(over)react to the special util/max combination.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1461316044-9520-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 8 ++++----
 kernel/sched/rt.c       | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index affd97ec9f65..8f9b5af4e857 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -717,10 +717,6 @@ static void update_curr_dl(struct rq *rq)
 	if (!dl_task(curr) || !on_dl_rq(dl_se))
 		return;
 
-	/* Kick cpufreq (see the comment in linux/cpufreq.h). */
-	if (cpu_of(rq) == smp_processor_id())
-		cpufreq_trigger_update(rq_clock(rq));
-
 	/*
 	 * Consumed budget is computed considering the time as
 	 * observed by schedulable tasks (excluding time spent
@@ -736,6 +732,10 @@ static void update_curr_dl(struct rq *rq)
 		return;
 	}
 
+	/* kick cpufreq (see the comment in linux/cpufreq.h). */
+	if (cpu_of(rq) == smp_processor_id())
+		cpufreq_trigger_update(rq_clock(rq));
+
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c41ea7ac1764..19e13060fcd5 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -953,14 +953,14 @@ static void update_curr_rt(struct rq *rq)
 	if (curr->sched_class != &rt_sched_class)
 		return;
 
-	/* Kick cpufreq (see the comment in linux/cpufreq.h). */
-	if (cpu_of(rq) == smp_processor_id())
-		cpufreq_trigger_update(rq_clock(rq));
-
 	delta_exec = rq_clock_task(rq) - curr->se.exec_start;
 	if (unlikely((s64)delta_exec <= 0))
 		return;
 
+	/* Kick cpufreq (see the comment in linux/cpufreq.h). */
+	if (cpu_of(rq) == smp_processor_id())
+		cpufreq_trigger_update(rq_clock(rq));
+
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
 
-- 
cgit v1.2.3


From 5db4298133d99b3dfc60d6899ac9df169769c899 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 26 Apr 2016 10:22:08 -0700
Subject: lcoking/locktorture: Simplify the torture_runnable computation

This commit replaces an #ifdef with IS_ENABLED(), saving five lines.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: corbet@lwn.net
Cc: dave@stgolabs.net
Cc: dhowells@redhat.com
Cc: linux-doc@vger.kernel.org
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1461691328-5429-4-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/locktorture.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index d066a50dc87e..f8c5af52a131 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -75,12 +75,7 @@ struct lock_stress_stats {
 	long n_lock_acquired;
 };
 
-#if defined(MODULE)
-#define LOCKTORTURE_RUNNABLE_INIT 1
-#else
-#define LOCKTORTURE_RUNNABLE_INIT 0
-#endif
-int torture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+int torture_runnable = IS_ENABLED(MODULE);
 module_param(torture_runnable, int, 0444);
 MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
 
-- 
cgit v1.2.3


From f98db6013c557c216da5038d9c52045be55cd039 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 26 Apr 2016 09:39:06 -0700
Subject: sched/core: Add switch_mm_irqs_off() and use it in the scheduler

By default, this is the same thing as switch_mm().

x86 will override it as an optimization.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/df401df47bdd6be3e389c6f1e3f5310d70e81b2c.1461688545.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mmu_context.h | 7 +++++++
 kernel/sched/core.c         | 6 +++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index 70fffeba7495..a4441784503b 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -1,9 +1,16 @@
 #ifndef _LINUX_MMU_CONTEXT_H
 #define _LINUX_MMU_CONTEXT_H
 
+#include <asm/mmu_context.h>
+
 struct mm_struct;
 
 void use_mm(struct mm_struct *mm);
 void unuse_mm(struct mm_struct *mm);
 
+/* Architectures that care about IRQ state in switch_mm can override this. */
+#ifndef switch_mm_irqs_off
+# define switch_mm_irqs_off switch_mm
+#endif
+
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9d84d6004745..adcafda7ffc8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -33,7 +33,7 @@
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
-#include <asm/mmu_context.h>
+#include <linux/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
 #include <linux/completion.h>
@@ -2733,7 +2733,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		atomic_inc(&oldmm->mm_count);
 		enter_lazy_tlb(oldmm, next);
 	} else
-		switch_mm(oldmm, mm, next);
+		switch_mm_irqs_off(oldmm, mm, next);
 
 	if (!prev->mm) {
 		prev->active_mm = NULL;
@@ -5274,7 +5274,7 @@ void idle_task_exit(void)
 	BUG_ON(cpu_online(smp_processor_id()));
 
 	if (mm != &init_mm) {
-		switch_mm(mm, &init_mm, current);
+		switch_mm_irqs_off(mm, &init_mm, current);
 		finish_arch_post_lock_switch();
 	}
 	mmdrop(mm);
-- 
cgit v1.2.3


From f6cf0545ec697ddc278b7457b7d0c0d86a2ea88e Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Wed, 27 Apr 2016 17:47:11 +0100
Subject: PM / Hibernate: Call flush_icache_range() on pages restored in-place

Some architectures require code written to memory as if it were data to be
'cleaned' from any data caches before the processor can fetch them as new
instructions.

During resume from hibernate, the snapshot code copies some pages directly,
meaning these architectures do not get a chance to perform their cache
maintenance. Modify the read and decompress code to call
flush_icache_range() on all pages that are restored, so that the restored
in-place pages are guaranteed to be executable on these architectures.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Rafael J. Wysocki <rjw@rjwysocki.net>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
[will: make clean_pages_on_* static and remove initialisers]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/power/swap.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 12cd989dadf6..160e1006640d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -36,6 +36,14 @@
 
 #define HIBERNATE_SIG	"S1SUSPEND"
 
+/*
+ * When reading an {un,}compressed image, we may restore pages in place,
+ * in which case some architectures need these pages cleaning before they
+ * can be executed. We don't know which pages these may be, so clean the lot.
+ */
+static bool clean_pages_on_read;
+static bool clean_pages_on_decompress;
+
 /*
  *	The swap map is a data structure used for keeping track of each page
  *	written to a swap partition.  It consists of many swap_map_page
@@ -241,6 +249,9 @@ static void hib_end_io(struct bio *bio)
 
 	if (bio_data_dir(bio) == WRITE)
 		put_page(page);
+	else if (clean_pages_on_read)
+		flush_icache_range((unsigned long)page_address(page),
+				   (unsigned long)page_address(page) + PAGE_SIZE);
 
 	if (bio->bi_error && !hb->error)
 		hb->error = bio->bi_error;
@@ -1049,6 +1060,7 @@ static int load_image(struct swap_map_handle *handle,
 
 	hib_init_batch(&hb);
 
+	clean_pages_on_read = true;
 	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
 		nr_to_read);
 	m = nr_to_read / 10;
@@ -1124,6 +1136,10 @@ static int lzo_decompress_threadfn(void *data)
 		d->unc_len = LZO_UNC_SIZE;
 		d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
 		                               d->unc, &d->unc_len);
+		if (clean_pages_on_decompress)
+			flush_icache_range((unsigned long)d->unc,
+					   (unsigned long)d->unc + d->unc_len);
+
 		atomic_set(&d->stop, 1);
 		wake_up(&d->done);
 	}
@@ -1189,6 +1205,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	}
 	memset(crc, 0, offsetof(struct crc_data, go));
 
+	clean_pages_on_decompress = true;
+
 	/*
 	 * Start the decompression threads.
 	 */
-- 
cgit v1.2.3


From 92117d8443bc5afacc8d5ba82e541946310f106e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 27 Apr 2016 18:56:20 -0700
Subject: bpf: fix refcnt overflow

On a system with >32Gbyte of phyiscal memory and infinite RLIMIT_MEMLOCK,
the malicious application may overflow 32-bit bpf program refcnt.
It's also possible to overflow map refcnt on 1Tb system.
Impose 32k hard limit which means that the same bpf program or
map cannot be shared by more than 32k processes.

Fixes: 1be7f75d1668 ("bpf: enable non-root eBPF programs")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  3 ++-
 kernel/bpf/inode.c    |  7 ++++---
 kernel/bpf/syscall.c  | 24 ++++++++++++++++++++----
 kernel/bpf/verifier.c | 11 +++++++----
 4 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 21ee41b92e8a..f1d5c5acc8dd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -171,12 +171,13 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl);
 void bpf_register_map_type(struct bpf_map_type_list *tl);
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog);
 void bpf_prog_put(struct bpf_prog *prog);
 void bpf_prog_put_rcu(struct bpf_prog *prog);
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd);
 struct bpf_map *__bpf_map_get(struct fd f);
-void bpf_map_inc(struct bpf_map *map, bool uref);
+struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index f2ece3c174a5..8f94ca1860cf 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -31,10 +31,10 @@ static void *bpf_any_get(void *raw, enum bpf_type type)
 {
 	switch (type) {
 	case BPF_TYPE_PROG:
-		atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
+		raw = bpf_prog_inc(raw);
 		break;
 	case BPF_TYPE_MAP:
-		bpf_map_inc(raw, true);
+		raw = bpf_map_inc(raw, true);
 		break;
 	default:
 		WARN_ON_ONCE(1);
@@ -297,7 +297,8 @@ static void *bpf_obj_do_get(const struct filename *pathname,
 		goto out;
 
 	raw = bpf_any_get(inode->i_private, *type);
-	touch_atime(&path);
+	if (!IS_ERR(raw))
+		touch_atime(&path);
 
 	path_put(&path);
 	return raw;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index adc5e4bd74f8..cf5e9f7ad13a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -218,11 +218,18 @@ struct bpf_map *__bpf_map_get(struct fd f)
 	return f.file->private_data;
 }
 
-void bpf_map_inc(struct bpf_map *map, bool uref)
+/* prog's and map's refcnt limit */
+#define BPF_MAX_REFCNT 32768
+
+struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref)
 {
-	atomic_inc(&map->refcnt);
+	if (atomic_inc_return(&map->refcnt) > BPF_MAX_REFCNT) {
+		atomic_dec(&map->refcnt);
+		return ERR_PTR(-EBUSY);
+	}
 	if (uref)
 		atomic_inc(&map->usercnt);
+	return map;
 }
 
 struct bpf_map *bpf_map_get_with_uref(u32 ufd)
@@ -234,7 +241,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 	if (IS_ERR(map))
 		return map;
 
-	bpf_map_inc(map, true);
+	map = bpf_map_inc(map, true);
 	fdput(f);
 
 	return map;
@@ -658,6 +665,15 @@ static struct bpf_prog *__bpf_prog_get(struct fd f)
 	return f.file->private_data;
 }
 
+struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
+{
+	if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) {
+		atomic_dec(&prog->aux->refcnt);
+		return ERR_PTR(-EBUSY);
+	}
+	return prog;
+}
+
 /* called by sockets/tracing/seccomp before attaching program to an event
  * pairs with bpf_prog_put()
  */
@@ -670,7 +686,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 	if (IS_ERR(prog))
 		return prog;
 
-	atomic_inc(&prog->aux->refcnt);
+	prog = bpf_prog_inc(prog);
 	fdput(f);
 
 	return prog;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index db2574e7b8b0..89bcaa0966da 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2049,15 +2049,18 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
 				return -E2BIG;
 			}
 
-			/* remember this map */
-			env->used_maps[env->used_map_cnt++] = map;
-
 			/* hold the map. If the program is rejected by verifier,
 			 * the map will be released by release_maps() or it
 			 * will be used by the valid program until it's unloaded
 			 * and all maps are released in free_bpf_prog_info()
 			 */
-			bpf_map_inc(map, false);
+			map = bpf_map_inc(map, false);
+			if (IS_ERR(map)) {
+				fdput(f);
+				return PTR_ERR(map);
+			}
+			env->used_maps[env->used_map_cnt++] = map;
+
 			fdput(f);
 next_insn:
 			insn++;
-- 
cgit v1.2.3


From 6aff67c85c9e5a4bc99e5211c1bac547936626ca Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 27 Apr 2016 18:56:21 -0700
Subject: bpf: fix check_map_func_compatibility logic

The commit 35578d798400 ("bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter")
introduced clever way to check bpf_helper<->map_type compatibility.
Later on commit a43eec304259 ("bpf: introduce bpf_perf_event_output() helper") adjusted
the logic and inadvertently broke it.
Get rid of the clever bool compare and go back to two-way check
from map and from helper perspective.

Fixes: a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 65 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 89bcaa0966da..c5c17a62f509 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -239,16 +239,6 @@ static const char * const reg_type_str[] = {
 	[CONST_IMM]		= "imm",
 };
 
-static const struct {
-	int map_type;
-	int func_id;
-} func_limit[] = {
-	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
-	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
-	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
-	{BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid},
-};
-
 static void print_verifier_state(struct verifier_env *env)
 {
 	enum bpf_reg_type t;
@@ -921,27 +911,52 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 
 static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 {
-	bool bool_map, bool_func;
-	int i;
-
 	if (!map)
 		return 0;
 
-	for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
-		bool_map = (map->map_type == func_limit[i].map_type);
-		bool_func = (func_id == func_limit[i].func_id);
-		/* only when map & func pair match it can continue.
-		 * don't allow any other map type to be passed into
-		 * the special func;
-		 */
-		if (bool_func && bool_map != bool_func) {
-			verbose("cannot pass map_type %d into func %d\n",
-				map->map_type, func_id);
-			return -EINVAL;
-		}
+	/* We need a two way check, first is from map perspective ... */
+	switch (map->map_type) {
+	case BPF_MAP_TYPE_PROG_ARRAY:
+		if (func_id != BPF_FUNC_tail_call)
+			goto error;
+		break;
+	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+		if (func_id != BPF_FUNC_perf_event_read &&
+		    func_id != BPF_FUNC_perf_event_output)
+			goto error;
+		break;
+	case BPF_MAP_TYPE_STACK_TRACE:
+		if (func_id != BPF_FUNC_get_stackid)
+			goto error;
+		break;
+	default:
+		break;
+	}
+
+	/* ... and second from the function itself. */
+	switch (func_id) {
+	case BPF_FUNC_tail_call:
+		if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+			goto error;
+		break;
+	case BPF_FUNC_perf_event_read:
+	case BPF_FUNC_perf_event_output:
+		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+			goto error;
+		break;
+	case BPF_FUNC_get_stackid:
+		if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
+			goto error;
+		break;
+	default:
+		break;
 	}
 
 	return 0;
+error:
+	verbose("cannot pass map_type %d into func %d\n",
+		map->map_type, func_id);
+	return -EINVAL;
 }
 
 static int check_call(struct verifier_env *env, int func_id)
-- 
cgit v1.2.3


From 8639a847b0e11f8d2daa3eafe15a9609c91fd357 Mon Sep 17 00:00:00 2001
From: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
Date: Thu, 28 Apr 2016 16:18:18 -0700
Subject: kexec: update VMCOREINFO for compound_order/dtor

makedumpfile refers page.lru.next to get the order of compound pages for
page filtering.

However, now the order is stored in page.compound_order, hence
VMCOREINFO should be updated to export the offset of
page.compound_order.

The fact is, page.compound_order was introduced already in kernel 4.0,
but the offset of it was the same as page.lru.next until kernel 4.3, so
this was not actual problem.

The above can be said also for page.lru.prev and page.compound_dtor,
it's necessary to detect hugetlbfs pages.  Further, the content was
changed from direct address to the ID which means dtor.

The problem is that unnecessary hugepages won't be removed from a dump
file in kernels 4.4.x and later.  This means that extra disk space would
be consumed.  It's a problem, but not critical.

Signed-off-by: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 8d34308ea449..cbbb4c724149 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1415,6 +1415,8 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(page, lru);
 	VMCOREINFO_OFFSET(page, _mapcount);
 	VMCOREINFO_OFFSET(page, private);
+	VMCOREINFO_OFFSET(page, compound_dtor);
+	VMCOREINFO_OFFSET(page, compound_order);
 	VMCOREINFO_OFFSET(pglist_data, node_zones);
 	VMCOREINFO_OFFSET(pglist_data, nr_zones);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -1447,8 +1449,8 @@ static int __init crash_save_vmcoreinfo_init(void)
 #ifdef CONFIG_X86
 	VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
 #endif
-#ifdef CONFIG_HUGETLBFS
-	VMCOREINFO_SYMBOL(free_huge_page);
+#ifdef CONFIG_HUGETLB_PAGE
+	VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
 #endif
 
 	arch_crash_save_vmcoreinfo();
-- 
cgit v1.2.3


From d7f53518f713d3d9bf5ed150f943853fb94e7473 Mon Sep 17 00:00:00 2001
From: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
Date: Thu, 28 Apr 2016 16:18:21 -0700
Subject: kexec: export OFFSET(page.compound_head) to find out compound tail
 page

PageAnon() always look at head page to check PAGE_MAPPING_ANON and tail
page's page->mapping has just a poisoned data since commit 1c290f642101
("mm: sanitize page->mapping for tail pages").

If makedumpfile checks page->mapping of a compound tail page to
distinguish anonymous page as usual, it must fail in newer kernel.  So
it's necessary to export OFFSET(page.compound_head) to avoid checking
compound tail pages.

The problem is that unnecessary hugepages won't be removed from a dump
file in kernels 4.5.x and later.  This means that extra disk space would
be consumed.  It's a problem, but not critical.

Signed-off-by: Atsushi Kumagai <ats-kumagai@wm.jp.nec.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index cbbb4c724149..1391d3ee3b86 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1417,6 +1417,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(page, private);
 	VMCOREINFO_OFFSET(page, compound_dtor);
 	VMCOREINFO_OFFSET(page, compound_order);
+	VMCOREINFO_OFFSET(page, compound_head);
 	VMCOREINFO_OFFSET(pglist_data, node_zones);
 	VMCOREINFO_OFFSET(pglist_data, nr_zones);
 #ifdef CONFIG_FLAT_NODE_MEM_MAP
-- 
cgit v1.2.3


From bdab42dfc974d15303afbf259f340f374a453974 Mon Sep 17 00:00:00 2001
From: James Morse <james.morse@arm.com>
Date: Thu, 28 Apr 2016 16:18:52 -0700
Subject: kcov: don't trace the code coverage code

Kcov causes the compiler to add a call to __sanitizer_cov_trace_pc() in
every basic block.  Ftrace patches in a call to _mcount() to each
function it has annotated.

Letting these mechanisms annotate each other is a bad thing.  Break the
loop by adding 'notrace' to __sanitizer_cov_trace_pc() so that ftrace
won't try to patch this code.

This patch lets arm64 with KCOV and STACK_TRACER boot.

Signed-off-by: James Morse <james.morse@arm.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3efbee0834a8..78bed7125515 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -43,7 +43,7 @@ struct kcov {
  * Entry point from instrumented code.
  * This is called once per basic-block/edge.
  */
-void __sanitizer_cov_trace_pc(void)
+void notrace __sanitizer_cov_trace_pc(void)
 {
 	struct task_struct *t;
 	enum kcov_mode mode;
-- 
cgit v1.2.3


From 36f05ae8bce904b4c8105363e6227a79d343bda6 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 28 Apr 2016 16:18:55 -0700
Subject: kcov: don't profile branches in kcov

Profiling 'if' statements in __sanitizer_cov_trace_pc() leads to
unbound recursion and crash:

	__sanitizer_cov_trace_pc() ->
		ftrace_likely_update ->
			__sanitizer_cov_trace_pc() ...

Define DISABLE_BRANCH_PROFILING to disable this tracer.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index 78bed7125515..a02f2dddd1d7 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,5 +1,6 @@
 #define pr_fmt(fmt) "kcov: " fmt
 
+#define DISABLE_BRANCH_PROFILING
 #include <linux/compiler.h>
 #include <linux/types.h>
 #include <linux/file.h>
-- 
cgit v1.2.3


From 9cbb1506ab2db987c160e7fc50665bf47b5b6fa1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 27 Apr 2016 11:09:42 -0400
Subject: tracing: Fold filter_check_discard() into its only user

The function filter_check_discard() is small and only called by one user,
its code can be folded into that one caller and make the code a bit less
comlplex.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 13 -------------
 kernel/trace/trace.h | 13 ++++++-------
 2 files changed, 6 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 02f5a5f51d49..1ba54e241c8d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -306,19 +306,6 @@ void trace_array_put(struct trace_array *this_tr)
 	mutex_unlock(&trace_types_lock);
 }
 
-int filter_check_discard(struct trace_event_file *file, void *rec,
-			 struct ring_buffer *buffer,
-			 struct ring_buffer_event *event)
-{
-	if (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
-	    !filter_match_preds(file->filter, rec)) {
-		ring_buffer_discard_commit(buffer, event);
-		return 1;
-	}
-
-	return 0;
-}
-
 int call_filter_check_discard(struct trace_event_call *call, void *rec,
 			      struct ring_buffer *buffer,
 			      struct ring_buffer_event *event)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ee8691c66bfe..0862e7559548 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1065,9 +1065,6 @@ struct trace_subsystem_dir {
 	int				nr_events;
 };
 
-extern int filter_check_discard(struct trace_event_file *file, void *rec,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event);
 extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event);
@@ -1096,12 +1093,14 @@ __event_trigger_test_discard(struct trace_event_file *file,
 	if (eflags & EVENT_FILE_FL_TRIGGER_COND)
 		*tt = event_triggers_call(file, entry);
 
-	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags))
+	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
+	    (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
+	     !filter_match_preds(file->filter, entry))) {
 		ring_buffer_discard_commit(buffer, event);
-	else if (!filter_check_discard(file, entry, buffer, event))
-		return false;
+		return true;
+	}
 
-	return true;
+	return false;
 }
 
 /**
-- 
cgit v1.2.3


From fa66ddb870ca022342fe6d1312ef76d2f7233a1d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 Apr 2016 12:04:13 -0400
Subject: tracing: Move trace_buffer_unlock_commit{_regs}() to local header

The functions trace_buffer_unlock_commit() and the _regs() version are only
used within the kernel/trace directory. Move them to the local header and
remove the export as well.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  9 ---------
 kernel/trace/trace.c         |  2 --
 kernel/trace/trace.h         | 10 ++++++++++
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index bb383af35cc7..48cc5e19c5f5 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -158,15 +158,6 @@ struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
 				  int type, unsigned long len,
 				  unsigned long flags, int pc);
-void trace_buffer_unlock_commit(struct trace_array *tr,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event,
-				unsigned long flags, int pc);
-void trace_buffer_unlock_commit_regs(struct trace_array *tr,
-				     struct ring_buffer *buffer,
-				     struct ring_buffer_event *event,
-				     unsigned long flags, int pc,
-				     struct pt_regs *regs);
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
 					 struct ring_buffer_event *event);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1ba54e241c8d..94e7e4d11b79 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1696,7 +1696,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 	ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
 	ftrace_trace_userstack(buffer, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
 
 static struct ring_buffer *temp_buffer;
 
@@ -1748,7 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	ftrace_trace_stack(tr, buffer, flags, 0, pc, regs);
 	ftrace_trace_userstack(buffer, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
 
 void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
 					 struct ring_buffer_event *event)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0862e7559548..bd5ae56dec7a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1068,6 +1068,16 @@ struct trace_subsystem_dir {
 extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event);
+
+void trace_buffer_unlock_commit(struct trace_array *tr,
+				struct ring_buffer *buffer,
+				struct ring_buffer_event *event,
+				unsigned long flags, int pc);
+void trace_buffer_unlock_commit_regs(struct trace_array *tr,
+				     struct ring_buffer *buffer,
+				     struct ring_buffer_event *event,
+				     unsigned long flags, int pc,
+				     struct pt_regs *regs);
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
-- 
cgit v1.2.3


From a9fe48dcde88fd48e210e4280f19cb9300ec9112 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 16:12:39 -0400
Subject: tracing: Remove unused function trace_current_buffer_discard_commit()

The function trace_current_buffer_discard_commit() has no callers, remove
it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h | 2 --
 kernel/trace/trace.c         | 8 --------
 2 files changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 48cc5e19c5f5..356c39b3abbb 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -158,8 +158,6 @@ struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
 				  int type, unsigned long len,
 				  unsigned long flags, int pc);
-void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
-					 struct ring_buffer_event *event);
 
 void tracing_record_cmdline(struct task_struct *tsk);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 94e7e4d11b79..e5bdb9accf52 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1734,7 +1734,6 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
 	return trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 }
-EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
 
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct ring_buffer *buffer,
@@ -1748,13 +1747,6 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 	ftrace_trace_userstack(buffer, flags, pc);
 }
 
-void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
-					 struct ring_buffer_event *event)
-{
-	ring_buffer_discard_commit(buffer, event);
-}
-EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
-
 void
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
-- 
cgit v1.2.3


From 33fddff24d05d71f97722cb7deec4964d39d10dc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 17:44:01 -0400
Subject: tracing: Have trace_buffer_unlock_commit() call the _regs version
 with NULL

There's no real difference between trace_buffer_unlock_commit() and
trace_buffer_unlock_commit_regs() except that the former passes NULL to
ftrace_stack_trace() instead of regs. Have the former be a static inline of
the latter which passes NULL for regs.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 11 -----------
 kernel/trace/trace.h | 13 +++++++++----
 2 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e5bdb9accf52..41bf14412666 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1686,17 +1686,6 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve
 	ring_buffer_unlock_commit(buffer, event);
 }
 
-void trace_buffer_unlock_commit(struct trace_array *tr,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event,
-				unsigned long flags, int pc)
-{
-	__buffer_unlock_commit(buffer, event);
-
-	ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
-	ftrace_trace_userstack(buffer, flags, pc);
-}
-
 static struct ring_buffer *temp_buffer;
 
 struct ring_buffer_event *
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bd5ae56dec7a..10156a09103f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1069,15 +1069,20 @@ extern int call_filter_check_discard(struct trace_event_call *call, void *rec,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event);
 
-void trace_buffer_unlock_commit(struct trace_array *tr,
-				struct ring_buffer *buffer,
-				struct ring_buffer_event *event,
-				unsigned long flags, int pc);
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event,
 				     unsigned long flags, int pc,
 				     struct pt_regs *regs);
+
+static inline void trace_buffer_unlock_commit(struct trace_array *tr,
+					      struct ring_buffer *buffer,
+					      struct ring_buffer_event *event,
+					      unsigned long flags, int pc)
+{
+	trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
+}
+
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
-- 
cgit v1.2.3


From f09d90864eb7cc00cadbbfd083b4eff84c167981 Mon Sep 17 00:00:00 2001
From: Miroslav Benes <mbenes@suse.cz>
Date: Thu, 28 Apr 2016 16:34:08 +0200
Subject: livepatch: make object/func-walking helpers more robust

Current object-walking helper checks the presence of obj->funcs to
determine the end of objs array in klp_object structure. This is
somewhat fragile because one can easily forget about funcs definition
during livepatch creation. In such a case the livepatch module is
successfully loaded and all objects after the incorrect one are omitted.
This is very confusing. Let's make the helper more robust and check also
for the other external member, name. Thus the helper correctly stops on
an empty item of the array. We need to have a check for obj->funcs in
klp_init_object() to make it work.

The same applies to a func-walking helper.

As a benefit we'll check for new_func member definition during the
livepatch initialization. There is no such check anywhere in the code
now.

[jkosina@suse.cz: fix shortlog]
Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Jessica Yu <jeyu@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/livepatch.h | 6 ++++--
 kernel/livepatch/core.c   | 3 +++
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index bd830d590465..90843ccb7852 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -124,10 +124,12 @@ struct klp_patch {
 };
 
 #define klp_for_each_object(patch, obj) \
-	for (obj = patch->objs; obj->funcs; obj++)
+	for (obj = patch->objs; obj->funcs || obj->name; obj++)
 
 #define klp_for_each_func(obj, func) \
-	for (func = obj->funcs; func->old_name; func++)
+	for (func = obj->funcs; \
+	     func->old_name || func->new_func || func->old_sympos; \
+	     func++)
 
 int klp_register_patch(struct klp_patch *);
 int klp_unregister_patch(struct klp_patch *);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index d68fbf63b083..ddef649a2914 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -683,6 +683,9 @@ static void klp_free_patch(struct klp_patch *patch)
 
 static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 {
+	if (!func->old_name || !func->new_func)
+		return -EINVAL;
+
 	INIT_LIST_HEAD(&func->stack_node);
 	func->state = KLP_DISABLED;
 
-- 
cgit v1.2.3


From 9b9db275051cd9191e7776c4fd79ccd4318aa2dc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 18:10:21 -0400
Subject: tracing: Remove one use of trace_current_buffer_lock_reserve()

The only user of trace_current_buffer_lock_reserve() is in the boot up self
tests. Restructure the code a little to have that code use what everything
else uses: trace_event_buffer_lock_reserve().

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e7cb983ee93c..da1eeb6190e3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3392,7 +3392,7 @@ static __init void event_trace_self_tests(void)
 
 static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
 
-static struct trace_array *event_tr;
+static struct trace_event_file event_trace_file __initdata;
 
 static void __init
 function_test_events_call(unsigned long ip, unsigned long parent_ip,
@@ -3416,17 +3416,17 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
 
 	local_save_flags(flags);
 
-	event = trace_current_buffer_lock_reserve(&buffer,
-						  TRACE_FN, sizeof(*entry),
-						  flags, pc);
+	event = trace_event_buffer_lock_reserve(&buffer, &event_trace_file,
+						TRACE_FN, sizeof(*entry),
+						flags, pc);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 
-	trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc);
-
+	event_trigger_unlock_commit(&event_trace_file, buffer, event,
+				    entry, flags, pc);
  out:
 	atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
 	preempt_enable_notrace();
@@ -3441,9 +3441,11 @@ static struct ftrace_ops trace_ops __initdata  =
 static __init void event_trace_self_test_with_function(void)
 {
 	int ret;
-	event_tr = top_trace_array();
-	if (WARN_ON(!event_tr))
+
+	event_trace_file.tr = top_trace_array();
+	if (WARN_ON(!event_trace_file.tr))
 		return;
+
 	ret = register_ftrace_function(&trace_ops);
 	if (WARN_ON(ret < 0)) {
 		pr_info("Failed to enable function tracer for event tests\n");
-- 
cgit v1.2.3


From 904d1857ad09b43f514897dd42daffe200d1ca50 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 29 Apr 2016 18:11:54 -0400
Subject: tracing: Remove unused function trace_current_buffer_lock_reserve()

trace_current_buffer_lock_reserve() has no more users. Remove it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h |  4 ----
 kernel/trace/trace.c         | 10 ----------
 2 files changed, 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 356c39b3abbb..3111a1efdad6 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -154,10 +154,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer,
 				struct trace_event_file *trace_file,
 				int type, unsigned long len,
 				unsigned long flags, int pc);
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
-				  int type, unsigned long len,
-				  unsigned long flags, int pc);
 
 void tracing_record_cmdline(struct task_struct *tsk);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 41bf14412666..c09e8ffadc73 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1714,16 +1714,6 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
 
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
-				  int type, unsigned long len,
-				  unsigned long flags, int pc)
-{
-	*current_rb = global_trace.trace_buffer.buffer;
-	return trace_buffer_lock_reserve(*current_rb,
-					 type, len, flags, pc);
-}
-
 void trace_buffer_unlock_commit_regs(struct trace_array *tr,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event,
-- 
cgit v1.2.3


From 01292cea0df86ed4a1eb6450d6eda375ef925716 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Mon, 25 Apr 2016 08:14:23 +0100
Subject: genirq: Make irq_destroy_ipi take a cpumask of IPIs to destroy

Previously irq_destroy_ipi() would destroy IPIs to all CPUs that were
configured by irq_reserve_ipi(). This change makes it possible to
destroy just a subset of the IPIs. This may be useful to remove IPIs to
CPUs that have been hot removed so that the IRQ numbers allocated within
the IPI domain can be re-used.

The original behaviour is restored by passing the complete mask that the
IPI was created with.

There are currently no users of this function that would break from the
API change.

Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: jason@lakedaemon.net
Cc: marc.zyngier@arm.com
Cc: ralf@linux-mips.org
Cc: Qais Yousef <qsyousef@gmail.com>
Cc: lisa.parratt@imgtec.com
Cc: jiang.liu@linux.intel.com
Link: http://lkml.kernel.org/r/1461568464-31701-1-git-send-email-matt.redfearn@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain.h |  2 +-
 kernel/irq/ipi.c          | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 2aed04396210..e1b81d35e7a3 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -348,7 +348,7 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr,
 /* IPI functions */
 unsigned int irq_reserve_ipi(struct irq_domain *domain,
 			     const struct cpumask *dest);
-void irq_destroy_ipi(unsigned int irq);
+void irq_destroy_ipi(unsigned int irq, const struct cpumask *dest);
 
 /* V2 interfaces to support hierarchy IRQ domains. */
 extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 14777af8e097..bedc995ae214 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -106,11 +106,12 @@ free_descs:
 /**
  * irq_destroy_ipi() - unreserve an IPI that was previously allocated
  * @irq:	linux irq number to be destroyed
+ * @dest:	cpumask of cpus which should have the IPI removed
  *
  * Return the IPIs allocated with irq_reserve_ipi() to the system destroying
  * all virqs associated with them.
  */
-void irq_destroy_ipi(unsigned int irq)
+void irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 {
 	struct irq_data *data = irq_get_irq_data(irq);
 	struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
@@ -129,10 +130,19 @@ void irq_destroy_ipi(unsigned int irq)
 		return;
 	}
 
-	if (irq_domain_is_ipi_per_cpu(domain))
-		nr_irqs = cpumask_weight(ipimask);
-	else
+	if (WARN_ON(!cpumask_subset(dest, ipimask)))
+		/*
+		 * Must be destroying a subset of CPUs to which this IPI
+		 * was set up to target
+		 */
+		return;
+
+	if (irq_domain_is_ipi_per_cpu(domain)) {
+		irq = irq + cpumask_first(dest) - data->common->ipi_offset;
+		nr_irqs = cpumask_weight(dest);
+	} else {
 		nr_irqs = 1;
+	}
 
 	irq_domain_free_irqs(irq, nr_irqs);
 }
-- 
cgit v1.2.3


From 7cec18a3906b52e855c9386650c0226bbe594a4c Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Mon, 25 Apr 2016 08:14:24 +0100
Subject: genirq: Add error code reporting to irq_{reserve,destroy}_ipi

Make these functions return appropriate error codes when something goes
wrong.

Previously irq_destroy_ipi returned void making it impossible to notify
the caller if the request could not be fulfilled. Patch 1 in the series
added another condition in which this could fail in addition to the
existing ones. irq_reserve_ipi returned an unsigned int meaning it could
only return 0 on failure and give the caller no indication as to why the
request failed.

As time goes on there are likely to be further conditions added in which
these functions can fail. These APIs and the IPI IRQ domain are new in
4.6 and the number of existing call sites are low, changing the API now
has little impact on the code, while making it easier for these
functions to grow over time.

Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: linux-mips@linux-mips.org
Cc: jason@lakedaemon.net
Cc: marc.zyngier@arm.com
Cc: ralf@linux-mips.org
Cc: Qais Yousef <qsyousef@gmail.com>
Cc: lisa.parratt@imgtec.com
Cc: jiang.liu@linux.intel.com
Link: http://lkml.kernel.org/r/1461568464-31701-2-git-send-email-matt.redfearn@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain.h |  5 ++---
 kernel/irq/ipi.c          | 31 +++++++++++++++++--------------
 2 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index e1b81d35e7a3..736abd74c135 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -346,9 +346,8 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr,
 			irq_hw_number_t *out_hwirq, unsigned int *out_type);
 
 /* IPI functions */
-unsigned int irq_reserve_ipi(struct irq_domain *domain,
-			     const struct cpumask *dest);
-void irq_destroy_ipi(unsigned int irq, const struct cpumask *dest);
+int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest);
+int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest);
 
 /* V2 interfaces to support hierarchy IRQ domains. */
 extern struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain,
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index bedc995ae214..c42742208e5e 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -19,9 +19,9 @@
  *
  * Allocate a virq that can be used to send IPI to any CPU in dest mask.
  *
- * On success it'll return linux irq number and 0 on failure
+ * On success it'll return linux irq number and error code on failure
  */
-unsigned int irq_reserve_ipi(struct irq_domain *domain,
+int irq_reserve_ipi(struct irq_domain *domain,
 			     const struct cpumask *dest)
 {
 	unsigned int nr_irqs, offset;
@@ -30,18 +30,18 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain,
 
 	if (!domain ||!irq_domain_is_ipi(domain)) {
 		pr_warn("Reservation on a non IPI domain\n");
-		return 0;
+		return -EINVAL;
 	}
 
 	if (!cpumask_subset(dest, cpu_possible_mask)) {
 		pr_warn("Reservation is not in possible_cpu_mask\n");
-		return 0;
+		return -EINVAL;
 	}
 
 	nr_irqs = cpumask_weight(dest);
 	if (!nr_irqs) {
 		pr_warn("Reservation for empty destination mask\n");
-		return 0;
+		return -EINVAL;
 	}
 
 	if (irq_domain_is_ipi_single(domain)) {
@@ -72,14 +72,14 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain,
 			next = cpumask_next(next, dest);
 		if (next < nr_cpu_ids) {
 			pr_warn("Destination mask has holes\n");
-			return 0;
+			return -EINVAL;
 		}
 	}
 
 	virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
 	if (virq <= 0) {
 		pr_warn("Can't reserve IPI, failed to alloc descs\n");
-		return 0;
+		return -ENOMEM;
 	}
 
 	virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
@@ -100,7 +100,7 @@ unsigned int irq_reserve_ipi(struct irq_domain *domain,
 
 free_descs:
 	irq_free_descs(virq, nr_irqs);
-	return 0;
+	return -EBUSY;
 }
 
 /**
@@ -108,10 +108,12 @@ free_descs:
  * @irq:	linux irq number to be destroyed
  * @dest:	cpumask of cpus which should have the IPI removed
  *
- * Return the IPIs allocated with irq_reserve_ipi() to the system destroying
- * all virqs associated with them.
+ * The IPIs allocated with irq_reserve_ipi() are retuerned to the system
+ * destroying all virqs associated with them.
+ *
+ * Return 0 on success or error code on failure.
  */
-void irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
+int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 {
 	struct irq_data *data = irq_get_irq_data(irq);
 	struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
@@ -119,7 +121,7 @@ void irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 	unsigned int nr_irqs;
 
 	if (!irq || !data || !ipimask)
-		return;
+		return -EINVAL;
 
 	domain = data->domain;
 	if (WARN_ON(domain == NULL))
@@ -127,7 +129,7 @@ void irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 
 	if (!irq_domain_is_ipi(domain)) {
 		pr_warn("Trying to destroy a non IPI domain!\n");
-		return;
+		return -EINVAL;
 	}
 
 	if (WARN_ON(!cpumask_subset(dest, ipimask)))
@@ -135,7 +137,7 @@ void irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 		 * Must be destroying a subset of CPUs to which this IPI
 		 * was set up to target
 		 */
-		return;
+		return -EINVAL;
 
 	if (irq_domain_is_ipi_per_cpu(domain)) {
 		irq = irq + cpumask_first(dest) - data->common->ipi_offset;
@@ -145,6 +147,7 @@ void irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 	}
 
 	irq_domain_free_irqs(irq, nr_irqs);
+	return 0;
 }
 
 /**
-- 
cgit v1.2.3


From 651e8b54abdeeaa36f5f54ffa05c18707a3cc1d0 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 11 Apr 2016 09:57:51 +0100
Subject: irqdomain: Allow domain matching on irq_fwspec

When iterating over the irq domain list, we try to match a domain
either by calling a match() function or by comparing a number
of fields passed as parameters.

Both approaches are a bit restrictive:
- match() is DT specific and only takes a device node
- the fallback case only deals with the fwnode_handle

It would be useful if we had a per-domain function that would
actually perform the matching check on the whole of the
irq_fwspec structure. This would allow for a domain to triage
matching attempts that need to extend beyond the fwnode.

Let's introduce irq_find_matching_fwspec(), which takes a full
blown irq_fwspec structure, and call into a select() function
implemented by the irqdomain. irq_find_matching_fwnode() is
made a wrapper around irq_find_matching_fwspec in order to
preserve compatibility.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: devicetree@vger.kernel.org
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Rob Herring <robh+dt@kernel.org>
Link: http://lkml.kernel.org/r/1460365075-7316-2-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain.h | 15 ++++++++++++++-
 kernel/irq/irqdomain.c    | 19 ++++++++++---------
 2 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 736abd74c135..f1f36e04d885 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -96,6 +96,8 @@ enum irq_domain_bus_token {
 struct irq_domain_ops {
 	int (*match)(struct irq_domain *d, struct device_node *node,
 		     enum irq_domain_bus_token bus_token);
+	int (*select)(struct irq_domain *d, struct irq_fwspec *fwspec,
+		      enum irq_domain_bus_token bus_token);
 	int (*map)(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw);
 	void (*unmap)(struct irq_domain *d, unsigned int virq);
 	int (*xlate)(struct irq_domain *d, struct device_node *node,
@@ -211,7 +213,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 					 irq_hw_number_t first_hwirq,
 					 const struct irq_domain_ops *ops,
 					 void *host_data);
-extern struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
+extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
 						   enum irq_domain_bus_token bus_token);
 extern void irq_set_default_host(struct irq_domain *host);
 extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
@@ -227,6 +229,17 @@ static inline bool is_fwnode_irqchip(struct fwnode_handle *fwnode)
 	return fwnode && fwnode->type == FWNODE_IRQCHIP;
 }
 
+static inline
+struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
+					    enum irq_domain_bus_token bus_token)
+{
+	struct irq_fwspec fwspec = {
+		.fwnode = fwnode,
+	};
+
+	return irq_find_matching_fwspec(&fwspec, bus_token);
+}
+
 static inline struct irq_domain *irq_find_matching_host(struct device_node *node,
 							enum irq_domain_bus_token bus_token)
 {
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3a519a01118b..503c5b9dd030 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -243,14 +243,15 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
 EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
 
 /**
- * irq_find_matching_fwnode() - Locates a domain for a given fwnode
- * @fwnode: FW descriptor of the interrupt controller
+ * irq_find_matching_fwspec() - Locates a domain for a given fwspec
+ * @fwspec: FW specifier for an interrupt
  * @bus_token: domain-specific data
  */
-struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
+struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec,
 					    enum irq_domain_bus_token bus_token)
 {
 	struct irq_domain *h, *found = NULL;
+	struct fwnode_handle *fwnode = fwspec->fwnode;
 	int rc;
 
 	/* We might want to match the legacy controller last since
@@ -264,7 +265,9 @@ struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
 	 */
 	mutex_lock(&irq_domain_mutex);
 	list_for_each_entry(h, &irq_domain_list, link) {
-		if (h->ops->match)
+		if (h->ops->select && fwspec->param_count)
+			rc = h->ops->select(h, fwspec, bus_token);
+		else if (h->ops->match)
 			rc = h->ops->match(h, to_of_node(fwnode), bus_token);
 		else
 			rc = ((fwnode != NULL) && (h->fwnode == fwnode) &&
@@ -279,7 +282,7 @@ struct irq_domain *irq_find_matching_fwnode(struct fwnode_handle *fwnode,
 	mutex_unlock(&irq_domain_mutex);
 	return found;
 }
-EXPORT_SYMBOL_GPL(irq_find_matching_fwnode);
+EXPORT_SYMBOL_GPL(irq_find_matching_fwspec);
 
 /**
  * irq_set_default_host() - Set a "default" irq domain
@@ -574,11 +577,9 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
 	int virq;
 
 	if (fwspec->fwnode) {
-		domain = irq_find_matching_fwnode(fwspec->fwnode,
-						  DOMAIN_BUS_WIRED);
+		domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
 		if (!domain)
-			domain = irq_find_matching_fwnode(fwspec->fwnode,
-							  DOMAIN_BUS_ANY);
+			domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_ANY);
 	} else {
 		domain = irq_default_domain;
 	}
-- 
cgit v1.2.3


From 222df54fd8b7641dcc81476f157806bb3144ee1d Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Mon, 11 Apr 2016 09:57:52 +0100
Subject: genirq: Allow the affinity of a percpu interrupt to be set/retrieved

In order to prepare the genirq layer for the concept of partitionned
percpu interrupts, let's allow an affinity to be associated with
such an interrupt. We introduce:

- irq_set_percpu_devid_partition: flag an interrupt as a percpu-devid
  interrupt, and associate it with an affinity
- irq_get_percpu_devid_partition: allow the affinity of that interrupt
  to be retrieved.

This will allow a driver to discover which CPUs the per-cpu interrupt
can actually fire on.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: devicetree@vger.kernel.org
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Rob Herring <robh+dt@kernel.org>
Link: http://lkml.kernel.org/r/1460365075-7316-3-git-send-email-marc.zyngier@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h     |  4 ++++
 include/linux/irqdesc.h |  1 +
 kernel/irq/irqdesc.c    | 26 +++++++++++++++++++++++++-
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index c4de62348ff2..4d758a7c604a 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -530,6 +530,10 @@ static inline void irq_set_chip_and_handler(unsigned int irq, struct irq_chip *c
 }
 
 extern int irq_set_percpu_devid(unsigned int irq);
+extern int irq_set_percpu_devid_partition(unsigned int irq,
+					  const struct cpumask *affinity);
+extern int irq_get_percpu_devid_partition(unsigned int irq,
+					  struct cpumask *affinity);
 
 extern void
 __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index dcca77c4b9d2..b51beebf9804 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -66,6 +66,7 @@ struct irq_desc {
 	int			threads_handled_last;
 	raw_spinlock_t		lock;
 	struct cpumask		*percpu_enabled;
+	const struct cpumask	*percpu_affinity;
 #ifdef CONFIG_SMP
 	const struct cpumask	*affinity_hint;
 	struct irq_affinity_notify *affinity_notify;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0ccd028817d7..8731e1c5d1e7 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -595,7 +595,8 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
 		chip_bus_sync_unlock(desc);
 }
 
-int irq_set_percpu_devid(unsigned int irq)
+int irq_set_percpu_devid_partition(unsigned int irq,
+				   const struct cpumask *affinity)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
@@ -610,10 +611,33 @@ int irq_set_percpu_devid(unsigned int irq)
 	if (!desc->percpu_enabled)
 		return -ENOMEM;
 
+	if (affinity)
+		desc->percpu_affinity = affinity;
+	else
+		desc->percpu_affinity = cpu_possible_mask;
+
 	irq_set_percpu_devid_flags(irq);
 	return 0;
 }
 
+int irq_set_percpu_devid(unsigned int irq)
+{
+	return irq_set_percpu_devid_partition(irq, NULL);
+}
+
+int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (!desc || !desc->percpu_enabled)
+		return -EINVAL;
+
+	if (affinity)
+		cpumask_copy(affinity, desc->percpu_affinity);
+
+	return 0;
+}
+
 void kstat_incr_irq_this_cpu(unsigned int irq)
 {
 	kstat_incr_irqs_this_cpu(irq_to_desc(irq));
-- 
cgit v1.2.3


From dcb0b5575d24a32f51a3f1003312fb94ed4e214a Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 2 May 2016 21:30:04 -0400
Subject: tracing: Remove TRACE_EVENT_FL_USE_CALL_FILTER logic

Nothing sets TRACE_EVENT_FL_USE_CALL_FILTER anymore. Remove it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_events.h       |  3 --
 kernel/trace/trace_events_filter.c | 71 ++++++--------------------------------
 2 files changed, 10 insertions(+), 64 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 3111a1efdad6..ba6456302e34 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -214,7 +214,6 @@ enum {
 	TRACE_EVENT_FL_NO_SET_FILTER_BIT,
 	TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
-	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
 	TRACE_EVENT_FL_KPROBE_BIT,
 	TRACE_EVENT_FL_UPROBE_BIT,
@@ -229,7 +228,6 @@ enum {
  *  WAS_ENABLED   - Set and stays set when an event was ever enabled
  *                    (used for module unloading, if a module event is enabled,
  *                     it is best to clear the buffers that used it).
- *  USE_CALL_FILTER - For trace internal events, don't use file filter
  *  TRACEPOINT    - Event is a tracepoint
  *  KPROBE        - Event is a kprobe
  *  UPROBE        - Event is a uprobe
@@ -240,7 +238,6 @@ enum {
 	TRACE_EVENT_FL_NO_SET_FILTER	= (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
 	TRACE_EVENT_FL_IGNORE_ENABLE	= (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
-	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
 	TRACE_EVENT_FL_KPROBE		= (1 << TRACE_EVENT_FL_KPROBE_BIT),
 	TRACE_EVENT_FL_UPROBE		= (1 << TRACE_EVENT_FL_UPROBE_BIT),
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index b3f5051cd4e9..d1d27bf37a19 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -689,10 +689,7 @@ static void append_filter_err(struct filter_parse_state *ps,
 
 static inline struct event_filter *event_filter(struct trace_event_file *file)
 {
-	if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		return file->event_call->filter;
-	else
-		return file->filter;
+	return file->filter;
 }
 
 /* caller must hold event_mutex */
@@ -826,12 +823,7 @@ static void __free_preds(struct event_filter *filter)
 
 static void filter_disable(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags &= ~TRACE_EVENT_FL_FILTERED;
-	else
-		file->flags &= ~EVENT_FILE_FL_FILTERED;
+	file->flags &= ~EVENT_FILE_FL_FILTERED;
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -883,13 +875,8 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
 
 static inline void __remove_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
 	filter_disable(file);
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		remove_filter_string(call->filter);
-	else
-		remove_filter_string(file->filter);
+	remove_filter_string(file->filter);
 }
 
 static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
@@ -906,15 +893,8 @@ static void filter_free_subsystem_preds(struct trace_subsystem_dir *dir,
 
 static inline void __free_subsystem_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) {
-		__free_filter(call->filter);
-		call->filter = NULL;
-	} else {
-		__free_filter(file->filter);
-		file->filter = NULL;
-	}
+	__free_filter(file->filter);
+	file->filter = NULL;
 }
 
 static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
@@ -1718,69 +1698,38 @@ fail:
 
 static inline void event_set_filtered_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags |= TRACE_EVENT_FL_FILTERED;
-	else
-		file->flags |= EVENT_FILE_FL_FILTERED;
+	file->flags |= EVENT_FILE_FL_FILTERED;
 }
 
 static inline void event_set_filter(struct trace_event_file *file,
 				    struct event_filter *filter)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		rcu_assign_pointer(call->filter, filter);
-	else
-		rcu_assign_pointer(file->filter, filter);
+	rcu_assign_pointer(file->filter, filter);
 }
 
 static inline void event_clear_filter(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		RCU_INIT_POINTER(call->filter, NULL);
-	else
-		RCU_INIT_POINTER(file->filter, NULL);
+	RCU_INIT_POINTER(file->filter, NULL);
 }
 
 static inline void
 event_set_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
-	else
-		file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
+	file->flags |= EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline void
 event_clear_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
-	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
-		call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
-	else
-		file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
+	file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER;
 }
 
 static inline bool
 event_no_set_filter_flag(struct trace_event_file *file)
 {
-	struct trace_event_call *call = file->event_call;
-
 	if (file->flags & EVENT_FILE_FL_NO_SET_FILTER)
 		return true;
 
-	if ((call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) &&
-	    (call->flags & TRACE_EVENT_FL_NO_SET_FILTER))
-		return true;
-
 	return false;
 }
 
-- 
cgit v1.2.3


From 407bc16ad1769f5cb8ad9555611cb198187ef4cd Mon Sep 17 00:00:00 2001
From: Stas Sergeev <stsp@list.ru>
Date: Thu, 14 Apr 2016 23:20:03 +0300
Subject: signals/sigaltstack: Prepare to add new SS_xxx flags

This patch adds SS_FLAG_BITS - the mask that splits sigaltstack
mode values and bit-flags. Since there is no bit-flags yet, the
mask is defined to 0. The flags are added by subsequent patches.
With every new flag, the mask should have the appropriate bit cleared.

This makes sure if some flag is tried on a kernel that doesn't
support it, the -EINVAL error will be returned, because such a
flag will be treated as an invalid mode rather than the bit-flag.

That way the existence of the particular features can be probed
at run-time.

This change was suggested by Andy Lutomirski:

  https://lkml.org/lkml/2016/3/6/158

Signed-off-by: Stas Sergeev <stsp@list.ru>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amanieu d'Antras <amanieu@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: linux-api@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1460665206-13646-3-git-send-email-stsp@list.ru
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/signal.h |  3 +++
 kernel/signal.c             | 16 ++++++----------
 2 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/signal.h b/include/uapi/linux/signal.h
index e1bd50c29ded..7c73165d11ce 100644
--- a/include/uapi/linux/signal.h
+++ b/include/uapi/linux/signal.h
@@ -7,4 +7,7 @@
 #define SS_ONSTACK	1
 #define SS_DISABLE	2
 
+/* mask for all SS_xxx flags */
+#define SS_FLAG_BITS	0
+
 #endif /* _UAPI_LINUX_SIGNAL_H */
diff --git a/kernel/signal.c b/kernel/signal.c
index aa9bf00749c1..b1c6eb4df2a8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3104,7 +3104,8 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
 	if (uss) {
 		void __user *ss_sp;
 		size_t ss_size;
-		int ss_flags;
+		unsigned ss_flags;
+		int ss_mode;
 
 		error = -EFAULT;
 		if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
@@ -3119,18 +3120,13 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
 		if (on_sig_stack(sp))
 			goto out;
 
+		ss_mode = ss_flags & ~SS_FLAG_BITS;
 		error = -EINVAL;
-		/*
-		 * Note - this code used to test ss_flags incorrectly:
-		 *  	  old code may have been written using ss_flags==0
-		 *	  to mean ss_flags==SS_ONSTACK (as this was the only
-		 *	  way that worked) - this fix preserves that older
-		 *	  mechanism.
-		 */
-		if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
+		if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
+				ss_mode != 0)
 			goto out;
 
-		if (ss_flags == SS_DISABLE) {
+		if (ss_mode == SS_DISABLE) {
 			ss_size = 0;
 			ss_sp = NULL;
 		} else {
-- 
cgit v1.2.3


From 2a74213838104a41588d86fd5e8d344972891ace Mon Sep 17 00:00:00 2001
From: Stas Sergeev <stsp@list.ru>
Date: Thu, 14 Apr 2016 23:20:04 +0300
Subject: signals/sigaltstack: Implement SS_AUTODISARM flag

This patch implements the SS_AUTODISARM flag that can be OR-ed with
SS_ONSTACK when forming ss_flags.

When this flag is set, sigaltstack will be disabled when entering
the signal handler; more precisely, after saving sas to uc_stack.
When leaving the signal handler, the sigaltstack is restored by
uc_stack.

When this flag is used, it is safe to switch from sighandler with
swapcontext(). Without this flag, the subsequent signal will corrupt
the state of the switched-away sighandler.

To detect the support of this functionality, one can do:

  err = sigaltstack(SS_DISABLE | SS_AUTODISARM);
  if (err && errno == EINVAL)
	unsupported();

Signed-off-by: Stas Sergeev <stsp@list.ru>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Amanieu d'Antras <amanieu@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Heinrich Schuchardt <xypron.glpk@gmx.de>
Cc: Jason Low <jason.low2@hp.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Moore <pmoore@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: linux-api@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1460665206-13646-4-git-send-email-stsp@list.ru
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h       |  8 ++++++++
 include/linux/signal.h      |  4 +++-
 include/uapi/linux/signal.h |  4 +++-
 kernel/fork.c               |  2 +-
 kernel/signal.c             | 10 ++++++++--
 5 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52c4847b05e2..2950c5cd3005 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1596,6 +1596,7 @@ struct task_struct {
 
 	unsigned long sas_ss_sp;
 	size_t sas_ss_size;
+	unsigned sas_ss_flags;
 
 	struct callback_head *task_works;
 
@@ -2592,6 +2593,13 @@ static inline int sas_ss_flags(unsigned long sp)
 	return on_sig_stack(sp) ? SS_ONSTACK : 0;
 }
 
+static inline void sas_ss_reset(struct task_struct *p)
+{
+	p->sas_ss_sp = 0;
+	p->sas_ss_size = 0;
+	p->sas_ss_flags = SS_DISABLE;
+}
+
 static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
 {
 	if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 92557bbce7e7..3fbe81444d31 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -432,8 +432,10 @@ int __save_altstack(stack_t __user *, unsigned long);
 	stack_t __user *__uss = uss; \
 	struct task_struct *t = current; \
 	put_user_ex((void __user *)t->sas_ss_sp, &__uss->ss_sp); \
-	put_user_ex(sas_ss_flags(sp), &__uss->ss_flags); \
+	put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \
 	put_user_ex(t->sas_ss_size, &__uss->ss_size); \
+	if (t->sas_ss_flags & SS_AUTODISARM) \
+		sas_ss_reset(t); \
 } while (0);
 
 #ifdef CONFIG_PROC_FS
diff --git a/include/uapi/linux/signal.h b/include/uapi/linux/signal.h
index 7c73165d11ce..738826048af2 100644
--- a/include/uapi/linux/signal.h
+++ b/include/uapi/linux/signal.h
@@ -7,7 +7,9 @@
 #define SS_ONSTACK	1
 #define SS_DISABLE	2
 
+/* bit-flags */
+#define SS_AUTODISARM	(1 << 4)	/* disable sas during sighandling */
 /* mask for all SS_xxx flags */
-#define SS_FLAG_BITS	0
+#define SS_FLAG_BITS	SS_AUTODISARM
 
 #endif /* _UAPI_LINUX_SIGNAL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index d277e83ed3e0..3e8451527cbe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1494,7 +1494,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * sigaltstack should be cleared when sharing the same VM
 	 */
 	if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
-		p->sas_ss_sp = p->sas_ss_size = 0;
+		sas_ss_reset(p);
 
 	/*
 	 * Syscall tracing and stepping should be turned off in the
diff --git a/kernel/signal.c b/kernel/signal.c
index b1c6eb4df2a8..bf97ea5775ae 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3137,6 +3137,7 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
 
 		current->sas_ss_sp = (unsigned long) ss_sp;
 		current->sas_ss_size = ss_size;
+		current->sas_ss_flags = ss_flags;
 	}
 
 	error = 0;
@@ -3167,9 +3168,14 @@ int restore_altstack(const stack_t __user *uss)
 int __save_altstack(stack_t __user *uss, unsigned long sp)
 {
 	struct task_struct *t = current;
-	return  __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
-		__put_user(sas_ss_flags(sp), &uss->ss_flags) |
+	int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
+		__put_user(t->sas_ss_flags, &uss->ss_flags) |
 		__put_user(t->sas_ss_size, &uss->ss_size);
+	if (err)
+		return err;
+	if (t->sas_ss_flags & SS_AUTODISARM)
+		sas_ss_reset(t);
+	return 0;
 }
 
 #ifdef CONFIG_COMPAT
-- 
cgit v1.2.3


From 854145e0a8e9a05f7366d240e2f99d9c1ca6d6dd Mon Sep 17 00:00:00 2001
From: Chunyu Hu <chuhu@redhat.com>
Date: Tue, 3 May 2016 19:34:34 +0800
Subject: tracing: Don't display trigger file for events that can't be enabled

Currently register functions for events will be called
through the 'reg' field of event class directly without
any check when seting up triggers.

Triggers for events that don't support register through
debug fs (events under events/ftrace are for trace-cmd to
read event format, and most of them don't have a register
function except events/ftrace/functionx) can't be enabled
at all, and an oops will be hit when setting up trigger
for those events, so just not creating them is an easy way
to avoid the oops.

Link: http://lkml.kernel.org/r/1462275274-3911-1-git-send-email-chuhu@redhat.com

Cc: stable@vger.kernel.org # 3.14+
Fixes: 85f2b08268c01 ("tracing: Add basic event trigger framework")
Signed-off-by: Chunyu Hu <chuhu@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05ddc0820771..6f965864cc02 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2095,8 +2095,13 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
 	trace_create_file("filter", 0644, file->dir, file,
 			  &ftrace_event_filter_fops);
 
-	trace_create_file("trigger", 0644, file->dir, file,
-			  &event_trigger_fops);
+	/*
+	 * Only event directories that can be enabled should have
+	 * triggers.
+	 */
+	if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
+		trace_create_file("trigger", 0644, file->dir, file,
+				  &event_trigger_fops);
 
 	trace_create_file("format", 0444, file->dir, call,
 			  &ftrace_event_format_fops);
-- 
cgit v1.2.3


From 0fc1b09ff1ff404ddf753f5ffa5cd0adc8fdcdc9 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 3 May 2016 17:15:43 -0400
Subject: tracing: Use temp buffer when filtering events

Filtering of events requires the data to be written to the ring buffer
before it can be decided to filter or not. This is because the parameters of
the filter are based on the result that is written to the ring buffer and
not on the parameters that are passed into the trace functions.

The ftrace ring buffer is optimized for writing into the ring buffer and
committing. The discard procedure used when filtering decides the event
should be discarded is much more heavy weight. Thus, using a temporary
filter when filtering events can speed things up drastically.

Without a temp buffer we have:

 # trace-cmd start -p nop
 # perf stat -r 10 hackbench 50
       0.790706626 seconds time elapsed ( +-  0.71% )

 # trace-cmd start -e all
 # perf stat -r 10 hackbench 50
       1.566904059 seconds time elapsed ( +-  0.27% )

 # trace-cmd start -e all -f 'common_preempt_count==20'
 # perf stat -r 10 hackbench 50
       1.690598511 seconds time elapsed ( +-  0.19% )

 # trace-cmd start -e all -f 'common_preempt_count!=20'
 # perf stat -r 10 hackbench 50
       1.707486364 seconds time elapsed ( +-  0.30% )

The first run above is without any tracing, just to get a based figure.
hackbench takes ~0.79 seconds to run on the system.

The second run enables tracing all events where nothing is filtered. This
increases the time by 100% and hackbench takes 1.57 seconds to run.

The third run filters all events where the preempt count will equal "20"
(this should never happen) thus all events are discarded. This takes 1.69
seconds to run. This is 10% slower than just committing the events!

The last run enables all events and filters where the filter will commit all
events, and this takes 1.70 seconds to run. The filtering overhead is
approximately 10%. Thus, the discard and commit of an event from the ring
buffer may be about the same time.

With this patch, the numbers change:

 # trace-cmd start -p nop
 # perf stat -r 10 hackbench 50
       0.778233033 seconds time elapsed ( +-  0.38% )

 # trace-cmd start -e all
 # perf stat -r 10 hackbench 50
       1.582102692 seconds time elapsed ( +-  0.28% )

 # trace-cmd start -e all -f 'common_preempt_count==20'
 # perf stat -r 10 hackbench 50
       1.309230710 seconds time elapsed ( +-  0.22% )

 # trace-cmd start -e all -f 'common_preempt_count!=20'
 # perf stat -r 10 hackbench 50
       1.786001924 seconds time elapsed ( +-  0.20% )

The first run is again the base with no tracing.

The second run is all tracing with no filtering. It is a little slower, but
that may be well within the noise.

The third run shows that discarding all events only took 1.3 seconds. This
is a speed up of 23%! The discard is much faster than even the commit.

The one downside is shown in the last run. Events that are not discarded by
the filter will take longer to add, this is due to the extra copy of the
event.

Cc: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c               | 154 +++++++++++++++++++++++++++++++++++--
 kernel/trace/trace.h               |  19 ++++-
 kernel/trace/trace_events.c        |  10 +++
 kernel/trace/trace_events_filter.c |  10 +++
 4 files changed, 185 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c09e8ffadc73..8a4bd6b68a0b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -312,7 +312,7 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec,
 {
 	if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
 	    !filter_match_preds(call->filter, rec)) {
-		ring_buffer_discard_commit(buffer, event);
+		__trace_event_discard_commit(buffer, event);
 		return 1;
 	}
 
@@ -1660,6 +1660,16 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
 
+static __always_inline void
+trace_event_setup(struct ring_buffer_event *event,
+		  int type, unsigned long flags, int pc)
+{
+	struct trace_entry *ent = ring_buffer_event_data(event);
+
+	tracing_generic_entry_update(ent, flags, pc);
+	ent->type = type;
+}
+
 struct ring_buffer_event *
 trace_buffer_lock_reserve(struct ring_buffer *buffer,
 			  int type,
@@ -1669,21 +1679,136 @@ trace_buffer_lock_reserve(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 
 	event = ring_buffer_lock_reserve(buffer, len);
-	if (event != NULL) {
-		struct trace_entry *ent = ring_buffer_event_data(event);
+	if (event != NULL)
+		trace_event_setup(event, type, flags, pc);
+
+	return event;
+}
+
+DEFINE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
+DEFINE_PER_CPU(int, trace_buffered_event_cnt);
+static int trace_buffered_event_ref;
+
+/**
+ * trace_buffered_event_enable - enable buffering events
+ *
+ * When events are being filtered, it is quicker to use a temporary
+ * buffer to write the event data into if there's a likely chance
+ * that it will not be committed. The discard of the ring buffer
+ * is not as fast as committing, and is much slower than copying
+ * a commit.
+ *
+ * When an event is to be filtered, allocate per cpu buffers to
+ * write the event data into, and if the event is filtered and discarded
+ * it is simply dropped, otherwise, the entire data is to be committed
+ * in one shot.
+ */
+void trace_buffered_event_enable(void)
+{
+	struct ring_buffer_event *event;
+	struct page *page;
+	int cpu;
 
-		tracing_generic_entry_update(ent, flags, pc);
-		ent->type = type;
+	WARN_ON_ONCE(!mutex_is_locked(&event_mutex));
+
+	if (trace_buffered_event_ref++)
+		return;
+
+	for_each_tracing_cpu(cpu) {
+		page = alloc_pages_node(cpu_to_node(cpu),
+					GFP_KERNEL | __GFP_NORETRY, 0);
+		if (!page)
+			goto failed;
+
+		event = page_address(page);
+		memset(event, 0, sizeof(*event));
+
+		per_cpu(trace_buffered_event, cpu) = event;
+
+		preempt_disable();
+		if (cpu == smp_processor_id() &&
+		    this_cpu_read(trace_buffered_event) !=
+		    per_cpu(trace_buffered_event, cpu))
+			WARN_ON_ONCE(1);
+		preempt_enable();
 	}
 
-	return event;
+	return;
+ failed:
+	trace_buffered_event_disable();
+}
+
+static void enable_trace_buffered_event(void *data)
+{
+	/* Probably not needed, but do it anyway */
+	smp_rmb();
+	this_cpu_dec(trace_buffered_event_cnt);
+}
+
+static void disable_trace_buffered_event(void *data)
+{
+	this_cpu_inc(trace_buffered_event_cnt);
+}
+
+/**
+ * trace_buffered_event_disable - disable buffering events
+ *
+ * When a filter is removed, it is faster to not use the buffered
+ * events, and to commit directly into the ring buffer. Free up
+ * the temp buffers when there are no more users. This requires
+ * special synchronization with current events.
+ */
+void trace_buffered_event_disable(void)
+{
+	int cpu;
+
+	WARN_ON_ONCE(!mutex_is_locked(&event_mutex));
+
+	if (WARN_ON_ONCE(!trace_buffered_event_ref))
+		return;
+
+	if (--trace_buffered_event_ref)
+		return;
+
+	preempt_disable();
+	/* For each CPU, set the buffer as used. */
+	smp_call_function_many(tracing_buffer_mask,
+			       disable_trace_buffered_event, NULL, 1);
+	preempt_enable();
+
+	/* Wait for all current users to finish */
+	synchronize_sched();
+
+	for_each_tracing_cpu(cpu) {
+		free_page((unsigned long)per_cpu(trace_buffered_event, cpu));
+		per_cpu(trace_buffered_event, cpu) = NULL;
+	}
+	/*
+	 * Make sure trace_buffered_event is NULL before clearing
+	 * trace_buffered_event_cnt.
+	 */
+	smp_wmb();
+
+	preempt_disable();
+	/* Do the work on each cpu */
+	smp_call_function_many(tracing_buffer_mask,
+			       enable_trace_buffered_event, NULL, 1);
+	preempt_enable();
 }
 
 void
 __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
 {
 	__this_cpu_write(trace_cmdline_save, true);
-	ring_buffer_unlock_commit(buffer, event);
+
+	/* If this is the temp buffer, we need to commit fully */
+	if (this_cpu_read(trace_buffered_event) == event) {
+		/* Length is in event->array[0] */
+		ring_buffer_write(buffer, event->array[0], &event->array[1]);
+		/* Release the temp buffer */
+		this_cpu_dec(trace_buffered_event_cnt);
+	} else
+		ring_buffer_unlock_commit(buffer, event);
 }
 
 static struct ring_buffer *temp_buffer;
@@ -1695,8 +1820,23 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 			  unsigned long flags, int pc)
 {
 	struct ring_buffer_event *entry;
+	int val;
 
 	*current_rb = trace_file->tr->trace_buffer.buffer;
+
+	if ((trace_file->flags &
+	     (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
+	    (entry = this_cpu_read(trace_buffered_event))) {
+		/* Try to use the per cpu buffer first */
+		val = this_cpu_inc_return(trace_buffered_event_cnt);
+		if (val == 1) {
+			trace_event_setup(entry, type, flags, pc);
+			entry->array[0] = len;
+			return entry;
+		}
+		this_cpu_dec(trace_buffered_event_cnt);
+	}
+
 	entry = trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 	/*
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 10156a09103f..5167c366d6b7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1083,6 +1083,23 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr,
 	trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL);
 }
 
+DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event);
+DECLARE_PER_CPU(int, trace_buffered_event_cnt);
+void trace_buffered_event_disable(void);
+void trace_buffered_event_enable(void);
+
+static inline void
+__trace_event_discard_commit(struct ring_buffer *buffer,
+			     struct ring_buffer_event *event)
+{
+	if (this_cpu_read(trace_buffered_event) == event) {
+		/* Simply release the temp buffer */
+		this_cpu_dec(trace_buffered_event_cnt);
+		return;
+	}
+	ring_buffer_discard_commit(buffer, event);
+}
+
 /*
  * Helper function for event_trigger_unlock_commit{_regs}().
  * If there are event triggers attached to this event that requires
@@ -1111,7 +1128,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
 	if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
 	    (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
 	     !filter_match_preds(file->filter, entry))) {
-		ring_buffer_discard_commit(buffer, event);
+		__trace_event_discard_commit(buffer, event);
 		return true;
 	}
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index da1eeb6190e3..4d006707b947 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -363,6 +363,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 {
 	struct trace_event_call *call = file->event_call;
 	struct trace_array *tr = file->tr;
+	unsigned long file_flags = file->flags;
 	int ret = 0;
 	int disable;
 
@@ -445,6 +446,15 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
 		break;
 	}
 
+	/* Enable or disable use of trace_buffered_event */
+	if ((file_flags & EVENT_FILE_FL_SOFT_DISABLED) !=
+	    (file->flags & EVENT_FILE_FL_SOFT_DISABLED)) {
+		if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
+			trace_buffered_event_enable();
+		else
+			trace_buffered_event_disable();
+	}
+
 	return ret;
 }
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d1d27bf37a19..9daa9b3bc6d9 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -823,7 +823,12 @@ static void __free_preds(struct event_filter *filter)
 
 static void filter_disable(struct trace_event_file *file)
 {
+	unsigned long old_flags = file->flags;
+
 	file->flags &= ~EVENT_FILE_FL_FILTERED;
+
+	if (old_flags != file->flags)
+		trace_buffered_event_disable();
 }
 
 static void __free_filter(struct event_filter *filter)
@@ -1698,7 +1703,12 @@ fail:
 
 static inline void event_set_filtered_flag(struct trace_event_file *file)
 {
+	unsigned long old_flags = file->flags;
+
 	file->flags |= EVENT_FILE_FL_FILTERED;
+
+	if (old_flags != file->flags)
+		trace_buffered_event_enable();
 }
 
 static inline void event_set_filter(struct trace_event_file *file,
-- 
cgit v1.2.3


From 0318bc8a919ded355eaa5078689924a15c1bf52a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 3 May 2016 10:31:51 -0700
Subject: signals/sigaltstack: Report current flag bits in sigaltstack()

sigaltstack()'s reported previous state uses a somewhat odd
convention, but the concept of flag bits is new, and we can do the
flag bits sensibly.  Specifically, let's just report them directly.

This will allow saving and restoring the sigaltstack state using
sigaltstack() to work correctly.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amanieu d'Antras <amanieu@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Stas Sergeev <stsp@list.ru>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/94b291ec9fd47741a9264851e316e158ded0b00d.1462296606.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/signal.c                           |  3 ++-
 tools/testing/selftests/sigaltstack/sas.c | 19 ++++++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index bf97ea5775ae..ab122a2cee41 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3099,7 +3099,8 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
 
 	oss.ss_sp = (void __user *) current->sas_ss_sp;
 	oss.ss_size = current->sas_ss_size;
-	oss.ss_flags = sas_ss_flags(sp);
+	oss.ss_flags = sas_ss_flags(sp) |
+		(current->sas_ss_flags & SS_FLAG_BITS);
 
 	if (uss) {
 		void __user *ss_sp;
diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c
index a98c3ef8141f..4280d0699792 100644
--- a/tools/testing/selftests/sigaltstack/sas.c
+++ b/tools/testing/selftests/sigaltstack/sas.c
@@ -113,6 +113,19 @@ int main(void)
 		perror("mmap()");
 		return EXIT_FAILURE;
 	}
+
+	err = sigaltstack(NULL, &stk);
+	if (err) {
+		perror("[FAIL]\tsigaltstack()");
+		exit(EXIT_FAILURE);
+	}
+	if (stk.ss_flags == SS_DISABLE) {
+		printf("[OK]\tInitial sigaltstack state was SS_DISABLE\n");
+	} else {
+		printf("[FAIL]\tInitial sigaltstack state was %i; should have been SS_DISABLE\n", stk.ss_flags);
+		return EXIT_FAILURE;
+	}
+
 	stk.ss_sp = sstack;
 	stk.ss_size = SIGSTKSZ;
 	stk.ss_flags = SS_ONSTACK | SS_AUTODISARM;
@@ -151,12 +164,12 @@ int main(void)
 		perror("[FAIL]\tsigaltstack()");
 		exit(EXIT_FAILURE);
 	}
-	if (stk.ss_flags != 0) {
-		printf("[FAIL]\tss_flags=%i, should be 0\n",
+	if (stk.ss_flags != SS_AUTODISARM) {
+		printf("[FAIL]\tss_flags=%i, should be SS_AUTODISARM\n",
 				stk.ss_flags);
 		exit(EXIT_FAILURE);
 	}
-	printf("[OK]\tsigaltstack is enabled after signal\n");
+	printf("[OK]\tsigaltstack is still SS_AUTODISARM after signal\n");
 
 	printf("[OK]\tTest passed\n");
 	return 0;
-- 
cgit v1.2.3


From 470bf1f27a1472264d18c84b324389509f0e30b3 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Thu, 24 Mar 2016 02:46:33 +0100
Subject: seccomp: Fix comment typo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop accidentally repeated word in comment.

Signed-off-by: Mickaël Salaün <mic@digikod.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
---
 kernel/seccomp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e1e5a354854e..6c9bb62ed046 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -915,7 +915,7 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
 
 	fprog = filter->prog->orig_prog;
 	if (!fprog) {
-		/* This must be a new non-cBPF filter, since we save every
+		/* This must be a new non-cBPF filter, since we save
 		 * every cBPF filter's orig_prog above when
 		 * CONFIG_CHECKPOINT_RESTORE is enabled.
 		 */
-- 
cgit v1.2.3


From 3e71a462dd483ce508a723356b293731e7d788ea Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Apr 2016 16:16:33 +0200
Subject: sched/core: Move task_rq_lock() out of line

Its a rather large function, inline doesn't seems to make much sense:

 $ size defconfig-build/kernel/sched/core.o{.orig,}
    text    data     bss     dec     hex filename
   56533   21037    2320   79890   13812 defconfig-build/kernel/sched/core.o.orig
   55733   21037    2320   79090   134f2 defconfig-build/kernel/sched/core.o

The 'perf bench sched messaging' micro-benchmark shows a visible improvement
of 4-5%:

  $ for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ; do echo performance > $i ; done
  $ perf stat --null --repeat 25 -- perf bench sched messaging -g 40 -l 5000

  pre:
       4.582798193 seconds time elapsed          ( +-  1.41% )
       4.733374877 seconds time elapsed          ( +-  2.10% )
       4.560955136 seconds time elapsed          ( +-  1.43% )
       4.631062303 seconds time elapsed          ( +-  1.40% )

  post:
       4.364765213 seconds time elapsed          ( +-  0.91% )
       4.454442734 seconds time elapsed          ( +-  1.18% )
       4.448893817 seconds time elapsed          ( +-  1.41% )
       4.424346872 seconds time elapsed          ( +-  0.97% )

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/sched.h | 67 ++++------------------------------------------------
 2 files changed, 69 insertions(+), 63 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c82ca6eccfec..1b609a886795 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -170,6 +170,71 @@ static struct rq *this_rq_lock(void)
 	return rq;
 }
 
+/*
+ * __task_rq_lock - lock the rq @p resides on.
+ */
+struct rq *__task_rq_lock(struct task_struct *p)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	for (;;) {
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
+			return rq;
+		}
+		raw_spin_unlock(&rq->lock);
+
+		while (unlikely(task_on_rq_migrating(p)))
+			cpu_relax();
+	}
+}
+
+/*
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
+ */
+struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(p->pi_lock)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	for (;;) {
+		raw_spin_lock_irqsave(&p->pi_lock, *flags);
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		/*
+		 *	move_queued_task()		task_rq_lock()
+		 *
+		 *	ACQUIRE (rq->lock)
+		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
+		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
+		 *	[S] ->cpu = new_cpu		[L] task_rq()
+		 *					[L] ->on_rq
+		 *	RELEASE (rq->lock)
+		 *
+		 * If we observe the old cpu in task_rq_lock, the acquire of
+		 * the old rq->lock will fully serialize against the stores.
+		 *
+		 * If we observe the new cpu in task_rq_lock, the acquire will
+		 * pair with the WMB to ensure we must then also see migrating.
+		 */
+		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+			lockdep_pin_lock(&rq->lock);
+			return rq;
+		}
+		raw_spin_unlock(&rq->lock);
+		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+
+		while (unlikely(task_on_rq_migrating(p)))
+			cpu_relax();
+	}
+}
+
 #ifdef CONFIG_SCHED_HRTICK
 /*
  * Use HR-timers to deliver accurate preemption points.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 69da6fcaa0e8..aab4cf05d48a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1451,70 +1451,11 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
 #endif
 
-/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
-	__acquires(rq->lock)
-{
-	struct rq *rq;
-
-	lockdep_assert_held(&p->pi_lock);
-
-	for (;;) {
-		rq = task_rq(p);
-		raw_spin_lock(&rq->lock);
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-			lockdep_pin_lock(&rq->lock);
-			return rq;
-		}
-		raw_spin_unlock(&rq->lock);
-
-		while (unlikely(task_on_rq_migrating(p)))
-			cpu_relax();
-	}
-}
-
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+struct rq *__task_rq_lock(struct task_struct *p)
+	__acquires(rq->lock);
+struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(p->pi_lock)
-	__acquires(rq->lock)
-{
-	struct rq *rq;
-
-	for (;;) {
-		raw_spin_lock_irqsave(&p->pi_lock, *flags);
-		rq = task_rq(p);
-		raw_spin_lock(&rq->lock);
-		/*
-		 *	move_queued_task()		task_rq_lock()
-		 *
-		 *	ACQUIRE (rq->lock)
-		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
-		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
-		 *	[S] ->cpu = new_cpu		[L] task_rq()
-		 *					[L] ->on_rq
-		 *	RELEASE (rq->lock)
-		 *
-		 * If we observe the old cpu in task_rq_lock, the acquire of
-		 * the old rq->lock will fully serialize against the stores.
-		 *
-		 * If we observe the new cpu in task_rq_lock, the acquire will
-		 * pair with the WMB to ensure we must then also see migrating.
-		 */
-		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-			lockdep_pin_lock(&rq->lock);
-			return rq;
-		}
-		raw_spin_unlock(&rq->lock);
-		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
-		while (unlikely(task_on_rq_migrating(p)))
-			cpu_relax();
-	}
-}
+	__acquires(rq->lock);
 
 static inline void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
-- 
cgit v1.2.3


From eb58075149b7f0300ff19142e6245fe75db2a081 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 31 Jul 2015 21:28:18 +0200
Subject: sched/core: Introduce 'struct rq_flags'

In order to be able to pass around more than just the IRQ flags in the
future, add a rq_flags structure.

No difference in code generation for the x86_64-defconfig build I
tested.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c     | 98 +++++++++++++++++++++++++------------------------
 kernel/sched/deadline.c |  6 +--
 kernel/sched/sched.h    | 14 ++++---
 3 files changed, 62 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1b609a886795..7c7db60115b4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -173,7 +173,7 @@ static struct rq *this_rq_lock(void)
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
-struct rq *__task_rq_lock(struct task_struct *p)
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
@@ -197,14 +197,14 @@ struct rq *__task_rq_lock(struct task_struct *p)
 /*
  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
  */
-struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	__acquires(p->pi_lock)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 
 	for (;;) {
-		raw_spin_lock_irqsave(&p->pi_lock, *flags);
+		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		/*
@@ -228,7 +228,7 @@ struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 			return rq;
 		}
 		raw_spin_unlock(&rq->lock);
-		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 
 		while (unlikely(task_on_rq_migrating(p)))
 			cpu_relax();
@@ -1150,12 +1150,12 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 static int __set_cpus_allowed_ptr(struct task_struct *p,
 				  const struct cpumask *new_mask, bool check)
 {
-	unsigned long flags;
-	struct rq *rq;
 	unsigned int dest_cpu;
+	struct rq_flags rf;
+	struct rq *rq;
 	int ret = 0;
 
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 
 	/*
 	 * Must re-check here, to close a race against __kthread_bind(),
@@ -1184,7 +1184,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 		stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
 		tlb_migrate_finish(p->mm);
 		return 0;
@@ -1198,7 +1198,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		lockdep_pin_lock(&rq->lock);
 	}
 out:
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 
 	return ret;
 }
@@ -1382,8 +1382,8 @@ out:
  */
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
-	unsigned long flags;
 	int running, queued;
+	struct rq_flags rf;
 	unsigned long ncsw;
 	struct rq *rq;
 
@@ -1418,14 +1418,14 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * lock now, to be *sure*. If we're wrong, we'll
 		 * just go back and repeat.
 		 */
-		rq = task_rq_lock(p, &flags);
+		rq = task_rq_lock(p, &rf);
 		trace_sched_wait_task(p);
 		running = task_running(rq, p);
 		queued = task_on_rq_queued(p);
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 
 		/*
 		 * If it changed from the expected state, bail out now.
@@ -1723,17 +1723,18 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
  */
 static int ttwu_remote(struct task_struct *p, int wake_flags)
 {
+	struct rq_flags rf;
 	struct rq *rq;
 	int ret = 0;
 
-	rq = __task_rq_lock(p);
+	rq = __task_rq_lock(p, &rf);
 	if (task_on_rq_queued(p)) {
 		/* check_preempt_curr() may use rq clock */
 		update_rq_clock(rq);
 		ttwu_do_wakeup(rq, p, wake_flags);
 		ret = 1;
 	}
-	__task_rq_unlock(rq);
+	__task_rq_unlock(rq, &rf);
 
 	return ret;
 }
@@ -2486,12 +2487,12 @@ extern void init_dl_bw(struct dl_bw *dl_b);
  */
 void wake_up_new_task(struct task_struct *p)
 {
-	unsigned long flags;
+	struct rq_flags rf;
 	struct rq *rq;
 
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	/* Initialize new task's runnable average */
 	init_entity_runnable_average(&p->se);
+	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
 #ifdef CONFIG_SMP
 	/*
 	 * Fork balancing, do it here and not earlier because:
@@ -2503,7 +2504,7 @@ void wake_up_new_task(struct task_struct *p)
 	/* Post initialize new task's util average when its cfs_rq is set */
 	post_init_entity_util_avg(&p->se);
 
-	rq = __task_rq_lock(p);
+	rq = __task_rq_lock(p, &rf);
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p);
@@ -2519,7 +2520,7 @@ void wake_up_new_task(struct task_struct *p)
 		lockdep_pin_lock(&rq->lock);
 	}
 #endif
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2935,7 +2936,7 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
  */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
-	unsigned long flags;
+	struct rq_flags rf;
 	struct rq *rq;
 	u64 ns;
 
@@ -2955,7 +2956,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 		return p->se.sum_exec_runtime;
 #endif
 
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 	/*
 	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would
 	 * project cycles that may never be accounted to this
@@ -2966,7 +2967,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 		p->sched_class->update_curr(rq);
 	}
 	ns = p->se.sum_exec_runtime;
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 
 	return ns;
 }
@@ -3524,12 +3525,13 @@ EXPORT_SYMBOL(default_wake_function);
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
-	struct rq *rq;
 	const struct sched_class *prev_class;
+	struct rq_flags rf;
+	struct rq *rq;
 
 	BUG_ON(prio > MAX_PRIO);
 
-	rq = __task_rq_lock(p);
+	rq = __task_rq_lock(p, &rf);
 
 	/*
 	 * Idle task boosting is a nono in general. There is one
@@ -3605,7 +3607,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
 	preempt_disable(); /* avoid rq from going away on us */
-	__task_rq_unlock(rq);
+	__task_rq_unlock(rq, &rf);
 
 	balance_callback(rq);
 	preempt_enable();
@@ -3615,7 +3617,7 @@ out_unlock:
 void set_user_nice(struct task_struct *p, long nice)
 {
 	int old_prio, delta, queued;
-	unsigned long flags;
+	struct rq_flags rf;
 	struct rq *rq;
 
 	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
@@ -3624,7 +3626,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * We have to be careful, if called from sys_setpriority(),
 	 * the task might be in the middle of scheduling on another CPU.
 	 */
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 	/*
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
@@ -3655,7 +3657,7 @@ void set_user_nice(struct task_struct *p, long nice)
 			resched_curr(rq);
 	}
 out_unlock:
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -3952,11 +3954,11 @@ static int __sched_setscheduler(struct task_struct *p,
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
 	int retval, oldprio, oldpolicy = -1, queued, running;
 	int new_effective_prio, policy = attr->sched_policy;
-	unsigned long flags;
 	const struct sched_class *prev_class;
-	struct rq *rq;
+	struct rq_flags rf;
 	int reset_on_fork;
 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+	struct rq *rq;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -4051,13 +4053,13 @@ recheck:
 	 * To be able to change p->policy safely, the appropriate
 	 * runqueue lock must be held.
 	 */
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
 	 */
 	if (p == rq->stop) {
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 		return -EINVAL;
 	}
 
@@ -4074,7 +4076,7 @@ recheck:
 			goto change;
 
 		p->sched_reset_on_fork = reset_on_fork;
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 		return 0;
 	}
 change:
@@ -4088,7 +4090,7 @@ change:
 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
 				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
 				!task_group_is_autogroup(task_group(p))) {
-			task_rq_unlock(rq, p, &flags);
+			task_rq_unlock(rq, p, &rf);
 			return -EPERM;
 		}
 #endif
@@ -4103,7 +4105,7 @@ change:
 			 */
 			if (!cpumask_subset(span, &p->cpus_allowed) ||
 			    rq->rd->dl_bw.bw == 0) {
-				task_rq_unlock(rq, p, &flags);
+				task_rq_unlock(rq, p, &rf);
 				return -EPERM;
 			}
 		}
@@ -4113,7 +4115,7 @@ change:
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 		goto recheck;
 	}
 
@@ -4123,7 +4125,7 @@ change:
 	 * is available.
 	 */
 	if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
-		task_rq_unlock(rq, p, &flags);
+		task_rq_unlock(rq, p, &rf);
 		return -EBUSY;
 	}
 
@@ -4168,7 +4170,7 @@ change:
 
 	check_class_changed(rq, p, prev_class, oldprio);
 	preempt_disable(); /* avoid rq from going away on us */
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
@@ -5021,10 +5023,10 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 {
 	struct task_struct *p;
 	unsigned int time_slice;
-	unsigned long flags;
+	struct rq_flags rf;
+	struct timespec t;
 	struct rq *rq;
 	int retval;
-	struct timespec t;
 
 	if (pid < 0)
 		return -EINVAL;
@@ -5039,11 +5041,11 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 	if (retval)
 		goto out_unlock;
 
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 	time_slice = 0;
 	if (p->sched_class->get_rr_interval)
 		time_slice = p->sched_class->get_rr_interval(rq, p);
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 
 	rcu_read_unlock();
 	jiffies_to_timespec(time_slice, &t);
@@ -5307,11 +5309,11 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
  */
 void sched_setnuma(struct task_struct *p, int nid)
 {
-	struct rq *rq;
-	unsigned long flags;
 	bool queued, running;
+	struct rq_flags rf;
+	struct rq *rq;
 
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 
@@ -5326,7 +5328,7 @@ void sched_setnuma(struct task_struct *p, int nid)
 		p->sched_class->set_curr_task(rq);
 	if (queued)
 		enqueue_task(rq, p, ENQUEUE_RESTORE);
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
@@ -7757,10 +7759,10 @@ void sched_move_task(struct task_struct *tsk)
 {
 	struct task_group *tg;
 	int queued, running;
-	unsigned long flags;
+	struct rq_flags rf;
 	struct rq *rq;
 
-	rq = task_rq_lock(tsk, &flags);
+	rq = task_rq_lock(tsk, &rf);
 
 	running = task_current(rq, tsk);
 	queued = task_on_rq_queued(tsk);
@@ -7792,7 +7794,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (queued)
 		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
 
-	task_rq_unlock(rq, tsk, &flags);
+	task_rq_unlock(rq, tsk, &rf);
 }
 #endif /* CONFIG_CGROUP_SCHED */
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 8f9b5af4e857..738e3c84dfe1 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -591,10 +591,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 						     struct sched_dl_entity,
 						     dl_timer);
 	struct task_struct *p = dl_task_of(dl_se);
-	unsigned long flags;
+	struct rq_flags rf;
 	struct rq *rq;
 
-	rq = task_rq_lock(p, &flags);
+	rq = task_rq_lock(p, &rf);
 
 	/*
 	 * The task might have changed its scheduling policy to something
@@ -677,7 +677,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 #endif
 
 unlock:
-	task_rq_unlock(rq, p, &flags);
+	task_rq_unlock(rq, p, &rf);
 
 	/*
 	 * This can free the task_struct, including this hrtimer, do not touch
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aab4cf05d48a..a5eecb1e5e4b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1451,13 +1451,17 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
 #endif
 
-struct rq *__task_rq_lock(struct task_struct *p)
+struct rq_flags {
+	unsigned long flags;
+};
+
+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	__acquires(rq->lock);
-struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 	__acquires(p->pi_lock)
 	__acquires(rq->lock);
 
-static inline void __task_rq_unlock(struct rq *rq)
+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
 	__releases(rq->lock)
 {
 	lockdep_unpin_lock(&rq->lock);
@@ -1465,13 +1469,13 @@ static inline void __task_rq_unlock(struct rq *rq)
 }
 
 static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
 	lockdep_unpin_lock(&rq->lock);
 	raw_spin_unlock(&rq->lock);
-	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From e7904a28f5331c21d17af638cb477c83662e3cb6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 1 Aug 2015 19:25:08 +0200
Subject: locking/lockdep, sched/core: Implement a better lock pinning scheme

The problem with the existing lock pinning is that each pin is of
value 1; this mean you can simply unpin if you know its pinned,
without having any extra information.

This scheme generates a random (16 bit) cookie for each pin and
requires this same cookie to unpin. This means you have to keep the
cookie in context.

No objsize difference for !LOCKDEP kernels.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h  | 23 ++++++++++----
 kernel/locking/lockdep.c | 71 +++++++++++++++++++++++++++++++++++++------
 kernel/sched/core.c      | 79 +++++++++++++++++++++++++-----------------------
 kernel/sched/deadline.c  | 11 ++++---
 kernel/sched/fair.c      |  6 ++--
 kernel/sched/idle_task.c |  2 +-
 kernel/sched/rt.c        |  6 ++--
 kernel/sched/sched.h     |  8 +++--
 kernel/sched/stop_task.c |  2 +-
 9 files changed, 140 insertions(+), 68 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index d026b190c530..cf9bf9612702 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -354,8 +354,13 @@ extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
 extern void lockdep_clear_current_reclaim_state(void);
 extern void lockdep_trace_alloc(gfp_t mask);
 
-extern void lock_pin_lock(struct lockdep_map *lock);
-extern void lock_unpin_lock(struct lockdep_map *lock);
+struct pin_cookie { unsigned int val; };
+
+#define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
+
+extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
+extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
+extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 
 # define INIT_LOCKDEP				.lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
 
@@ -371,8 +376,9 @@ extern void lock_unpin_lock(struct lockdep_map *lock);
 
 #define lockdep_recursing(tsk)	((tsk)->lockdep_recursion)
 
-#define lockdep_pin_lock(l)		lock_pin_lock(&(l)->dep_map)
-#define lockdep_unpin_lock(l)	lock_unpin_lock(&(l)->dep_map)
+#define lockdep_pin_lock(l)	lock_pin_lock(&(l)->dep_map)
+#define lockdep_repin_lock(l,c)	lock_repin_lock(&(l)->dep_map, (c))
+#define lockdep_unpin_lock(l,c)	lock_unpin_lock(&(l)->dep_map, (c))
 
 #else /* !CONFIG_LOCKDEP */
 
@@ -425,8 +431,13 @@ struct lock_class_key { };
 
 #define lockdep_recursing(tsk)			(0)
 
-#define lockdep_pin_lock(l)				do { (void)(l); } while (0)
-#define lockdep_unpin_lock(l)			do { (void)(l); } while (0)
+struct pin_cookie { };
+
+#define NIL_COOKIE (struct pin_cookie){ }
+
+#define lockdep_pin_lock(l)			({ struct pin_cookie cookie; cookie; })
+#define lockdep_repin_lock(l, c)		do { (void)(l); (void)(c); } while (0)
+#define lockdep_unpin_lock(l, c)		do { (void)(l); (void)(c); } while (0)
 
 #endif /* !LOCKDEP */
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index ed9410936a22..d7f94f4c811d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -45,6 +45,7 @@
 #include <linux/bitops.h>
 #include <linux/gfp.h>
 #include <linux/kmemcheck.h>
+#include <linux/random.h>
 
 #include <asm/sections.h>
 
@@ -3554,7 +3555,35 @@ static int __lock_is_held(struct lockdep_map *lock)
 	return 0;
 }
 
-static void __lock_pin_lock(struct lockdep_map *lock)
+static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
+{
+	struct pin_cookie cookie = NIL_COOKIE;
+	struct task_struct *curr = current;
+	int i;
+
+	if (unlikely(!debug_locks))
+		return cookie;
+
+	for (i = 0; i < curr->lockdep_depth; i++) {
+		struct held_lock *hlock = curr->held_locks + i;
+
+		if (match_held_lock(hlock, lock)) {
+			/*
+			 * Grab 16bits of randomness; this is sufficient to not
+			 * be guessable and still allows some pin nesting in
+			 * our u32 pin_count.
+			 */
+			cookie.val = 1 + (prandom_u32() >> 16);
+			hlock->pin_count += cookie.val;
+			return cookie;
+		}
+	}
+
+	WARN(1, "pinning an unheld lock\n");
+	return cookie;
+}
+
+static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
 {
 	struct task_struct *curr = current;
 	int i;
@@ -3566,7 +3595,7 @@ static void __lock_pin_lock(struct lockdep_map *lock)
 		struct held_lock *hlock = curr->held_locks + i;
 
 		if (match_held_lock(hlock, lock)) {
-			hlock->pin_count++;
+			hlock->pin_count += cookie.val;
 			return;
 		}
 	}
@@ -3574,7 +3603,7 @@ static void __lock_pin_lock(struct lockdep_map *lock)
 	WARN(1, "pinning an unheld lock\n");
 }
 
-static void __lock_unpin_lock(struct lockdep_map *lock)
+static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
 {
 	struct task_struct *curr = current;
 	int i;
@@ -3589,7 +3618,11 @@ static void __lock_unpin_lock(struct lockdep_map *lock)
 			if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
 				return;
 
-			hlock->pin_count--;
+			hlock->pin_count -= cookie.val;
+
+			if (WARN((int)hlock->pin_count < 0, "pin count corrupted\n"))
+				hlock->pin_count = 0;
+
 			return;
 		}
 	}
@@ -3720,24 +3753,44 @@ int lock_is_held(struct lockdep_map *lock)
 }
 EXPORT_SYMBOL_GPL(lock_is_held);
 
-void lock_pin_lock(struct lockdep_map *lock)
+struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
 {
+	struct pin_cookie cookie = NIL_COOKIE;
 	unsigned long flags;
 
 	if (unlikely(current->lockdep_recursion))
-		return;
+		return cookie;
 
 	raw_local_irq_save(flags);
 	check_flags(flags);
 
 	current->lockdep_recursion = 1;
-	__lock_pin_lock(lock);
+	cookie = __lock_pin_lock(lock);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
+
+	return cookie;
 }
 EXPORT_SYMBOL_GPL(lock_pin_lock);
 
-void lock_unpin_lock(struct lockdep_map *lock)
+void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
+{
+	unsigned long flags;
+
+	if (unlikely(current->lockdep_recursion))
+		return;
+
+	raw_local_irq_save(flags);
+	check_flags(flags);
+
+	current->lockdep_recursion = 1;
+	__lock_repin_lock(lock, cookie);
+	current->lockdep_recursion = 0;
+	raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_repin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
 {
 	unsigned long flags;
 
@@ -3748,7 +3801,7 @@ void lock_unpin_lock(struct lockdep_map *lock)
 	check_flags(flags);
 
 	current->lockdep_recursion = 1;
-	__lock_unpin_lock(lock);
+	__lock_unpin_lock(lock, cookie);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7c7db60115b4..71c5a753e6e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -184,7 +184,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-			lockdep_pin_lock(&rq->lock);
+			rf->cookie = lockdep_pin_lock(&rq->lock);
 			return rq;
 		}
 		raw_spin_unlock(&rq->lock);
@@ -224,7 +224,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 		 * pair with the WMB to ensure we must then also see migrating.
 		 */
 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
-			lockdep_pin_lock(&rq->lock);
+			rf->cookie = lockdep_pin_lock(&rq->lock);
 			return rq;
 		}
 		raw_spin_unlock(&rq->lock);
@@ -1193,9 +1193,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		 * OK, since we're going to drop the lock immediately
 		 * afterwards anyway.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, rf.cookie);
 		rq = move_queued_task(rq, p, dest_cpu);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, rf.cookie);
 	}
 out:
 	task_rq_unlock(rq, p, &rf);
@@ -1669,8 +1669,8 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl
 /*
  * Mark the task runnable and perform wakeup-preemption.
  */
-static void
-ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
+			   struct pin_cookie cookie)
 {
 	check_preempt_curr(rq, p, wake_flags);
 	p->state = TASK_RUNNING;
@@ -1682,9 +1682,9 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 		 * Our task @p is fully woken up and running; so its safe to
 		 * drop the rq->lock, hereafter rq is only used for statistics.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		p->sched_class->task_woken(rq, p);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, cookie);
 	}
 
 	if (rq->idle_stamp) {
@@ -1702,7 +1702,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 }
 
 static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+		 struct pin_cookie cookie)
 {
 	lockdep_assert_held(&rq->lock);
 
@@ -1712,7 +1713,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 #endif
 
 	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
-	ttwu_do_wakeup(rq, p, wake_flags);
+	ttwu_do_wakeup(rq, p, wake_flags, cookie);
 }
 
 /*
@@ -1731,7 +1732,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 	if (task_on_rq_queued(p)) {
 		/* check_preempt_curr() may use rq clock */
 		update_rq_clock(rq);
-		ttwu_do_wakeup(rq, p, wake_flags);
+		ttwu_do_wakeup(rq, p, wake_flags, rf.cookie);
 		ret = 1;
 	}
 	__task_rq_unlock(rq, &rf);
@@ -1744,6 +1745,7 @@ void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
 	struct llist_node *llist = llist_del_all(&rq->wake_list);
+	struct pin_cookie cookie;
 	struct task_struct *p;
 	unsigned long flags;
 
@@ -1751,15 +1753,15 @@ void sched_ttwu_pending(void)
 		return;
 
 	raw_spin_lock_irqsave(&rq->lock, flags);
-	lockdep_pin_lock(&rq->lock);
+	cookie = lockdep_pin_lock(&rq->lock);
 
 	while (llist) {
 		p = llist_entry(llist, struct task_struct, wake_entry);
 		llist = llist_next(llist);
-		ttwu_do_activate(rq, p, 0);
+		ttwu_do_activate(rq, p, 0, cookie);
 	}
 
-	lockdep_unpin_lock(&rq->lock);
+	lockdep_unpin_lock(&rq->lock, cookie);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -1846,6 +1848,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 static void ttwu_queue(struct task_struct *p, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
+	struct pin_cookie cookie;
 
 #if defined(CONFIG_SMP)
 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
@@ -1856,9 +1859,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 #endif
 
 	raw_spin_lock(&rq->lock);
-	lockdep_pin_lock(&rq->lock);
-	ttwu_do_activate(rq, p, 0);
-	lockdep_unpin_lock(&rq->lock);
+	cookie = lockdep_pin_lock(&rq->lock);
+	ttwu_do_activate(rq, p, 0, cookie);
+	lockdep_unpin_lock(&rq->lock, cookie);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -2055,7 +2058,7 @@ out:
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
  * the current task.
  */
-static void try_to_wake_up_local(struct task_struct *p)
+static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie)
 {
 	struct rq *rq = task_rq(p);
 
@@ -2072,11 +2075,11 @@ static void try_to_wake_up_local(struct task_struct *p)
 		 * disabled avoiding further scheduler activity on it and we've
 		 * not yet picked a replacement task.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&p->pi_lock);
 		raw_spin_lock(&rq->lock);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, cookie);
 	}
 
 	if (!(p->state & TASK_NORMAL))
@@ -2087,7 +2090,7 @@ static void try_to_wake_up_local(struct task_struct *p)
 	if (!task_on_rq_queued(p))
 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
 
-	ttwu_do_wakeup(rq, p, 0);
+	ttwu_do_wakeup(rq, p, 0, cookie);
 	if (schedstat_enabled())
 		ttwu_stat(p, smp_processor_id(), 0);
 out:
@@ -2515,9 +2518,9 @@ void wake_up_new_task(struct task_struct *p)
 		 * Nothing relies on rq->lock after this, so its fine to
 		 * drop it.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, rf.cookie);
 		p->sched_class->task_woken(rq, p);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, rf.cookie);
 	}
 #endif
 	task_rq_unlock(rq, p, &rf);
@@ -2782,7 +2785,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
  */
 static __always_inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
-	       struct task_struct *next)
+	       struct task_struct *next, struct pin_cookie cookie)
 {
 	struct mm_struct *mm, *oldmm;
 
@@ -2814,7 +2817,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * of the scheduler it's an obvious special-case), so we
 	 * do an early lockdep release here:
 	 */
-	lockdep_unpin_lock(&rq->lock);
+	lockdep_unpin_lock(&rq->lock, cookie);
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
 
 	/* Here we just switch the register state and the stack. */
@@ -3154,7 +3157,7 @@ static inline void schedule_debug(struct task_struct *prev)
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	const struct sched_class *class = &fair_sched_class;
 	struct task_struct *p;
@@ -3165,20 +3168,20 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 	 */
 	if (likely(prev->sched_class == class &&
 		   rq->nr_running == rq->cfs.h_nr_running)) {
-		p = fair_sched_class.pick_next_task(rq, prev);
+		p = fair_sched_class.pick_next_task(rq, prev, cookie);
 		if (unlikely(p == RETRY_TASK))
 			goto again;
 
 		/* assumes fair_sched_class->next == idle_sched_class */
 		if (unlikely(!p))
-			p = idle_sched_class.pick_next_task(rq, prev);
+			p = idle_sched_class.pick_next_task(rq, prev, cookie);
 
 		return p;
 	}
 
 again:
 	for_each_class(class) {
-		p = class->pick_next_task(rq, prev);
+		p = class->pick_next_task(rq, prev, cookie);
 		if (p) {
 			if (unlikely(p == RETRY_TASK))
 				goto again;
@@ -3232,6 +3235,7 @@ static void __sched notrace __schedule(bool preempt)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
+	struct pin_cookie cookie;
 	struct rq *rq;
 	int cpu;
 
@@ -3265,7 +3269,7 @@ static void __sched notrace __schedule(bool preempt)
 	 */
 	smp_mb__before_spinlock();
 	raw_spin_lock(&rq->lock);
-	lockdep_pin_lock(&rq->lock);
+	cookie = lockdep_pin_lock(&rq->lock);
 
 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
 
@@ -3287,7 +3291,7 @@ static void __sched notrace __schedule(bool preempt)
 
 				to_wakeup = wq_worker_sleeping(prev);
 				if (to_wakeup)
-					try_to_wake_up_local(to_wakeup);
+					try_to_wake_up_local(to_wakeup, cookie);
 			}
 		}
 		switch_count = &prev->nvcsw;
@@ -3296,7 +3300,7 @@ static void __sched notrace __schedule(bool preempt)
 	if (task_on_rq_queued(prev))
 		update_rq_clock(rq);
 
-	next = pick_next_task(rq, prev);
+	next = pick_next_task(rq, prev, cookie);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->clock_skip_update = 0;
@@ -3307,9 +3311,9 @@ static void __sched notrace __schedule(bool preempt)
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
-		rq = context_switch(rq, prev, next); /* unlocks the rq */
+		rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */
 	} else {
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		raw_spin_unlock_irq(&rq->lock);
 	}
 
@@ -5392,6 +5396,7 @@ static void migrate_tasks(struct rq *dead_rq)
 {
 	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
+	struct pin_cookie cookie;
 	int dest_cpu;
 
 	/*
@@ -5423,8 +5428,8 @@ static void migrate_tasks(struct rq *dead_rq)
 		/*
 		 * pick_next_task assumes pinned rq->lock.
 		 */
-		lockdep_pin_lock(&rq->lock);
-		next = pick_next_task(rq, &fake_task);
+		cookie = lockdep_pin_lock(&rq->lock);
+		next = pick_next_task(rq, &fake_task, cookie);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
@@ -5437,7 +5442,7 @@ static void migrate_tasks(struct rq *dead_rq)
 		 * because !cpu_active at this point, which means load-balance
 		 * will not interfere. Also, stop-machine.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		raw_spin_unlock(&rq->lock);
 		raw_spin_lock(&next->pi_lock);
 		raw_spin_lock(&rq->lock);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 738e3c84dfe1..ba53a87bb978 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -670,9 +670,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 		 * Nothing relies on rq->lock after this, so its safe to drop
 		 * rq->lock.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, rf.cookie);
 		push_dl_task(rq);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, rf.cookie);
 	}
 #endif
 
@@ -1125,7 +1125,8 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
 	return rb_entry(left, struct sched_dl_entity, rb_node);
 }
 
-struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
+struct task_struct *
+pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	struct sched_dl_entity *dl_se;
 	struct task_struct *p;
@@ -1140,9 +1141,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 		 * disabled avoiding further scheduler activity on it and we're
 		 * being very careful to re-start the picking loop.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		pull_dl_task(rq);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, cookie);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a stop task can slip in, in which case we need to
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8a33abce650..91395e1552ae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5542,7 +5542,7 @@ preempt:
 }
 
 static struct task_struct *
-pick_next_task_fair(struct rq *rq, struct task_struct *prev)
+pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
@@ -5655,9 +5655,9 @@ idle:
 	 * further scheduler activity on it and we're being very careful to
 	 * re-start the picking loop.
 	 */
-	lockdep_unpin_lock(&rq->lock);
+	lockdep_unpin_lock(&rq->lock, cookie);
 	new_tasks = idle_balance(rq);
-	lockdep_pin_lock(&rq->lock);
+	lockdep_repin_lock(&rq->lock, cookie);
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 	 * possible for any higher priority task to appear. In that case we
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 47ce94931f1b..2ce5458bbe1d 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 }
 
 static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev)
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	put_prev_task(rq, prev);
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 19e13060fcd5..68deaf901a12 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1524,7 +1524,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 }
 
 static struct task_struct *
-pick_next_task_rt(struct rq *rq, struct task_struct *prev)
+pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	struct task_struct *p;
 	struct rt_rq *rt_rq = &rq->rt;
@@ -1536,9 +1536,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 		 * disabled avoiding further scheduler activity on it and we're
 		 * being very careful to re-start the picking loop.
 		 */
-		lockdep_unpin_lock(&rq->lock);
+		lockdep_unpin_lock(&rq->lock, cookie);
 		pull_rt_task(rq);
-		lockdep_pin_lock(&rq->lock);
+		lockdep_repin_lock(&rq->lock, cookie);
 		/*
 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
 		 * means a dl or stop task can slip in, in which case we need
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a5eecb1e5e4b..0b6a838e9e73 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1202,7 +1202,8 @@ struct sched_class {
 	 * tasks.
 	 */
 	struct task_struct * (*pick_next_task) (struct rq *rq,
-						struct task_struct *prev);
+						struct task_struct *prev,
+						struct pin_cookie cookie);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
@@ -1453,6 +1454,7 @@ static inline void sched_avg_update(struct rq *rq) { }
 
 struct rq_flags {
 	unsigned long flags;
+	struct pin_cookie cookie;
 };
 
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -1464,7 +1466,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf)
 	__releases(rq->lock)
 {
-	lockdep_unpin_lock(&rq->lock);
+	lockdep_unpin_lock(&rq->lock, rf->cookie);
 	raw_spin_unlock(&rq->lock);
 }
 
@@ -1473,7 +1475,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
-	lockdep_unpin_lock(&rq->lock);
+	lockdep_unpin_lock(&rq->lock, rf->cookie);
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index cbc67da10954..604297a08b3a 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 }
 
 static struct task_struct *
-pick_next_task_stop(struct rq *rq, struct task_struct *prev)
+pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
 	struct task_struct *stop = rq->stop;
 
-- 
cgit v1.2.3


From 2159197d66770ec01f75c93fb11dc66df81fd45b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 28 Apr 2016 12:49:38 +0200
Subject: sched/core: Enable increased load resolution on 64-bit kernels

Mike ran into the low load resolution limitation on his big machine.

So reenable these bits; nobody could ever reproduce/analyze the
reported power usage claim and Google has been running with this for
years as well.

Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0b6a838e9e73..066a4c2d2695 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -49,11 +49,13 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
  * and does not change the user-interface for setting shares/weights.
  *
  * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
- * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
- * increased costs.
+ * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are
+ * pretty high and the returns do not justify the increased costs.
+ *
+ * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
+ * increase coverage and consistency always enable it on 64bit platforms.
  */
-#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
+#ifdef CONFIG_64BIT
 # define SCHED_LOAD_RESOLUTION	10
 # define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
 # define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
-- 
cgit v1.2.3


From 6ecdd74962f246dfe8750b7bea481a1c0816315d Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Tue, 5 Apr 2016 12:12:26 +0800
Subject: sched/fair: Generalize the load/util averages resolution definition

Integer metric needs fixed point arithmetic. In sched/fair, a few
metrics, e.g., weight, load, load_avg, util_avg, freq, and capacity,
may have different fixed point ranges, which makes their update and
usage error-prone.

In order to avoid the errors relating to the fixed point range, we
definie a basic fixed point range, and then formalize all metrics to
base on the basic range.

The basic range is 1024 or (1 << 10). Further, one can recursively
apply the basic range to have larger range.

Pointed out by Ben Segall, weight (visible to user, e.g., NICE-0 has
1024) and load (e.g., NICE_0_LOAD) have independent ranges, but they
must be well calibrated.

Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: lizefan@huawei.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1459829551-21625-2-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 16 +++++++++++++---
 kernel/sched/fair.c   |  4 ----
 kernel/sched/sched.h  | 15 ++++++++++-----
 3 files changed, 23 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d894f2d61388..7d779d70a3a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -936,10 +936,20 @@ enum cpu_idle_type {
 	CPU_MAX_IDLE_TYPES
 };
 
+/*
+ * Integer metrics need fixed point arithmetic, e.g., sched/fair
+ * has a few: load, load_avg, util_avg, freq, and capacity.
+ *
+ * We define a basic fixed point arithmetic range, and then formalize
+ * all these metrics based on that basic range.
+ */
+# define SCHED_FIXEDPOINT_SHIFT	10
+# define SCHED_FIXEDPOINT_SCALE	(1L << SCHED_FIXEDPOINT_SHIFT)
+
 /*
  * Increase resolution of cpu_capacity calculations
  */
-#define SCHED_CAPACITY_SHIFT	10
+#define SCHED_CAPACITY_SHIFT	SCHED_FIXEDPOINT_SHIFT
 #define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)
 
 /*
@@ -1205,8 +1215,8 @@ struct load_weight {
  * 1) load_avg factors frequency scaling into the amount of time that a
  * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
  * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency and cpu scaling into the amount of time
- * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
+ * 2) util_avg factors frequency and cpu capacity scaling into the amount of time
+ * that a sched_entity is running on a CPU, in the range [0..SCHED_CAPACITY_SCALE].
  * For cfs_rq, it is the aggregated such times of all runnable and
  * blocked sched_entities.
  * The 64 bit load_sum can:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 91395e1552ae..76ca86e9fc20 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2662,10 +2662,6 @@ static u32 __compute_runnable_contrib(u64 n)
 	return contrib + runnable_avg_yN_sum[n];
 }
 
-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
-#error "load tracking assumes 2^10 as unit"
-#endif
-
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 066a4c2d2695..ad83361f9e67 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -56,18 +56,23 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
  * increase coverage and consistency always enable it on 64bit platforms.
  */
 #ifdef CONFIG_64BIT
-# define SCHED_LOAD_RESOLUTION	10
-# define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
-# define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
+# define SCHED_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
+# define scale_load(w)		((w) << SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w)	((w) >> SCHED_FIXEDPOINT_SHIFT)
 #else
-# define SCHED_LOAD_RESOLUTION	0
+# define SCHED_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT)
 # define scale_load(w)		(w)
 # define scale_load_down(w)	(w)
 #endif
 
-#define SCHED_LOAD_SHIFT	(10 + SCHED_LOAD_RESOLUTION)
 #define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
 
+/*
+ * NICE_0's weight (visible to users) and its load (invisible to users) have
+ * independent ranges, but they should be well calibrated. We use scale_load()
+ * and scale_load_down(w) to convert between them, and the following must be true:
+ * scale_load(sched_prio_to_weight[20]) == NICE_0_LOAD
+ */
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
 
-- 
cgit v1.2.3


From 172895e6b5216eba3e0880460829a8baeefd55f3 Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Tue, 5 Apr 2016 12:12:27 +0800
Subject: sched/fair: Rename SCHED_LOAD_SHIFT to NICE_0_LOAD_SHIFT and remove
 SCHED_LOAD_SCALE

After cleaning up the sched metrics, there are two definitions that are
ambiguous and confusing: SCHED_LOAD_SHIFT and SCHED_LOAD_SHIFT.

Resolve this:

 - Rename SCHED_LOAD_SHIFT to NICE_0_LOAD_SHIFT, which better reflects what
   it is.

 - Replace SCHED_LOAD_SCALE use with SCHED_CAPACITY_SCALE and remove SCHED_LOAD_SCALE.

Suggested-by: Ben Segall <bsegall@google.com>
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dietmar.eggemann@arm.com
Cc: lizefan@huawei.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1459829551-21625-3-git-send-email-yuyang.du@intel.com
[ Rewrote the changelog and fixed the build on 32-bit kernels. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  |  4 ++--
 kernel/sched/sched.h | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 76ca86e9fc20..e1485710d1ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -719,7 +719,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	struct sched_avg *sa = &se->avg;
-	long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2;
+	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
 
 	if (cap > 0) {
 		if (cfs_rq->avg.util_avg != 0) {
@@ -7010,7 +7010,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	if (busiest->group_type == group_overloaded &&
 	    local->group_type   == group_overloaded) {
 		load_above_capacity = busiest->sum_nr_running *
-					SCHED_LOAD_SCALE;
+				      scale_load_down(NICE_0_LOAD);
 		if (load_above_capacity > busiest->group_capacity)
 			load_above_capacity -= busiest->group_capacity;
 		else
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ad83361f9e67..d24e91b0a722 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -56,25 +56,25 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
  * increase coverage and consistency always enable it on 64bit platforms.
  */
 #ifdef CONFIG_64BIT
-# define SCHED_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
+# define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
 # define scale_load(w)		((w) << SCHED_FIXEDPOINT_SHIFT)
 # define scale_load_down(w)	((w) >> SCHED_FIXEDPOINT_SHIFT)
 #else
-# define SCHED_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT)
+# define NICE_0_LOAD_SHIFT	(SCHED_FIXEDPOINT_SHIFT)
 # define scale_load(w)		(w)
 # define scale_load_down(w)	(w)
 #endif
 
-#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
-
 /*
- * NICE_0's weight (visible to users) and its load (invisible to users) have
- * independent ranges, but they should be well calibrated. We use scale_load()
- * and scale_load_down(w) to convert between them, and the following must be true:
- * scale_load(sched_prio_to_weight[20]) == NICE_0_LOAD
+ * Task weight (visible to users) and its load (invisible to users) have
+ * independent resolution, but they should be well calibrated. We use
+ * scale_load() and scale_load_down(w) to convert between them. The
+ * following must be true:
+ *
+ *  scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ *
  */
-#define NICE_0_LOAD		SCHED_LOAD_SCALE
-#define NICE_0_SHIFT		SCHED_LOAD_SHIFT
+#define NICE_0_LOAD		(1L << NICE_0_LOAD_SHIFT)
 
 /*
  * Single value that decides SCHED_DEADLINE internal math precision.
@@ -863,7 +863,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 struct sched_group_capacity {
 	atomic_t ref;
 	/*
-	 * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+	 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
 	 * for a single CPU.
 	 */
 	unsigned int capacity;
-- 
cgit v1.2.3


From 7b20b916e953cabef569541f991a0a583bc344cb Mon Sep 17 00:00:00 2001
From: Yuyang Du <yuyang.du@intel.com>
Date: Tue, 3 May 2016 05:54:27 +0800
Subject: sched/fair: Optimize sum computation with a lookup table

__compute_runnable_contrib() uses a loop to compute sum, whereas a
table lookup can do it faster in a constant amount of time.

The program to generate the constants is located at:

  Documentation/scheduler/sched-avg.txt

Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Morten Rasmussen <morten.rasmussen@arm.com>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: juri.lelli@arm.com
Cc: pjt@google.com
Link: http://lkml.kernel.org/r/1462226078-31904-2-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e1485710d1ec..8c381a6d56e3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2602,6 +2602,16 @@ static const u32 runnable_avg_yN_sum[] = {
 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
 };
 
+/*
+ * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
+ * lower integers. See Documentation/scheduler/sched-avg.txt how these
+ * were generated:
+ */
+static const u32 __accumulated_sum_N32[] = {
+	    0, 23371, 35056, 40899, 43820, 45281,
+	46011, 46376, 46559, 46650, 46696, 46719,
+};
+
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
@@ -2650,14 +2660,9 @@ static u32 __compute_runnable_contrib(u64 n)
 	else if (unlikely(n >= LOAD_AVG_MAX_N))
 		return LOAD_AVG_MAX;
 
-	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
-	do {
-		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
-		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
-
-		n -= LOAD_AVG_PERIOD;
-	} while (n > LOAD_AVG_PERIOD);
-
+	/* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
+	contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
+	n %= LOAD_AVG_PERIOD;
 	contrib = decay_load(contrib, n);
 	return contrib + runnable_avg_yN_sum[n];
 }
-- 
cgit v1.2.3


From db6ea2fb094fb3a6afc36d3e4229bc162638ad24 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Tue, 3 May 2016 12:38:25 +0800
Subject: sched/debug: Print out idle balance values even on !CONFIG_SCHEDSTATS
 kernels

The max_idle_balance_cost and avg_idle values which are tracked and ar used to
capture short idle incidents, are not associated with schedstats, however the
information of these two values isn't printed out on !CONFIG_SCHEDSTATS kernels.

Fix this by moving the value printout out of the CONFIG_SCHEDSTATS section.

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1462250305-4523-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fbc3bd5ff60..cf905f655ba1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -626,15 +626,16 @@ do {									\
 #undef P
 #undef PN
 
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
-
 #ifdef CONFIG_SMP
+#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
 	P64(avg_idle);
 	P64(max_idle_balance_cost);
+#undef P64
 #endif
 
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+
 	if (schedstat_enabled()) {
 		P(yld_count);
 		P(sched_count);
@@ -644,7 +645,6 @@ do {									\
 	}
 
 #undef P
-#undef P64
 #endif
 	spin_lock_irqsave(&sched_debug_lock, flags);
 	print_cfs_stats(m, cpu);
-- 
cgit v1.2.3


From b52fad2db5d792d89975cebf2fe1646a7af28ed0 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Tue, 3 May 2016 20:46:54 +0100
Subject: sched/fair: Update rq clock before updating nohz CPU load

If we're accessing rq_clock() (e.g. in sched_avg_update()) we should
update the rq clock before calling cpu_load_update(), otherwise any
time calculations will be stale.

All other paths currently call update_rq_clock().

Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1462304814-11715-1-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8c381a6d56e3..7a00c7c2dad0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4724,6 +4724,7 @@ void cpu_load_update_nohz_stop(void)
 
 	load = weighted_cpuload(cpu_of(this_rq));
 	raw_spin_lock(&this_rq->lock);
+	update_rq_clock(this_rq);
 	cpu_load_update_nohz(this_rq, curr_jiffies, load);
 	raw_spin_unlock(&this_rq->lock);
 }
-- 
cgit v1.2.3


From 0a9b23ce46cd5d3a360fbefca8ffce441c55046e Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 29 Apr 2016 20:32:38 +0100
Subject: sched/fair: Remove stale power aware scheduling comments

Commit 8e7fbcbc22c1 ("sched: Remove stale power aware scheduling remnants
and dysfunctional knobs") deleted the power aware scheduling support.

This patch gets rid of the remaining power aware scheduling related
comments in the code as well.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1461958364-675-2-git-send-email-dietmar.eggemann@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a00c7c2dad0..537d71ec18ab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7027,9 +7027,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	 * We're trying to get all the cpus to the average_load, so we don't
 	 * want to push ourselves above the average load, nor do we wish to
 	 * reduce the max loaded cpu below the average load. At the same time,
-	 * we also don't want to reduce the group load below the group capacity
-	 * (so that we can implement power-savings policies etc). Thus we look
-	 * for the minimum possible imbalance.
+	 * we also don't want to reduce the group load below the group
+	 * capacity. Thus we look for the minimum possible imbalance.
 	 */
 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
 
@@ -7053,10 +7052,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 
 /**
  * find_busiest_group - Returns the busiest group within the sched_domain
- * if there is an imbalance. If there isn't an imbalance, and
- * the user has opted for power-savings, it returns a group whose
- * CPUs can be put to idle by rebalancing those tasks elsewhere, if
- * such a group exists.
+ * if there is an imbalance.
  *
  * Also calculates the amount of weighted load which should be moved
  * to restore balance.
@@ -7064,9 +7060,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
  * @env: The load balancing environment.
  *
  * Return:	- The busiest group if imbalance exists.
- *		- If no imbalance and user has opted for power-savings balance,
- *		   return the least loaded group whose CPUs can be
- *		   put to idle by rebalancing its tasks onto our group.
  */
 static struct sched_group *find_busiest_group(struct lb_env *env)
 {
-- 
cgit v1.2.3


From 885e542ce827f5f102fe9628d63c6430c8b7ab2c Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Fri, 29 Apr 2016 20:32:39 +0100
Subject: sched/fair: Fix comment in calculate_imbalance()

The comment in calculate_imbalance() was introduced in commit:

 2dd73a4f09be ("[PATCH] sched: implement smpnice")

which described the logic as it was then, but a later commit:

  b18855500fc4 ("sched/balancing: Fix 'local->avg_load > sds->avg_load' case in calculate_imbalance()")

.. complicated this logic some more so that the comment does not match anymore.

Update the comment to match the code.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1461958364-675-3-git-send-email-dietmar.eggemann@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 537d71ec18ab..51f7a4b62985 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7000,9 +7000,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	}
 
 	/*
-	 * In the presence of smp nice balancing, certain scenarios can have
-	 * max load less than avg load(as we skip the groups at or below
-	 * its cpu_capacity, while calculating max_load..)
+	 * Avg load of busiest sg can be less and avg load of local sg can
+	 * be greater than avg load across all sgs of sd because avg load
+	 * factors in sg capacity and sgs with smaller group_type are
+	 * skipped when updating the busiest sg:
 	 */
 	if (busiest->avg_load <= sds->avg_load ||
 	    local->avg_load >= sds->avg_load) {
-- 
cgit v1.2.3


From dc209a3fd73ec96d4491bcc128c3b50b0a8e8017 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Sun, 17 Apr 2016 23:31:42 -0700
Subject: locking/pvqspinlock: Avoid double resetting of stats

... remove the redundant second iteration, this is most
likely a copy/past buglet.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Cc: waiman.long@hpe.com
Link: http://lkml.kernel.org/r/1460961103-24953-2-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_stat.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index d734b7502001..72722334237a 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -191,8 +191,6 @@ static ssize_t qstat_write(struct file *file, const char __user *user_buf,
 
 		for (i = 0 ; i < qstat_num; i++)
 			WRITE_ONCE(ptr[i], 0);
-		for (i = 0 ; i < qstat_num; i++)
-			WRITE_ONCE(ptr[i], 0);
 	}
 	return count;
 }
-- 
cgit v1.2.3


From b96bbdde19cc56f288372d25fd5ea7af04fc1271 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 19 Apr 2016 21:17:25 -0700
Subject: locking/pvqspinlock: Robustify init_qspinlock_stat()

Specifically around the debugfs file creation calls,
I have no idea if they could ever possibly fail, but
this is core code (debug aside) so lets at least
check the return value and inform anything fishy.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Waiman Long <Waiman.Long@hpe.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160420041725.GC3472@linux-uzut.site
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_stat.h | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 72722334237a..22e025309845 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -212,10 +212,8 @@ static int __init init_qspinlock_stat(void)
 	struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
 	int i;
 
-	if (!d_qstat) {
-		pr_warn("Could not create 'qlockstat' debugfs directory\n");
-		return 0;
-	}
+	if (!d_qstat)
+		goto out;
 
 	/*
 	 * Create the debugfs files
@@ -225,12 +223,20 @@ static int __init init_qspinlock_stat(void)
 	 * performance.
 	 */
 	for (i = 0; i < qstat_num; i++)
-		debugfs_create_file(qstat_names[i], 0400, d_qstat,
-				   (void *)(long)i, &fops_qstat);
+		if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
+					 (void *)(long)i, &fops_qstat))
+			goto fail_undo;
+
+	if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+				 (void *)(long)qstat_reset_cnts, &fops_qstat))
+		goto fail_undo;
 
-	debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
-			   (void *)(long)qstat_reset_cnts, &fops_qstat);
 	return 0;
+fail_undo:
+	debugfs_remove_recursive(d_qstat);
+out:
+	pr_warn("Could not create 'qlockstat' debugfs entries\n");
+	return -ENOMEM;
 }
 fs_initcall(init_qspinlock_stat);
 
-- 
cgit v1.2.3


From a1cc5bcfcfca0b99f009b117785142dbdc3b87a3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 21 Apr 2016 20:35:25 +0200
Subject: locking/atomics: Flip atomic_fetch_or() arguments

All the atomic operations have their arguments the wrong way around;
make atomic_fetch_or() consistent and flip them.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/atomic.h   | 4 ++--
 kernel/time/tick-sched.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 506c3531832e..e451534fe54d 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -560,11 +560,11 @@ static inline int atomic_dec_if_positive(atomic_t *v)
 
 /**
  * atomic_fetch_or - perform *p |= mask and return old value of *p
- * @p: pointer to atomic_t
  * @mask: mask to OR on the atomic_t
+ * @p: pointer to atomic_t
  */
 #ifndef atomic_fetch_or
-static inline int atomic_fetch_or(atomic_t *p, int mask)
+static inline int atomic_fetch_or(int mask, atomic_t *p)
 {
 	int old, val = atomic_read(p);
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 58e3310c9b21..3daa49ff0719 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -262,7 +262,7 @@ static void tick_nohz_dep_set_all(atomic_t *dep,
 {
 	int prev;
 
-	prev = atomic_fetch_or(dep, BIT(bit));
+	prev = atomic_fetch_or(BIT(bit), dep);
 	if (!prev)
 		tick_nohz_full_kick_all();
 }
@@ -292,7 +292,7 @@ void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
 
 	ts = per_cpu_ptr(&tick_cpu_sched, cpu);
 
-	prev = atomic_fetch_or(&ts->tick_dep_mask, BIT(bit));
+	prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
 	if (!prev) {
 		preempt_disable();
 		/* Perf needs local kick that is NMI safe */
-- 
cgit v1.2.3


From c796bbbe8dccd9c91ebbb99ffef33e0f73ced7bf Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 27 Apr 2016 18:44:42 +0300
Subject: perf/core: Move set_filter() out of CONFIG_EVENT_TRACING

For instruction trace filtering, namely, for communicating filter
definitions from userspace, I'd like to re-use the SET_FILTER code
that the tracepoints are using currently.

To that end, move the relevant code out from behind the
CONFIG_EVENT_TRACING dependency.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1461771888-10409-2-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 45 ++++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9de459a4dac7..13956ee883fd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7235,24 +7235,6 @@ static inline void perf_tp_register(void)
 	perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
 }
 
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
-	char *filter_str;
-	int ret;
-
-	if (event->attr.type != PERF_TYPE_TRACEPOINT)
-		return -EINVAL;
-
-	filter_str = strndup_user(arg, PAGE_SIZE);
-	if (IS_ERR(filter_str))
-		return PTR_ERR(filter_str);
-
-	ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
-
-	kfree(filter_str);
-	return ret;
-}
-
 static void perf_event_free_filter(struct perf_event *event)
 {
 	ftrace_profile_free_filter(event);
@@ -7307,11 +7289,6 @@ static inline void perf_tp_register(void)
 {
 }
 
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
-	return -ENOENT;
-}
-
 static void perf_event_free_filter(struct perf_event *event)
 {
 }
@@ -7339,6 +7316,28 @@ void perf_bp_event(struct perf_event *bp, void *data)
 }
 #endif
 
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+	char *filter_str;
+	int ret = -EINVAL;
+
+	if (event->attr.type != PERF_TYPE_TRACEPOINT ||
+	    !IS_ENABLED(CONFIG_EVENT_TRACING))
+		return -EINVAL;
+
+	filter_str = strndup_user(arg, PAGE_SIZE);
+	if (IS_ERR(filter_str))
+		return PTR_ERR(filter_str);
+
+	if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
+	    event->attr.type == PERF_TYPE_TRACEPOINT)
+		ret = ftrace_profile_set_filter(event, event->attr.config,
+						filter_str);
+
+	kfree(filter_str);
+	return ret;
+}
+
 /*
  * hrtimer based swevent callback
  */
-- 
cgit v1.2.3


From b73e4fefc18adbe4d12ffb746fb16306674c1ac6 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 27 Apr 2016 18:44:45 +0300
Subject: perf/core: Extend perf_event_aux_ctx() to optionally iterate through
 more events

Trace filtering code needs an iterator that can go through all events in
a context, including inactive and filtered, to be able to update their
filters' address ranges based on mmap or exec events.

This patch changes perf_event_aux_ctx() to optionally do this.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1461771888-10409-5-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 13956ee883fd..2bb7c47f18e3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5781,15 +5781,18 @@ typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
 static void
 perf_event_aux_ctx(struct perf_event_context *ctx,
 		   perf_event_aux_output_cb output,
-		   void *data)
+		   void *data, bool all)
 {
 	struct perf_event *event;
 
 	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
-		if (event->state < PERF_EVENT_STATE_INACTIVE)
-			continue;
-		if (!event_filter_match(event))
-			continue;
+		if (!all) {
+			if (event->state < PERF_EVENT_STATE_INACTIVE)
+				continue;
+			if (!event_filter_match(event))
+				continue;
+		}
+
 		output(event, data);
 	}
 }
@@ -5800,7 +5803,7 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
 {
 	rcu_read_lock();
 	preempt_disable();
-	perf_event_aux_ctx(task_ctx, output, data);
+	perf_event_aux_ctx(task_ctx, output, data, false);
 	preempt_enable();
 	rcu_read_unlock();
 }
@@ -5830,13 +5833,13 @@ perf_event_aux(perf_event_aux_output_cb output, void *data,
 		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
 		if (cpuctx->unique_pmu != pmu)
 			goto next;
-		perf_event_aux_ctx(&cpuctx->ctx, output, data);
+		perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
 		ctxn = pmu->task_ctx_nr;
 		if (ctxn < 0)
 			goto next;
 		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
 		if (ctx)
-			perf_event_aux_ctx(ctx, output, data);
+			perf_event_aux_ctx(ctx, output, data, false);
 next:
 		put_cpu_ptr(pmu->pmu_cpu_context);
 	}
@@ -5878,10 +5881,10 @@ static int __perf_pmu_output_stop(void *info)
 	};
 
 	rcu_read_lock();
-	perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro);
+	perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
 	if (cpuctx->task_ctx)
 		perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
-				   &ro);
+				   &ro, false);
 	rcu_read_unlock();
 
 	return ro.err;
-- 
cgit v1.2.3


From 375637bc524952f1122ea22caf5a8f1fecad8228 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 27 Apr 2016 18:44:46 +0300
Subject: perf/core: Introduce address range filtering

Many instruction tracing PMUs out there support address range-based
filtering, which would, for example, generate trace data only for a
given range of instruction addresses, which is useful for tracing
individual functions, modules or libraries. Other PMUs may also
utilize this functionality to allow filtering to or filtering out
code at certain address ranges.

This patch introduces the interface for userspace to specify these
filters and for the PMU drivers to apply these filters to hardware
configuration.

The user interface is an ASCII string that is passed via an ioctl()
and specifies (in the form of an ASCII string) address ranges within
certain object files or within kernel. There is no special treatment
for kernel modules yet, but it might be a worthy pursuit.

The PMU driver interface basically adds two extra callbacks to the
PMU driver structure, one of which validates the filter configuration
proposed by the user against what the hardware is actually capable of
doing and the other one translates hardware-independent filter
configuration into something that can be programmed into the
hardware.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1461771888-10409-6-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  98 +++++++
 kernel/events/core.c       | 623 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 705 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a090700cccca..c77e4a159fa2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -151,6 +151,15 @@ struct hw_perf_event {
 	 */
 	struct task_struct		*target;
 
+	/*
+	 * PMU would store hardware filter configuration
+	 * here.
+	 */
+	void				*addr_filters;
+
+	/* Last sync'ed generation of filters */
+	unsigned long			addr_filters_gen;
+
 /*
  * hw_perf_event::state flags; used to track the PERF_EF_* state.
  */
@@ -240,6 +249,9 @@ struct pmu {
 	int				task_ctx_nr;
 	int				hrtimer_interval_ms;
 
+	/* number of address filters this PMU can do */
+	unsigned int			nr_addr_filters;
+
 	/*
 	 * Fully disable/enable this PMU, can be used to protect from the PMI
 	 * as well as for lazy/batch writing of the MSRs.
@@ -392,12 +404,71 @@ struct pmu {
 	 */
 	void (*free_aux)		(void *aux); /* optional */
 
+	/*
+	 * Validate address range filters: make sure the HW supports the
+	 * requested configuration and number of filters; return 0 if the
+	 * supplied filters are valid, -errno otherwise.
+	 *
+	 * Runs in the context of the ioctl()ing process and is not serialized
+	 * with the rest of the PMU callbacks.
+	 */
+	int (*addr_filters_validate)	(struct list_head *filters);
+					/* optional */
+
+	/*
+	 * Synchronize address range filter configuration:
+	 * translate hw-agnostic filters into hardware configuration in
+	 * event::hw::addr_filters.
+	 *
+	 * Runs as a part of filter sync sequence that is done in ->start()
+	 * callback by calling perf_event_addr_filters_sync().
+	 *
+	 * May (and should) traverse event::addr_filters::list, for which its
+	 * caller provides necessary serialization.
+	 */
+	void (*addr_filters_sync)	(struct perf_event *event);
+					/* optional */
+
 	/*
 	 * Filter events for PMU-specific reasons.
 	 */
 	int (*filter_match)		(struct perf_event *event); /* optional */
 };
 
+/**
+ * struct perf_addr_filter - address range filter definition
+ * @entry:	event's filter list linkage
+ * @inode:	object file's inode for file-based filters
+ * @offset:	filter range offset
+ * @size:	filter range size
+ * @range:	1: range, 0: address
+ * @filter:	1: filter/start, 0: stop
+ *
+ * This is a hardware-agnostic filter configuration as specified by the user.
+ */
+struct perf_addr_filter {
+	struct list_head	entry;
+	struct inode		*inode;
+	unsigned long		offset;
+	unsigned long		size;
+	unsigned int		range	: 1,
+				filter	: 1;
+};
+
+/**
+ * struct perf_addr_filters_head - container for address range filters
+ * @list:	list of filters for this event
+ * @lock:	spinlock that serializes accesses to the @list and event's
+ *		(and its children's) filter generations.
+ *
+ * A child event will use parent's @list (and therefore @lock), so they are
+ * bundled together; see perf_event_addr_filters().
+ */
+struct perf_addr_filters_head {
+	struct list_head	list;
+	raw_spinlock_t		lock;
+};
+
 /**
  * enum perf_event_active_state - the states of a event
  */
@@ -566,6 +637,12 @@ struct perf_event {
 
 	atomic_t			event_limit;
 
+	/* address range filters */
+	struct perf_addr_filters_head	addr_filters;
+	/* vma address array for file-based filders */
+	unsigned long			*addr_filters_offs;
+	unsigned long			addr_filters_gen;
+
 	void (*destroy)(struct perf_event *);
 	struct rcu_head			rcu_head;
 
@@ -1070,6 +1147,27 @@ static inline bool is_write_backward(struct perf_event *event)
 	return !!event->attr.write_backward;
 }
 
+static inline bool has_addr_filter(struct perf_event *event)
+{
+	return event->pmu->nr_addr_filters;
+}
+
+/*
+ * An inherited event uses parent's filters
+ */
+static inline struct perf_addr_filters_head *
+perf_event_addr_filters(struct perf_event *event)
+{
+	struct perf_addr_filters_head *ifh = &event->addr_filters;
+
+	if (event->parent)
+		ifh = &event->parent->addr_filters;
+
+	return ifh;
+}
+
+extern void perf_event_addr_filters_sync(struct perf_event *event);
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
 extern int perf_output_begin_forward(struct perf_output_handle *handle,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2bb7c47f18e3..ffdc096a4f4e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -44,6 +44,8 @@
 #include <linux/compat.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
+#include <linux/namei.h>
+#include <linux/parser.h>
 
 #include "internal.h"
 
@@ -2365,11 +2367,17 @@ void perf_event_enable(struct perf_event *event)
 }
 EXPORT_SYMBOL_GPL(perf_event_enable);
 
+struct stop_event_data {
+	struct perf_event	*event;
+	unsigned int		restart;
+};
+
 static int __perf_event_stop(void *info)
 {
-	struct perf_event *event = info;
+	struct stop_event_data *sd = info;
+	struct perf_event *event = sd->event;
 
-	/* for AUX events, our job is done if the event is already inactive */
+	/* if it's already INACTIVE, do nothing */
 	if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
 		return 0;
 
@@ -2385,9 +2393,86 @@ static int __perf_event_stop(void *info)
 
 	event->pmu->stop(event, PERF_EF_UPDATE);
 
+	/*
+	 * May race with the actual stop (through perf_pmu_output_stop()),
+	 * but it is only used for events with AUX ring buffer, and such
+	 * events will refuse to restart because of rb::aux_mmap_count==0,
+	 * see comments in perf_aux_output_begin().
+	 *
+	 * Since this is happening on a event-local CPU, no trace is lost
+	 * while restarting.
+	 */
+	if (sd->restart)
+		event->pmu->start(event, PERF_EF_START);
+
 	return 0;
 }
 
+static int perf_event_restart(struct perf_event *event)
+{
+	struct stop_event_data sd = {
+		.event		= event,
+		.restart	= 1,
+	};
+	int ret = 0;
+
+	do {
+		if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+			return 0;
+
+		/* matches smp_wmb() in event_sched_in() */
+		smp_rmb();
+
+		/*
+		 * We only want to restart ACTIVE events, so if the event goes
+		 * inactive here (event->oncpu==-1), there's nothing more to do;
+		 * fall through with ret==-ENXIO.
+		 */
+		ret = cpu_function_call(READ_ONCE(event->oncpu),
+					__perf_event_stop, &sd);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+/*
+ * In order to contain the amount of racy and tricky in the address filter
+ * configuration management, it is a two part process:
+ *
+ * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
+ *      we update the addresses of corresponding vmas in
+ *	event::addr_filters_offs array and bump the event::addr_filters_gen;
+ * (p2) when an event is scheduled in (pmu::add), it calls
+ *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
+ *      if the generation has changed since the previous call.
+ *
+ * If (p1) happens while the event is active, we restart it to force (p2).
+ *
+ * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
+ *     pre-existing mappings, called once when new filters arrive via SET_FILTER
+ *     ioctl;
+ * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
+ *     registered mapping, called for every new mmap(), with mm::mmap_sem down
+ *     for reading;
+ * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
+ *     of exec.
+ */
+void perf_event_addr_filters_sync(struct perf_event *event)
+{
+	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+	if (!has_addr_filter(event))
+		return;
+
+	raw_spin_lock(&ifh->lock);
+	if (event->addr_filters_gen != event->hw.addr_filters_gen) {
+		event->pmu->addr_filters_sync(event);
+		event->hw.addr_filters_gen = event->addr_filters_gen;
+	}
+	raw_spin_unlock(&ifh->lock);
+}
+EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
+
 static int _perf_event_refresh(struct perf_event *event, int refresh)
 {
 	/*
@@ -3237,16 +3322,6 @@ out:
 		put_ctx(clone_ctx);
 }
 
-void perf_event_exec(void)
-{
-	int ctxn;
-
-	rcu_read_lock();
-	for_each_task_context_nr(ctxn)
-		perf_event_enable_on_exec(ctxn);
-	rcu_read_unlock();
-}
-
 struct perf_read_data {
 	struct perf_event *event;
 	bool group;
@@ -3748,6 +3823,9 @@ static bool exclusive_event_installable(struct perf_event *event,
 	return true;
 }
 
+static void perf_addr_filters_splice(struct perf_event *event,
+				       struct list_head *head);
+
 static void _free_event(struct perf_event *event)
 {
 	irq_work_sync(&event->pending);
@@ -3775,6 +3853,8 @@ static void _free_event(struct perf_event *event)
 	}
 
 	perf_event_free_bpf_prog(event);
+	perf_addr_filters_splice(event, NULL);
+	kfree(event->addr_filters_offs);
 
 	if (event->destroy)
 		event->destroy(event);
@@ -5846,6 +5926,57 @@ next:
 	rcu_read_unlock();
 }
 
+/*
+ * Clear all file-based filters at exec, they'll have to be
+ * re-instated when/if these objects are mmapped again.
+ */
+static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
+{
+	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+	struct perf_addr_filter *filter;
+	unsigned int restart = 0, count = 0;
+	unsigned long flags;
+
+	if (!has_addr_filter(event))
+		return;
+
+	raw_spin_lock_irqsave(&ifh->lock, flags);
+	list_for_each_entry(filter, &ifh->list, entry) {
+		if (filter->inode) {
+			event->addr_filters_offs[count] = 0;
+			restart++;
+		}
+
+		count++;
+	}
+
+	if (restart)
+		event->addr_filters_gen++;
+	raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+	if (restart)
+		perf_event_restart(event);
+}
+
+void perf_event_exec(void)
+{
+	struct perf_event_context *ctx;
+	int ctxn;
+
+	rcu_read_lock();
+	for_each_task_context_nr(ctxn) {
+		ctx = current->perf_event_ctxp[ctxn];
+		if (!ctx)
+			continue;
+
+		perf_event_enable_on_exec(ctxn);
+
+		perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+				   true);
+	}
+	rcu_read_unlock();
+}
+
 struct remote_output {
 	struct ring_buffer	*rb;
 	int			err;
@@ -5856,6 +5987,9 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
 	struct perf_event *parent = event->parent;
 	struct remote_output *ro = data;
 	struct ring_buffer *rb = ro->rb;
+	struct stop_event_data sd = {
+		.event	= event,
+	};
 
 	if (!has_aux(event))
 		return;
@@ -5868,7 +6002,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
 	 * ring-buffer, but it will be the child that's actually using it:
 	 */
 	if (rcu_dereference(parent->rb) == rb)
-		ro->err = __perf_event_stop(event);
+		ro->err = __perf_event_stop(&sd);
 }
 
 static int __perf_pmu_output_stop(void *info)
@@ -6329,6 +6463,87 @@ got_name:
 	kfree(buf);
 }
 
+/*
+ * Whether this @filter depends on a dynamic object which is not loaded
+ * yet or its load addresses are not known.
+ */
+static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
+{
+	return filter->filter && filter->inode;
+}
+
+/*
+ * Check whether inode and address range match filter criteria.
+ */
+static bool perf_addr_filter_match(struct perf_addr_filter *filter,
+				     struct file *file, unsigned long offset,
+				     unsigned long size)
+{
+	if (filter->inode != file->f_inode)
+		return false;
+
+	if (filter->offset > offset + size)
+		return false;
+
+	if (filter->offset + filter->size < offset)
+		return false;
+
+	return true;
+}
+
+static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
+{
+	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+	struct vm_area_struct *vma = data;
+	unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
+	struct file *file = vma->vm_file;
+	struct perf_addr_filter *filter;
+	unsigned int restart = 0, count = 0;
+
+	if (!has_addr_filter(event))
+		return;
+
+	if (!file)
+		return;
+
+	raw_spin_lock_irqsave(&ifh->lock, flags);
+	list_for_each_entry(filter, &ifh->list, entry) {
+		if (perf_addr_filter_match(filter, file, off,
+					     vma->vm_end - vma->vm_start)) {
+			event->addr_filters_offs[count] = vma->vm_start;
+			restart++;
+		}
+
+		count++;
+	}
+
+	if (restart)
+		event->addr_filters_gen++;
+	raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+	if (restart)
+		perf_event_restart(event);
+}
+
+/*
+ * Adjust all task's events' filters to the new vma
+ */
+static void perf_addr_filters_adjust(struct vm_area_struct *vma)
+{
+	struct perf_event_context *ctx;
+	int ctxn;
+
+	rcu_read_lock();
+	for_each_task_context_nr(ctxn) {
+		ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+		if (!ctx)
+			continue;
+
+		perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+	}
+	rcu_read_unlock();
+}
+
 void perf_event_mmap(struct vm_area_struct *vma)
 {
 	struct perf_mmap_event mmap_event;
@@ -6360,6 +6575,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
 		/* .flags (attr_mmap2 only) */
 	};
 
+	perf_addr_filters_adjust(vma);
 	perf_event_mmap_event(&mmap_event);
 }
 
@@ -7319,13 +7535,370 @@ void perf_bp_event(struct perf_event *bp, void *data)
 }
 #endif
 
+/*
+ * Allocate a new address filter
+ */
+static struct perf_addr_filter *
+perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
+{
+	int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
+	struct perf_addr_filter *filter;
+
+	filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
+	if (!filter)
+		return NULL;
+
+	INIT_LIST_HEAD(&filter->entry);
+	list_add_tail(&filter->entry, filters);
+
+	return filter;
+}
+
+static void free_filters_list(struct list_head *filters)
+{
+	struct perf_addr_filter *filter, *iter;
+
+	list_for_each_entry_safe(filter, iter, filters, entry) {
+		if (filter->inode)
+			iput(filter->inode);
+		list_del(&filter->entry);
+		kfree(filter);
+	}
+}
+
+/*
+ * Free existing address filters and optionally install new ones
+ */
+static void perf_addr_filters_splice(struct perf_event *event,
+				     struct list_head *head)
+{
+	unsigned long flags;
+	LIST_HEAD(list);
+
+	if (!has_addr_filter(event))
+		return;
+
+	/* don't bother with children, they don't have their own filters */
+	if (event->parent)
+		return;
+
+	raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
+
+	list_splice_init(&event->addr_filters.list, &list);
+	if (head)
+		list_splice(head, &event->addr_filters.list);
+
+	raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
+
+	free_filters_list(&list);
+}
+
+/*
+ * Scan through mm's vmas and see if one of them matches the
+ * @filter; if so, adjust filter's address range.
+ * Called with mm::mmap_sem down for reading.
+ */
+static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
+					    struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		struct file *file = vma->vm_file;
+		unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+		unsigned long vma_size = vma->vm_end - vma->vm_start;
+
+		if (!file)
+			continue;
+
+		if (!perf_addr_filter_match(filter, file, off, vma_size))
+			continue;
+
+		return vma->vm_start;
+	}
+
+	return 0;
+}
+
+/*
+ * Update event's address range filters based on the
+ * task's existing mappings, if any.
+ */
+static void perf_event_addr_filters_apply(struct perf_event *event)
+{
+	struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+	struct task_struct *task = READ_ONCE(event->ctx->task);
+	struct perf_addr_filter *filter;
+	struct mm_struct *mm = NULL;
+	unsigned int count = 0;
+	unsigned long flags;
+
+	/*
+	 * We may observe TASK_TOMBSTONE, which means that the event tear-down
+	 * will stop on the parent's child_mutex that our caller is also holding
+	 */
+	if (task == TASK_TOMBSTONE)
+		return;
+
+	mm = get_task_mm(event->ctx->task);
+	if (!mm)
+		goto restart;
+
+	down_read(&mm->mmap_sem);
+
+	raw_spin_lock_irqsave(&ifh->lock, flags);
+	list_for_each_entry(filter, &ifh->list, entry) {
+		event->addr_filters_offs[count] = 0;
+
+		if (perf_addr_filter_needs_mmap(filter))
+			event->addr_filters_offs[count] =
+				perf_addr_filter_apply(filter, mm);
+
+		count++;
+	}
+
+	event->addr_filters_gen++;
+	raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+	up_read(&mm->mmap_sem);
+
+	mmput(mm);
+
+restart:
+	perf_event_restart(event);
+}
+
+/*
+ * Address range filtering: limiting the data to certain
+ * instruction address ranges. Filters are ioctl()ed to us from
+ * userspace as ascii strings.
+ *
+ * Filter string format:
+ *
+ * ACTION RANGE_SPEC
+ * where ACTION is one of the
+ *  * "filter": limit the trace to this region
+ *  * "start": start tracing from this address
+ *  * "stop": stop tracing at this address/region;
+ * RANGE_SPEC is
+ *  * for kernel addresses: <start address>[/<size>]
+ *  * for object files:     <start address>[/<size>]@</path/to/object/file>
+ *
+ * if <size> is not specified, the range is treated as a single address.
+ */
+enum {
+	IF_ACT_FILTER,
+	IF_ACT_START,
+	IF_ACT_STOP,
+	IF_SRC_FILE,
+	IF_SRC_KERNEL,
+	IF_SRC_FILEADDR,
+	IF_SRC_KERNELADDR,
+};
+
+enum {
+	IF_STATE_ACTION = 0,
+	IF_STATE_SOURCE,
+	IF_STATE_END,
+};
+
+static const match_table_t if_tokens = {
+	{ IF_ACT_FILTER,	"filter" },
+	{ IF_ACT_START,		"start" },
+	{ IF_ACT_STOP,		"stop" },
+	{ IF_SRC_FILE,		"%u/%u@%s" },
+	{ IF_SRC_KERNEL,	"%u/%u" },
+	{ IF_SRC_FILEADDR,	"%u@%s" },
+	{ IF_SRC_KERNELADDR,	"%u" },
+};
+
+/*
+ * Address filter string parser
+ */
+static int
+perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
+			     struct list_head *filters)
+{
+	struct perf_addr_filter *filter = NULL;
+	char *start, *orig, *filename = NULL;
+	struct path path;
+	substring_t args[MAX_OPT_ARGS];
+	int state = IF_STATE_ACTION, token;
+	unsigned int kernel = 0;
+	int ret = -EINVAL;
+
+	orig = fstr = kstrdup(fstr, GFP_KERNEL);
+	if (!fstr)
+		return -ENOMEM;
+
+	while ((start = strsep(&fstr, " ,\n")) != NULL) {
+		ret = -EINVAL;
+
+		if (!*start)
+			continue;
+
+		/* filter definition begins */
+		if (state == IF_STATE_ACTION) {
+			filter = perf_addr_filter_new(event, filters);
+			if (!filter)
+				goto fail;
+		}
+
+		token = match_token(start, if_tokens, args);
+		switch (token) {
+		case IF_ACT_FILTER:
+		case IF_ACT_START:
+			filter->filter = 1;
+
+		case IF_ACT_STOP:
+			if (state != IF_STATE_ACTION)
+				goto fail;
+
+			state = IF_STATE_SOURCE;
+			break;
+
+		case IF_SRC_KERNELADDR:
+		case IF_SRC_KERNEL:
+			kernel = 1;
+
+		case IF_SRC_FILEADDR:
+		case IF_SRC_FILE:
+			if (state != IF_STATE_SOURCE)
+				goto fail;
+
+			if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
+				filter->range = 1;
+
+			*args[0].to = 0;
+			ret = kstrtoul(args[0].from, 0, &filter->offset);
+			if (ret)
+				goto fail;
+
+			if (filter->range) {
+				*args[1].to = 0;
+				ret = kstrtoul(args[1].from, 0, &filter->size);
+				if (ret)
+					goto fail;
+			}
+
+			if (token == IF_SRC_FILE) {
+				filename = match_strdup(&args[2]);
+				if (!filename) {
+					ret = -ENOMEM;
+					goto fail;
+				}
+			}
+
+			state = IF_STATE_END;
+			break;
+
+		default:
+			goto fail;
+		}
+
+		/*
+		 * Filter definition is fully parsed, validate and install it.
+		 * Make sure that it doesn't contradict itself or the event's
+		 * attribute.
+		 */
+		if (state == IF_STATE_END) {
+			if (kernel && event->attr.exclude_kernel)
+				goto fail;
+
+			if (!kernel) {
+				if (!filename)
+					goto fail;
+
+				/* look up the path and grab its inode */
+				ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+				if (ret)
+					goto fail_free_name;
+
+				filter->inode = igrab(d_inode(path.dentry));
+				path_put(&path);
+				kfree(filename);
+				filename = NULL;
+
+				ret = -EINVAL;
+				if (!filter->inode ||
+				    !S_ISREG(filter->inode->i_mode))
+					/* free_filters_list() will iput() */
+					goto fail;
+			}
+
+			/* ready to consume more filters */
+			state = IF_STATE_ACTION;
+			filter = NULL;
+		}
+	}
+
+	if (state != IF_STATE_ACTION)
+		goto fail;
+
+	kfree(orig);
+
+	return 0;
+
+fail_free_name:
+	kfree(filename);
+fail:
+	free_filters_list(filters);
+	kfree(orig);
+
+	return ret;
+}
+
+static int
+perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
+{
+	LIST_HEAD(filters);
+	int ret;
+
+	/*
+	 * Since this is called in perf_ioctl() path, we're already holding
+	 * ctx::mutex.
+	 */
+	lockdep_assert_held(&event->ctx->mutex);
+
+	if (WARN_ON_ONCE(event->parent))
+		return -EINVAL;
+
+	/*
+	 * For now, we only support filtering in per-task events; doing so
+	 * for CPU-wide events requires additional context switching trickery,
+	 * since same object code will be mapped at different virtual
+	 * addresses in different processes.
+	 */
+	if (!event->ctx->task)
+		return -EOPNOTSUPP;
+
+	ret = perf_event_parse_addr_filter(event, filter_str, &filters);
+	if (ret)
+		return ret;
+
+	ret = event->pmu->addr_filters_validate(&filters);
+	if (ret) {
+		free_filters_list(&filters);
+		return ret;
+	}
+
+	/* remove existing filters, if any */
+	perf_addr_filters_splice(event, &filters);
+
+	/* install new filters */
+	perf_event_for_each_child(event, perf_event_addr_filters_apply);
+
+	return ret;
+}
+
 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 {
 	char *filter_str;
 	int ret = -EINVAL;
 
-	if (event->attr.type != PERF_TYPE_TRACEPOINT ||
-	    !IS_ENABLED(CONFIG_EVENT_TRACING))
+	if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
+	    !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
+	    !has_addr_filter(event))
 		return -EINVAL;
 
 	filter_str = strndup_user(arg, PAGE_SIZE);
@@ -7336,6 +7909,8 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 	    event->attr.type == PERF_TYPE_TRACEPOINT)
 		ret = ftrace_profile_set_filter(event, event->attr.config,
 						filter_str);
+	else if (has_addr_filter(event))
+		ret = perf_event_set_addr_filter(event, filter_str);
 
 	kfree(filter_str);
 	return ret;
@@ -8130,6 +8705,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	INIT_LIST_HEAD(&event->sibling_list);
 	INIT_LIST_HEAD(&event->rb_entry);
 	INIT_LIST_HEAD(&event->active_entry);
+	INIT_LIST_HEAD(&event->addr_filters.list);
 	INIT_HLIST_NODE(&event->hlist_entry);
 
 
@@ -8137,6 +8713,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	init_irq_work(&event->pending, perf_pending_event);
 
 	mutex_init(&event->mmap_mutex);
+	raw_spin_lock_init(&event->addr_filters.lock);
 
 	atomic_long_set(&event->refcount, 1);
 	event->cpu		= cpu;
@@ -8221,11 +8798,22 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (err)
 		goto err_pmu;
 
+	if (has_addr_filter(event)) {
+		event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
+						   sizeof(unsigned long),
+						   GFP_KERNEL);
+		if (!event->addr_filters_offs)
+			goto err_per_task;
+
+		/* force hw sync on the address filters */
+		event->addr_filters_gen = 1;
+	}
+
 	if (!event->parent) {
 		if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
 			err = get_callchain_buffers();
 			if (err)
-				goto err_per_task;
+				goto err_addr_filters;
 		}
 	}
 
@@ -8234,6 +8822,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	return event;
 
+err_addr_filters:
+	kfree(event->addr_filters_offs);
+
 err_per_task:
 	exclusive_event_destroy(event);
 
-- 
cgit v1.2.3


From 6e855cd4f4b5258016cf707f94f96bfa51c32f32 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Wed, 27 Apr 2016 18:44:48 +0300
Subject: perf/core: Let userspace know if the PMU supports address filters

Export an additional common attribute for PMUs that support address range
filtering to let the perf userspace identify such PMUs in a uniform way.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1461771888-10409-8-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index ffdc096a4f4e..63be65437e9e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8273,6 +8273,20 @@ static void free_pmu_context(struct pmu *pmu)
 out:
 	mutex_unlock(&pmus_lock);
 }
+
+/*
+ * Let userspace know that this PMU supports address range filtering:
+ */
+static ssize_t nr_addr_filters_show(struct device *dev,
+				    struct device_attribute *attr,
+				    char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+}
+DEVICE_ATTR_RO(nr_addr_filters);
+
 static struct idr pmu_idr;
 
 static ssize_t
@@ -8374,9 +8388,19 @@ static int pmu_dev_alloc(struct pmu *pmu)
 	if (ret)
 		goto free_dev;
 
+	/* For PMUs with address filters, throw in an extra attribute: */
+	if (pmu->nr_addr_filters)
+		ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
+
+	if (ret)
+		goto del_dev;
+
 out:
 	return ret;
 
+del_dev:
+	device_del(pmu->dev);
+
 free_dev:
 	put_device(pmu->dev);
 	goto out;
@@ -8512,6 +8536,8 @@ void perf_pmu_unregister(struct pmu *pmu)
 	free_percpu(pmu->pmu_disable_count);
 	if (pmu->type >= PERF_TYPE_MAX)
 		idr_remove(&pmu_idr, pmu->type);
+	if (pmu->nr_addr_filters)
+		device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
 	device_del(pmu->dev);
 	put_device(pmu->dev);
 	free_pmu_context(pmu);
-- 
cgit v1.2.3


From 5101ef20f0ef1de79091a1fdb6b1a7f07565545a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 26 Apr 2016 11:33:46 +0100
Subject: perf/arm: Special-case hetereogeneous CPUs

Commit:

  26657848502b7847 ("perf/core: Verify we have a single perf_hw_context PMU")

forcefully prevents multiple PMUs from sharing perf_hw_context, as this
generally doesn't make sense. It is a common bug for uncore PMUs to
use perf_hw_context rather than perf_invalid_context, which this detects.

However, systems exist with heterogeneous CPUs (and hence heterogeneous
HW PMUs), for which sharing perf_hw_context is necessary, and possible
in some limited cases.

To make this work we have to perform some gymnastics, as we did in these
commits:

  66eb579e66ecfea5 ("perf: allow for PMU-specific event filtering")
  c904e32a69b7c779 ("arm: perf: filter unschedulable events")

To allow those systems to work, we must allow PMUs for heterogeneous
CPUs to share perf_hw_context, though we must still disallow sharing
otherwise to detect the common misuse of perf_hw_context.

This patch adds a new PERF_PMU_CAP_HETEROGENEOUS_CPUS for this, updates
the core logic to account for this, and makes use of it in the arm_pmu
code that is used for systems with heterogeneous CPUs. Comments are
added to make the rationale clear and hopefully avoid accidental abuse.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Will Deacon <will.deacon@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/20160426103346.GA20836@leverpostej
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/perf/arm_pmu.c     | 8 ++++++++
 include/linux/perf_event.h | 1 +
 kernel/events/core.c       | 8 +++++++-
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index f70090897fdf..f2d01d4d9364 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -847,6 +847,14 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 	if (!platform_get_irq(cpu_pmu->plat_device, 0))
 		cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT;
 
+	/*
+	 * This is a CPU PMU potentially in a heterogeneous configuration (e.g.
+	 * big.LITTLE). This is not an uncore PMU, and we have taken ctx
+	 * sharing into account (e.g. with our pmu::filter_match callback and
+	 * pmu::event_init group validation).
+	 */
+	cpu_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS;
+
 	return 0;
 
 out_unregister:
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index c77e4a159fa2..9e1c3ada91c4 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -225,6 +225,7 @@ struct perf_event;
 #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF		0x08
 #define PERF_PMU_CAP_EXCLUSIVE			0x10
 #define PERF_PMU_CAP_ITRACE			0x20
+#define PERF_PMU_CAP_HETEROGENEOUS_CPUS		0x40
 
 /**
  * struct pmu - generic performance monitoring unit
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 63be65437e9e..fc0290f25482 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8443,7 +8443,13 @@ skip_type:
 	if (pmu->task_ctx_nr == perf_hw_context) {
 		static int hw_context_taken = 0;
 
-		if (WARN_ON_ONCE(hw_context_taken))
+		/*
+		 * Other than systems with heterogeneous CPUs, it never makes
+		 * sense for two PMUs to share perf_hw_context. PMUs which are
+		 * uncore must use perf_invalid_context.
+		 */
+		if (WARN_ON_ONCE(hw_context_taken &&
+		    !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
 			pmu->task_ctx_nr = perf_invalid_context;
 
 		hw_context_taken = 1;
-- 
cgit v1.2.3


From e9d867a67fd03ccc07248ca4e9c2f74fed494d5b Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Thu, 10 Mar 2016 12:54:08 +0100
Subject: sched: Allow per-cpu kernel threads to run on online && !active
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order to enable symmetric hotplug, we must mirror the online &&
!active state of cpu-down on the cpu-up side.

However, to retain sanity, limit this state to per-cpu kthreads.

Aside from the change to set_cpus_allowed_ptr(), which allow moving
the per-cpu kthreads on, the other critical piece is the cpu selection
for pinned tasks in select_task_rq(). This avoids dropping into
select_fallback_rq().

select_fallback_rq() cannot be allowed to select !active cpus because
its used to migrate user tasks away. And we do not want to move user
tasks onto cpus that are in transition.

Requested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Jan H. Schönherr <jschoenh@amazon.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160301152303.GV6356@twins.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/powerpc/kernel/smp.c |  2 +-
 arch/s390/kernel/smp.c    |  2 +-
 include/linux/cpumask.h   |  6 ++----
 kernel/sched/core.c       | 49 ++++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 46 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8cac1eb41466..55c924b65f71 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -565,7 +565,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 		smp_ops->give_timebase();
 
 	/* Wait until cpu puts itself in the online & active maps */
-	while (!cpu_online(cpu) || !cpu_active(cpu))
+	while (!cpu_online(cpu))
 		cpu_relax();
 
 	return 0;
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 40a6b4f9c36c..7b89a7572100 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -832,7 +832,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 	pcpu_attach_task(pcpu, tidle);
 	pcpu_start_fn(pcpu, smp_start_secondary, NULL);
 	/* Wait until cpu puts itself in the online & active maps */
-	while (!cpu_online(cpu) || !cpu_active(cpu))
+	while (!cpu_online(cpu))
 		cpu_relax();
 	return 0;
 }
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 40cee6b77a93..e828cf65d7df 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -743,12 +743,10 @@ set_cpu_present(unsigned int cpu, bool present)
 static inline void
 set_cpu_online(unsigned int cpu, bool online)
 {
-	if (online) {
+	if (online)
 		cpumask_set_cpu(cpu, &__cpu_online_mask);
-		cpumask_set_cpu(cpu, &__cpu_active_mask);
-	} else {
+	else
 		cpumask_clear_cpu(cpu, &__cpu_online_mask);
-	}
 }
 
 static inline void
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fcac37b..8bfd7d4f1c21 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1082,13 +1082,21 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 static int __set_cpus_allowed_ptr(struct task_struct *p,
 				  const struct cpumask *new_mask, bool check)
 {
+	const struct cpumask *cpu_valid_mask = cpu_active_mask;
+	unsigned int dest_cpu;
 	unsigned long flags;
 	struct rq *rq;
-	unsigned int dest_cpu;
 	int ret = 0;
 
 	rq = task_rq_lock(p, &flags);
 
+	if (p->flags & PF_KTHREAD) {
+		/*
+		 * Kernel threads are allowed on online && !active CPUs
+		 */
+		cpu_valid_mask = cpu_online_mask;
+	}
+
 	/*
 	 * Must re-check here, to close a race against __kthread_bind(),
 	 * sched_setaffinity() is not guaranteed to observe the flag.
@@ -1101,18 +1109,28 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	if (cpumask_equal(&p->cpus_allowed, new_mask))
 		goto out;
 
-	if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	do_set_cpus_allowed(p, new_mask);
 
+	if (p->flags & PF_KTHREAD) {
+		/*
+		 * For kernel threads that do indeed end up on online &&
+		 * !active we want to ensure they are strict per-cpu threads.
+		 */
+		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+			!cpumask_intersects(new_mask, cpu_active_mask) &&
+			p->nr_cpus_allowed != 1);
+	}
+
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpumask_test_cpu(task_cpu(p), new_mask))
 		goto out;
 
-	dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
 		struct migration_arg arg = { p, dest_cpu };
 		/* Need help from migration thread: drop lock and wait. */
@@ -1431,6 +1449,25 @@ EXPORT_SYMBOL_GPL(kick_process);
 
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ *
+ * A few notes on cpu_active vs cpu_online:
+ *
+ *  - cpu_active must be a subset of cpu_online
+ *
+ *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
+ *    see __set_cpus_allowed_ptr(). At this point the newly online
+ *    cpu isn't yet part of the sched domains, and balancing will not
+ *    see it.
+ *
+ *  - on cpu-down we clear cpu_active() to mask the sched domains and
+ *    avoid the load balancer to place new tasks on the to be removed
+ *    cpu. Existing tasks will remain running there and will be taken
+ *    off.
+ *
+ * This means that fallback selection must not select !active CPUs.
+ * And can assume that any active CPU must be online. Conversely
+ * select_task_rq() below may allow selection of !active CPUs in order
+ * to satisfy the above rules.
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
@@ -1449,8 +1486,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 
 		/* Look for allowed, online CPU in same node. */
 		for_each_cpu(dest_cpu, nodemask) {
-			if (!cpu_online(dest_cpu))
-				continue;
 			if (!cpu_active(dest_cpu))
 				continue;
 			if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
@@ -1461,8 +1496,6 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 	for (;;) {
 		/* Any allowed, online CPU? */
 		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-			if (!cpu_online(dest_cpu))
-				continue;
 			if (!cpu_active(dest_cpu))
 				continue;
 			goto out;
@@ -1514,6 +1547,8 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 
 	if (p->nr_cpus_allowed > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+	else
+		cpu = cpumask_any(tsk_cpus_allowed(p));
 
 	/*
 	 * In order not to call set_task_cpu() on a blocking task we need
-- 
cgit v1.2.3


From 9cf7243d5d83d27aca47f842107bfa02b5f11d16 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:09 +0100
Subject: sched: Make set_cpu_rq_start_time() a built in hotplug state

Start distangling the maze of hotplug notifiers in the scheduler.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h |  1 +
 include/linux/sched.h      |  1 +
 kernel/cpu.c               |  6 ++++++
 kernel/sched/core.c        | 16 +++++++++-------
 4 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 5d68e15e46b7..99fd1d2f76fe 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -8,6 +8,7 @@ enum cpuhp_state {
 	CPUHP_BRINGUP_CPU,
 	CPUHP_AP_IDLE_DEAD,
 	CPUHP_AP_OFFLINE,
+	CPUHP_AP_SCHED_STARTING,
 	CPUHP_AP_NOTIFY_STARTING,
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52c4847b05e2..39597d0a005e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -372,6 +372,7 @@ extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
+extern int sched_cpu_starting(unsigned int cpu);
 
 extern void sched_show_task(struct task_struct *p);
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3e3f6e49eabb..f46d02b966bf 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1236,6 +1236,12 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 		.name			= "ap:offline",
 		.cant_stop		= true,
 	},
+	/* First state is scheduler control. Interrupts are disabled */
+	[CPUHP_AP_SCHED_STARTING] = {
+		.name			= "sched:starting",
+		.startup		= sched_cpu_starting,
+		.teardown		= NULL,
+	},
 	/*
 	 * Low level startup/teardown notifiers. Run with interrupts
 	 * disabled. Will be removed once the notifiers are converted to
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8bfd7d4f1c21..4df9aaae27a2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5473,10 +5473,10 @@ static struct notifier_block migration_notifier = {
 	.priority = CPU_PRI_MIGRATION,
 };
 
-static void set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(unsigned int cpu)
 {
-	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
+
 	rq->age_stamp = sched_clock_cpu(cpu);
 }
 
@@ -5486,10 +5486,6 @@ static int sched_cpu_active(struct notifier_block *nfb,
 	int cpu = (long)hcpu;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_STARTING:
-		set_cpu_rq_start_time();
-		return NOTIFY_OK;
-
 	case CPU_DOWN_FAILED:
 		set_cpu_active(cpu, true);
 		return NOTIFY_OK;
@@ -5511,6 +5507,12 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
 	}
 }
 
+int sched_cpu_starting(unsigned int cpu)
+{
+	set_cpu_rq_start_time(cpu);
+	return 0;
+}
+
 static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
@@ -7426,7 +7428,7 @@ void __init sched_init(void)
 	if (cpu_isolated_map == NULL)
 		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 	idle_thread_set_boot_cpu();
-	set_cpu_rq_start_time();
+	set_cpu_rq_start_time(smp_processor_id());
 #endif
 	init_sched_fair_class();
 
-- 
cgit v1.2.3


From e26fbffd32c28107d9d268b432706ccf84fb6411 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:10 +0100
Subject: sched: Allow hotplug notifiers to be setup early

Prevent the SMP scheduler related notifiers to be executed before the smp
scheduler is initialized and install them early.

This is a preparatory change for further consolidation of the hotplug notifier
maze.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 59 ++++++++++++++++++++++++++++++++---------------------
 1 file changed, 36 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4df9aaae27a2..328502c9af00 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5195,6 +5195,8 @@ out:
 
 #ifdef CONFIG_SMP
 
+static bool sched_smp_initialized __read_mostly;
+
 #ifdef CONFIG_NUMA_BALANCING
 /* Migrate current task p to target_cpu */
 int migrate_task_to(struct task_struct *p, int target_cpu)
@@ -5513,25 +5515,6 @@ int sched_cpu_starting(unsigned int cpu)
 	return 0;
 }
 
-static int __init migration_init(void)
-{
-	void *cpu = (void *)(long)smp_processor_id();
-	int err;
-
-	/* Initialize migration for the boot CPU */
-	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
-	BUG_ON(err == NOTIFY_BAD);
-	migration_call(&migration_notifier, CPU_ONLINE, cpu);
-	register_cpu_notifier(&migration_notifier);
-
-	/* Register cpu active notifiers */
-	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
-	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-
-	return 0;
-}
-early_initcall(migration_init);
-
 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 
 #ifdef CONFIG_SCHED_DEBUG
@@ -6711,6 +6694,9 @@ static int sched_domains_numa_masks_update(struct notifier_block *nfb,
 {
 	int cpu = (long)hcpu;
 
+	if (!sched_smp_initialized)
+		return NOTIFY_DONE;
+
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
 		sched_domains_numa_masks_set(cpu);
@@ -7129,6 +7115,9 @@ static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 			     void *hcpu)
 {
+	if (!sched_smp_initialized)
+		return NOTIFY_DONE;
+
 	switch (action) {
 	case CPU_ONLINE_FROZEN:
 	case CPU_DOWN_FAILED_FROZEN:
@@ -7169,6 +7158,9 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
 	bool overflow;
 	int cpus;
 
+	if (!sched_smp_initialized)
+		return NOTIFY_DONE;
+
 	switch (action) {
 	case CPU_DOWN_PREPARE:
 		rcu_read_lock_sched();
@@ -7216,10 +7208,6 @@ void __init sched_init_smp(void)
 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 
-	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
-	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
-	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
 	init_hrtick();
 
 	/* Move init over to a non-isolated CPU */
@@ -7230,7 +7218,32 @@ void __init sched_init_smp(void)
 
 	init_sched_rt_class();
 	init_sched_dl_class();
+	sched_smp_initialized = true;
 }
+
+static int __init migration_init(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+	int err;
+
+	/* Initialize migration for the boot CPU */
+	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+	BUG_ON(err == NOTIFY_BAD);
+	migration_call(&migration_notifier, CPU_ONLINE, cpu);
+	register_cpu_notifier(&migration_notifier);
+
+	/* Register cpu active notifiers */
+	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+
+	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
+	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
+	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
+
+	return 0;
+}
+early_initcall(migration_init);
+
 #else
 void __init sched_init_smp(void)
 {
-- 
cgit v1.2.3


From 135fb3e19773e66f56b60e3b9fdda6166e77c55d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:11 +0100
Subject: sched: Consolidate the notifier maze

We can maintain the ordering of the scheduler cpu hotplug functionality nicely
in one notifer. Get rid of the maze.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h |  12 ++--
 kernel/sched/core.c | 174 +++++++++++++++++++++-------------------------------
 2 files changed, 73 insertions(+), 113 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f9b1fab4388a..17017051bfb1 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -61,19 +61,15 @@ struct notifier_block;
 enum {
 	/*
 	 * SCHED_ACTIVE marks a cpu which is coming up active during
-	 * CPU_ONLINE and CPU_DOWN_FAILED and must be the first
-	 * notifier.  CPUSET_ACTIVE adjusts cpuset according to
-	 * cpu_active mask right after SCHED_ACTIVE.  During
-	 * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
-	 * ordered in the similar way.
+	 * CPU_ONLINE and CPU_DOWN_FAILED and must be the first notifier.  Is
+	 * also cpuset according to cpu_active mask right after activating the
+	 * cpu. During CPU_DOWN_PREPARE, SCHED_INACTIVE reversed the operation.
 	 *
 	 * This ordering guarantees consistent cpu_active mask and
 	 * migration behavior to all cpu notifiers.
 	 */
 	CPU_PRI_SCHED_ACTIVE	= INT_MAX,
-	CPU_PRI_CPUSET_ACTIVE	= INT_MAX - 1,
-	CPU_PRI_SCHED_INACTIVE	= INT_MIN + 1,
-	CPU_PRI_CPUSET_INACTIVE	= INT_MIN,
+	CPU_PRI_SCHED_INACTIVE	= INT_MIN,
 
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 328502c9af00..8a03a04f5c3a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5482,39 +5482,6 @@ static void set_cpu_rq_start_time(unsigned int cpu)
 	rq->age_stamp = sched_clock_cpu(cpu);
 }
 
-static int sched_cpu_active(struct notifier_block *nfb,
-				      unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_FAILED:
-		set_cpu_active(cpu, true);
-		return NOTIFY_OK;
-
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static int sched_cpu_inactive(struct notifier_block *nfb,
-					unsigned long action, void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_PREPARE:
-		set_cpu_active((long)hcpu, false);
-		return NOTIFY_OK;
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-int sched_cpu_starting(unsigned int cpu)
-{
-	set_cpu_rq_start_time(cpu);
-	return 0;
-}
-
 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 
 #ifdef CONFIG_SCHED_DEBUG
@@ -6662,10 +6629,13 @@ static void sched_init_numa(void)
 	init_numa_topology_type();
 }
 
-static void sched_domains_numa_masks_set(int cpu)
+static void sched_domains_numa_masks_set(unsigned int cpu)
 {
-	int i, j;
 	int node = cpu_to_node(cpu);
+	int i, j;
+
+	if (!sched_smp_initialized)
+		return;
 
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		for (j = 0; j < nr_node_ids; j++) {
@@ -6675,54 +6645,23 @@ static void sched_domains_numa_masks_set(int cpu)
 	}
 }
 
-static void sched_domains_numa_masks_clear(int cpu)
+static void sched_domains_numa_masks_clear(unsigned int cpu)
 {
 	int i, j;
+
+	if (!sched_smp_initialized)
+		return;
+
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		for (j = 0; j < nr_node_ids; j++)
 			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
 	}
 }
 
-/*
- * Update sched_domains_numa_masks[level][node] array when new cpus
- * are onlined.
- */
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-					   unsigned long action,
-					   void *hcpu)
-{
-	int cpu = (long)hcpu;
-
-	if (!sched_smp_initialized)
-		return NOTIFY_DONE;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_ONLINE:
-		sched_domains_numa_masks_set(cpu);
-		break;
-
-	case CPU_DEAD:
-		sched_domains_numa_masks_clear(cpu);
-		break;
-
-	default:
-		return NOTIFY_DONE;
-	}
-
-	return NOTIFY_OK;
-}
 #else
-static inline void sched_init_numa(void)
-{
-}
-
-static int sched_domains_numa_masks_update(struct notifier_block *nfb,
-					   unsigned long action,
-					   void *hcpu)
-{
-	return 0;
-}
+static inline void sched_init_numa(void) { }
+static void sched_domains_numa_masks_set(unsigned int cpu) { }
+static void sched_domains_numa_masks_clear(unsigned int cpu) { }
 #endif /* CONFIG_NUMA */
 
 static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -7112,16 +7051,12 @@ static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
  * If we come here as part of a suspend/resume, don't touch cpusets because we
  * want to restore it back to its original state upon resume anyway.
  */
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
-			     void *hcpu)
+static void cpuset_cpu_active(bool frozen)
 {
 	if (!sched_smp_initialized)
-		return NOTIFY_DONE;
-
-	switch (action) {
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_FAILED_FROZEN:
+		return;
 
+	if (frozen) {
 		/*
 		 * num_cpus_frozen tracks how many CPUs are involved in suspend
 		 * resume sequence. As long as this is not the last online
@@ -7131,38 +7066,28 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
 		num_cpus_frozen--;
 		if (likely(num_cpus_frozen)) {
 			partition_sched_domains(1, NULL, NULL);
-			break;
+			return;
 		}
-
 		/*
 		 * This is the last CPU online operation. So fall through and
 		 * restore the original sched domains by considering the
 		 * cpuset configurations.
 		 */
-
-	case CPU_ONLINE:
-		cpuset_update_active_cpus(true);
-		break;
-	default:
-		return NOTIFY_DONE;
 	}
-	return NOTIFY_OK;
+	cpuset_update_active_cpus(true);
 }
 
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
-			       void *hcpu)
+static int cpuset_cpu_inactive(unsigned int cpu, bool frozen)
 {
 	unsigned long flags;
-	long cpu = (long)hcpu;
 	struct dl_bw *dl_b;
 	bool overflow;
 	int cpus;
 
 	if (!sched_smp_initialized)
-		return NOTIFY_DONE;
+		return 0;
 
-	switch (action) {
-	case CPU_DOWN_PREPARE:
+	if (!frozen) {
 		rcu_read_lock_sched();
 		dl_b = dl_bw_of(cpu);
 
@@ -7174,17 +7099,60 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
 		rcu_read_unlock_sched();
 
 		if (overflow)
-			return notifier_from_errno(-EBUSY);
+			return -EBUSY;
 		cpuset_update_active_cpus(false);
-		break;
-	case CPU_DOWN_PREPARE_FROZEN:
+	} else {
 		num_cpus_frozen++;
 		partition_sched_domains(1, NULL, NULL);
-		break;
+	}
+	return 0;
+}
+
+static int sched_cpu_active(struct notifier_block *nfb, unsigned long action,
+			    void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_FAILED:
+	case CPU_ONLINE:
+		set_cpu_active(cpu, true);
+		sched_domains_numa_masks_set(cpu);
+		cpuset_cpu_active(action & CPU_TASKS_FROZEN);
+		return NOTIFY_OK;
 	default:
 		return NOTIFY_DONE;
 	}
-	return NOTIFY_OK;
+}
+
+static int sched_cpu_inactive(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	int ret;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_DOWN_PREPARE:
+		set_cpu_active(cpu, false);
+		ret = cpuset_cpu_inactive(cpu, action & CPU_TASKS_FROZEN);
+		if (ret) {
+			set_cpu_active(cpu, true);
+			return notifier_from_errno(ret);
+		}
+		return NOTIFY_OK;
+
+	case CPU_DEAD:
+		sched_domains_numa_masks_clear(cpu);
+		return NOTIFY_OK;
+	default:
+		return NOTIFY_DONE;
+	}
+}
+
+int sched_cpu_starting(unsigned int cpu)
+{
+	set_cpu_rq_start_time(cpu);
+	return 0;
 }
 
 void __init sched_init_smp(void)
@@ -7236,10 +7204,6 @@ static int __init migration_init(void)
 	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
 	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
 
-	hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
-	hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
-	hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
 	return 0;
 }
 early_initcall(migration_init);
-- 
cgit v1.2.3


From c6d2c7475ced868cfc636cd5a55a428da94537c3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:12 +0100
Subject: sched: Move sched_domains_numa_masks_clear() to DOWN_PREPARE

This is the last operation on the cpu before vanishing. No point in calling
that on CPU_DEAD.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a03a04f5c3a..541f9ab8ce4f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7139,9 +7139,6 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
 			set_cpu_active(cpu, true);
 			return notifier_from_errno(ret);
 		}
-		return NOTIFY_OK;
-
-	case CPU_DEAD:
 		sched_domains_numa_masks_clear(cpu);
 		return NOTIFY_OK;
 	default:
-- 
cgit v1.2.3


From 40190a78f85fec29f0fdd21f6b4415712085711e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:13 +0100
Subject: sched/hotplug: Convert cpu_[in]active notifiers to state machine

Now that we reduced everything into single notifiers, it's simple to move them
into the hotplug state machine space.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h        | 12 ---------
 include/linux/cpuhotplug.h |  1 +
 include/linux/sched.h      |  2 ++
 kernel/cpu.c               |  8 ++++--
 kernel/sched/core.c        | 67 +++++++++++++++-------------------------------
 5 files changed, 30 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 17017051bfb1..b22b000cf6ee 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -59,18 +59,6 @@ struct notifier_block;
  * CPU notifier priorities.
  */
 enum {
-	/*
-	 * SCHED_ACTIVE marks a cpu which is coming up active during
-	 * CPU_ONLINE and CPU_DOWN_FAILED and must be the first notifier.  Is
-	 * also cpuset according to cpu_active mask right after activating the
-	 * cpu. During CPU_DOWN_PREPARE, SCHED_INACTIVE reversed the operation.
-	 *
-	 * This ordering guarantees consistent cpu_active mask and
-	 * migration behavior to all cpu notifiers.
-	 */
-	CPU_PRI_SCHED_ACTIVE	= INT_MAX,
-	CPU_PRI_SCHED_INACTIVE	= INT_MIN,
-
 	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION	= 10,
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 99fd1d2f76fe..9e07468bf1c5 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -13,6 +13,7 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_AP_ONLINE_IDLE,
+	CPUHP_AP_ACTIVE,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_NOTIFY_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 39597d0a005e..1e5f961b1a74 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -373,6 +373,8 @@ extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 extern int sched_cpu_starting(unsigned int cpu);
+extern int sched_cpu_activate(unsigned int cpu);
+extern int sched_cpu_deactivate(unsigned int cpu);
 
 extern void sched_show_task(struct task_struct *p);
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f46d02b966bf..15402b72fbc7 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -923,8 +923,6 @@ void cpuhp_online_idle(enum cpuhp_state state)
 
 	st->state = CPUHP_AP_ONLINE_IDLE;
 
-	/* The cpu is marked online, set it active now */
-	set_cpu_active(cpu, true);
 	/* Unpark the stopper thread and the hotplug thread of this cpu */
 	stop_machine_unpark(cpu);
 	kthread_unpark(st->thread);
@@ -1259,6 +1257,12 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 	[CPUHP_AP_ONLINE] = {
 		.name			= "ap:online",
 	},
+	/* First state is scheduler control. Interrupts are enabled */
+	[CPUHP_AP_ACTIVE] = {
+		.name			= "sched:active",
+		.startup		= sched_cpu_activate,
+		.teardown		= sched_cpu_deactivate,
+	},
 	/* Handle smpboot threads park/unpark */
 	[CPUHP_AP_SMPBOOT_THREADS] = {
 		.name			= "smpboot:threads",
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 541f9ab8ce4f..73bcd937d436 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6634,9 +6634,6 @@ static void sched_domains_numa_masks_set(unsigned int cpu)
 	int node = cpu_to_node(cpu);
 	int i, j;
 
-	if (!sched_smp_initialized)
-		return;
-
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		for (j = 0; j < nr_node_ids; j++) {
 			if (node_distance(j, node) <= sched_domains_numa_distance[i])
@@ -6649,9 +6646,6 @@ static void sched_domains_numa_masks_clear(unsigned int cpu)
 {
 	int i, j;
 
-	if (!sched_smp_initialized)
-		return;
-
 	for (i = 0; i < sched_domains_numa_levels; i++) {
 		for (j = 0; j < nr_node_ids; j++)
 			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
@@ -7051,12 +7045,9 @@ static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
  * If we come here as part of a suspend/resume, don't touch cpusets because we
  * want to restore it back to its original state upon resume anyway.
  */
-static void cpuset_cpu_active(bool frozen)
+static void cpuset_cpu_active(void)
 {
-	if (!sched_smp_initialized)
-		return;
-
-	if (frozen) {
+	if (cpuhp_tasks_frozen) {
 		/*
 		 * num_cpus_frozen tracks how many CPUs are involved in suspend
 		 * resume sequence. As long as this is not the last online
@@ -7077,17 +7068,14 @@ static void cpuset_cpu_active(bool frozen)
 	cpuset_update_active_cpus(true);
 }
 
-static int cpuset_cpu_inactive(unsigned int cpu, bool frozen)
+static int cpuset_cpu_inactive(unsigned int cpu)
 {
 	unsigned long flags;
 	struct dl_bw *dl_b;
 	bool overflow;
 	int cpus;
 
-	if (!sched_smp_initialized)
-		return 0;
-
-	if (!frozen) {
+	if (!cpuhp_tasks_frozen) {
 		rcu_read_lock_sched();
 		dl_b = dl_bw_of(cpu);
 
@@ -7108,42 +7096,33 @@ static int cpuset_cpu_inactive(unsigned int cpu, bool frozen)
 	return 0;
 }
 
-static int sched_cpu_active(struct notifier_block *nfb, unsigned long action,
-			    void *hcpu)
+int sched_cpu_activate(unsigned int cpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
+	set_cpu_active(cpu, true);
 
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_FAILED:
-	case CPU_ONLINE:
-		set_cpu_active(cpu, true);
+	if (sched_smp_initialized) {
 		sched_domains_numa_masks_set(cpu);
-		cpuset_cpu_active(action & CPU_TASKS_FROZEN);
-		return NOTIFY_OK;
-	default:
-		return NOTIFY_DONE;
+		cpuset_cpu_active();
 	}
+	return 0;
 }
 
-static int sched_cpu_inactive(struct notifier_block *nfb,
-					unsigned long action, void *hcpu)
+int sched_cpu_deactivate(unsigned int cpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
 	int ret;
 
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DOWN_PREPARE:
-		set_cpu_active(cpu, false);
-		ret = cpuset_cpu_inactive(cpu, action & CPU_TASKS_FROZEN);
-		if (ret) {
-			set_cpu_active(cpu, true);
-			return notifier_from_errno(ret);
-		}
-		sched_domains_numa_masks_clear(cpu);
-		return NOTIFY_OK;
-	default:
-		return NOTIFY_DONE;
+	set_cpu_active(cpu, false);
+
+	if (!sched_smp_initialized)
+		return 0;
+
+	ret = cpuset_cpu_inactive(cpu);
+	if (ret) {
+		set_cpu_active(cpu, true);
+		return ret;
 	}
+	sched_domains_numa_masks_clear(cpu);
+	return 0;
 }
 
 int sched_cpu_starting(unsigned int cpu)
@@ -7197,10 +7176,6 @@ static int __init migration_init(void)
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 
-	/* Register cpu active notifiers */
-	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
-	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
-
 	return 0;
 }
 early_initcall(migration_init);
-- 
cgit v1.2.3


From b2454caa8977ade27292a71f2def5e403e24b4d5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 10 Mar 2016 12:54:14 +0100
Subject: sched/hotplug: Move sync_rcu to be with set_cpu_active(false)

The sync_rcu stuff is specificically for clearing bits in the active
mask, such that everybody will observe the bit cleared and will not
consider the cleared CPU for load-balancing etc.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160310120025.169219710@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/cpu.c        | 15 ---------------
 kernel/sched/core.c | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 15402b72fbc7..c134a35374a1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -703,21 +703,6 @@ static int takedown_cpu(unsigned int cpu)
 	struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
 	int err;
 
-	/*
-	 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
-	 * and RCU users of this state to go away such that all new such users
-	 * will observe it.
-	 *
-	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
-	 * not imply sync_sched(), so wait for both.
-	 *
-	 * Do sync before park smpboot threads to take care the rcu boost case.
-	 */
-	if (IS_ENABLED(CONFIG_PREEMPT))
-		synchronize_rcu_mult(call_rcu, call_rcu_sched);
-	else
-		synchronize_rcu();
-
 	/* Park the smpboot threads */
 	kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
 	smpboot_park_threads(cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 73bcd937d436..0a3107809972 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7112,6 +7112,20 @@ int sched_cpu_deactivate(unsigned int cpu)
 	int ret;
 
 	set_cpu_active(cpu, false);
+	/*
+	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
+	 * users of this state to go away such that all new such users will
+	 * observe it.
+	 *
+	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+	 * not imply sync_sched(), so wait for both.
+	 *
+	 * Do sync before park smpboot threads to take care the rcu boost case.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT))
+		synchronize_rcu_mult(call_rcu, call_rcu_sched);
+	else
+		synchronize_rcu();
 
 	if (!sched_smp_initialized)
 		return 0;
-- 
cgit v1.2.3


From 94baf7a5d882cde0b4d591f4ab89cc32ee39ac6a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:15 +0100
Subject: sched/migration: Move prepare transition to SCHED_STARTING state

We can piggy pack that on the SCHED_STARTING state. It's not required before
the cpu actually comes online. Name the function proper as it has nothing to
do with migration.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160310120025.248226511@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0a3107809972..bafc308e6d45 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5424,11 +5424,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 
-	case CPU_UP_PREPARE:
-		rq->calc_load_update = calc_load_update;
-		account_reset_rq(rq);
-		break;
-
 	case CPU_ONLINE:
 		/* Update our root-domain */
 		raw_spin_lock_irqsave(&rq->lock, flags);
@@ -7139,9 +7134,19 @@ int sched_cpu_deactivate(unsigned int cpu)
 	return 0;
 }
 
+static void sched_rq_cpu_starting(unsigned int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	rq->calc_load_update = calc_load_update;
+	account_reset_rq(rq);
+	update_max_interval();
+}
+
 int sched_cpu_starting(unsigned int cpu)
 {
 	set_cpu_rq_start_time(cpu);
+	sched_rq_cpu_starting(cpu);
 	return 0;
 }
 
@@ -7182,11 +7187,8 @@ void __init sched_init_smp(void)
 static int __init migration_init(void)
 {
 	void *cpu = (void *)(long)smp_processor_id();
-	int err;
 
-	/* Initialize migration for the boot CPU */
-	err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
-	BUG_ON(err == NOTIFY_BAD);
+	sched_rq_cpu_starting(smp_processor_id());
 	migration_call(&migration_notifier, CPU_ONLINE, cpu);
 	register_cpu_notifier(&migration_notifier);
 
-- 
cgit v1.2.3


From e9cd8fa4fcfd67c95db9b87c0fff88fa23cb00e5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:16 +0100
Subject: sched/migration: Move calc_load_migrate() into CPU_DYING

It really does not matter when we fold the load for the outgoing cpu. It's
almost dead anyway, so there is no harm if we fail to fold the few
microseconds which are required for going fully away.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160310120025.328739226@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bafc308e6d45..688e8a83208c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5447,9 +5447,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		migrate_tasks(rq);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		break;
-
-	case CPU_DEAD:
 		calc_load_migrate(rq);
 		break;
 #endif
-- 
cgit v1.2.3


From 7d97669933eb94245ec9b715753753ec5ca8f646 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:17 +0100
Subject: sched/migration: Move CPU_ONLINE into scheduler state

The alleged requirement that the migration notifier has a lower priority than
perf is completely undocumented and there is no indication at all that this is
true. perf does not even handle the CPU_ONLINE notification and perf really
has nothing to do with migration.

Move the CPU_ONLINE code into the sched_activate_cpu() state callback.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160310120025.421743581@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 688e8a83208c..8d8d9034edff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5424,17 +5424,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 
-	case CPU_ONLINE:
-		/* Update our root-domain */
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->rd) {
-			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-
-			set_rq_online(rq);
-		}
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		break;
-
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_DYING:
 		sched_ttwu_pending();
@@ -7090,12 +7079,34 @@ static int cpuset_cpu_inactive(unsigned int cpu)
 
 int sched_cpu_activate(unsigned int cpu)
 {
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
 	set_cpu_active(cpu, true);
 
 	if (sched_smp_initialized) {
 		sched_domains_numa_masks_set(cpu);
 		cpuset_cpu_active();
 	}
+
+	/*
+	 * Put the rq online, if not already. This happens:
+	 *
+	 * 1) In the early boot process, because we build the real domains
+	 *    after all cpus have been brought up.
+	 *
+	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+	 *    domains.
+	 */
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	if (rq->rd) {
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_online(rq);
+	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	update_max_interval();
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From f2785ddb5367e217365099294b89d6a84668069e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:18 +0100
Subject: sched/hotplug: Move migration CPU_DYING to sched_cpu_dying()

Remove the hotplug notifier and make it an explicit state.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160310120025.502222097@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h   |  2 --
 include/linux/sched.h |  6 +++++
 kernel/cpu.c          |  2 +-
 kernel/sched/core.c   | 72 ++++++++++++++++-----------------------------------
 4 files changed, 29 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b22b000cf6ee..21597dcac0e2 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -59,9 +59,7 @@ struct notifier_block;
  * CPU notifier priorities.
  */
 enum {
-	/* migration should happen before other stuff but after perf */
 	CPU_PRI_PERF		= 20,
-	CPU_PRI_MIGRATION	= 10,
 
 	/* bring up workqueues before normal notifiers and down after */
 	CPU_PRI_WORKQUEUE_UP	= 5,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e5f961b1a74..47835cf8aefa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -376,6 +376,12 @@ extern int sched_cpu_starting(unsigned int cpu);
 extern int sched_cpu_activate(unsigned int cpu);
 extern int sched_cpu_deactivate(unsigned int cpu);
 
+#ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_dying(unsigned int cpu);
+#else
+# define sched_cpu_dying	NULL
+#endif
+
 extern void sched_show_task(struct task_struct *p);
 
 #ifdef CONFIG_LOCKUP_DETECTOR
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c134a35374a1..d6eeb8c5ef88 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1223,7 +1223,7 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 	[CPUHP_AP_SCHED_STARTING] = {
 		.name			= "sched:starting",
 		.startup		= sched_cpu_starting,
-		.teardown		= NULL,
+		.teardown		= sched_cpu_dying,
 	},
 	/*
 	 * Low level startup/teardown notifiers. Run with interrupts
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8d8d9034edff..a9a65ed772e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5411,51 +5411,6 @@ static void set_rq_offline(struct rq *rq)
 	}
 }
 
-/*
- * migration_call - callback that gets triggered when a CPU is added.
- * Here we can start up the necessary migration thread for the new CPU.
- */
-static int
-migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-	int cpu = (long)hcpu;
-	unsigned long flags;
-	struct rq *rq = cpu_rq(cpu);
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-
-#ifdef CONFIG_HOTPLUG_CPU
-	case CPU_DYING:
-		sched_ttwu_pending();
-		/* Update our root-domain */
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->rd) {
-			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
-			set_rq_offline(rq);
-		}
-		migrate_tasks(rq);
-		BUG_ON(rq->nr_running != 1); /* the migration thread */
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
-		calc_load_migrate(rq);
-		break;
-#endif
-	}
-
-	update_max_interval();
-
-	return NOTIFY_OK;
-}
-
-/*
- * Register at high priority so that task migration (migrate_all_tasks)
- * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_event subsystem, though.
- */
-static struct notifier_block migration_notifier = {
-	.notifier_call = migration_call,
-	.priority = CPU_PRI_MIGRATION,
-};
-
 static void set_cpu_rq_start_time(unsigned int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -7158,6 +7113,28 @@ int sched_cpu_starting(unsigned int cpu)
 	return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+int sched_cpu_dying(unsigned int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	/* Handle pending wakeups and then migrate everything off */
+	sched_ttwu_pending();
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	if (rq->rd) {
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_offline(rq);
+	}
+	migrate_tasks(rq);
+	BUG_ON(rq->nr_running != 1);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+	calc_load_migrate(rq);
+	update_max_interval();
+	return 0;
+}
+#endif
+
 void __init sched_init_smp(void)
 {
 	cpumask_var_t non_isolated_cpus;
@@ -7194,12 +7171,7 @@ void __init sched_init_smp(void)
 
 static int __init migration_init(void)
 {
-	void *cpu = (void *)(long)smp_processor_id();
-
 	sched_rq_cpu_starting(smp_processor_id());
-	migration_call(&migration_notifier, CPU_ONLINE, cpu);
-	register_cpu_notifier(&migration_notifier);
-
 	return 0;
 }
 early_initcall(migration_init);
-- 
cgit v1.2.3


From aaddd7d1c740ab3c5efaad7a34650b6dc680c21c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:19 +0100
Subject: sched/hotplug: Make activate() the last hotplug step

The scheduler can handle per cpu threads before the cpu is set to active and
it does not allow user space threads on the cpu before active is
set. Attaching to the scheduling domains is also not required before user
space threads can be handled.

Move the activation to the end of the hotplug state space. That also means
that deactivation is the first action when a cpu is shut down.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160310120025.597477199@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpuhotplug.h |  2 +-
 kernel/cpu.c               | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 9e07468bf1c5..386374d19987 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -13,11 +13,11 @@ enum cpuhp_state {
 	CPUHP_AP_ONLINE,
 	CPUHP_TEARDOWN_CPU,
 	CPUHP_AP_ONLINE_IDLE,
-	CPUHP_AP_ACTIVE,
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_NOTIFY_ONLINE,
 	CPUHP_AP_ONLINE_DYN,
 	CPUHP_AP_ONLINE_DYN_END		= CPUHP_AP_ONLINE_DYN + 30,
+	CPUHP_AP_ACTIVE,
 	CPUHP_ONLINE,
 };
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d6eeb8c5ef88..d948e44c471e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1242,12 +1242,6 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 	[CPUHP_AP_ONLINE] = {
 		.name			= "ap:online",
 	},
-	/* First state is scheduler control. Interrupts are enabled */
-	[CPUHP_AP_ACTIVE] = {
-		.name			= "sched:active",
-		.startup		= sched_cpu_activate,
-		.teardown		= sched_cpu_deactivate,
-	},
 	/* Handle smpboot threads park/unpark */
 	[CPUHP_AP_SMPBOOT_THREADS] = {
 		.name			= "smpboot:threads",
@@ -1269,6 +1263,15 @@ static struct cpuhp_step cpuhp_ap_states[] = {
 	 * The dynamically registered state space is here
 	 */
 
+#ifdef CONFIG_SMP
+	/* Last state is scheduler control setting the cpu active */
+	[CPUHP_AP_ACTIVE] = {
+		.name			= "sched:active",
+		.startup		= sched_cpu_activate,
+		.teardown		= sched_cpu_deactivate,
+	},
+#endif
+
 	/* CPU is fully up and running. */
 	[CPUHP_ONLINE] = {
 		.name			= "online",
-- 
cgit v1.2.3


From 20a5c8cc74ade5027c2b0e2bc724278afd6054f3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:20 +0100
Subject: sched/fair: Make ilb_notifier an explicit call

No need for an extra notifier.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160310120025.693720241@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c  |  1 +
 kernel/sched/fair.c  | 15 +--------------
 kernel/sched/sched.h |  4 ++++
 3 files changed, 6 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a9a65ed772e3..28ffd6882ea6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7131,6 +7131,7 @@ int sched_cpu_dying(unsigned int cpu)
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	calc_load_migrate(rq);
 	update_max_interval();
+	nohz_balance_exit_idle(cpu);
 	return 0;
 }
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fe30e66aff1..8b6db3626566 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7711,7 +7711,7 @@ static void nohz_balancer_kick(void)
 	return;
 }
 
-static inline void nohz_balance_exit_idle(int cpu)
+void nohz_balance_exit_idle(unsigned int cpu)
 {
 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
 		/*
@@ -7784,18 +7784,6 @@ void nohz_balance_enter_idle(int cpu)
 	atomic_inc(&nohz.nr_cpus);
 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
-
-static int sched_ilb_notifier(struct notifier_block *nfb,
-					unsigned long action, void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_DYING:
-		nohz_balance_exit_idle(smp_processor_id());
-		return NOTIFY_OK;
-	default:
-		return NOTIFY_DONE;
-	}
-}
 #endif
 
 static DEFINE_SPINLOCK(balancing);
@@ -8600,7 +8588,6 @@ __init void init_sched_fair_class(void)
 #ifdef CONFIG_NO_HZ_COMMON
 	nohz.next_balance = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-	cpu_notifier(sched_ilb_notifier, 0);
 #endif
 #endif /* SMP */
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec2e8d23527e..16a27b624ee5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1743,6 +1743,10 @@ enum rq_nohz_flag_bits {
 };
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
+
+extern void nohz_balance_exit_idle(unsigned int cpu);
+#else
+static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 #endif
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-- 
cgit v1.2.3


From e5ef27d0f5acf9f1db2882d7546a41c021f66820 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 10 Mar 2016 12:54:21 +0100
Subject: sched: Make hrtick_notifier an explicit call

No need for an extra notifier. We don't need to handle all these states. It's
sufficient to kill the timer when the cpu dies.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: rt@linutronix.de
Link: http://lkml.kernel.org/r/20160310120025.770528462@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 34 +---------------------------------
 1 file changed, 1 insertion(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 28ffd6882ea6..9c710ad0ac22 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -249,29 +249,6 @@ void hrtick_start(struct rq *rq, u64 delay)
 	}
 }
 
-static int
-hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-	int cpu = (int)(long)hcpu;
-
-	switch (action) {
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		hrtick_clear(cpu_rq(cpu));
-		return NOTIFY_OK;
-	}
-
-	return NOTIFY_DONE;
-}
-
-static __init void init_hrtick(void)
-{
-	hotcpu_notifier(hotplug_hrtick, 0);
-}
 #else
 /*
  * Called to set the hrtick timer state.
@@ -288,10 +265,6 @@ void hrtick_start(struct rq *rq, u64 delay)
 	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
 		      HRTIMER_MODE_REL_PINNED);
 }
-
-static inline void init_hrtick(void)
-{
-}
 #endif /* CONFIG_SMP */
 
 static void init_rq_hrtick(struct rq *rq)
@@ -315,10 +288,6 @@ static inline void hrtick_clear(struct rq *rq)
 static inline void init_rq_hrtick(struct rq *rq)
 {
 }
-
-static inline void init_hrtick(void)
-{
-}
 #endif	/* CONFIG_SCHED_HRTICK */
 
 /*
@@ -7132,6 +7101,7 @@ int sched_cpu_dying(unsigned int cpu)
 	calc_load_migrate(rq);
 	update_max_interval();
 	nohz_balance_exit_idle(cpu);
+	hrtick_clear(rq);
 	return 0;
 }
 #endif
@@ -7157,8 +7127,6 @@ void __init sched_init_smp(void)
 		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 
-	init_hrtick();
-
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
 		BUG();
-- 
cgit v1.2.3


From 1a0dc1ac1d2928e25739ee82d7e04423b01da563 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 5 May 2016 19:49:09 -0700
Subject: bpf: cleanup verifier code

cleanup verifier code and prepare it for addition of "pointer to packet" logic

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 100 ++++++++++++++++++++++++++------------------------
 1 file changed, 53 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 63554b6d4e25..afeb62808902 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -249,28 +249,30 @@ static const char * const reg_type_str[] = {
 	[CONST_IMM]		= "imm",
 };
 
-static void print_verifier_state(struct verifier_env *env)
+static void print_verifier_state(struct verifier_state *state)
 {
+	struct reg_state *reg;
 	enum bpf_reg_type t;
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++) {
-		t = env->cur_state.regs[i].type;
+		reg = &state->regs[i];
+		t = reg->type;
 		if (t == NOT_INIT)
 			continue;
 		verbose(" R%d=%s", i, reg_type_str[t]);
 		if (t == CONST_IMM || t == PTR_TO_STACK)
-			verbose("%ld", env->cur_state.regs[i].imm);
+			verbose("%ld", reg->imm);
 		else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
 			 t == PTR_TO_MAP_VALUE_OR_NULL)
 			verbose("(ks=%d,vs=%d)",
-				env->cur_state.regs[i].map_ptr->key_size,
-				env->cur_state.regs[i].map_ptr->value_size);
+				reg->map_ptr->key_size,
+				reg->map_ptr->value_size);
 	}
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
+		if (state->stack_slot_type[i] == STACK_SPILL)
 			verbose(" fp%d=%s", -MAX_BPF_STACK + i,
-				reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
+				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
 	}
 	verbose("\n");
 }
@@ -686,10 +688,11 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			    int value_regno)
 {
 	struct verifier_state *state = &env->cur_state;
+	struct reg_state *reg = &state->regs[regno];
 	int size, err = 0;
 
-	if (state->regs[regno].type == PTR_TO_STACK)
-		off += state->regs[regno].imm;
+	if (reg->type == PTR_TO_STACK)
+		off += reg->imm;
 
 	size = bpf_size_to_bytes(bpf_size);
 	if (size < 0)
@@ -700,7 +703,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		return -EACCES;
 	}
 
-	if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+	if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
 			verbose("R%d leaks addr into map\n", value_regno);
@@ -710,7 +713,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
 
-	} else if (state->regs[regno].type == PTR_TO_CTX) {
+	} else if (reg->type == PTR_TO_CTX) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
 			verbose("R%d leaks addr into ctx\n", value_regno);
@@ -720,8 +723,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
 
-	} else if (state->regs[regno].type == FRAME_PTR ||
-		   state->regs[regno].type == PTR_TO_STACK) {
+	} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
 		if (off >= 0 || off < -MAX_BPF_STACK) {
 			verbose("invalid stack off=%d size=%d\n", off, size);
 			return -EACCES;
@@ -739,7 +741,7 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		}
 	} else {
 		verbose("R%d invalid mem access '%s'\n",
-			regno, reg_type_str[state->regs[regno].type]);
+			regno, reg_type_str[reg->type]);
 		return -EACCES;
 	}
 	return err;
@@ -1104,7 +1106,7 @@ static int check_call(struct verifier_env *env, int func_id)
 /* check validity of 32-bit and 64-bit arithmetic operations */
 static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 {
-	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *regs = env->cur_state.regs, *dst_reg;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -1193,8 +1195,6 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 
 	} else {	/* all other ALU ops: and, sub, xor, add, ... */
 
-		bool stack_relative = false;
-
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
 				verbose("BPF_ALU uses reserved fields\n");
@@ -1232,11 +1232,19 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 			}
 		}
 
+		/* check dest operand */
+		err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+		if (err)
+			return err;
+
+		dst_reg = &regs[insn->dst_reg];
+
 		/* pattern match 'bpf_add Rx, imm' instruction */
 		if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
-		    regs[insn->dst_reg].type == FRAME_PTR &&
-		    BPF_SRC(insn->code) == BPF_K) {
-			stack_relative = true;
+		    dst_reg->type == FRAME_PTR && BPF_SRC(insn->code) == BPF_K) {
+			dst_reg->type = PTR_TO_STACK;
+			dst_reg->imm = insn->imm;
+			return 0;
 		} else if (is_pointer_value(env, insn->dst_reg)) {
 			verbose("R%d pointer arithmetic prohibited\n",
 				insn->dst_reg);
@@ -1248,15 +1256,8 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 			return -EACCES;
 		}
 
-		/* check dest operand */
-		err = check_reg_arg(regs, insn->dst_reg, DST_OP);
-		if (err)
-			return err;
-
-		if (stack_relative) {
-			regs[insn->dst_reg].type = PTR_TO_STACK;
-			regs[insn->dst_reg].imm = insn->imm;
-		}
+		/* mark dest operand */
+		mark_reg_unknown_value(regs, insn->dst_reg);
 	}
 
 	return 0;
@@ -1265,7 +1266,7 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 static int check_cond_jmp_op(struct verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
-	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *regs = env->cur_state.regs, *dst_reg;
 	struct verifier_state *other_branch;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
@@ -1303,11 +1304,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
 	if (err)
 		return err;
 
+	dst_reg = &regs[insn->dst_reg];
+
 	/* detect if R == 0 where R was initialized to zero earlier */
 	if (BPF_SRC(insn->code) == BPF_K &&
 	    (opcode == BPF_JEQ || opcode == BPF_JNE) &&
-	    regs[insn->dst_reg].type == CONST_IMM &&
-	    regs[insn->dst_reg].imm == insn->imm) {
+	    dst_reg->type == CONST_IMM && dst_reg->imm == insn->imm) {
 		if (opcode == BPF_JEQ) {
 			/* if (imm == imm) goto pc+off;
 			 * only follow the goto, ignore fall-through
@@ -1329,9 +1331,8 @@ static int check_cond_jmp_op(struct verifier_env *env,
 
 	/* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
 	if (BPF_SRC(insn->code) == BPF_K &&
-	    insn->imm == 0 && (opcode == BPF_JEQ ||
-			       opcode == BPF_JNE) &&
-	    regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+	    insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+	    dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
 		if (opcode == BPF_JEQ) {
 			/* next fallthrough insn can access memory via
 			 * this register
@@ -1366,7 +1367,7 @@ static int check_cond_jmp_op(struct verifier_env *env,
 		}
 	}
 	if (log_level)
-		print_verifier_state(env);
+		print_verifier_state(&env->cur_state);
 	return 0;
 }
 
@@ -1444,14 +1445,14 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
 	int i, err;
 
 	if (!may_access_skb(env->prog->type)) {
-		verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n");
+		verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
 		return -EINVAL;
 	}
 
 	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
 	    BPF_SIZE(insn->code) == BPF_DW ||
 	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
-		verbose("BPF_LD_ABS uses reserved fields\n");
+		verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -1712,17 +1713,21 @@ err_free:
  */
 static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
 {
+	struct reg_state *rold, *rcur;
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++) {
-		if (memcmp(&old->regs[i], &cur->regs[i],
-			   sizeof(old->regs[0])) != 0) {
-			if (old->regs[i].type == NOT_INIT ||
-			    (old->regs[i].type == UNKNOWN_VALUE &&
-			     cur->regs[i].type != NOT_INIT))
-				continue;
-			return false;
-		}
+		rold = &old->regs[i];
+		rcur = &cur->regs[i];
+
+		if (memcmp(rold, rcur, sizeof(*rold)) == 0)
+			continue;
+
+		if (rold->type == NOT_INIT ||
+		    (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
+			continue;
+
+		return false;
 	}
 
 	for (i = 0; i < MAX_BPF_STACK; i++) {
@@ -1844,7 +1849,7 @@ static int do_check(struct verifier_env *env)
 
 		if (log_level && do_print_state) {
 			verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
-			print_verifier_state(env);
+			print_verifier_state(&env->cur_state);
 			do_print_state = false;
 		}
 
@@ -2056,6 +2061,7 @@ process_bpf_exit:
 		insn_idx++;
 	}
 
+	verbose("processed %d insns\n", insn_processed);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 969bf05eb3cedd5a8d4b7c346a85c2ede87a6d6d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 5 May 2016 19:49:10 -0700
Subject: bpf: direct packet access

Extended BPF carried over two instructions from classic to access
packet data: LD_ABS and LD_IND. They're highly optimized in JITs,
but due to their design they have to do length check for every access.
When BPF is processing 20M packets per second single LD_ABS after JIT
is consuming 3% cpu. Hence the need to optimize it further by amortizing
the cost of 'off < skb_headlen' over multiple packet accesses.
One option is to introduce two new eBPF instructions LD_ABS_DW and LD_IND_DW
with similar usage as skb_header_pointer().
The kernel part for interpreter and x64 JIT was implemented in [1], but such
new insns behave like old ld_abs and abort the program with 'return 0' if
access is beyond linear data. Such hidden control flow is hard to workaround
plus changing JITs and rolling out new llvm is incovenient.

Therefore allow cls_bpf/act_bpf program access skb->data directly:
int bpf_prog(struct __sk_buff *skb)
{
  struct iphdr *ip;

  if (skb->data + sizeof(struct iphdr) + ETH_HLEN > skb->data_end)
      /* packet too small */
      return 0;

  ip = skb->data + ETH_HLEN;

  /* access IP header fields with direct loads */
  if (ip->version != 4 || ip->saddr == 0x7f000001)
      return 1;
  [...]
}

This solution avoids introduction of new instructions. llvm stays
the same and all JITs stay the same, but verifier has to work extra hard
to prove safety of the above program.

For XDP the direct store instructions can be allowed as well.

The skb->data is NET_IP_ALIGNED, so for common cases the verifier can check
the alignment. The complex packet parsers where packet pointer is adjusted
incrementally cannot be tracked for alignment, so allow byte access in such cases
and misaligned access on architectures that define efficient_unaligned_access

[1] https://git.kernel.org/cgit/linux/kernel/git/ast/bpf.git/?h=ld_abs_dw

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   2 +
 kernel/bpf/core.c        |   5 +
 kernel/bpf/verifier.c    | 441 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 440 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b7b0fb1292e7..406459b935a2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -370,6 +370,8 @@ struct __sk_buff {
 	__u32 cb[5];
 	__u32 hash;
 	__u32 tc_classid;
+	__u32 data;
+	__u32 data_end;
 };
 
 struct bpf_tunnel_key {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e4248fe79513..d781b077431f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -794,6 +794,11 @@ void __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
 }
 
+bool __weak bpf_helper_changes_skb_data(void *func)
+{
+	return false;
+}
+
 /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
  * skb_copy_bits(), so provide a weak definition of it for NET-less config.
  */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index afeb62808902..6338c61fc2a1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -136,13 +137,32 @@ enum bpf_reg_type {
 	FRAME_PTR,		 /* reg == frame_pointer */
 	PTR_TO_STACK,		 /* reg == frame_pointer + imm */
 	CONST_IMM,		 /* constant integer value */
+
+	/* PTR_TO_PACKET represents:
+	 * skb->data
+	 * skb->data + imm
+	 * skb->data + (u16) var
+	 * skb->data + (u16) var + imm
+	 * if (range > 0) then [ptr, ptr + range - off) is safe to access
+	 * if (id > 0) means that some 'var' was added
+	 * if (off > 0) menas that 'imm' was added
+	 */
+	PTR_TO_PACKET,
+	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
 
 struct reg_state {
 	enum bpf_reg_type type;
 	union {
-		/* valid when type == CONST_IMM | PTR_TO_STACK */
-		long imm;
+		/* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
+		s64 imm;
+
+		/* valid when type == PTR_TO_PACKET* */
+		struct {
+			u32 id;
+			u16 off;
+			u16 range;
+		};
 
 		/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
 		 *   PTR_TO_MAP_VALUE_OR_NULL
@@ -247,6 +267,8 @@ static const char * const reg_type_str[] = {
 	[FRAME_PTR]		= "fp",
 	[PTR_TO_STACK]		= "fp",
 	[CONST_IMM]		= "imm",
+	[PTR_TO_PACKET]		= "pkt",
+	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
 static void print_verifier_state(struct verifier_state *state)
@@ -262,7 +284,12 @@ static void print_verifier_state(struct verifier_state *state)
 			continue;
 		verbose(" R%d=%s", i, reg_type_str[t]);
 		if (t == CONST_IMM || t == PTR_TO_STACK)
-			verbose("%ld", reg->imm);
+			verbose("%lld", reg->imm);
+		else if (t == PTR_TO_PACKET)
+			verbose("(id=%d,off=%d,r=%d)",
+				reg->id, reg->off, reg->range);
+		else if (t == UNKNOWN_VALUE && reg->imm)
+			verbose("%lld", reg->imm);
 		else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
 			 t == PTR_TO_MAP_VALUE_OR_NULL)
 			verbose("(ks=%d,vs=%d)",
@@ -548,6 +575,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_MAP_VALUE_OR_NULL:
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
+	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_END:
 	case FRAME_PTR:
 	case CONST_PTR_TO_MAP:
 		return true;
@@ -647,6 +676,27 @@ static int check_map_access(struct verifier_env *env, u32 regno, int off,
 	return 0;
 }
 
+#define MAX_PACKET_OFF 0xffff
+
+static int check_packet_access(struct verifier_env *env, u32 regno, int off,
+			       int size)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *reg = &regs[regno];
+	int linear_size = (int) reg->range - (int) reg->off;
+
+	if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) {
+		verbose("verifier bug\n");
+		return -EFAULT;
+	}
+	if (off < 0 || off + size > linear_size) {
+		verbose("invalid access to packet, off=%d size=%d, allowed=%d\n",
+			off, size, linear_size);
+		return -EACCES;
+	}
+	return 0;
+}
+
 /* check access to 'struct bpf_context' fields */
 static int check_ctx_access(struct verifier_env *env, int off, int size,
 			    enum bpf_access_type t)
@@ -677,6 +727,45 @@ static bool is_pointer_value(struct verifier_env *env, int regno)
 	}
 }
 
+static int check_ptr_alignment(struct verifier_env *env, struct reg_state *reg,
+			       int off, int size)
+{
+	if (reg->type != PTR_TO_PACKET) {
+		if (off % size != 0) {
+			verbose("misaligned access off %d size %d\n", off, size);
+			return -EACCES;
+		} else {
+			return 0;
+		}
+	}
+
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_SCHED_CLS:
+	case BPF_PROG_TYPE_SCHED_ACT:
+		break;
+	default:
+		verbose("verifier is misconfigured\n");
+		return -EACCES;
+	}
+
+	if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+		/* misaligned access to packet is ok on x86,arm,arm64 */
+		return 0;
+
+	if (reg->id && size != 1) {
+		verbose("Unknown packet alignment. Only byte-sized access allowed\n");
+		return -EACCES;
+	}
+
+	/* skb->data is NET_IP_ALIGN-ed */
+	if ((NET_IP_ALIGN + reg->off + off) % size != 0) {
+		verbose("misaligned packet access off %d+%d+%d size %d\n",
+			NET_IP_ALIGN, reg->off, off, size);
+		return -EACCES;
+	}
+	return 0;
+}
+
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
  * if t==read, value_regno is a register which will receive the value from memory
@@ -698,10 +787,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 	if (size < 0)
 		return size;
 
-	if (off % size != 0) {
-		verbose("misaligned access off %d size %d\n", off, size);
-		return -EACCES;
-	}
+	err = check_ptr_alignment(env, reg, off, size);
+	if (err)
+		return err;
 
 	if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
@@ -720,8 +808,16 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			return -EACCES;
 		}
 		err = check_ctx_access(env, off, size, t);
-		if (!err && t == BPF_READ && value_regno >= 0)
+		if (!err && t == BPF_READ && value_regno >= 0) {
 			mark_reg_unknown_value(state->regs, value_regno);
+			if (off == offsetof(struct __sk_buff, data) &&
+			    env->allow_ptr_leaks)
+				/* note that reg.[id|off|range] == 0 */
+				state->regs[value_regno].type = PTR_TO_PACKET;
+			else if (off == offsetof(struct __sk_buff, data_end) &&
+				 env->allow_ptr_leaks)
+				state->regs[value_regno].type = PTR_TO_PACKET_END;
+		}
 
 	} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
 		if (off >= 0 || off < -MAX_BPF_STACK) {
@@ -739,11 +835,28 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 		} else {
 			err = check_stack_read(state, off, size, value_regno);
 		}
+	} else if (state->regs[regno].type == PTR_TO_PACKET) {
+		if (t == BPF_WRITE) {
+			verbose("cannot write into packet\n");
+			return -EACCES;
+		}
+		err = check_packet_access(env, regno, off, size);
+		if (!err && t == BPF_READ && value_regno >= 0)
+			mark_reg_unknown_value(state->regs, value_regno);
 	} else {
 		verbose("R%d invalid mem access '%s'\n",
 			regno, reg_type_str[reg->type]);
 		return -EACCES;
 	}
+
+	if (!err && size <= 2 && value_regno >= 0 && env->allow_ptr_leaks &&
+	    state->regs[value_regno].type == UNKNOWN_VALUE) {
+		/* 1 or 2 byte load zero-extends, determine the number of
+		 * zero upper bits. Not doing it fo 4 byte load, since
+		 * such values cannot be added to ptr_to_packet anyway.
+		 */
+		state->regs[value_regno].imm = 64 - size * 8;
+	}
 	return err;
 }
 
@@ -1001,6 +1114,29 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
+static void clear_all_pkt_pointers(struct verifier_env *env)
+{
+	struct verifier_state *state = &env->cur_state;
+	struct reg_state *regs = state->regs, *reg;
+	int i;
+
+	for (i = 0; i < MAX_BPF_REG; i++)
+		if (regs[i].type == PTR_TO_PACKET ||
+		    regs[i].type == PTR_TO_PACKET_END)
+			mark_reg_unknown_value(regs, i);
+
+	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+		if (state->stack_slot_type[i] != STACK_SPILL)
+			continue;
+		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		if (reg->type != PTR_TO_PACKET &&
+		    reg->type != PTR_TO_PACKET_END)
+			continue;
+		reg->type = UNKNOWN_VALUE;
+		reg->imm = 0;
+	}
+}
+
 static int check_call(struct verifier_env *env, int func_id)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -1008,6 +1144,7 @@ static int check_call(struct verifier_env *env, int func_id)
 	struct reg_state *regs = state->regs;
 	struct reg_state *reg;
 	struct bpf_call_arg_meta meta;
+	bool changes_data;
 	int i, err;
 
 	/* find function prototype */
@@ -1030,6 +1167,8 @@ static int check_call(struct verifier_env *env, int func_id)
 		return -EINVAL;
 	}
 
+	changes_data = bpf_helper_changes_skb_data(fn->func);
+
 	memset(&meta, 0, sizeof(meta));
 
 	/* We only support one arg being in raw mode at the moment, which
@@ -1100,6 +1239,189 @@ static int check_call(struct verifier_env *env, int func_id)
 	if (err)
 		return err;
 
+	if (changes_data)
+		clear_all_pkt_pointers(env);
+	return 0;
+}
+
+static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *dst_reg = &regs[insn->dst_reg];
+	struct reg_state *src_reg = &regs[insn->src_reg];
+	s32 imm;
+
+	if (BPF_SRC(insn->code) == BPF_K) {
+		/* pkt_ptr += imm */
+		imm = insn->imm;
+
+add_imm:
+		if (imm <= 0) {
+			verbose("addition of negative constant to packet pointer is not allowed\n");
+			return -EACCES;
+		}
+		if (imm >= MAX_PACKET_OFF ||
+		    imm + dst_reg->off >= MAX_PACKET_OFF) {
+			verbose("constant %d is too large to add to packet pointer\n",
+				imm);
+			return -EACCES;
+		}
+		/* a constant was added to pkt_ptr.
+		 * Remember it while keeping the same 'id'
+		 */
+		dst_reg->off += imm;
+	} else {
+		if (src_reg->type == CONST_IMM) {
+			/* pkt_ptr += reg where reg is known constant */
+			imm = src_reg->imm;
+			goto add_imm;
+		}
+		/* disallow pkt_ptr += reg
+		 * if reg is not uknown_value with guaranteed zero upper bits
+		 * otherwise pkt_ptr may overflow and addition will become
+		 * subtraction which is not allowed
+		 */
+		if (src_reg->type != UNKNOWN_VALUE) {
+			verbose("cannot add '%s' to ptr_to_packet\n",
+				reg_type_str[src_reg->type]);
+			return -EACCES;
+		}
+		if (src_reg->imm < 48) {
+			verbose("cannot add integer value with %lld upper zero bits to ptr_to_packet\n",
+				src_reg->imm);
+			return -EACCES;
+		}
+		/* dst_reg stays as pkt_ptr type and since some positive
+		 * integer value was added to the pointer, increment its 'id'
+		 */
+		dst_reg->id++;
+
+		/* something was added to pkt_ptr, set range and off to zero */
+		dst_reg->off = 0;
+		dst_reg->range = 0;
+	}
+	return 0;
+}
+
+static int evaluate_reg_alu(struct verifier_env *env, struct bpf_insn *insn)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *dst_reg = &regs[insn->dst_reg];
+	u8 opcode = BPF_OP(insn->code);
+	s64 imm_log2;
+
+	/* for type == UNKNOWN_VALUE:
+	 * imm > 0 -> number of zero upper bits
+	 * imm == 0 -> don't track which is the same as all bits can be non-zero
+	 */
+
+	if (BPF_SRC(insn->code) == BPF_X) {
+		struct reg_state *src_reg = &regs[insn->src_reg];
+
+		if (src_reg->type == UNKNOWN_VALUE && src_reg->imm > 0 &&
+		    dst_reg->imm && opcode == BPF_ADD) {
+			/* dreg += sreg
+			 * where both have zero upper bits. Adding them
+			 * can only result making one more bit non-zero
+			 * in the larger value.
+			 * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+			 *     0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+			 */
+			dst_reg->imm = min(dst_reg->imm, src_reg->imm);
+			dst_reg->imm--;
+			return 0;
+		}
+		if (src_reg->type == CONST_IMM && src_reg->imm > 0 &&
+		    dst_reg->imm && opcode == BPF_ADD) {
+			/* dreg += sreg
+			 * where dreg has zero upper bits and sreg is const.
+			 * Adding them can only result making one more bit
+			 * non-zero in the larger value.
+			 */
+			imm_log2 = __ilog2_u64((long long)src_reg->imm);
+			dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
+			dst_reg->imm--;
+			return 0;
+		}
+		/* all other cases non supported yet, just mark dst_reg */
+		dst_reg->imm = 0;
+		return 0;
+	}
+
+	/* sign extend 32-bit imm into 64-bit to make sure that
+	 * negative values occupy bit 63. Note ilog2() would have
+	 * been incorrect, since sizeof(insn->imm) == 4
+	 */
+	imm_log2 = __ilog2_u64((long long)insn->imm);
+
+	if (dst_reg->imm && opcode == BPF_LSH) {
+		/* reg <<= imm
+		 * if reg was a result of 2 byte load, then its imm == 48
+		 * which means that upper 48 bits are zero and shifting this reg
+		 * left by 4 would mean that upper 44 bits are still zero
+		 */
+		dst_reg->imm -= insn->imm;
+	} else if (dst_reg->imm && opcode == BPF_MUL) {
+		/* reg *= imm
+		 * if multiplying by 14 subtract 4
+		 * This is conservative calculation of upper zero bits.
+		 * It's not trying to special case insn->imm == 1 or 0 cases
+		 */
+		dst_reg->imm -= imm_log2 + 1;
+	} else if (opcode == BPF_AND) {
+		/* reg &= imm */
+		dst_reg->imm = 63 - imm_log2;
+	} else if (dst_reg->imm && opcode == BPF_ADD) {
+		/* reg += imm */
+		dst_reg->imm = min(dst_reg->imm, 63 - imm_log2);
+		dst_reg->imm--;
+	} else if (opcode == BPF_RSH) {
+		/* reg >>= imm
+		 * which means that after right shift, upper bits will be zero
+		 * note that verifier already checked that
+		 * 0 <= imm < 64 for shift insn
+		 */
+		dst_reg->imm += insn->imm;
+		if (unlikely(dst_reg->imm > 64))
+			/* some dumb code did:
+			 * r2 = *(u32 *)mem;
+			 * r2 >>= 32;
+			 * and all bits are zero now */
+			dst_reg->imm = 64;
+	} else {
+		/* all other alu ops, means that we don't know what will
+		 * happen to the value, mark it with unknown number of zero bits
+		 */
+		dst_reg->imm = 0;
+	}
+
+	if (dst_reg->imm < 0) {
+		/* all 64 bits of the register can contain non-zero bits
+		 * and such value cannot be added to ptr_to_packet, since it
+		 * may overflow, mark it as unknown to avoid further eval
+		 */
+		dst_reg->imm = 0;
+	}
+	return 0;
+}
+
+static int evaluate_reg_imm_alu(struct verifier_env *env, struct bpf_insn *insn)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	struct reg_state *dst_reg = &regs[insn->dst_reg];
+	struct reg_state *src_reg = &regs[insn->src_reg];
+	u8 opcode = BPF_OP(insn->code);
+
+	/* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
+	 * Don't care about overflow or negative values, just add them
+	 */
+	if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K)
+		dst_reg->imm += insn->imm;
+	else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
+		 src_reg->type == CONST_IMM)
+		dst_reg->imm += src_reg->imm;
+	else
+		mark_reg_unknown_value(regs, insn->dst_reg);
 	return 0;
 }
 
@@ -1245,6 +1567,21 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 			dst_reg->type = PTR_TO_STACK;
 			dst_reg->imm = insn->imm;
 			return 0;
+		} else if (opcode == BPF_ADD &&
+			   BPF_CLASS(insn->code) == BPF_ALU64 &&
+			   dst_reg->type == PTR_TO_PACKET) {
+			/* ptr_to_packet += K|X */
+			return check_packet_ptr_add(env, insn);
+		} else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
+			   dst_reg->type == UNKNOWN_VALUE &&
+			   env->allow_ptr_leaks) {
+			/* unknown += K|X */
+			return evaluate_reg_alu(env, insn);
+		} else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
+			   dst_reg->type == CONST_IMM &&
+			   env->allow_ptr_leaks) {
+			/* reg_imm += K|X */
+			return evaluate_reg_imm_alu(env, insn);
 		} else if (is_pointer_value(env, insn->dst_reg)) {
 			verbose("R%d pointer arithmetic prohibited\n",
 				insn->dst_reg);
@@ -1263,6 +1600,34 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
+static void find_good_pkt_pointers(struct verifier_env *env,
+				   struct reg_state *dst_reg)
+{
+	struct verifier_state *state = &env->cur_state;
+	struct reg_state *regs = state->regs, *reg;
+	int i;
+	/* r2 = r3;
+	 * r2 += 8
+	 * if (r2 > pkt_end) goto somewhere
+	 * r2 == dst_reg, pkt_end == src_reg,
+	 * r2=pkt(id=n,off=8,r=0)
+	 * r3=pkt(id=n,off=0,r=0)
+	 * find register r3 and mark its range as r3=pkt(id=n,off=0,r=8)
+	 * so that range of bytes [r3, r3 + 8) is safe to access
+	 */
+	for (i = 0; i < MAX_BPF_REG; i++)
+		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+			regs[i].range = dst_reg->off;
+
+	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+		if (state->stack_slot_type[i] != STACK_SPILL)
+			continue;
+		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+			reg->range = dst_reg->off;
+	}
+}
+
 static int check_cond_jmp_op(struct verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
@@ -1346,6 +1711,10 @@ static int check_cond_jmp_op(struct verifier_env *env,
 			regs[insn->dst_reg].type = CONST_IMM;
 			regs[insn->dst_reg].imm = 0;
 		}
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+		   dst_reg->type == PTR_TO_PACKET &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
+		find_good_pkt_pointers(env, dst_reg);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
@@ -1685,6 +2054,58 @@ err_free:
 	return ret;
 }
 
+/* the following conditions reduce the number of explored insns
+ * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
+ */
+static bool compare_ptrs_to_packet(struct reg_state *old, struct reg_state *cur)
+{
+	if (old->id != cur->id)
+		return false;
+
+	/* old ptr_to_packet is more conservative, since it allows smaller
+	 * range. Ex:
+	 * old(off=0,r=10) is equal to cur(off=0,r=20), because
+	 * old(off=0,r=10) means that with range=10 the verifier proceeded
+	 * further and found no issues with the program. Now we're in the same
+	 * spot with cur(off=0,r=20), so we're safe too, since anything further
+	 * will only be looking at most 10 bytes after this pointer.
+	 */
+	if (old->off == cur->off && old->range < cur->range)
+		return true;
+
+	/* old(off=20,r=10) is equal to cur(off=22,re=22 or 5 or 0)
+	 * since both cannot be used for packet access and safe(old)
+	 * pointer has smaller off that could be used for further
+	 * 'if (ptr > data_end)' check
+	 * Ex:
+	 * old(off=20,r=10) and cur(off=22,r=22) and cur(off=22,r=0) mean
+	 * that we cannot access the packet.
+	 * The safe range is:
+	 * [ptr, ptr + range - off)
+	 * so whenever off >=range, it means no safe bytes from this pointer.
+	 * When comparing old->off <= cur->off, it means that older code
+	 * went with smaller offset and that offset was later
+	 * used to figure out the safe range after 'if (ptr > data_end)' check
+	 * Say, 'old' state was explored like:
+	 * ... R3(off=0, r=0)
+	 * R4 = R3 + 20
+	 * ... now R4(off=20,r=0)  <-- here
+	 * if (R4 > data_end)
+	 * ... R4(off=20,r=20), R3(off=0,r=20) and R3 can be used to access.
+	 * ... the code further went all the way to bpf_exit.
+	 * Now the 'cur' state at the mark 'here' has R4(off=30,r=0).
+	 * old_R4(off=20,r=0) equal to cur_R4(off=30,r=0), since if the verifier
+	 * goes further, such cur_R4 will give larger safe packet range after
+	 * 'if (R4 > data_end)' and all further insn were already good with r=20,
+	 * so they will be good with r=30 and we can prune the search.
+	 */
+	if (old->off <= cur->off &&
+	    old->off >= old->range && cur->off >= cur->range)
+		return true;
+
+	return false;
+}
+
 /* compare two verifier states
  *
  * all states stored in state_list are known to be valid, since
@@ -1727,6 +2148,10 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
 		    (rold->type == UNKNOWN_VALUE && rcur->type != NOT_INIT))
 			continue;
 
+		if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
+		    compare_ptrs_to_packet(rold, rcur))
+			continue;
+
 		return false;
 	}
 
-- 
cgit v1.2.3


From 735b433397ea2f97d59240cbe4ea770aa7c88eef Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 5 May 2016 19:49:11 -0700
Subject: bpf: improve verifier state equivalence

since UNKNOWN_VALUE type is weaker than CONST_IMM we can un-teach
verifier its recognition of constants in conditional branches
without affecting safety.
Ex:
if (reg == 123) {
  .. here verifier was marking reg->type as CONST_IMM
     instead keep reg as UNKNOWN_VALUE
}

Two verifier states with UNKNOWN_VALUE are equivalent, whereas
CONST_IMM_X != CONST_IMM_Y, since CONST_IMM is used for stack range
verification and other cases.
So help search pruning by marking registers as UNKNOWN_VALUE
where possible instead of CONST_IMM.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6338c61fc2a1..84bff68cf80e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1704,12 +1704,11 @@ static int check_cond_jmp_op(struct verifier_env *env,
 			 */
 			regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
 			/* branch targer cannot access it, since reg == 0 */
-			other_branch->regs[insn->dst_reg].type = CONST_IMM;
-			other_branch->regs[insn->dst_reg].imm = 0;
+			mark_reg_unknown_value(other_branch->regs,
+					       insn->dst_reg);
 		} else {
 			other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
-			regs[insn->dst_reg].type = CONST_IMM;
-			regs[insn->dst_reg].imm = 0;
+			mark_reg_unknown_value(regs, insn->dst_reg);
 		}
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
@@ -1718,22 +1717,6 @@ static int check_cond_jmp_op(struct verifier_env *env,
 	} else if (is_pointer_value(env, insn->dst_reg)) {
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
-	} else if (BPF_SRC(insn->code) == BPF_K &&
-		   (opcode == BPF_JEQ || opcode == BPF_JNE)) {
-
-		if (opcode == BPF_JEQ) {
-			/* detect if (R == imm) goto
-			 * and in the target state recognize that R = imm
-			 */
-			other_branch->regs[insn->dst_reg].type = CONST_IMM;
-			other_branch->regs[insn->dst_reg].imm = insn->imm;
-		} else {
-			/* detect if (R != imm) goto
-			 * and in the fall-through state recognize that R = imm
-			 */
-			regs[insn->dst_reg].type = CONST_IMM;
-			regs[insn->dst_reg].imm = insn->imm;
-		}
 	}
 	if (log_level)
 		print_verifier_state(&env->cur_state);
-- 
cgit v1.2.3


From 536bd00cdbb7b908573e5a93bae67b64cbae60d8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 6 May 2016 14:58:43 +0200
Subject: sched/fair: Fix !CONFIG_SMP kernel cpufreq governor breakage

The following commit:

  34e2c555f3e1 ("cpufreq: Add mechanism for registering utilization update callbacks")

overlooked the fact that update_load_avg(), where CFS invokes cpufreq
utilization update callbacks, becomes an empty stub on UP kernels.

In consequence, if !CONFIG_SMP, cpufreq governors are never invoked
from CFS and they do not have a chance to evaluate CPU performace
levels and update them often enough.

Needless to say, things don't work as expected then.

Fix the problem by making the !CONFIG_SMP stub of update_load_avg()
invoke cpufreq update callbacks too.

Reported-by: Steve Muckle <steve.muckle@linaro.org>
Tested-by: Steve Muckle <steve.muckle@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Steve Muckle <steve.muckle@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linux PM list <linux-pm@vger.kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Fixes: 34e2c555f3e1 (cpufreq: Add mechanism for registering utilization update callbacks)
Link: http://lkml.kernel.org/r/6282396.VVEdgVYxO3@vostro.rjw.lan
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fe30e66aff1..40748dc8ea3e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3030,7 +3030,14 @@ static int idle_balance(struct rq *this_rq);
 
 #else /* CONFIG_SMP */
 
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void update_load_avg(struct sched_entity *se, int not_used)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	struct rq *rq = rq_of(cfs_rq);
+
+	cpufreq_trigger_update(rq_clock(rq));
+}
+
 static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void
-- 
cgit v1.2.3


From 8c5e95548d41a48b1eb2be741107a259251ebd86 Mon Sep 17 00:00:00 2001
From: Muhammad Falak R Wani <falakreyaz@gmail.com>
Date: Thu, 5 May 2016 15:21:19 +0530
Subject: sched/core: Remove unused variable

Remove unused variable 'ret', and directly return 0.

Signed-off-by: Muhammad Falak R Wani <falakreyaz@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1462441879-10092-1-git-send-email-falakreyaz@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 71c5a753e6e9..a6d3e7a9c0a9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8019,7 +8019,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 static int sched_rt_global_constraints(void)
 {
 	unsigned long flags;
-	int i, ret = 0;
+	int i;
 
 	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
@@ -8031,7 +8031,7 @@ static int sched_rt_global_constraints(void)
 	}
 	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
 
-	return ret;
+	return 0;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-- 
cgit v1.2.3


From 58fe9c4621b7219e724c0b7af053112f974a08c3 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Sun, 8 May 2016 20:58:10 -0700
Subject: sched/core: Fix comment typo in wake_q_add()

... the comment clearly refers to wake_up_q(), and not
wake_up_list().

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: dave@stgolabs.net
Link: http://lkml.kernel.org/r/1462766290-28664-1-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a6d3e7a9c0a9..e09f92c3a096 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -465,7 +465,7 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 	 * wakeup due to that.
 	 *
 	 * This cmpxchg() implies a full barrier, which pairs with the write
-	 * barrier implied by the wakeup in wake_up_list().
+	 * barrier implied by the wakeup in wake_up_q().
 	 */
 	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
 		return;
-- 
cgit v1.2.3


From 4f41fc59620fcedaa97cbdf3d7d2956d80fcd922 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge.hallyn@ubuntu.com>
Date: Mon, 9 May 2016 09:59:55 -0500
Subject: cgroup, kernfs: make mountinfo show properly scoped path for cgroup
 namespaces

Patch summary:

When showing a cgroupfs entry in mountinfo, show the path of the mount
root dentry relative to the reader's cgroup namespace root.

Short explanation (courtesy of mkerrisk):

If we create a new cgroup namespace, then we want both /proc/self/cgroup
and /proc/self/mountinfo to show cgroup paths that are correctly
virtualized with respect to the cgroup mount point.  Previous to this
patch, /proc/self/cgroup shows the right info, but /proc/self/mountinfo
does not.

Long version:

When a uid 0 task which is in freezer cgroup /a/b, unshares a new cgroup
namespace, and then mounts a new instance of the freezer cgroup, the new
mount will be rooted at /a/b.  The root dentry field of the mountinfo
entry will show '/a/b'.

 cat > /tmp/do1 << EOF
 mount -t cgroup -o freezer freezer /mnt
 grep freezer /proc/self/mountinfo
 EOF

 unshare -Gm  bash /tmp/do1
 > 330 160 0:34 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer
 > 355 133 0:34 /a/b /mnt rw,relatime - cgroup freezer rw,freezer

The task's freezer cgroup entry in /proc/self/cgroup will simply show
'/':

 grep freezer /proc/self/cgroup
 9:freezer:/

If instead the same task simply bind mounts the /a/b cgroup directory,
the resulting mountinfo entry will again show /a/b for the dentry root.
However in this case the task will find its own cgroup at /mnt/a/b,
not at /mnt:

 mount --bind /sys/fs/cgroup/freezer/a/b /mnt
 130 25 0:34 /a/b /mnt rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,freezer

In other words, there is no way for the task to know, based on what is
in mountinfo, which cgroup directory is its own.

Example (by mkerrisk):

First, a little script to save some typing and verbiage:

echo -e "\t/proc/self/cgroup:\t$(cat /proc/self/cgroup | grep freezer)"
cat /proc/self/mountinfo | grep freezer |
        awk '{print "\tmountinfo:\t\t" $4 "\t" $5}'

Create cgroup, place this shell into the cgroup, and look at the state
of the /proc files:

2653
2653                         # Our shell
14254                        # cat(1)
        /proc/self/cgroup:      10:freezer:/a/b
        mountinfo:              /       /sys/fs/cgroup/freezer

Create a shell in new cgroup and mount namespaces. The act of creating
a new cgroup namespace causes the process's current cgroups directories
to become its cgroup root directories. (Here, I'm using my own version
of the "unshare" utility, which takes the same options as the util-linux
version):

Look at the state of the /proc files:

        /proc/self/cgroup:      10:freezer:/
        mountinfo:              /       /sys/fs/cgroup/freezer

The third entry in /proc/self/cgroup (the pathname of the cgroup inside
the hierarchy) is correctly virtualized w.r.t. the cgroup namespace, which
is rooted at /a/b in the outer namespace.

However, the info in /proc/self/mountinfo is not for this cgroup
namespace, since we are seeing a duplicate of the mount from the
old mount namespace, and the info there does not correspond to the
new cgroup namespace. However, trying to create a new mount still
doesn't show us the right information in mountinfo:

                                      # propagating to other mountns
        /proc/self/cgroup:      7:freezer:/
        mountinfo:              /a/b    /mnt/freezer

The act of creating a new cgroup namespace caused the process's
current freezer directory, "/a/b", to become its cgroup freezer root
directory. In other words, the pathname directory of the directory
within the newly mounted cgroup filesystem should be "/",
but mountinfo wrongly shows us "/a/b". The consequence of this is
that the process in the cgroup namespace cannot correctly construct
the pathname of its cgroup root directory from the information in
/proc/PID/mountinfo.

With this patch, the dentry root field in mountinfo is shown relative
to the reader's cgroup namespace.  So the same steps as above:

        /proc/self/cgroup:      10:freezer:/a/b
        mountinfo:              /       /sys/fs/cgroup/freezer
        /proc/self/cgroup:      10:freezer:/
        mountinfo:              /../..  /sys/fs/cgroup/freezer
        /proc/self/cgroup:      10:freezer:/
        mountinfo:              /       /mnt/freezer

cgroup.clone_children  freezer.parent_freezing  freezer.state      tasks
cgroup.procs           freezer.self_freezing    notify_on_release
3164
2653                   # First shell that placed in this cgroup
3164                   # Shell started by 'unshare'
14197                  # cat(1)

Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 fs/kernfs/mount.c      | 14 +++++++++++
 include/linux/kernfs.h |  2 ++
 kernel/cgroup.c        | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 79 insertions(+)

(limited to 'kernel')

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f73541fbe7af..3b78724c1979 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/namei.h>
+#include <linux/seq_file.h>
 
 #include "kernfs-internal.h"
 
@@ -40,6 +41,18 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
 	return 0;
 }
 
+static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry)
+{
+	struct kernfs_node *node = dentry->d_fsdata;
+	struct kernfs_root *root = kernfs_root(node);
+	struct kernfs_syscall_ops *scops = root->syscall_ops;
+
+	if (scops && scops->show_path)
+		return scops->show_path(sf, node, root);
+
+	return seq_dentry(sf, dentry, " \t\n\\");
+}
+
 const struct super_operations kernfs_sops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
@@ -47,6 +60,7 @@ const struct super_operations kernfs_sops = {
 
 	.remount_fs	= kernfs_sop_remount_fs,
 	.show_options	= kernfs_sop_show_options,
+	.show_path	= kernfs_sop_show_path,
 };
 
 /**
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index c06c44242f39..30f089ebe0a4 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -152,6 +152,8 @@ struct kernfs_syscall_ops {
 	int (*rmdir)(struct kernfs_node *kn);
 	int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		      const char *new_name);
+	int (*show_path)(struct seq_file *sf, struct kernfs_node *kn,
+			 struct kernfs_root *root);
 };
 
 struct kernfs_root {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 909a7d31ffd3..afea39eb7649 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1215,6 +1215,41 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	cgroup_free_root(root);
 }
 
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
+{
+	struct cgroup *res = NULL;
+	struct css_set *cset;
+
+	lockdep_assert_held(&css_set_lock);
+
+	rcu_read_lock();
+
+	cset = current->nsproxy->cgroup_ns->root_cset;
+	if (cset == &init_css_set) {
+		res = &root->cgrp;
+	} else {
+		struct cgrp_cset_link *link;
+
+		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+			struct cgroup *c = link->cgrp;
+
+			if (c->root == root) {
+				res = c;
+				break;
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	BUG_ON(!res);
+	return res;
+}
+
 /* look up cgroup associated with given css_set on the specified hierarchy */
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 					    struct cgroup_root *root)
@@ -1593,6 +1628,33 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 	return 0;
 }
 
+static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
+			    struct kernfs_root *kf_root)
+{
+	int len = 0, ret = 0;
+	char *buf = NULL;
+	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
+	struct cgroup *ns_cgroup;
+
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	spin_lock_bh(&css_set_lock);
+	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
+	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
+	spin_unlock_bh(&css_set_lock);
+
+	if (len >= PATH_MAX)
+		len = -ERANGE;
+	else if (len > 0) {
+		seq_escape(sf, buf, " \t\n\\");
+		len = 0;
+	}
+	kfree(buf);
+	return len;
+}
+
 static int cgroup_show_options(struct seq_file *seq,
 			       struct kernfs_root *kf_root)
 {
@@ -5433,6 +5495,7 @@ static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
 	.rename			= cgroup_rename,
+	.show_path		= cgroup_show_path,
 };
 
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
-- 
cgit v1.2.3


From 0161028b7c8aebef64194d3d73e43bc3b53b5c66 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 9 May 2016 15:48:51 -0700
Subject: perf/core: Change the default paranoia level to 2

Allowing unprivileged kernel profiling lets any user dump follow kernel
control flow and dump kernel registers.  This most likely allows trivial
kASLR bypassing, and it may allow other mischief as well.  (Off the top
of my head, the PERF_SAMPLE_REGS_INTR output during /dev/urandom reads
could be quite interesting.)

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt | 2 +-
 kernel/events/core.c            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 57653a44b128..fcddfd5ded99 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -645,7 +645,7 @@ allowed to execute.
 perf_event_paranoid:
 
 Controls use of the performance events system by unprivileged
-users (without CAP_SYS_ADMIN).  The default value is 1.
+users (without CAP_SYS_ADMIN).  The default value is 2.
 
  -1: Allow use of (almost) all events by all users
 >=0: Disallow raw tracepoint access by users without CAP_IOC_LOCK
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4e2ebf6f2f1f..c0ded2416615 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -351,7 +351,7 @@ static struct srcu_struct pmus_srcu;
  *   1 - disallow cpu events for unpriv
  *   2 - disallow kernel profiling for unpriv
  */
-int sysctl_perf_event_paranoid __read_mostly = 1;
+int sysctl_perf_event_paranoid __read_mostly = 2;
 
 /* Minimum for 512 kiB + 1 user control page */
 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
-- 
cgit v1.2.3


From 13b5ab02ae118fc8dfdc2b8597688ec4a11d5b53 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Mon, 9 May 2016 12:11:31 +0800
Subject: sched/rt, sched/dl: Don't push if task's scheduling class was changed

We got this warning:

    WARNING: CPU: 1 PID: 2468 at kernel/sched/core.c:1161 set_task_cpu+0x1af/0x1c0
    [...]
    Call Trace:

    dump_stack+0x63/0x87
    __warn+0xd1/0xf0
    warn_slowpath_null+0x1d/0x20
    set_task_cpu+0x1af/0x1c0
    push_dl_task.part.34+0xea/0x180
    push_dl_tasks+0x17/0x30
    __balance_callback+0x45/0x5c
    __sched_setscheduler+0x906/0xb90
    SyS_sched_setattr+0x150/0x190
    do_syscall_64+0x62/0x110
    entry_SYSCALL64_slow_path+0x25/0x25

This corresponds to:

    WARN_ON_ONCE(p->state == TASK_RUNNING &&
             p->sched_class == &fair_sched_class &&
             (p->on_rq && !task_on_rq_migrating(p)))

It happens because in find_lock_later_rq(), the task whose scheduling
class was changed to fair class is still pushed away as if it were
a deadline task ...

So, check in find_lock_later_rq() after double_lock_balance(), if the
scheduling class of the deadline task was changed, break and retry.

Apply the same logic to RT tasks.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Link: http://lkml.kernel.org/r/1462767091-1215-1-git-send-email-xlpang@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 1 +
 kernel/sched/rt.c       | 1 +
 2 files changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index affd97ec9f65..686ec8adf952 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1394,6 +1394,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 				     !cpumask_test_cpu(later_rq->cpu,
 				                       &task->cpus_allowed) ||
 				     task_running(rq, task) ||
+				     !dl_task(task) ||
 				     !task_on_rq_queued(task))) {
 				double_unlock_balance(rq, later_rq);
 				later_rq = NULL;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c41ea7ac1764..ec4f538d4396 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1729,6 +1729,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 				     !cpumask_test_cpu(lowest_rq->cpu,
 						       tsk_cpus_allowed(task)) ||
 				     task_running(rq, task) ||
+				     !rt_task(task) ||
 				     !task_on_rq_queued(task))) {
 
 				double_unlock_balance(rq, lowest_rq);
-- 
cgit v1.2.3


From b7d7641e2a8640a2f6ac7cdadd6c56eb12728b70 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 9 May 2016 17:22:13 -0700
Subject: blktrace: delete garbage for message trace

commit f4a1d08ce65 introduces a regression. Originally for
BLK_TN_MESSAGE, we add message in trace and return. The commit ignores
the early return and add garbage info.

Signed-off-by: Shaohua Li <shli@fb.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f94e7a21f52d..8ae9ea2cacaf 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1349,6 +1349,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 	if (t->action == BLK_TN_MESSAGE) {
 		log_action(iter, long_act ? "message" : "m");
 		blk_log_msg(s, iter->ent);
+		return trace_handle_return(s);
 	}
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-- 
cgit v1.2.3


From 8d1547e08dda8848f01185b3fd6bb946b68de99f Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 9 May 2016 17:22:14 -0700
Subject: blktrace: add missed mask name

BLK_TC_NOTIFY is missed in mask_maps, so we can't print out notify or
set mask with 'notify' name.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/trace/blktrace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8ae9ea2cacaf..9aef8654e90d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1552,6 +1552,7 @@ static const struct {
 	{ BLK_TC_COMPLETE,	"complete"	},
 	{ BLK_TC_FS,		"fs"		},
 	{ BLK_TC_PC,		"pc"		},
+	{ BLK_TC_NOTIFY,	"notify"	},
 	{ BLK_TC_AHEAD,		"ahead"		},
 	{ BLK_TC_META,		"meta"		},
 	{ BLK_TC_DISCARD,	"discard"	},
-- 
cgit v1.2.3


From cc622420798c4bcf093785d872525087a7798db9 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 25 Apr 2016 17:35:29 +0200
Subject: gcov: disable for COMPILE_TEST

Enabling gcov is counterproductive to compile testing: it significantly
increases the kernel image size, compile time, and it produces lots
of false positive "may be used uninitialized" warnings as the result
of missed optimizations.

This is in line with how UBSAN_SANITIZE_ALL and PROFILE_ALL_BRANCHES
work, both of which have similar problems.

With an ARM allmodconfig kernel, I see the build time drop from
283 minutes CPU time to 225 minutes, and the vmlinux size drops
from 43MB to 26MB.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
Signed-off-by: Michal Marek <mmarek@suse.com>
---
 kernel/gcov/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index c92e44855ddd..1276aabaab55 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -37,6 +37,7 @@ config ARCH_HAS_GCOV_PROFILE_ALL
 
 config GCOV_PROFILE_ALL
 	bool "Profile entire Kernel"
+	depends on !COMPILE_TEST
 	depends on GCOV_KERNEL
 	depends on ARCH_HAS_GCOV_PROFILE_ALL
 	default n
-- 
cgit v1.2.3


From 53d3bc773eaa7ab1cf63585e76af7ee869d5e709 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Wed, 11 May 2016 08:25:53 +0200
Subject: Revert "sched/fair: Fix fairness issue on migration"

Mike reported that this recent commit:

  3a47d5124a95 ("sched/fair: Fix fairness issue on migration")

... broke interactivity and the signal starvation test.

We have a proper fix series in the works but ran out of time for
v4.6, so revert the commit.

Reported-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40748dc8ea3e..e7dd0ec169be 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3188,25 +3188,17 @@ static inline void check_schedstat_required(void)
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING);
-	bool curr = cfs_rq->curr == se;
-
 	/*
-	 * If we're the current task, we must renormalise before calling
-	 * update_curr().
+	 * Update the normalized vruntime before updating min_vruntime
+	 * through calling update_curr().
 	 */
-	if (renorm && curr)
+	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
 		se->vruntime += cfs_rq->min_vruntime;
 
-	update_curr(cfs_rq);
-
 	/*
-	 * Otherwise, renormalise after, such that we're placed at the current
-	 * moment in time, instead of some random moment in the past.
+	 * Update run-time statistics of the 'current'.
 	 */
-	if (renorm && !curr)
-		se->vruntime += cfs_rq->min_vruntime;
-
+	update_curr(cfs_rq);
 	enqueue_entity_load_avg(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
@@ -3222,7 +3214,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		update_stats_enqueue(cfs_rq, se);
 		check_spread(cfs_rq, se);
 	}
-	if (!curr)
+	if (se != cfs_rq->curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
 
-- 
cgit v1.2.3


From 9b5d585d148a19bcadae81fa17ddbe3e22efb9e9 Mon Sep 17 00:00:00 2001
From: Jon Hunter <jonathanh@nvidia.com>
Date: Tue, 10 May 2016 16:14:35 +0100
Subject: genirq: Ensure IRQ descriptor is valid when setting-up the IRQ

In the function, setup_irq(), we don't check that the descriptor
returned from irq_to_desc() is valid before we start using it. For
example chip_bus_lock() called from setup_irq(), assumes that the
descriptor pointer is valid and doesn't check before dereferencing it.

In many other functions including setup/free_percpu_irq() we do check
that the descriptor returned is not NULL and therefore add the same test
to setup_irq() to ensure the descriptor returned is valid.

Signed-off-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cc1cc641d653..ef0bc02c3a70 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1407,7 +1407,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 	int retval;
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+	if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
 		return -EINVAL;
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, act);
-- 
cgit v1.2.3


From c58d25f371f5e4b2dfbec3a7bd6f3c24dd79095b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 12 May 2016 09:19:59 +0200
Subject: sched/fair: Move record_wakee()

Since I want to make ->task_woken() conditional on the task getting
migrated, we cannot use it to call record_wakee().

Move it to select_task_rq_fair(), which gets called in almost all the
same conditions. The only exception is if the woken task (@p) is
CPU-bound (as per the nr_cpus_allowed test in select_task_rq()).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Pavan Kondeti <pkondeti@codeaurora.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: byungchul.park@lge.com
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 61 +++++++++++++++++++++++++++++------------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 84e465ae7c63..48633a1c3b46 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4810,24 +4810,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 	return 0;
 }
 
-static void record_wakee(struct task_struct *p)
-{
-	/*
-	 * Rough decay (wiping) for cost saving, don't worry
-	 * about the boundary, really active task won't care
-	 * about the loss.
-	 */
-	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
-		current->wakee_flips >>= 1;
-		current->wakee_flip_decay_ts = jiffies;
-	}
-
-	if (current->last_wakee != p) {
-		current->last_wakee = p;
-		current->wakee_flips++;
-	}
-}
-
 static void task_waking_fair(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
@@ -4847,7 +4829,6 @@ static void task_waking_fair(struct task_struct *p)
 #endif
 
 	se->vruntime -= min_vruntime;
-	record_wakee(p);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -4965,17 +4946,39 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
 #endif
 
+static void record_wakee(struct task_struct *p)
+{
+	/*
+	 * Only decay a single time; tasks that have less then 1 wakeup per
+	 * jiffy will not have built up many flips.
+	 */
+	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
+		current->wakee_flips >>= 1;
+		current->wakee_flip_decay_ts = jiffies;
+	}
+
+	if (current->last_wakee != p) {
+		current->last_wakee = p;
+		current->wakee_flips++;
+	}
+}
+
 /*
  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ *
  * A waker of many should wake a different task than the one last awakened
- * at a frequency roughly N times higher than one of its wakees.  In order
- * to determine whether we should let the load spread vs consolodating to
- * shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other.  With
- * both conditions met, we can be relatively sure that the relationship is
- * non-monogamous, with partner count exceeding socket size.  Waker/wakee
- * being client/server, worker/dispatcher, interrupt source or whatever is
- * irrelevant, spread criteria is apparent partner count exceeds socket size.
+ * at a frequency roughly N times higher than one of its wakees.
+ *
+ * In order to determine whether we should let the load spread vs consolidating
+ * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.
+ *
+ * With both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.
+ *
+ * Waker/wakee being client/server, worker/dispatcher, interrupt source or
+ * whatever is irrelevant, spread criteria is apparent partner count exceeds
+ * socket size.
  */
 static int wake_wide(struct task_struct *p)
 {
@@ -5280,8 +5283,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	int want_affine = 0;
 	int sync = wake_flags & WF_SYNC;
 
-	if (sd_flag & SD_BALANCE_WAKE)
+	if (sd_flag & SD_BALANCE_WAKE) {
+		record_wakee(p);
 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+	}
 
 	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
-- 
cgit v1.2.3


From b5179ac70de85ef477cedf8b026a57913754cf1e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 May 2016 16:10:34 +0200
Subject: sched/fair: Prepare to fix fairness problems on migration

Mike reported that our recent attempt to fix migration problems:

  3a47d5124a95 ("sched/fair: Fix fairness issue on migration")

broke interactivity and the signal starve test. We reverted that
commit and now let's try it again more carefully, with some other
underlying problems fixed first.

One problem is that I assumed ENQUEUE_WAKING was only set when we do a
cross-cpu wakeup (migration), which isn't true. This means we now
destroy the vruntime history of tasks and wakeup-preemption suffers.

Cure this by making my assumption true, only call
sched_class::task_waking() when we do a cross-cpu wakeup. This avoids
the indirect call in the case we do a local wakeup.

Reported-by: Mike Galbraith <mgalbraith@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Pavan Kondeti <pkondeti@codeaurora.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: byungchul.park@lge.com
Cc: linux-kernel@vger.kernel.org
Fixes: 3a47d5124a95 ("sched/fair: Fix fairness issue on migration")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 29 +++++++++++++++++++++--------
 kernel/sched/fair.c | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1e622f254df4..1f73e2554bc1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1709,14 +1709,22 @@ static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 		 struct pin_cookie cookie)
 {
+	int en_flags = ENQUEUE_WAKEUP;
+
 	lockdep_assert_held(&rq->lock);
 
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
+
+	/*
+	 * If we migrated; we must have called sched_class::task_waking().
+	 */
+	if (wake_flags & WF_MIGRATED)
+		en_flags |= ENQUEUE_WAKING;
 #endif
 
-	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+	ttwu_activate(rq, p, en_flags);
 	ttwu_do_wakeup(rq, p, wake_flags, cookie);
 }
 
@@ -1762,7 +1770,11 @@ void sched_ttwu_pending(void)
 	while (llist) {
 		p = llist_entry(llist, struct task_struct, wake_entry);
 		llist = llist_next(llist);
-		ttwu_do_activate(rq, p, 0, cookie);
+		/*
+		 * See ttwu_queue(); we only call ttwu_queue_remote() when
+		 * its a x-cpu wakeup.
+		 */
+		ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
 	}
 
 	lockdep_unpin_lock(&rq->lock, cookie);
@@ -1849,7 +1861,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
 }
 #endif /* CONFIG_SMP */
 
-static void ttwu_queue(struct task_struct *p, int cpu)
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct pin_cookie cookie;
@@ -1864,7 +1876,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 
 	raw_spin_lock(&rq->lock);
 	cookie = lockdep_pin_lock(&rq->lock);
-	ttwu_do_activate(rq, p, 0, cookie);
+	ttwu_do_activate(rq, p, wake_flags, cookie);
 	lockdep_unpin_lock(&rq->lock, cookie);
 	raw_spin_unlock(&rq->lock);
 }
@@ -2034,17 +2046,18 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 
-	if (p->sched_class->task_waking)
-		p->sched_class->task_waking(p);
-
 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
 	if (task_cpu(p) != cpu) {
 		wake_flags |= WF_MIGRATED;
+
+		if (p->sched_class->task_waking)
+			p->sched_class->task_waking(p);
+
 		set_task_cpu(p, cpu);
 	}
 #endif /* CONFIG_SMP */
 
-	ttwu_queue(p, cpu);
+	ttwu_queue(p, cpu, wake_flags);
 stat:
 	if (schedstat_enabled())
 		ttwu_stat(p, cpu, wake_flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 48633a1c3b46..445bcd2d7ee1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3254,6 +3254,37 @@ static inline void check_schedstat_required(void)
 #endif
 }
 
+
+/*
+ * MIGRATION
+ *
+ *	dequeue
+ *	  update_curr()
+ *	    update_min_vruntime()
+ *	  vruntime -= min_vruntime
+ *
+ *	enqueue
+ *	  update_curr()
+ *	    update_min_vruntime()
+ *	  vruntime += min_vruntime
+ *
+ * this way the vruntime transition between RQs is done when both
+ * min_vruntime are up-to-date.
+ *
+ * WAKEUP (remote)
+ *
+ *	->task_waking_fair()
+ *	  vruntime -= min_vruntime
+ *
+ *	enqueue
+ *	  update_curr()
+ *	    update_min_vruntime()
+ *	  vruntime += min_vruntime
+ *
+ * this way we don't have the most up-to-date min_vruntime on the originating
+ * CPU and an up-to-date min_vruntime on the destination CPU.
+ */
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -4810,6 +4841,12 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 	return 0;
 }
 
+/*
+ * Called to migrate a waking task; as blocked tasks retain absolute vruntime
+ * the migration needs to deal with this by subtracting the old and adding the
+ * new min_vruntime -- the latter is done by enqueue_entity() when placing
+ * the task on the new runqueue.
+ */
 static void task_waking_fair(struct task_struct *p)
 {
 	struct sched_entity *se = &p->se;
-- 
cgit v1.2.3


From 59efa0bac9cf8b2ef8d08f7632826c6d90f6a9bb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 10 May 2016 18:24:37 +0200
Subject: sched/core: Kill sched_class::task_waking to clean up the migration
 logic

With sched_class::task_waking being called only when we do
set_task_cpu(), we can make sched_class::migrate_task_rq() do the work
and eliminate sched_class::task_waking entirely.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Pavan Kondeti <pkondeti@codeaurora.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: byungchul.park@lge.com
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  9 +-------
 kernel/sched/fair.c  | 58 +++++++++++++++++++++++++---------------------------
 kernel/sched/sched.h |  7 +++----
 3 files changed, 32 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1f73e2554bc1..636c4b9cac38 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1717,11 +1717,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
 	if (p->sched_contributes_to_load)
 		rq->nr_uninterruptible--;
 
-	/*
-	 * If we migrated; we must have called sched_class::task_waking().
-	 */
 	if (wake_flags & WF_MIGRATED)
-		en_flags |= ENQUEUE_WAKING;
+		en_flags |= ENQUEUE_MIGRATED;
 #endif
 
 	ttwu_activate(rq, p, en_flags);
@@ -2049,10 +2046,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
 	if (task_cpu(p) != cpu) {
 		wake_flags |= WF_MIGRATED;
-
-		if (p->sched_class->task_waking)
-			p->sched_class->task_waking(p);
-
 		set_task_cpu(p, cpu);
 	}
 #endif /* CONFIG_SMP */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 445bcd2d7ee1..24ce01b73906 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3273,7 +3273,7 @@ static inline void check_schedstat_required(void)
  *
  * WAKEUP (remote)
  *
- *	->task_waking_fair()
+ *	->migrate_task_rq_fair() (p->state == TASK_WAKING)
  *	  vruntime -= min_vruntime
  *
  *	enqueue
@@ -3292,7 +3292,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update the normalized vruntime before updating min_vruntime
 	 * through calling update_curr().
 	 */
-	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
+	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED))
 		se->vruntime += cfs_rq->min_vruntime;
 
 	/*
@@ -4841,33 +4841,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 	return 0;
 }
 
-/*
- * Called to migrate a waking task; as blocked tasks retain absolute vruntime
- * the migration needs to deal with this by subtracting the old and adding the
- * new min_vruntime -- the latter is done by enqueue_entity() when placing
- * the task on the new runqueue.
- */
-static void task_waking_fair(struct task_struct *p)
-{
-	struct sched_entity *se = &p->se;
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
-	u64 min_vruntime;
-
-#ifndef CONFIG_64BIT
-	u64 min_vruntime_copy;
-
-	do {
-		min_vruntime_copy = cfs_rq->min_vruntime_copy;
-		smp_rmb();
-		min_vruntime = cfs_rq->min_vruntime;
-	} while (min_vruntime != min_vruntime_copy);
-#else
-	min_vruntime = cfs_rq->min_vruntime;
-#endif
-
-	se->vruntime -= min_vruntime;
-}
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * effective_load() calculates the load change as seen from the root_task_group
@@ -5402,6 +5375,32 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  */
 static void migrate_task_rq_fair(struct task_struct *p)
 {
+	/*
+	 * As blocked tasks retain absolute vruntime the migration needs to
+	 * deal with this by subtracting the old and adding the new
+	 * min_vruntime -- the latter is done by enqueue_entity() when placing
+	 * the task on the new runqueue.
+	 */
+	if (p->state == TASK_WAKING) {
+		struct sched_entity *se = &p->se;
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		u64 min_vruntime;
+
+#ifndef CONFIG_64BIT
+		u64 min_vruntime_copy;
+
+		do {
+			min_vruntime_copy = cfs_rq->min_vruntime_copy;
+			smp_rmb();
+			min_vruntime = cfs_rq->min_vruntime;
+		} while (min_vruntime != min_vruntime_copy);
+#else
+		min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+		se->vruntime -= min_vruntime;
+	}
+
 	/*
 	 * We are supposed to update the task to "current" time, then its up to date
 	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
@@ -8672,7 +8671,6 @@ const struct sched_class fair_sched_class = {
 	.rq_online		= rq_online_fair,
 	.rq_offline		= rq_offline_fair,
 
-	.task_waking		= task_waking_fair,
 	.task_dead		= task_dead_fair,
 	.set_cpus_allowed	= set_cpus_allowed_common,
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ab6adb159e23..e51145e76807 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1168,7 +1168,7 @@ extern const u32 sched_prio_to_wmult[40];
  *
  * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
  * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
- * ENQUEUE_WAKING    - sched_class::task_waking was called
+ * ENQUEUE_MIGRATED  - the task was migrated during wakeup
  *
  */
 
@@ -1183,9 +1183,9 @@ extern const u32 sched_prio_to_wmult[40];
 #define ENQUEUE_HEAD		0x08
 #define ENQUEUE_REPLENISH	0x10
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		0x20
+#define ENQUEUE_MIGRATED	0x20
 #else
-#define ENQUEUE_WAKING		0x00
+#define ENQUEUE_MIGRATED	0x00
 #endif
 
 #define RETRY_TASK		((void *)-1UL)
@@ -1217,7 +1217,6 @@ struct sched_class {
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
 	void (*migrate_task_rq)(struct task_struct *p);
 
-	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 
 	void (*set_cpus_allowed)(struct task_struct *p,
-- 
cgit v1.2.3


From 2f950354e6d535b892f133d20bd6a8b09430424c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 May 2016 19:27:56 +0200
Subject: sched/fair: Fix fairness issue on migration

Pavan reported that in the presence of very light tasks (or cgroups)
the placement of migrated tasks can cause severe fairness issues.

The problem is that enqueue_entity() places the task before it updates
time, thereby it can place the task far in the past (remember that
light tasks will shoot virtual time forward at a high speed, so in
relation to the pre-existing light task, we can land far in the past).

This is done because update_curr() needs the current task, and we
might be placing the current task.

The obvious solution is to differentiate between the current and any
other task; placing the current before we update time, and placing any
other task after, such that !curr tasks end up at the current moment
in time, and not in the past.

This commit re-introduces the previously reverted commit:

  3a47d5124a95 ("sched/fair: Fix fairness issue on migration")

... which is now safe to do, after we've also fixed another
underlying bug first, in:

  sched/fair: Prepare to fix fairness problems on migration

and cleaned up other details in the migration code:

  sched/core: Kill sched_class::task_waking

Reported-by: Pavan Kondeti <pkondeti@codeaurora.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 24ce01b73906..d28d89d774aa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3288,17 +3288,27 @@ static inline void check_schedstat_required(void)
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+	bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
+	bool curr = cfs_rq->curr == se;
+
 	/*
-	 * Update the normalized vruntime before updating min_vruntime
-	 * through calling update_curr().
+	 * If we're the current task, we must renormalise before calling
+	 * update_curr().
 	 */
-	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED))
+	if (renorm && curr)
 		se->vruntime += cfs_rq->min_vruntime;
 
+	update_curr(cfs_rq);
+
 	/*
-	 * Update run-time statistics of the 'current'.
+	 * Otherwise, renormalise after, such that we're placed at the current
+	 * moment in time, instead of some random moment in the past. Being
+	 * placed in the past could significantly boost this task to the
+	 * fairness detriment of existing tasks.
 	 */
-	update_curr(cfs_rq);
+	if (renorm && !curr)
+		se->vruntime += cfs_rq->min_vruntime;
+
 	enqueue_entity_load_avg(cfs_rq, se);
 	account_entity_enqueue(cfs_rq, se);
 	update_cfs_shares(cfs_rq);
@@ -3314,7 +3324,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		update_stats_enqueue(cfs_rq, se);
 		check_spread(cfs_rq, se);
 	}
-	if (se != cfs_rq->curr)
+	if (!curr)
 		__enqueue_entity(cfs_rq, se);
 	se->on_rq = 1;
 
-- 
cgit v1.2.3


From 444969223c81c7d0a95136b7b4cfdcfbc96ac5bd Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Wed, 4 May 2016 14:45:34 +0800
Subject: sched/nohz: Fix affine unpinned timers mess

The following commit:

  9642d18eee2c ("nohz: Affine unpinned timers to housekeepers")'

intended to affine unpinned timers to housekeepers:

  unpinned timers(full dynaticks, idle)   =>   nearest busy housekeepers(otherwise, fallback to any housekeepers)
  unpinned timers(full dynaticks, busy)   =>   nearest busy housekeepers(otherwise, fallback to any housekeepers)
  unpinned timers(houserkeepers, idle)    =>   nearest busy housekeepers(otherwise, fallback to itself)

However, the !idle_cpu(i) && is_housekeeping_cpu(cpu) check modified the
intention to:

  unpinned timers(full dynaticks, idle)   =>   any housekeepers(no mattter cpu topology)
  unpinned timers(full dynaticks, busy)   =>   any housekeepers(no mattter cpu topology)
  unpinned timers(housekeepers, idle)     =>   any busy cpus(otherwise, fallback to any housekeepers)

This patch fixes it by checking if there are busy housekeepers nearby,
otherwise falls to any housekeepers/itself. After the patch:

  unpinned timers(full dynaticks, idle)   =>   nearest busy housekeepers(otherwise, fallback to any housekeepers)
  unpinned timers(full dynaticks, busy)   =>   nearest busy housekeepers(otherwise, fallback to any housekeepers)
  unpinned timers(housekeepers, idle)     =>   nearest busy housekeepers(otherwise, fallback to itself)

Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[ Fixed the changelog. ]
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Fixes: 'commit 9642d18eee2c ("nohz: Affine unpinned timers to housekeepers")'
Link: http://lkml.kernel.org/r/1462344334-8303-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 636c4b9cac38..6f6962a356b4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -533,7 +533,10 @@ int get_nohz_timer_target(void)
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		for_each_cpu(i, sched_domain_span(sd)) {
-			if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
+			if (cpu == i)
+				continue;
+
+			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
 				cpu = i;
 				goto unlock;
 			}
-- 
cgit v1.2.3


From 1be0eb2a97d756fb7dd8c9baf372d81fa9699c09 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 6 May 2016 12:21:23 +0200
Subject: sched/fair: Clean up scale confusion

Wanpeng noted that the scale_load_down() in calculate_imbalance() was
weird. I agree, it should be SCHED_CAPACITY_SCALE, since we're going
to compare against busiest->group_capacity, which is in [capacity]
units.

Reported-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuyang Du <yuyang.du@intel.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d28d89d774aa..23381056b4b5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7066,8 +7066,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	 */
 	if (busiest->group_type == group_overloaded &&
 	    local->group_type   == group_overloaded) {
-		load_above_capacity = busiest->sum_nr_running *
-				      scale_load_down(NICE_0_LOAD);
+		load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
 		if (load_above_capacity > busiest->group_capacity)
 			load_above_capacity -= busiest->group_capacity;
 		else
-- 
cgit v1.2.3


From cfa10334318d8212d007da8c771187643c9cef35 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Fri, 29 Apr 2016 20:32:40 +0100
Subject: sched/fair: Correct unit of load_above_capacity

In calculate_imbalance() load_above_capacity currently has the unit
[capacity] while it is used as being [load/capacity]. Not only is it
wrong it also makes it unlikely that load_above_capacity is ever used
as the subsequent code picks the smaller of load_above_capacity and
the avg_load

This patch ensures that load_above_capacity has the right unit
[load/capacity].

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
[ Changed changelog to note it was in capacity unit; +rebase. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1461958364-675-4-git-send-email-dietmar.eggemann@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 23381056b4b5..218f8e83db73 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7067,9 +7067,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	if (busiest->group_type == group_overloaded &&
 	    local->group_type   == group_overloaded) {
 		load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
-		if (load_above_capacity > busiest->group_capacity)
+		if (load_above_capacity > busiest->group_capacity) {
 			load_above_capacity -= busiest->group_capacity;
-		else
+			load_above_capacity *= NICE_0_LOAD;
+			load_above_capacity /= busiest->group_capacity;
+		} else
 			load_above_capacity = ~0UL;
 	}
 
-- 
cgit v1.2.3


From 20878232c52329f92423d27a60e48b6a6389e0dd Mon Sep 17 00:00:00 2001
From: Vik Heyndrickx <vik.heyndrickx@veribox.net>
Date: Thu, 28 Apr 2016 20:46:28 +0200
Subject: sched/loadavg: Fix loadavg artifacts on fully idle and on fully
 loaded systems

Systems show a minimal load average of 0.00, 0.01, 0.05 even when they
have no load at all.

Uptime and /proc/loadavg on all systems with kernels released during the
last five years up until kernel version 4.6-rc5, show a 5- and 15-minute
minimum loadavg of 0.01 and 0.05 respectively. This should be 0.00 on
idle systems, but the way the kernel calculates this value prevents it
from getting lower than the mentioned values.

Likewise but not as obviously noticeable, a fully loaded system with no
processes waiting, shows a maximum 1/5/15 loadavg of 1.00, 0.99, 0.95
(multiplied by number of cores).

Once the (old) load becomes 93 or higher, it mathematically can never
get lower than 93, even when the active (load) remains 0 forever.
This results in the strange 0.00, 0.01, 0.05 uptime values on idle
systems.  Note: 93/2048 = 0.0454..., which rounds up to 0.05.

It is not correct to add a 0.5 rounding (=1024/2048) here, since the
result from this function is fed back into the next iteration again,
so the result of that +0.5 rounding value then gets multiplied by
(2048-2037), and then rounded again, so there is a virtual "ghost"
load created, next to the old and active load terms.

By changing the way the internally kept value is rounded, that internal
value equivalent now can reach 0.00 on idle, and 1.00 on full load. Upon
increasing load, the internally kept load value is rounded up, when the
load is decreasing, the load value is rounded down.

The modified code was tested on nohz=off and nohz kernels. It was tested
on vanilla kernel 4.6-rc5 and on centos 7.1 kernel 3.10.0-327. It was
tested on single, dual, and octal cores system. It was tested on virtual
hosts and bare hardware. No unwanted effects have been observed, and the
problems that the patch intended to fix were indeed gone.

Tested-by: Damien Wyart <damien.wyart@free.fr>
Signed-off-by: Vik Heyndrickx <vik.heyndrickx@veribox.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Doug Smythies <dsmythies@telus.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 0f004f5a696a ("sched: Cure more NO_HZ load average woes")
Link: http://lkml.kernel.org/r/e8d32bff-d544-7748-72b5-3c86cc71f09f@veribox.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/loadavg.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index ef7159012cf3..b0b93fd33af9 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -99,10 +99,13 @@ long calc_load_fold_active(struct rq *this_rq)
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
-	load *= exp;
-	load += active * (FIXED_1 - exp);
-	load += 1UL << (FSHIFT - 1);
-	return load >> FSHIFT;
+	unsigned long newload;
+
+	newload = load * exp + active * (FIXED_1 - exp);
+	if (active >= load)
+		newload += FIXED_1-1;
+
+	return newload / FIXED_1;
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-- 
cgit v1.2.3


From ade42e092b5d1fb9a77b026f019b9953d66f1573 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 11 May 2016 14:23:30 +0200
Subject: sched/core: Use tsk_cpus_allowed() instead of accessing
 ->cpus_allowed

Use the future-safe accessor for struct task_struct's.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1462969411-17735-1-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpudeadline.c | 4 ++--
 kernel/sched/cpupri.c      | 4 ++--
 kernel/sched/deadline.c    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5a75b08cfd85..5be58820465c 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -103,10 +103,10 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	const struct sched_dl_entity *dl_se = &p->dl;
 
 	if (later_mask &&
-	    cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+	    cpumask_and(later_mask, cp->free_cpus, tsk_cpus_allowed(p))) {
 		best_cpu = cpumask_any(later_mask);
 		goto out;
-	} else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
+	} else if (cpumask_test_cpu(cpudl_maximum(cp), tsk_cpus_allowed(p)) &&
 			dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
 		best_cpu = cpudl_maximum(cp);
 		if (later_mask)
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..11e9705bf937 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 		if (skip)
 			continue;
 
-		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+		if (cpumask_any_and(tsk_cpus_allowed(p), vec->mask) >= nr_cpu_ids)
 			continue;
 
 		if (lowest_mask) {
-			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+			cpumask_and(lowest_mask, tsk_cpus_allowed(p), vec->mask);
 
 			/*
 			 * We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0ac6c84f3371..4c9b4eee3f1c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1393,7 +1393,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
 		if (double_lock_balance(rq, later_rq)) {
 			if (unlikely(task_rq(task) != rq ||
 				     !cpumask_test_cpu(later_rq->cpu,
-				                       &task->cpus_allowed) ||
+						       tsk_cpus_allowed(task)) ||
 				     task_running(rq, task) ||
 				     !dl_task(task) ||
 				     !task_on_rq_queued(task))) {
-- 
cgit v1.2.3


From 50605ffbdaf6d7ccab70d4631fd8347fc78af14f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 11 May 2016 14:23:31 +0200
Subject: sched/core: Provide a tsk_nr_cpus_allowed() helper

tsk_nr_cpus_allowed() is an accessor for task->nr_cpus_allowed which allows
us to change the representation of ->nr_cpus_allowed if required.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1462969411-17735-2-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h   |  5 +++++
 kernel/sched/core.c     |  2 +-
 kernel/sched/deadline.c | 28 ++++++++++++++--------------
 kernel/sched/rt.c       | 24 ++++++++++++------------
 4 files changed, 32 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8fea8a5b5ab..38526b67e787 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1930,6 +1930,11 @@ extern int arch_task_struct_size __read_mostly;
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
+static inline int tsk_nr_cpus_allowed(struct task_struct *p)
+{
+	return p->nr_cpus_allowed;
+}
+
 #define TNF_MIGRATED	0x01
 #define TNF_NO_GROUP	0x02
 #define TNF_SHARED	0x04
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6f6962a356b4..404c0784b1fc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1585,7 +1585,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
 	lockdep_assert_held(&p->pi_lock);
 
-	if (p->nr_cpus_allowed > 1)
+	if (tsk_nr_cpus_allowed(p) > 1)
 		cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 	else
 		cpu = cpumask_any(tsk_cpus_allowed(p));
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 4c9b4eee3f1c..fcb7f0217ff4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -134,7 +134,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	struct task_struct *p = dl_task_of(dl_se);
 
-	if (p->nr_cpus_allowed > 1)
+	if (tsk_nr_cpus_allowed(p) > 1)
 		dl_rq->dl_nr_migratory++;
 
 	update_dl_migration(dl_rq);
@@ -144,7 +144,7 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	struct task_struct *p = dl_task_of(dl_se);
 
-	if (p->nr_cpus_allowed > 1)
+	if (tsk_nr_cpus_allowed(p) > 1)
 		dl_rq->dl_nr_migratory--;
 
 	update_dl_migration(dl_rq);
@@ -966,7 +966,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_dl_entity(&p->dl, pi_se, flags);
 
-	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+	if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
 
@@ -1040,9 +1040,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
 	 * try to make it stay here, it might be important.
 	 */
 	if (unlikely(dl_task(curr)) &&
-	    (curr->nr_cpus_allowed < 2 ||
+	    (tsk_nr_cpus_allowed(curr) < 2 ||
 	     !dl_entity_preempt(&p->dl, &curr->dl)) &&
-	    (p->nr_cpus_allowed > 1)) {
+	    (tsk_nr_cpus_allowed(p) > 1)) {
 		int target = find_later_rq(p);
 
 		if (target != -1 &&
@@ -1063,7 +1063,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	 * Current can't be migrated, useless to reschedule,
 	 * let's hope p can move out.
 	 */
-	if (rq->curr->nr_cpus_allowed == 1 ||
+	if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
 	    cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
 		return;
 
@@ -1071,7 +1071,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	 * p is migratable, so let's not schedule it and
 	 * see if it is pushed or pulled somewhere else.
 	 */
-	if (p->nr_cpus_allowed != 1 &&
+	if (tsk_nr_cpus_allowed(p) != 1 &&
 	    cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
 		return;
 
@@ -1186,7 +1186,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
 {
 	update_curr_dl(rq);
 
-	if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
+	if (on_dl_rq(&p->dl) && tsk_nr_cpus_allowed(p) > 1)
 		enqueue_pushable_dl_task(rq, p);
 }
 
@@ -1287,7 +1287,7 @@ static int find_later_rq(struct task_struct *task)
 	if (unlikely(!later_mask))
 		return -1;
 
-	if (task->nr_cpus_allowed == 1)
+	if (tsk_nr_cpus_allowed(task) == 1)
 		return -1;
 
 	/*
@@ -1433,7 +1433,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
 
 	BUG_ON(rq->cpu != task_cpu(p));
 	BUG_ON(task_current(rq, p));
-	BUG_ON(p->nr_cpus_allowed <= 1);
+	BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
 
 	BUG_ON(!task_on_rq_queued(p));
 	BUG_ON(!dl_task(p));
@@ -1472,7 +1472,7 @@ retry:
 	 */
 	if (dl_task(rq->curr) &&
 	    dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
-	    rq->curr->nr_cpus_allowed > 1) {
+	    tsk_nr_cpus_allowed(rq->curr) > 1) {
 		resched_curr(rq);
 		return 0;
 	}
@@ -1619,9 +1619,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    p->nr_cpus_allowed > 1 &&
+	    tsk_nr_cpus_allowed(p) > 1 &&
 	    dl_task(rq->curr) &&
-	    (rq->curr->nr_cpus_allowed < 2 ||
+	    (tsk_nr_cpus_allowed(rq->curr) < 2 ||
 	     !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
 		push_dl_tasks(rq);
 	}
@@ -1725,7 +1725,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+		if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
 			queue_push_tasks(rq);
 #else
 		if (dl_task(rq->curr))
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 67afa06cc8bc..d5690b722691 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -334,7 +334,7 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
 	rt_rq->rt_nr_total++;
-	if (p->nr_cpus_allowed > 1)
+	if (tsk_nr_cpus_allowed(p) > 1)
 		rt_rq->rt_nr_migratory++;
 
 	update_rt_migration(rt_rq);
@@ -351,7 +351,7 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 
 	rt_rq->rt_nr_total--;
-	if (p->nr_cpus_allowed > 1)
+	if (tsk_nr_cpus_allowed(p) > 1)
 		rt_rq->rt_nr_migratory--;
 
 	update_rt_migration(rt_rq);
@@ -1324,7 +1324,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
 	enqueue_rt_entity(rt_se, flags);
 
-	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+	if (!task_current(rq, p) && tsk_nr_cpus_allowed(p) > 1)
 		enqueue_pushable_task(rq, p);
 }
 
@@ -1413,7 +1413,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 	 * will have to sort it out.
 	 */
 	if (curr && unlikely(rt_task(curr)) &&
-	    (curr->nr_cpus_allowed < 2 ||
+	    (tsk_nr_cpus_allowed(curr) < 2 ||
 	     curr->prio <= p->prio)) {
 		int target = find_lowest_rq(p);
 
@@ -1437,7 +1437,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 * Current can't be migrated, useless to reschedule,
 	 * let's hope p can move out.
 	 */
-	if (rq->curr->nr_cpus_allowed == 1 ||
+	if (tsk_nr_cpus_allowed(rq->curr) == 1 ||
 	    !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
 		return;
 
@@ -1445,7 +1445,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 * p is migratable, so let's not schedule it and
 	 * see if it is pushed or pulled somewhere else.
 	 */
-	if (p->nr_cpus_allowed != 1
+	if (tsk_nr_cpus_allowed(p) != 1
 	    && cpupri_find(&rq->rd->cpupri, p, NULL))
 		return;
 
@@ -1579,7 +1579,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 	 * The previous task needs to be made eligible for pushing
 	 * if it is still active
 	 */
-	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
+	if (on_rt_rq(&p->rt) && tsk_nr_cpus_allowed(p) > 1)
 		enqueue_pushable_task(rq, p);
 }
 
@@ -1629,7 +1629,7 @@ static int find_lowest_rq(struct task_struct *task)
 	if (unlikely(!lowest_mask))
 		return -1;
 
-	if (task->nr_cpus_allowed == 1)
+	if (tsk_nr_cpus_allowed(task) == 1)
 		return -1; /* No other targets possible */
 
 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1762,7 +1762,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
 
 	BUG_ON(rq->cpu != task_cpu(p));
 	BUG_ON(task_current(rq, p));
-	BUG_ON(p->nr_cpus_allowed <= 1);
+	BUG_ON(tsk_nr_cpus_allowed(p) <= 1);
 
 	BUG_ON(!task_on_rq_queued(p));
 	BUG_ON(!rt_task(p));
@@ -2122,9 +2122,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    p->nr_cpus_allowed > 1 &&
+	    tsk_nr_cpus_allowed(p) > 1 &&
 	    (dl_task(rq->curr) || rt_task(rq->curr)) &&
-	    (rq->curr->nr_cpus_allowed < 2 ||
+	    (tsk_nr_cpus_allowed(rq->curr) < 2 ||
 	     rq->curr->prio <= p->prio))
 		push_rt_tasks(rq);
 }
@@ -2197,7 +2197,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+		if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
 			queue_push_tasks(rq);
 #else
 		if (p->prio < rq->curr->prio)
-- 
cgit v1.2.3


From 3f56e687a138481894a1088d5aa7d41951bdb020 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 10 May 2016 16:18:33 +0300
Subject: perf/core: Disable the event on a truncated AUX record

When the PMU driver reports a truncated AUX record, it effectively means
that there is no more usable room in the event's AUX buffer (even though
there may still be some room, so that perf_aux_output_begin() doesn't take
action). At this point the consumer still has to be woken up and the event
has to be disabled, otherwise the event will just keep spinning between
perf_aux_output_begin() and perf_aux_output_end() until its context gets
unscheduled.

Again, for cpu-wide events this means never, so once in this condition,
they will be forever losing data.

Fix this by disabling the event and waking up the consumer in case of a
truncated AUX record.

Reported-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1462886313-13660-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index c49bab42dc57..ae9b90dc9a5a 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -405,6 +405,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 			 bool truncated)
 {
 	struct ring_buffer *rb = handle->rb;
+	bool wakeup = truncated;
 	unsigned long aux_head;
 	u64 flags = 0;
 
@@ -433,9 +434,16 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
 
 	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
-		perf_output_wakeup(handle);
+		wakeup = true;
 		local_add(rb->aux_watermark, &rb->aux_wakeup);
 	}
+
+	if (wakeup) {
+		if (truncated)
+			handle->event->pending_disable = 1;
+		perf_output_wakeup(handle);
+	}
+
 	handle->event = NULL;
 
 	local_set(&rb->aux_nest, 0);
-- 
cgit v1.2.3


From 9f448cd3cbcec8995935e60b27802ae56aac8cc0 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 10 May 2016 16:18:33 +0300
Subject: perf/core: Disable the event on a truncated AUX record

When the PMU driver reports a truncated AUX record, it effectively means
that there is no more usable room in the event's AUX buffer (even though
there may still be some room, so that perf_aux_output_begin() doesn't take
action). At this point the consumer still has to be woken up and the event
has to be disabled, otherwise the event will just keep spinning between
perf_aux_output_begin() and perf_aux_output_end() until its context gets
unscheduled.

Again, for cpu-wide events this means never, so once in this condition,
they will be forever losing data.

Fix this by disabling the event and waking up the consumer in case of a
truncated AUX record.

Reported-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1462886313-13660-3-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index c61f0cbd308b..7611d0f66cf8 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -347,6 +347,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 			 bool truncated)
 {
 	struct ring_buffer *rb = handle->rb;
+	bool wakeup = truncated;
 	unsigned long aux_head;
 	u64 flags = 0;
 
@@ -375,9 +376,16 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size,
 	aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
 
 	if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
-		perf_output_wakeup(handle);
+		wakeup = true;
 		local_add(rb->aux_watermark, &rb->aux_wakeup);
 	}
+
+	if (wakeup) {
+		if (truncated)
+			handle->event->pending_disable = 1;
+		perf_output_wakeup(handle);
+	}
+
 	handle->event = NULL;
 
 	local_set(&rb->aux_nest, 0);
-- 
cgit v1.2.3


From 09be4c824ebdbf3c043a07d2d9537a0164a1ecfe Mon Sep 17 00:00:00 2001
From: Felipe Balbi <felipe.balbi@linux.intel.com>
Date: Thu, 12 May 2016 12:34:38 +0300
Subject: cgroup: fix compile warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 4f41fc59620f ("cgroup, kernfs: make mountinfo
 show properly scoped path for cgroup namespaces")
 added the following compile warning:

kernel/cgroup.c: In function ‘cgroup_show_path’:
kernel/cgroup.c:1634:15: warning: unused variable ‘ret’ [-Wunused-variable]
  int len = 0, ret = 0;
               ^
fix it.

Fixes: 4f41fc59620f ("cgroup, kernfs: make mountinfo show properly scoped path for cgroup namespaces")
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index afea39eb7649..86cb5c6e8932 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1631,7 +1631,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 			    struct kernfs_root *kf_root)
 {
-	int len = 0, ret = 0;
+	int len = 0;
 	char *buf = NULL;
 	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
 	struct cgroup *ns_cgroup;
-- 
cgit v1.2.3


From f7c17d26f43d5cc1b7a6b896cd2fa24a079739b9 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Wed, 11 May 2016 17:55:18 +0800
Subject: workqueue: fix rebind bound workers warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

------------[ cut here ]------------
WARNING: CPU: 0 PID: 16 at kernel/workqueue.c:4559 rebind_workers+0x1c0/0x1d0
Modules linked in:
CPU: 0 PID: 16 Comm: cpuhp/0 Not tainted 4.6.0-rc4+ #31
Hardware name: IBM IBM System x3550 M4 Server -[7914IUW]-/00Y8603, BIOS -[D7E128FUS-1.40]- 07/23/2013
 0000000000000000 ffff881037babb58 ffffffff8139d885 0000000000000010
 0000000000000000 0000000000000000 0000000000000000 ffff881037babba8
 ffffffff8108505d ffff881037ba0000 000011cf3e7d6e60 0000000000000046
Call Trace:
 dump_stack+0x89/0xd4
 __warn+0xfd/0x120
 warn_slowpath_null+0x1d/0x20
 rebind_workers+0x1c0/0x1d0
 workqueue_cpu_up_callback+0xf5/0x1d0
 notifier_call_chain+0x64/0x90
 ? trace_hardirqs_on_caller+0xf2/0x220
 ? notify_prepare+0x80/0x80
 __raw_notifier_call_chain+0xe/0x10
 __cpu_notify+0x35/0x50
 notify_down_prepare+0x5e/0x80
 ? notify_prepare+0x80/0x80
 cpuhp_invoke_callback+0x73/0x330
 ? __schedule+0x33e/0x8a0
 cpuhp_down_callbacks+0x51/0xc0
 cpuhp_thread_fun+0xc1/0xf0
 smpboot_thread_fn+0x159/0x2a0
 ? smpboot_create_threads+0x80/0x80
 kthread+0xef/0x110
 ? wait_for_completion+0xf0/0x120
 ? schedule_tail+0x35/0xf0
 ret_from_fork+0x22/0x50
 ? __init_kthread_worker+0x70/0x70
---[ end trace eb12ae47d2382d8f ]---
notify_down_prepare: attempt to take down CPU 0 failed

This bug can be reproduced by below config w/ nohz_full= all cpus:

CONFIG_BOOTPARAM_HOTPLUG_CPU0=y
CONFIG_DEBUG_HOTPLUG_CPU0=y
CONFIG_NO_HZ_FULL=y

As Thomas pointed out:

| If a down prepare callback fails, then DOWN_FAILED is invoked for all
| callbacks which have successfully executed DOWN_PREPARE.
|
| But, workqueue has actually two notifiers. One which handles
| UP/DOWN_FAILED/ONLINE and one which handles DOWN_PREPARE.
|
| Now look at the priorities of those callbacks:
|
| CPU_PRI_WORKQUEUE_UP        = 5
| CPU_PRI_WORKQUEUE_DOWN      = -5
|
| So the call order on DOWN_PREPARE is:
|
| CB 1
| CB ...
| CB workqueue_up() -> Ignores DOWN_PREPARE
| CB ...
| CB X ---> Fails
|
| So we call up to CB X with DOWN_FAILED
|
| CB 1
| CB ...
| CB workqueue_up() -> Handles DOWN_FAILED
| CB ...
| CB X-1
|
| So the problem is that the workqueue stuff handles DOWN_FAILED in the up
| callback, while it should do it in the down callback. Which is not a good idea
| either because it wants to be called early on rollback...
|
| Brilliant stuff, isn't it? The hotplug rework will solve this problem because
| the callbacks become symetric, but for the existing mess, we need some
| workaround in the workqueue code.

The boot CPU handles housekeeping duty(unbound timers, workqueues,
timekeeping, ...) on behalf of full dynticks CPUs. It must remain
online when nohz full is enabled. There is a priority set to every
notifier_blocks:

workqueue_cpu_up > tick_nohz_cpu_down > workqueue_cpu_down

So tick_nohz_cpu_down callback failed when down prepare cpu 0, and
notifier_blocks behind tick_nohz_cpu_down will not be called any
more, which leads to workers are actually not unbound. Then hotplug
state machine will fallback to undo and online cpu 0 again. Workers
will be rebound unconditionally even if they are not unbound and
trigger the warning in this progress.

This patch fix it by catching !DISASSOCIATED to avoid rebind bound
workers.

Cc: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: stable@vger.kernel.org
Suggested-by: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
---
 kernel/workqueue.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 801a698564d4..1b2e36b2f31f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4555,6 +4555,17 @@ static void rebind_workers(struct worker_pool *pool)
 						  pool->attrs->cpumask) < 0);
 
 	spin_lock_irq(&pool->lock);
+
+	/*
+	 * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
+	 * w/o preceding DOWN_PREPARE.  Work around it.  CPU hotplug is
+	 * being reworked and this can go away in time.
+	 */
+	if (!(pool->flags & POOL_DISASSOCIATED)) {
+		spin_unlock_irq(&pool->lock);
+		return;
+	}
+
 	pool->flags &= ~POOL_DISASSOCIATED;
 
 	for_each_pool_worker(worker, pool) {
-- 
cgit v1.2.3


From c983f0e8677d5d686ea81de13f9c9ac34f21a2a7 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Tue, 29 Mar 2016 09:35:32 +0100
Subject: seccomp: Get compat syscalls from asm-generic header

Move retrieval of compat syscall numbers into inline function defined in
asm-generic header so that arches may override it.

[ralf@linux-mips.org: Resolve merge conflict.]

Suggested-by: Paul Burton <paul.burton@imgtec.com>
Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: IMG-MIPSLinuxKerneldevelopers@imgtec.com
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Cc: linux-arch@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12978/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 include/asm-generic/seccomp.h | 14 ++++++++++++++
 kernel/seccomp.c              |  9 +--------
 2 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/seccomp.h b/include/asm-generic/seccomp.h
index c9ccafa0d99a..e74072d23e69 100644
--- a/include/asm-generic/seccomp.h
+++ b/include/asm-generic/seccomp.h
@@ -29,4 +29,18 @@
 #define __NR_seccomp_sigreturn		__NR_rt_sigreturn
 #endif
 
+#ifdef CONFIG_COMPAT
+#ifndef get_compat_mode1_syscalls
+static inline const int *get_compat_mode1_syscalls(void)
+{
+	static const int mode1_syscalls_32[] = {
+		__NR_seccomp_read_32, __NR_seccomp_write_32,
+		__NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
+		0, /* null terminated */
+	};
+	return mode1_syscalls_32;
+}
+#endif
+#endif /* CONFIG_COMPAT */
+
 #endif /* _ASM_GENERIC_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index e1e5a354854e..737436ebb4fe 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -518,19 +518,12 @@ static int mode1_syscalls[] = {
 	0, /* null terminated */
 };
 
-#ifdef CONFIG_COMPAT
-static int mode1_syscalls_32[] = {
-	__NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
-	0, /* null terminated */
-};
-#endif
-
 static void __secure_computing_strict(int this_syscall)
 {
 	int *syscall_whitelist = mode1_syscalls;
 #ifdef CONFIG_COMPAT
 	if (in_compat_syscall())
-		syscall_whitelist = mode1_syscalls_32;
+		syscall_whitelist = get_compat_mode1_syscalls();
 #endif
 	do {
 		if (*syscall_whitelist == this_syscall)
-- 
cgit v1.2.3


From cb4253aa0f77f20be018970dbe5d01d78b930ef9 Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Tue, 29 Mar 2016 09:35:34 +0100
Subject: secomp: Constify mode1 syscall whitelist

These values are constant and should be marked as such.

Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Will Drewry <wad@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: IMG-MIPSLinuxKerneldevelopers@imgtec.com
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/12979/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
---
 kernel/seccomp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 737436ebb4fe..a0ffcb1a2bee 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -513,14 +513,14 @@ static void seccomp_send_sigsys(int syscall, int reason)
  * To be fully secure this must be combined with rlimit
  * to limit the stack allocations too.
  */
-static int mode1_syscalls[] = {
+static const int mode1_syscalls[] = {
 	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
 	0, /* null terminated */
 };
 
 static void __secure_computing_strict(int this_syscall)
 {
-	int *syscall_whitelist = mode1_syscalls;
+	const int *syscall_whitelist = mode1_syscalls;
 #ifdef CONFIG_COMPAT
 	if (in_compat_syscall())
 		syscall_whitelist = get_compat_mode1_syscalls();
-- 
cgit v1.2.3


From 9b94a8fba501f38368aef6ac1b30e7335252a220 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 12 May 2016 11:01:24 -0400
Subject: ring-buffer: Use long for nr_pages to avoid overflow failures

The size variable to change the ring buffer in ftrace is a long. The
nr_pages used to update the ring buffer based on the size is int. On 64 bit
machines this can cause an overflow problem.

For example, the following will cause the ring buffer to crash:

 # cd /sys/kernel/debug/tracing
 # echo 10 > buffer_size_kb
 # echo 8556384240 > buffer_size_kb

Then you get the warning of:

 WARNING: CPU: 1 PID: 318 at kernel/trace/ring_buffer.c:1527 rb_update_pages+0x22f/0x260

Which is:

  RB_WARN_ON(cpu_buffer, nr_removed);

Note each ring buffer page holds 4080 bytes.

This is because:

 1) 10 causes the ring buffer to have 3 pages.
    (10kb requires 3 * 4080 pages to hold)

 2) (2^31 / 2^10  + 1) * 4080 = 8556384240
    The value written into buffer_size_kb is shifted by 10 and then passed
    to ring_buffer_resize(). 8556384240 * 2^10 = 8761737461760

 3) The size passed to ring_buffer_resize() is then divided by BUF_PAGE_SIZE
    which is 4080. 8761737461760 / 4080 = 2147484672

 4) nr_pages is subtracted from the current nr_pages (3) and we get:
    2147484669. This value is saved in a signed integer nr_pages_to_update

 5) 2147484669 is greater than 2^31 but smaller than 2^32, a signed int
    turns into the value of -2147482627

 6) As the value is a negative number, in update_pages_handler() it is
    negated and passed to rb_remove_pages() and 2147482627 pages will
    be removed, which is much larger than 3 and it causes the warning
    because not all the pages asked to be removed were removed.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=118001

Cc: stable@vger.kernel.org # 2.6.28+
Fixes: 7a8e76a3829f1 ("tracing: unified trace buffer")
Reported-by: Hao Qin <QEver.cn@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 95181e36891a..99d64cd58c52 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -437,7 +437,7 @@ struct ring_buffer_per_cpu {
 	raw_spinlock_t			reader_lock;	/* serialize readers */
 	arch_spinlock_t			lock;
 	struct lock_class_key		lock_key;
-	unsigned int			nr_pages;
+	unsigned long			nr_pages;
 	unsigned int			current_context;
 	struct list_head		*pages;
 	struct buffer_page		*head_page;	/* read from head */
@@ -458,7 +458,7 @@ struct ring_buffer_per_cpu {
 	u64				write_stamp;
 	u64				read_stamp;
 	/* ring buffer pages to update, > 0 to add, < 0 to remove */
-	int				nr_pages_to_update;
+	long				nr_pages_to_update;
 	struct list_head		new_pages; /* new pages to add */
 	struct work_struct		update_pages_work;
 	struct completion		update_done;
@@ -1128,10 +1128,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
 	return 0;
 }
 
-static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu)
+static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
 {
-	int i;
 	struct buffer_page *bpage, *tmp;
+	long i;
 
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page;
@@ -1168,7 +1168,7 @@ free_pages:
 }
 
 static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
-			     unsigned nr_pages)
+			     unsigned long nr_pages)
 {
 	LIST_HEAD(pages);
 
@@ -1193,7 +1193,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
 }
 
 static struct ring_buffer_per_cpu *
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct buffer_page *bpage;
@@ -1293,8 +1293,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 					struct lock_class_key *key)
 {
 	struct ring_buffer *buffer;
+	long nr_pages;
 	int bsize;
-	int cpu, nr_pages;
+	int cpu;
 
 	/* keep it in its own cache line */
 	buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
@@ -1420,12 +1421,12 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage)
 }
 
 static int
-rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages)
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
 {
 	struct list_head *tail_page, *to_remove, *next_page;
 	struct buffer_page *to_remove_page, *tmp_iter_page;
 	struct buffer_page *last_page, *first_page;
-	unsigned int nr_removed;
+	unsigned long nr_removed;
 	unsigned long head_bit;
 	int page_entries;
 
@@ -1642,7 +1643,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
 			int cpu_id)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned nr_pages;
+	unsigned long nr_pages;
 	int cpu, err = 0;
 
 	/*
@@ -4640,8 +4641,9 @@ static int rb_cpu_notify(struct notifier_block *self,
 	struct ring_buffer *buffer =
 		container_of(self, struct ring_buffer, cpu_notify);
 	long cpu = (long)hcpu;
-	int cpu_i, nr_pages_same;
-	unsigned int nr_pages;
+	long nr_pages_same;
+	int cpu_i;
+	unsigned long nr_pages;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-- 
cgit v1.2.3


From 59643d1535eb220668692a5359de22545af579f6 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 13 May 2016 09:34:12 -0400
Subject: ring-buffer: Prevent overflow of size in ring_buffer_resize()

If the size passed to ring_buffer_resize() is greater than MAX_LONG - BUF_PAGE_SIZE
then the DIV_ROUND_UP() will return zero.

Here's the details:

  # echo 18014398509481980 > /sys/kernel/debug/tracing/buffer_size_kb

tracing_entries_write() processes this and converts kb to bytes.

 18014398509481980 << 10 = 18446744073709547520

and this is passed to ring_buffer_resize() as unsigned long size.

 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);

Where DIV_ROUND_UP(a, b) is (a + b - 1)/b

BUF_PAGE_SIZE is 4080 and here

 18446744073709547520 + 4080 - 1 = 18446744073709551599

where 18446744073709551599 is still smaller than 2^64

 2^64 - 18446744073709551599 = 17

But now 18446744073709551599 / 4080 = 4521260802379792

and size = size * 4080 = 18446744073709551360

This is checked to make sure its still greater than 2 * 4080,
which it is.

Then we convert to the number of buffer pages needed.

 nr_page = DIV_ROUND_UP(size, BUF_PAGE_SIZE)

but this time size is 18446744073709551360 and

 2^64 - (18446744073709551360 + 4080 - 1) = -3823

Thus it overflows and the resulting number is less than 4080, which makes

  3823 / 4080 = 0

an nr_pages is set to this. As we already checked against the minimum that
nr_pages may be, this causes the logic to fail as well, and we crash the
kernel.

There's no reason to have the two DIV_ROUND_UP() (that's just result of
historical code changes), clean up the code and fix this bug.

Cc: stable@vger.kernel.org # 3.5+
Fixes: 83f40318dab00 ("ring-buffer: Make removal of ring buffer pages atomic")
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 99d64cd58c52..9c143739b8d7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1657,14 +1657,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
 	    !cpumask_test_cpu(cpu_id, buffer->cpumask))
 		return size;
 
-	size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
-	size *= BUF_PAGE_SIZE;
+	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 
 	/* we need a minimum of two pages */
-	if (size < BUF_PAGE_SIZE * 2)
-		size = BUF_PAGE_SIZE * 2;
+	if (nr_pages < 2)
+		nr_pages = 2;
 
-	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+	size = nr_pages * BUF_PAGE_SIZE;
 
 	/*
 	 * Don't succeed if resizing is disabled, as a reader might be
-- 
cgit v1.2.3


From 04cafed7fc19a8010771c788708ac97c405fc3de Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 12 May 2016 13:57:45 +0200
Subject: locking/rwsem: Fix down_write_killable()

The new signal_pending exit path in __rwsem_down_write_failed_common()
was fingered as breaking his kernel by Tetsuo Handa.

Upon inspection it was found that there are two things wrong with it;

 - it forgets to remove WAITING_BIAS if it leaves the list empty, or
 - it forgets to wake further waiters that were blocked on the now
   removed waiter.

Especially the first issue causes new lock attempts to block and stall
indefinitely, as the code assumes that pending waiters mean there is
an owner that will wake when it releases the lock.

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Tested-by: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Waiman Long <Waiman.Long@hpe.com>
Link: http://lkml.kernel.org/r/20160512115745.GP3192@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index df4dcb883b50..09e30c6225e5 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -487,23 +487,32 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
 
 		/* Block until there are no active lockers. */
 		do {
-			if (signal_pending_state(state, current)) {
-				raw_spin_lock_irq(&sem->wait_lock);
-				ret = ERR_PTR(-EINTR);
-				goto out;
-			}
+			if (signal_pending_state(state, current))
+				goto out_nolock;
+
 			schedule();
 			set_current_state(state);
 		} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
 
 		raw_spin_lock_irq(&sem->wait_lock);
 	}
-out:
 	__set_current_state(TASK_RUNNING);
 	list_del(&waiter.list);
 	raw_spin_unlock_irq(&sem->wait_lock);
 
 	return ret;
+
+out_nolock:
+	__set_current_state(TASK_RUNNING);
+	raw_spin_lock_irq(&sem->wait_lock);
+	list_del(&waiter.list);
+	if (list_empty(&sem->wait_list))
+		rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem);
+	else
+		__rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+	raw_spin_unlock_irq(&sem->wait_lock);
+
+	return ERR_PTR(-EINTR);
 }
 
 __visible struct rw_semaphore * __sched
-- 
cgit v1.2.3


From 4936e3528e3e272c567fe4ff0abb7ce3e1500575 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:26 +0200
Subject: bpf: minor cleanups in ebpf code

Besides others, remove redundant comments where the code is self
documenting enough, and properly indent various bpf_verifier_ops
and bpf_prog_type_list declarations. Moreover, remove two exports
that actually have no module user.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c |  2 --
 net/core/filter.c | 34 +++++++++++++++-------------------
 2 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d781b077431f..5313d09d4b62 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -129,14 +129,12 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 
 	return fp;
 }
-EXPORT_SYMBOL_GPL(bpf_prog_realloc);
 
 void __bpf_prog_free(struct bpf_prog *fp)
 {
 	kfree(fp->aux);
 	vfree(fp);
 }
-EXPORT_SYMBOL_GPL(__bpf_prog_free);
 
 #ifdef CONFIG_BPF_JIT
 struct bpf_binary_header *
diff --git a/net/core/filter.c b/net/core/filter.c
index 71c2a1f473ad..ea51b479cf02 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2069,16 +2069,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
-	/* check bounds */
 	if (off < 0 || off >= sizeof(struct __sk_buff))
 		return false;
-
-	/* disallow misaligned access */
+	/* The verifier guarantees that size > 0. */
 	if (off % size != 0)
 		return false;
-
-	/* all __sk_buff fields are __u32 */
-	if (size != 4)
+	if (size != sizeof(__u32))
 		return false;
 
 	return true;
@@ -2097,7 +2093,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case offsetof(struct __sk_buff, cb[0]) ...
-			offsetof(struct __sk_buff, cb[4]):
+		     offsetof(struct __sk_buff, cb[4]):
 			break;
 		default:
 			return false;
@@ -2278,30 +2274,30 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 }
 
 static const struct bpf_verifier_ops sk_filter_ops = {
-	.get_func_proto = sk_filter_func_proto,
-	.is_valid_access = sk_filter_is_valid_access,
-	.convert_ctx_access = bpf_net_convert_ctx_access,
+	.get_func_proto		= sk_filter_func_proto,
+	.is_valid_access	= sk_filter_is_valid_access,
+	.convert_ctx_access	= bpf_net_convert_ctx_access,
 };
 
 static const struct bpf_verifier_ops tc_cls_act_ops = {
-	.get_func_proto = tc_cls_act_func_proto,
-	.is_valid_access = tc_cls_act_is_valid_access,
-	.convert_ctx_access = bpf_net_convert_ctx_access,
+	.get_func_proto		= tc_cls_act_func_proto,
+	.is_valid_access	= tc_cls_act_is_valid_access,
+	.convert_ctx_access	= bpf_net_convert_ctx_access,
 };
 
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
-	.ops = &sk_filter_ops,
-	.type = BPF_PROG_TYPE_SOCKET_FILTER,
+	.ops	= &sk_filter_ops,
+	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
 };
 
 static struct bpf_prog_type_list sched_cls_type __read_mostly = {
-	.ops = &tc_cls_act_ops,
-	.type = BPF_PROG_TYPE_SCHED_CLS,
+	.ops	= &tc_cls_act_ops,
+	.type	= BPF_PROG_TYPE_SCHED_CLS,
 };
 
 static struct bpf_prog_type_list sched_act_type __read_mostly = {
-	.ops = &tc_cls_act_ops,
-	.type = BPF_PROG_TYPE_SCHED_ACT,
+	.ops	= &tc_cls_act_ops,
+	.type	= BPF_PROG_TYPE_SCHED_ACT,
 };
 
 static int __init register_sk_filter_ops(void)
-- 
cgit v1.2.3


From c237ee5eb33bf19fe0591c04ff8db19da7323a83 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:30 +0200
Subject: bpf: add bpf_patch_insn_single helper

Move the functionality to patch instructions out of the verifier
code and into the core as the new bpf_patch_insn_single() helper
will be needed later on for blinding as well. No changes in
functionality.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  3 +++
 kernel/bpf/core.c      | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c  | 53 +++++++------------------------------
 3 files changed, 83 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4ff0e647598f..c4aae496f376 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -495,6 +495,9 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 void bpf_int_jit_compile(struct bpf_prog *fp);
 bool bpf_helper_changes_skb_data(void *func);
 
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+				       const struct bpf_insn *patch, u32 len);
+
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 5313d09d4b62..49b5538a5301 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,6 +136,77 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	vfree(fp);
 }
 
+static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
+{
+	return BPF_CLASS(insn->code) == BPF_JMP  &&
+	       /* Call and Exit are both special jumps with no
+		* target inside the BPF instruction image.
+		*/
+	       BPF_OP(insn->code) != BPF_CALL &&
+	       BPF_OP(insn->code) != BPF_EXIT;
+}
+
+static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta)
+{
+	struct bpf_insn *insn = prog->insnsi;
+	u32 i, insn_cnt = prog->len;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (!bpf_is_jmp_and_has_target(insn))
+			continue;
+
+		/* Adjust offset of jmps if we cross boundaries. */
+		if (i < pos && i + insn->off + 1 > pos)
+			insn->off += delta;
+		else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
+			insn->off -= delta;
+	}
+}
+
+struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
+				       const struct bpf_insn *patch, u32 len)
+{
+	u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
+	struct bpf_prog *prog_adj;
+
+	/* Since our patchlet doesn't expand the image, we're done. */
+	if (insn_delta == 0) {
+		memcpy(prog->insnsi + off, patch, sizeof(*patch));
+		return prog;
+	}
+
+	insn_adj_cnt = prog->len + insn_delta;
+
+	/* Several new instructions need to be inserted. Make room
+	 * for them. Likely, there's no need for a new allocation as
+	 * last page could have large enough tailroom.
+	 */
+	prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
+				    GFP_USER);
+	if (!prog_adj)
+		return NULL;
+
+	prog_adj->len = insn_adj_cnt;
+
+	/* Patching happens in 3 steps:
+	 *
+	 * 1) Move over tail of insnsi from next instruction onwards,
+	 *    so we can patch the single target insn with one or more
+	 *    new ones (patching is always from 1 to n insns, n > 0).
+	 * 2) Inject new instructions at the target location.
+	 * 3) Adjust branch offsets if necessary.
+	 */
+	insn_rest = insn_adj_cnt - off - len;
+
+	memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1,
+		sizeof(*patch) * insn_rest);
+	memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len);
+
+	bpf_adj_branches(prog_adj, off, insn_delta);
+
+	return prog_adj;
+}
+
 #ifdef CONFIG_BPF_JIT
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 84bff68cf80e..a08d66215245 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2587,26 +2587,6 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
 			insn->src_reg = 0;
 }
 
-static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
-{
-	struct bpf_insn *insn = prog->insnsi;
-	int insn_cnt = prog->len;
-	int i;
-
-	for (i = 0; i < insn_cnt; i++, insn++) {
-		if (BPF_CLASS(insn->code) != BPF_JMP ||
-		    BPF_OP(insn->code) == BPF_CALL ||
-		    BPF_OP(insn->code) == BPF_EXIT)
-			continue;
-
-		/* adjust offset of jmps if necessary */
-		if (i < pos && i + insn->off + 1 > pos)
-			insn->off += delta;
-		else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
-			insn->off -= delta;
-	}
-}
-
 /* convert load instructions that access fields of 'struct __sk_buff'
  * into sequence of instructions that access fields of 'struct sk_buff'
  */
@@ -2616,14 +2596,15 @@ static int convert_ctx_accesses(struct verifier_env *env)
 	int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
-	u32 cnt;
-	int i;
 	enum bpf_access_type type;
+	int i;
 
 	if (!env->prog->aux->ops->convert_ctx_access)
 		return 0;
 
 	for (i = 0; i < insn_cnt; i++, insn++) {
+		u32 insn_delta, cnt;
+
 		if (insn->code == (BPF_LDX | BPF_MEM | BPF_W))
 			type = BPF_READ;
 		else if (insn->code == (BPF_STX | BPF_MEM | BPF_W))
@@ -2645,34 +2626,18 @@ static int convert_ctx_accesses(struct verifier_env *env)
 			return -EINVAL;
 		}
 
-		if (cnt == 1) {
-			memcpy(insn, insn_buf, sizeof(*insn));
-			continue;
-		}
-
-		/* several new insns need to be inserted. Make room for them */
-		insn_cnt += cnt - 1;
-		new_prog = bpf_prog_realloc(env->prog,
-					    bpf_prog_size(insn_cnt),
-					    GFP_USER);
+		new_prog = bpf_patch_insn_single(env->prog, i, insn_buf, cnt);
 		if (!new_prog)
 			return -ENOMEM;
 
-		new_prog->len = insn_cnt;
-
-		memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1,
-			sizeof(*insn) * (insn_cnt - i - cnt));
-
-		/* copy substitute insns in place of load instruction */
-		memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt);
-
-		/* adjust branches in the whole program */
-		adjust_branches(new_prog, i, cnt - 1);
+		insn_delta = cnt - 1;
 
 		/* keep walking new program and skip insns we just inserted */
 		env->prog = new_prog;
-		insn = new_prog->insnsi + i + cnt - 1;
-		i += cnt - 1;
+		insn      = new_prog->insnsi + i + insn_delta;
+
+		insn_cnt += insn_delta;
+		i        += insn_delta;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From d1c55ab5e41fcd72cb0a8bef86d3f652ad9ad9f5 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:31 +0200
Subject: bpf: prepare bpf_int_jit_compile/bpf_prog_select_runtime apis

Since the blinding is strictly only called from inside eBPF JITs,
we need to change signatures for bpf_int_jit_compile() and
bpf_prog_select_runtime() first in order to prepare that the
eBPF program we're dealing with can change underneath. Hence,
for call sites, we need to return the latest prog. No functional
change in this patch.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm64/net/bpf_jit_comp.c |  7 ++++---
 arch/s390/net/bpf_jit_comp.c  |  8 +++++---
 arch/x86/net/bpf_jit_comp.c   |  7 ++++---
 include/linux/filter.h        |  5 +++--
 kernel/bpf/core.c             | 18 ++++++++++++++----
 kernel/bpf/syscall.c          |  2 +-
 lib/test_bpf.c                |  5 ++++-
 net/core/filter.c             |  6 +++++-
 8 files changed, 40 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index ef35e866caf7..dd428807cb30 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -762,7 +762,7 @@ void bpf_jit_compile(struct bpf_prog *prog)
 	/* Nothing to do here. We support Internal BPF. */
 }
 
-void bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_binary_header *header;
 	struct jit_ctx ctx;
@@ -770,14 +770,14 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	u8 *image_ptr;
 
 	if (!bpf_jit_enable)
-		return;
+		return prog;
 
 	memset(&ctx, 0, sizeof(ctx));
 	ctx.prog = prog;
 
 	ctx.offset = kcalloc(prog->len, sizeof(int), GFP_KERNEL);
 	if (ctx.offset == NULL)
-		return;
+		return prog;
 
 	/* 1. Initial fake pass to compute ctx->idx. */
 
@@ -828,6 +828,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	prog->jited = 1;
 out:
 	kfree(ctx.offset);
+	return prog;
 }
 
 void bpf_jit_free(struct bpf_prog *prog)
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 3c0bfc1f2694..fcf301a889e7 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1262,18 +1262,19 @@ void bpf_jit_compile(struct bpf_prog *fp)
 /*
  * Compile eBPF program "fp"
  */
-void bpf_int_jit_compile(struct bpf_prog *fp)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 {
 	struct bpf_binary_header *header;
 	struct bpf_jit jit;
 	int pass;
 
 	if (!bpf_jit_enable)
-		return;
+		return fp;
+
 	memset(&jit, 0, sizeof(jit));
 	jit.addrs = kcalloc(fp->len + 1, sizeof(*jit.addrs), GFP_KERNEL);
 	if (jit.addrs == NULL)
-		return;
+		return fp;
 	/*
 	 * Three initial passes:
 	 *   - 1/2: Determine clobbered registers
@@ -1305,6 +1306,7 @@ void bpf_int_jit_compile(struct bpf_prog *fp)
 	}
 free_addrs:
 	kfree(jit.addrs);
+	return fp;
 }
 
 /*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index f5bfd4fd28dd..6b2d23ea3590 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1073,7 +1073,7 @@ void bpf_jit_compile(struct bpf_prog *prog)
 {
 }
 
-void bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
 	struct bpf_binary_header *header = NULL;
 	int proglen, oldproglen = 0;
@@ -1084,11 +1084,11 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	int i;
 
 	if (!bpf_jit_enable)
-		return;
+		return prog;
 
 	addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
 	if (!addrs)
-		return;
+		return prog;
 
 	/* Before first pass, make a rough estimation of addrs[]
 	 * each bpf instruction is translated to less than 64 bytes
@@ -1140,6 +1140,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
 	}
 out:
 	kfree(addrs);
+	return prog;
 }
 
 void bpf_jit_free(struct bpf_prog *fp)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c4aae496f376..891852cf7716 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,7 @@ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp)
 
 int sk_filter(struct sock *sk, struct sk_buff *skb);
 
-int bpf_prog_select_runtime(struct bpf_prog *fp);
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
 void bpf_prog_free(struct bpf_prog *fp);
 
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
@@ -492,7 +492,8 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
 u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
-void bpf_int_jit_compile(struct bpf_prog *fp);
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 bool bpf_helper_changes_skb_data(void *func);
 
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 49b5538a5301..70f0821aca47 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -761,15 +761,22 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
 /**
  *	bpf_prog_select_runtime - select exec runtime for BPF program
  *	@fp: bpf_prog populated with internal BPF program
+ *	@err: pointer to error variable
  *
  * Try to JIT eBPF program, if JIT is not available, use interpreter.
  * The BPF program will be executed via BPF_PROG_RUN() macro.
  */
-int bpf_prog_select_runtime(struct bpf_prog *fp)
+struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
 	fp->bpf_func = (void *) __bpf_prog_run;
 
-	bpf_int_jit_compile(fp);
+	/* eBPF JITs can rewrite the program in case constant
+	 * blinding is active. However, in case of error during
+	 * blinding, bpf_int_jit_compile() must always return a
+	 * valid program, which in this case would simply not
+	 * be JITed, but falls back to the interpreter.
+	 */
+	fp = bpf_int_jit_compile(fp);
 	bpf_prog_lock_ro(fp);
 
 	/* The tail call compatibility check can only be done at
@@ -777,7 +784,9 @@ int bpf_prog_select_runtime(struct bpf_prog *fp)
 	 * with JITed or non JITed program concatenations and not
 	 * all eBPF JITs might immediately support all features.
 	 */
-	return bpf_check_tail_call(fp);
+	*err = bpf_check_tail_call(fp);
+
+	return fp;
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
@@ -859,8 +868,9 @@ const struct bpf_func_proto bpf_tail_call_proto = {
 };
 
 /* For classic BPF JITs that don't implement bpf_int_jit_compile(). */
-void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog)
 {
+	return prog;
 }
 
 bool __weak bpf_helper_changes_skb_data(void *func)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cf5e9f7ad13a..46ecce4b79ed 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -762,7 +762,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 	fixup_bpf_calls(prog);
 
 	/* eBPF program is ready to be JITed */
-	err = bpf_prog_select_runtime(prog);
+	prog = bpf_prog_select_runtime(prog, &err);
 	if (err < 0)
 		goto free_used_maps;
 
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 8f22fbedc3a6..93f45011a59d 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -5621,7 +5621,10 @@ static struct bpf_prog *generate_filter(int which, int *err)
 		fp->type = BPF_PROG_TYPE_SOCKET_FILTER;
 		memcpy(fp->insnsi, fptr, fp->len * sizeof(struct bpf_insn));
 
-		bpf_prog_select_runtime(fp);
+		/* We cannot error here as we don't need type compatibility
+		 * checks.
+		 */
+		fp = bpf_prog_select_runtime(fp, err);
 		break;
 	}
 
diff --git a/net/core/filter.c b/net/core/filter.c
index ea51b479cf02..68adb5f52110 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -994,7 +994,11 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
 		 */
 		goto out_err_free;
 
-	bpf_prog_select_runtime(fp);
+	/* We are guaranteed to never error here with cBPF to eBPF
+	 * transitions, since there's no issue with type compatibility
+	 * checks on program arrays.
+	 */
+	fp = bpf_prog_select_runtime(fp, &err);
 
 	kfree(old_prog);
 	return fp;
-- 
cgit v1.2.3


From 4f3446bb809f20ad56cadf712e6006815ae7a8f9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 13 May 2016 19:08:32 +0200
Subject: bpf: add generic constant blinding for use in jits

This work adds a generic facility for use from eBPF JIT compilers
that allows for further hardening of JIT generated images through
blinding constants. In response to the original work on BPF JIT
spraying published by Keegan McAllister [1], most BPF JITs were
changed to make images read-only and start at a randomized offset
in the page, where the rest was filled with trap instructions. We
have this nowadays in x86, arm, arm64 and s390 JIT compilers.
Additionally, later work also made eBPF interpreter images read
only for kernels supporting DEBUG_SET_MODULE_RONX, that is, x86,
arm, arm64 and s390 archs as well currently. This is done by
default for mentioned JITs when JITing is enabled. Furthermore,
we had a generic and configurable constant blinding facility on our
todo for quite some time now to further make spraying harder, and
first implementation since around netconf 2016.

We found that for systems where untrusted users can load cBPF/eBPF
code where JIT is enabled, start offset randomization helps a bit
to make jumps into crafted payload harder, but in case where larger
programs that cross page boundary are injected, we again have some
part of the program opcodes at a page start offset. With improved
guessing and more reliable payload injection, chances can increase
to jump into such payload. Elena Reshetova recently wrote a test
case for it [2, 3]. Moreover, eBPF comes with 64 bit constants, which
can leave some more room for payloads. Note that for all this,
additional bugs in the kernel are still required to make the jump
(and of course to guess right, to not jump into a trap) and naturally
the JIT must be enabled, which is disabled by default.

For helping mitigation, the general idea is to provide an option
bpf_jit_harden that admins can tweak along with bpf_jit_enable, so
that for cases where JIT should be enabled for performance reasons,
the generated image can be further hardened with blinding constants
for unpriviledged users (bpf_jit_harden == 1), with trading off
performance for these, but not for privileged ones. We also added
the option of blinding for all users (bpf_jit_harden == 2), which
is quite helpful for testing f.e. with test_bpf.ko. There are no
further e.g. hardening levels of bpf_jit_harden switch intended,
rationale is to have it dead simple to use as on/off. Since this
functionality would need to be duplicated over and over for JIT
compilers to use, which are already complex enough, we provide a
generic eBPF byte-code level based blinding implementation, which is
then just transparently JITed. JIT compilers need to make only a few
changes to integrate this facility and can be migrated one by one.

This option is for eBPF JITs and will be used in x86, arm64, s390
without too much effort, and soon ppc64 JITs, thus that native eBPF
can be blinded as well as cBPF to eBPF migrations, so that both can
be covered with a single implementation. The rule for JITs is that
bpf_jit_blind_constants() must be called from bpf_int_jit_compile(),
and in case blinding is disabled, we follow normally with JITing the
passed program. In case blinding is enabled and we fail during the
process of blinding itself, we must return with the interpreter.
Similarly, in case the JITing process after the blinding failed, we
return normally to the interpreter with the non-blinded code. Meaning,
interpreter doesn't change in any way and operates on eBPF code as
usual. For doing this pre-JIT blinding step, we need to make use of
a helper/auxiliary register, here BPF_REG_AX. This is strictly internal
to the JIT and not in any way part of the eBPF architecture. Just like
in the same way as JITs internally make use of some helper registers
when emitting code, only that here the helper register is one
abstraction level higher in eBPF bytecode, but nevertheless in JIT
phase. That helper register is needed since f.e. manually written
program can issue loads to all registers of eBPF architecture.

The core concept with the additional register is: blind out all 32
and 64 bit constants by converting BPF_K based instructions into a
small sequence from K_VAL into ((RND ^ K_VAL) ^ RND). Therefore, this
is transformed into: BPF_REG_AX := (RND ^ K_VAL), BPF_REG_AX ^= RND,
and REG <OP> BPF_REG_AX, so actual operation on the target register
is translated from BPF_K into BPF_X one that is operating on
BPF_REG_AX's content. During rewriting phase when blinding, RND is
newly generated via prandom_u32() for each processed instruction.
64 bit loads are split into two 32 bit loads to make translation and
patching not too complex. Only basic thing required by JITs is to
call the helper bpf_jit_blind_constants()/bpf_jit_prog_release_other()
pair, and to map BPF_REG_AX into an unused register.

Small bpf_jit_disasm extract from [2] when applied to x86 JIT:

echo 0 > /proc/sys/net/core/bpf_jit_harden

  ffffffffa034f5e9 + <x>:
  [...]
  39:   mov    $0xa8909090,%eax
  3e:   mov    $0xa8909090,%eax
  43:   mov    $0xa8ff3148,%eax
  48:   mov    $0xa89081b4,%eax
  4d:   mov    $0xa8900bb0,%eax
  52:   mov    $0xa810e0c1,%eax
  57:   mov    $0xa8908eb4,%eax
  5c:   mov    $0xa89020b0,%eax
  [...]

echo 1 > /proc/sys/net/core/bpf_jit_harden

  ffffffffa034f1e5 + <x>:
  [...]
  39:   mov    $0xe1192563,%r10d
  3f:   xor    $0x4989b5f3,%r10d
  46:   mov    %r10d,%eax
  49:   mov    $0xb8296d93,%r10d
  4f:   xor    $0x10b9fd03,%r10d
  56:   mov    %r10d,%eax
  59:   mov    $0x8c381146,%r10d
  5f:   xor    $0x24c7200e,%r10d
  66:   mov    %r10d,%eax
  69:   mov    $0xeb2a830e,%r10d
  6f:   xor    $0x43ba02ba,%r10d
  76:   mov    %r10d,%eax
  79:   mov    $0xd9730af,%r10d
  7f:   xor    $0xa5073b1f,%r10d
  86:   mov    %r10d,%eax
  89:   mov    $0x9a45662b,%r10d
  8f:   xor    $0x325586ea,%r10d
  96:   mov    %r10d,%eax
  [...]

As can be seen, original constants that carry payload are hidden
when enabled, actual operations are transformed from constant-based
to register-based ones, making jumps into constants ineffective.
Above extract/example uses single BPF load instruction over and
over, but of course all instructions with constants are blinded.

Performance wise, JIT with blinding performs a bit slower than just
JIT and faster than interpreter case. This is expected, since we
still get all the performance benefits from JITing and in normal
use-cases not every single instruction needs to be blinded. Summing
up all 296 test cases averaged over multiple runs from test_bpf.ko
suite, interpreter was 55% slower than JIT only and JIT with blinding
was 8% slower than JIT only. Since there are also some extremes in
the test suite, I expect for ordinary workloads that the performance
for the JIT with blinding case is even closer to JIT only case,
f.e. nmap test case from suite has averaged timings in ns 29 (JIT),
35 (+ blinding), and 151 (interpreter).

BPF test suite, seccomp test suite, eBPF sample code and various
bigger networking eBPF programs have been tested with this and were
running fine. For testing purposes, I also adapted interpreter and
redirected blinded eBPF image to interpreter and also here all tests
pass.

  [1] http://mainisusuallyafunction.blogspot.com/2012/11/attacking-hardened-linux-systems-with.html
  [2] https://github.com/01org/jit-spray-poc-for-ksp/
  [3] http://www.openwall.com/lists/kernel-hardening/2016/05/03/5

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Elena Reshetova <elena.reshetova@intel.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt |  11 +++
 include/linux/filter.h       |  42 +++++++++
 kernel/bpf/core.c            | 203 +++++++++++++++++++++++++++++++++++++++++++
 net/Kconfig                  |   7 +-
 net/core/sysctl_net_core.c   |   9 ++
 5 files changed, 270 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 809ab6efcc74..f0480f7ea740 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -43,6 +43,17 @@ Values :
 	1 - enable the JIT
 	2 - enable the JIT and ask the compiler to emit traces on kernel log.
 
+bpf_jit_harden
+--------------
+
+This enables hardening for the Berkeley Packet Filter Just in Time compiler.
+Supported are eBPF JIT backends. Enabling hardening trades off performance,
+but can mitigate JIT spraying.
+Values :
+	0 - disable JIT hardening (default value)
+	1 - enable JIT hardening for unprivileged users only
+	2 - enable JIT hardening for all users
+
 dev_weight
 --------------
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 891852cf7716..6fc31ef1da2d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -13,6 +13,8 @@
 #include <linux/printk.h>
 #include <linux/workqueue.h>
 #include <linux/sched.h>
+#include <linux/capability.h>
+
 #include <net/sch_generic.h>
 
 #include <asm/cacheflush.h>
@@ -42,6 +44,15 @@ struct bpf_prog_aux;
 #define BPF_REG_X	BPF_REG_7
 #define BPF_REG_TMP	BPF_REG_8
 
+/* Kernel hidden auxiliary/helper register for hardening step.
+ * Only used by eBPF JITs. It's nothing more than a temporary
+ * register that JITs use internally, only that here it's part
+ * of eBPF instructions that have been rewritten for blinding
+ * constants. See JIT pre-step in bpf_jit_blind_constants().
+ */
+#define BPF_REG_AX		MAX_BPF_REG
+#define MAX_BPF_JIT_REG		(MAX_BPF_REG + 1)
+
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
@@ -501,6 +512,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 
 #ifdef CONFIG_BPF_JIT
 extern int bpf_jit_enable;
+extern int bpf_jit_harden;
 
 typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
 
@@ -513,6 +525,9 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr);
 void bpf_jit_compile(struct bpf_prog *fp);
 void bpf_jit_free(struct bpf_prog *fp);
 
+struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp);
+void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other);
+
 static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 				u32 pass, void *image)
 {
@@ -523,6 +538,33 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 		print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
 			       16, 1, image, proglen, false);
 }
+
+static inline bool bpf_jit_is_ebpf(void)
+{
+# ifdef CONFIG_HAVE_EBPF_JIT
+	return true;
+# else
+	return false;
+# endif
+}
+
+static inline bool bpf_jit_blinding_enabled(void)
+{
+	/* These are the prerequisites, should someone ever have the
+	 * idea to call blinding outside of them, we make sure to
+	 * bail out.
+	 */
+	if (!bpf_jit_is_ebpf())
+		return false;
+	if (!bpf_jit_enable)
+		return false;
+	if (!bpf_jit_harden)
+		return false;
+	if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN))
+		return false;
+
+	return true;
+}
 #else
 static inline void bpf_jit_compile(struct bpf_prog *fp)
 {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 70f0821aca47..f1e8a0def99b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -243,6 +243,209 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
 	module_memfree(hdr);
 }
+
+int bpf_jit_harden __read_mostly;
+
+static int bpf_jit_blind_insn(const struct bpf_insn *from,
+			      const struct bpf_insn *aux,
+			      struct bpf_insn *to_buff)
+{
+	struct bpf_insn *to = to_buff;
+	u32 imm_rnd = prandom_u32();
+	s16 off;
+
+	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
+	BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
+
+	if (from->imm == 0 &&
+	    (from->code == (BPF_ALU   | BPF_MOV | BPF_K) ||
+	     from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
+		*to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg);
+		goto out;
+	}
+
+	switch (from->code) {
+	case BPF_ALU | BPF_ADD | BPF_K:
+	case BPF_ALU | BPF_SUB | BPF_K:
+	case BPF_ALU | BPF_AND | BPF_K:
+	case BPF_ALU | BPF_OR  | BPF_K:
+	case BPF_ALU | BPF_XOR | BPF_K:
+	case BPF_ALU | BPF_MUL | BPF_K:
+	case BPF_ALU | BPF_MOV | BPF_K:
+	case BPF_ALU | BPF_DIV | BPF_K:
+	case BPF_ALU | BPF_MOD | BPF_K:
+		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_ALU64 | BPF_ADD | BPF_K:
+	case BPF_ALU64 | BPF_SUB | BPF_K:
+	case BPF_ALU64 | BPF_AND | BPF_K:
+	case BPF_ALU64 | BPF_OR  | BPF_K:
+	case BPF_ALU64 | BPF_XOR | BPF_K:
+	case BPF_ALU64 | BPF_MUL | BPF_K:
+	case BPF_ALU64 | BPF_MOV | BPF_K:
+	case BPF_ALU64 | BPF_DIV | BPF_K:
+	case BPF_ALU64 | BPF_MOD | BPF_K:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_JMP | BPF_JEQ  | BPF_K:
+	case BPF_JMP | BPF_JNE  | BPF_K:
+	case BPF_JMP | BPF_JGT  | BPF_K:
+	case BPF_JMP | BPF_JGE  | BPF_K:
+	case BPF_JMP | BPF_JSGT | BPF_K:
+	case BPF_JMP | BPF_JSGE | BPF_K:
+	case BPF_JMP | BPF_JSET | BPF_K:
+		/* Accommodate for extra offset in case of a backjump. */
+		off = from->off;
+		if (off < 0)
+			off -= 2;
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off);
+		break;
+
+	case BPF_LD | BPF_ABS | BPF_W:
+	case BPF_LD | BPF_ABS | BPF_H:
+	case BPF_LD | BPF_ABS | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+		break;
+
+	case BPF_LD | BPF_IND | BPF_W:
+	case BPF_LD | BPF_IND | BPF_H:
+	case BPF_LD | BPF_IND | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU32_REG(BPF_ADD, BPF_REG_AX, from->src_reg);
+		*to++ = BPF_LD_IND(from->code, BPF_REG_AX, 0);
+		break;
+
+	case BPF_LD | BPF_IMM | BPF_DW:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+		*to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX);
+		break;
+	case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
+		*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
+		*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_ALU64_REG(BPF_OR,  aux[0].dst_reg, BPF_REG_AX);
+		break;
+
+	case BPF_ST | BPF_MEM | BPF_DW:
+	case BPF_ST | BPF_MEM | BPF_W:
+	case BPF_ST | BPF_MEM | BPF_H:
+	case BPF_ST | BPF_MEM | BPF_B:
+		*to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
+		*to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+		*to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off);
+		break;
+	}
+out:
+	return to - to_buff;
+}
+
+static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
+					      gfp_t gfp_extra_flags)
+{
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+			  gfp_extra_flags;
+	struct bpf_prog *fp;
+
+	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
+	if (fp != NULL) {
+		kmemcheck_annotate_bitfield(fp, meta);
+
+		/* aux->prog still points to the fp_other one, so
+		 * when promoting the clone to the real program,
+		 * this still needs to be adapted.
+		 */
+		memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE);
+	}
+
+	return fp;
+}
+
+static void bpf_prog_clone_free(struct bpf_prog *fp)
+{
+	/* aux was stolen by the other clone, so we cannot free
+	 * it from this path! It will be freed eventually by the
+	 * other program on release.
+	 *
+	 * At this point, we don't need a deferred release since
+	 * clone is guaranteed to not be locked.
+	 */
+	fp->aux = NULL;
+	__bpf_prog_free(fp);
+}
+
+void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other)
+{
+	/* We have to repoint aux->prog to self, as we don't
+	 * know whether fp here is the clone or the original.
+	 */
+	fp->aux->prog = fp;
+	bpf_prog_clone_free(fp_other);
+}
+
+struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
+{
+	struct bpf_insn insn_buff[16], aux[2];
+	struct bpf_prog *clone, *tmp;
+	int insn_delta, insn_cnt;
+	struct bpf_insn *insn;
+	int i, rewritten;
+
+	if (!bpf_jit_blinding_enabled())
+		return prog;
+
+	clone = bpf_prog_clone_create(prog, GFP_USER);
+	if (!clone)
+		return ERR_PTR(-ENOMEM);
+
+	insn_cnt = clone->len;
+	insn = clone->insnsi;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		/* We temporarily need to hold the original ld64 insn
+		 * so that we can still access the first part in the
+		 * second blinding run.
+		 */
+		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) &&
+		    insn[1].code == 0)
+			memcpy(aux, insn, sizeof(aux));
+
+		rewritten = bpf_jit_blind_insn(insn, aux, insn_buff);
+		if (!rewritten)
+			continue;
+
+		tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
+		if (!tmp) {
+			/* Patching may have repointed aux->prog during
+			 * realloc from the original one, so we need to
+			 * fix it up here on error.
+			 */
+			bpf_jit_prog_release_other(prog, clone);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		clone = tmp;
+		insn_delta = rewritten - 1;
+
+		/* Walk new program and skip insns we just inserted. */
+		insn = clone->insnsi + i + insn_delta;
+		insn_cnt += insn_delta;
+		i        += insn_delta;
+	}
+
+	return clone;
+}
 #endif /* CONFIG_BPF_JIT */
 
 /* Base function for offset calculation. Needs to go into .text section,
diff --git a/net/Kconfig b/net/Kconfig
index f7148f24f114..ff40562a782c 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -295,8 +295,11 @@ config BPF_JIT
 	  Berkeley Packet Filter filtering capabilities are normally handled
 	  by an interpreter. This option allows kernel to generate a native
 	  code when filter is loaded in memory. This should speedup
-	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
-	  this feature changing /proc/sys/net/core/bpf_jit_enable
+	  packet sniffing (libpcap/tcpdump).
+
+	  Note, admin should enable this feature changing:
+	  /proc/sys/net/core/bpf_jit_enable
+	  /proc/sys/net/core/bpf_jit_harden (optional)
 
 config NET_FLOW_LIMIT
 	bool
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a6beb7b6ae55..0df2aa652530 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -294,6 +294,15 @@ static struct ctl_table net_core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
+# ifdef CONFIG_HAVE_EBPF_JIT
+	{
+		.procname	= "bpf_jit_harden",
+		.data		= &bpf_jit_harden,
+		.maxlen		= sizeof(int),
+		.mode		= 0600,
+		.proc_handler	= proc_dointvec,
+	},
+# endif
 #endif
 	{
 		.procname	= "netdev_tstamp_prequeue",
-- 
cgit v1.2.3


From a831100aeefbe6d9f3e47a3e2712f82c042f1f5c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 May 2016 16:34:53 -0300
Subject: perf core: Generalize max_stack sysctl handler

So that it can be used for other stack related knobs, such as the
upcoming one to tweak the max number of of contexts per stack sample.

In all those cases we can only change the value if there are no perf
sessions collecting stacks, so they need to grab that mutex, etc.

Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-8t3fk94wuzp8m2z1n4gc0s17@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 kernel/events/callchain.c | 5 +++--
 kernel/sysctl.c           | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index b9325e7dcba1..7fc89939ede9 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -228,7 +228,8 @@ exit_put:
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-	int new_value = sysctl_perf_event_max_stack, ret;
+	int *value = table->data;
+	int new_value = *value, ret;
 	struct ctl_table new_table = *table;
 
 	new_table.data = &new_value;
@@ -240,7 +241,7 @@ int perf_event_max_stack_handler(struct ctl_table *table, int write,
 	if (atomic_read(&nr_callchain_events))
 		ret = -EBUSY;
 	else
-		sysctl_perf_event_max_stack = new_value;
+		*value = new_value;
 
 	mutex_unlock(&callchain_mutex);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b318663525..0ec6907a16b3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1149,7 +1149,7 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.procname	= "perf_event_max_stack",
-		.data		= NULL, /* filled in by handler */
+		.data		= &sysctl_perf_event_max_stack,
 		.maxlen		= sizeof(sysctl_perf_event_max_stack),
 		.mode		= 0644,
 		.proc_handler	= perf_event_max_stack_handler,
-- 
cgit v1.2.3


From cfbcf468454ab4b20f0b4b62da51920b99fdb19e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 Apr 2016 12:30:53 -0300
Subject: perf core: Pass max stack as a perf_callchain_entry context

This makes perf_callchain_{user,kernel}() receive the max stack
as context for the perf_callchain_entry, instead of accessing
the global sysctl_perf_event_max_stack.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-kolmn1yo40p7jhswxwrc7rrd@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/arc/kernel/perf_event.c       |  6 +++---
 arch/arm/kernel/perf_callchain.c   | 10 +++++-----
 arch/arm64/kernel/perf_callchain.c | 14 +++++++-------
 arch/metag/kernel/perf_callchain.c | 10 +++++-----
 arch/mips/kernel/perf_event.c      | 12 ++++++------
 arch/powerpc/perf/callchain.c      | 14 +++++++-------
 arch/s390/kernel/perf_event.c      |  4 ++--
 arch/sh/kernel/perf_callchain.c    |  4 ++--
 arch/sparc/kernel/perf_event.c     | 14 +++++++-------
 arch/tile/kernel/perf_event.c      |  6 +++---
 arch/x86/events/core.c             | 14 +++++++-------
 arch/xtensa/kernel/perf_event.c    | 10 +++++-----
 include/linux/perf_event.h         | 16 +++++++++++-----
 kernel/bpf/stackmap.c              |  3 ++-
 kernel/events/callchain.c          | 20 ++++++++++++--------
 15 files changed, 84 insertions(+), 73 deletions(-)

(limited to 'kernel')

diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c
index 8b134cfe5e1f..6fd48021324b 100644
--- a/arch/arc/kernel/perf_event.c
+++ b/arch/arc/kernel/perf_event.c
@@ -48,7 +48,7 @@ struct arc_callchain_trace {
 static int callchain_trace(unsigned int addr, void *data)
 {
 	struct arc_callchain_trace *ctrl = data;
-	struct perf_callchain_entry *entry = ctrl->perf_stuff;
+	struct perf_callchain_entry_ctx *entry = ctrl->perf_stuff;
 	perf_callchain_store(entry, addr);
 
 	if (ctrl->depth++ < 3)
@@ -58,7 +58,7 @@ static int callchain_trace(unsigned int addr, void *data)
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct arc_callchain_trace ctrl = {
 		.depth = 0,
@@ -69,7 +69,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	/*
 	 * User stack can't be unwound trivially with kernel dwarf unwinder
diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index 27563befa8a2..bc552e813e7b 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -31,7 +31,7 @@ struct frame_tail {
  */
 static struct frame_tail __user *
 user_backtrace(struct frame_tail __user *tail,
-	       struct perf_callchain_entry *entry)
+	       struct perf_callchain_entry_ctx *entry)
 {
 	struct frame_tail buftail;
 	unsigned long err;
@@ -59,7 +59,7 @@ user_backtrace(struct frame_tail __user *tail,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct frame_tail __user *tail;
 
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->nr < sysctl_perf_event_max_stack) &&
+	while ((entry->entry->nr < entry->max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
@@ -89,13 +89,13 @@ static int
 callchain_trace(struct stackframe *fr,
 		void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 	perf_callchain_store(entry, fr->pc);
 	return 0;
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct stackframe fr;
 
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 32c3c6e70119..0d60150057cf 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -31,7 +31,7 @@ struct frame_tail {
  */
 static struct frame_tail __user *
 user_backtrace(struct frame_tail __user *tail,
-	       struct perf_callchain_entry *entry)
+	       struct perf_callchain_entry_ctx *entry)
 {
 	struct frame_tail buftail;
 	unsigned long err;
@@ -76,7 +76,7 @@ struct compat_frame_tail {
 
 static struct compat_frame_tail __user *
 compat_user_backtrace(struct compat_frame_tail __user *tail,
-		      struct perf_callchain_entry *entry)
+		      struct perf_callchain_entry_ctx *entry)
 {
 	struct compat_frame_tail buftail;
 	unsigned long err;
@@ -106,7 +106,7 @@ compat_user_backtrace(struct compat_frame_tail __user *tail,
 }
 #endif /* CONFIG_COMPAT */
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 			 struct pt_regs *regs)
 {
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->nr < sysctl_perf_event_max_stack &&
+		while (entry->entry->nr < entry->max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->nr < sysctl_perf_event_max_stack) &&
+		while ((entry->entry->nr < entry->max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
@@ -146,12 +146,12 @@ void perf_callchain_user(struct perf_callchain_entry *entry,
  */
 static int callchain_trace(struct stackframe *frame, void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 	perf_callchain_store(entry, frame->pc);
 	return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
 	struct stackframe frame;
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index 252abc12a5a3..b3261a98b15b 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -29,7 +29,7 @@ static bool is_valid_call(unsigned long calladdr)
 
 static struct metag_frame __user *
 user_backtrace(struct metag_frame __user *user_frame,
-	       struct perf_callchain_entry *entry)
+	       struct perf_callchain_entry_ctx *entry)
 {
 	struct metag_frame frame;
 	unsigned long calladdr;
@@ -56,7 +56,7 @@ user_backtrace(struct metag_frame __user *user_frame,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	unsigned long sp = regs->ctx.AX[0].U0;
 	struct metag_frame __user *frame;
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 
 	--frame;
 
-	while ((entry->nr < sysctl_perf_event_max_stack) && frame)
+	while ((entry->entry->nr < entry->max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
@@ -78,13 +78,13 @@ static int
 callchain_trace(struct stackframe *fr,
 		void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 	perf_callchain_store(entry, fr->pc);
 	return 0;
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct stackframe fr;
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index 5021c546ad07..22395c7d7030 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -25,8 +25,8 @@
  * the user stack callchains, we will add it here.
  */
 
-static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
-	unsigned long reg29)
+static void save_raw_perf_callchain(struct perf_callchain_entry_ctx *entry,
+				    unsigned long reg29)
 {
 	unsigned long *sp = (unsigned long *)reg29;
 	unsigned long addr;
@@ -35,14 +35,14 @@ static void save_raw_perf_callchain(struct perf_callchain_entry *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->nr >= sysctl_perf_event_max_stack)
+			if (entry->entry->nr >= entry->max_stack)
 				break;
 		}
 	}
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
-		      struct pt_regs *regs)
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
+			   struct pt_regs *regs)
 {
 	unsigned long sp = regs->regs[29];
 #ifdef CONFIG_KALLSYMS
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->nr >= sysctl_perf_event_max_stack)
+		if (entry->entry->nr >= entry->max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index 22d9015c1acc..c9260c1dfdbc 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -47,7 +47,7 @@ static int valid_next_sp(unsigned long sp, unsigned long prev_sp)
 }
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	unsigned long sp, next_sp;
 	unsigned long next_ip;
@@ -232,7 +232,7 @@ static int sane_signal_64_frame(unsigned long sp)
 		puc == (unsigned long) &sf->uc;
 }
 
-static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned long sp, next_sp;
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -319,7 +319,7 @@ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret)
 	return rc;
 }
 
-static inline void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static inline void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 					  struct pt_regs *regs)
 {
 }
@@ -439,7 +439,7 @@ static unsigned int __user *signal_frame_32_regs(unsigned int sp,
 	return mctx->mc_gregs;
 }
 
-static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned int sp, next_sp;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
@@ -487,7 +487,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	if (current_is_64bit())
 		perf_callchain_user_64(entry, regs);
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index c3e4099b60a5..87035fa58bbe 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -224,13 +224,13 @@ arch_initcall(service_level_perf_register);
 
 static int __perf_callchain_kernel(void *data, unsigned long address)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	perf_callchain_store(entry, address);
 	return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
 	if (user_mode(regs))
diff --git a/arch/sh/kernel/perf_callchain.c b/arch/sh/kernel/perf_callchain.c
index cc80b614b5fa..fa2c0cd23eaa 100644
--- a/arch/sh/kernel/perf_callchain.c
+++ b/arch/sh/kernel/perf_callchain.c
@@ -21,7 +21,7 @@ static int callchain_stack(void *data, char *name)
 
 static void callchain_address(void *data, unsigned long addr, int reliable)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	if (reliable)
 		perf_callchain_store(entry, addr);
@@ -33,7 +33,7 @@ static const struct stacktrace_ops callchain_ops = {
 };
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	perf_callchain_store(entry, regs->pc);
 
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index a4b8b5aed21c..bcc5376db74b 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1711,7 +1711,7 @@ static int __init init_hw_perf_events(void)
 }
 pure_initcall(init_hw_perf_events);
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
 	unsigned long ksp, fp;
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry,
 			}
 		}
 #endif
-	} while (entry->nr < sysctl_perf_event_max_stack);
+	} while (entry->entry->nr < entry->max_stack);
 }
 
 static inline int
@@ -1769,7 +1769,7 @@ valid_user_frame(const void __user *fp, unsigned long size)
 	return (__range_not_ok(fp, size, TASK_SIZE) == 0);
 }
 
-static void perf_callchain_user_64(struct perf_callchain_entry *entry,
+static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned long ufp;
@@ -1790,10 +1790,10 @@ static void perf_callchain_user_64(struct perf_callchain_entry *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < sysctl_perf_event_max_stack);
+	} while (entry->entry->nr < entry->max_stack);
 }
 
-static void perf_callchain_user_32(struct perf_callchain_entry *entry,
+static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 				   struct pt_regs *regs)
 {
 	unsigned long ufp;
@@ -1822,11 +1822,11 @@ static void perf_callchain_user_32(struct perf_callchain_entry *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->nr < sysctl_perf_event_max_stack);
+	} while (entry->entry->nr < entry->max_stack);
 }
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	u64 saved_fault_address = current_thread_info()->fault_address;
 	u8 saved_fault_code = get_thread_fault_code();
diff --git a/arch/tile/kernel/perf_event.c b/arch/tile/kernel/perf_event.c
index 8767060d70fb..6394c1ccb68e 100644
--- a/arch/tile/kernel/perf_event.c
+++ b/arch/tile/kernel/perf_event.c
@@ -941,7 +941,7 @@ arch_initcall(init_hw_perf_events);
 /*
  * Tile specific backtracing code for perf_events.
  */
-static inline void perf_callchain(struct perf_callchain_entry *entry,
+static inline void perf_callchain(struct perf_callchain_entry_ctx *entry,
 		    struct pt_regs *regs)
 {
 	struct KBacktraceIterator kbt;
@@ -992,13 +992,13 @@ static inline void perf_callchain(struct perf_callchain_entry *entry,
 	}
 }
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 		    struct pt_regs *regs)
 {
 	perf_callchain(entry, regs);
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 		      struct pt_regs *regs)
 {
 	perf_callchain(entry, regs);
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5e5e76a52f58..07f2b01cfb72 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2202,7 +2202,7 @@ static int backtrace_stack(void *data, char *name)
 
 static int backtrace_address(void *data, unsigned long addr, int reliable)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	return perf_callchain_store(entry, addr);
 }
@@ -2214,7 +2214,7 @@ static const struct stacktrace_ops backtrace_ops = {
 };
 
 void
-perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* TODO: We don't support guest os callchain now */
@@ -2268,7 +2268,7 @@ static unsigned long get_segment_base(unsigned int segment)
 #include <asm/compat.h>
 
 static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
 {
 	/* 32-bit process in 64-bit kernel. */
 	unsigned long ss_base, cs_base;
@@ -2283,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2309,14 +2309,14 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
 }
 #else
 static inline int
-perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
+perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *entry)
 {
     return 0;
 }
 #endif
 
 void
-perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
+perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs)
 {
 	struct stack_frame frame;
 	const void __user *fp;
@@ -2343,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 
 	pagefault_disable();
-	while (entry->nr < sysctl_perf_event_max_stack) {
+	while (entry->entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/arch/xtensa/kernel/perf_event.c b/arch/xtensa/kernel/perf_event.c
index a6b00b3af429..ef90479e0397 100644
--- a/arch/xtensa/kernel/perf_event.c
+++ b/arch/xtensa/kernel/perf_event.c
@@ -323,23 +323,23 @@ static void xtensa_pmu_read(struct perf_event *event)
 
 static int callchain_trace(struct stackframe *frame, void *data)
 {
-	struct perf_callchain_entry *entry = data;
+	struct perf_callchain_entry_ctx *entry = data;
 
 	perf_callchain_store(entry, frame->pc);
 	return 0;
 }
 
-void perf_callchain_kernel(struct perf_callchain_entry *entry,
+void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
-	xtensa_backtrace_kernel(regs, sysctl_perf_event_max_stack,
+	xtensa_backtrace_kernel(regs, entry->max_stack,
 				callchain_trace, NULL, entry);
 }
 
-void perf_callchain_user(struct perf_callchain_entry *entry,
+void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 			 struct pt_regs *regs)
 {
-	xtensa_backtrace_user(regs, sysctl_perf_event_max_stack,
+	xtensa_backtrace_user(regs, entry->max_stack,
 			      callchain_trace, entry);
 }
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9e1c3ada91c4..dbd18246b36e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -61,6 +61,11 @@ struct perf_callchain_entry {
 	__u64				ip[0]; /* /proc/sys/kernel/perf_event_max_stack */
 };
 
+struct perf_callchain_entry_ctx {
+	struct perf_callchain_entry *entry;
+	u32			    max_stack;
+};
+
 struct perf_raw_record {
 	u32				size;
 	void				*data;
@@ -1063,19 +1068,20 @@ extern void perf_event_fork(struct task_struct *tsk);
 /* Callchains */
 DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
 
-extern void perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs);
-extern void perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs);
+extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
+extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
 extern struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-		   bool crosstask, bool add_mark);
+		   u32 max_stack, bool crosstask, bool add_mark);
 extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
 
-static inline int perf_callchain_store(struct perf_callchain_entry *entry, u64 ip)
+static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
-	if (entry->nr < sysctl_perf_event_max_stack) {
+	struct perf_callchain_entry *entry = ctx->entry;
+	if (entry->nr < ctx->max_stack) {
 		entry->ip[entry->nr++] = ip;
 		return 0;
 	} else {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index f5a19548be12..a82d7605db3f 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -136,7 +136,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
 			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
 		return -EINVAL;
 
-	trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
 
 	if (unlikely(!trace))
 		/* couldn't fetch the stack trace */
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 7fc89939ede9..af95ad92893a 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -32,12 +32,12 @@ static DEFINE_MUTEX(callchain_mutex);
 static struct callchain_cpus_entries *callchain_cpus_entries;
 
 
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 				  struct pt_regs *regs)
 {
 }
 
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 				struct pt_regs *regs)
 {
 }
@@ -176,14 +176,15 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
 	if (!kernel && !user)
 		return NULL;
 
-	return get_perf_callchain(regs, 0, kernel, user, crosstask, true);
+	return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
 }
 
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
-		   bool crosstask, bool add_mark)
+		   u32 max_stack, bool crosstask, bool add_mark)
 {
 	struct perf_callchain_entry *entry;
+	struct perf_callchain_entry_ctx ctx;
 	int rctx;
 
 	entry = get_callchain_entry(&rctx);
@@ -193,12 +194,15 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	if (!entry)
 		goto exit_put;
 
+	ctx.entry     = entry;
+	ctx.max_stack = max_stack;
+
 	entry->nr = init_nr;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-			perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-		perf_callchain_kernel(entry, regs);
+			perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL);
+		perf_callchain_kernel(&ctx, regs);
 	}
 
 	if (user) {
@@ -214,8 +218,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 				goto exit_put;
 
 			if (add_mark)
-				perf_callchain_store(entry, PERF_CONTEXT_USER);
-			perf_callchain_user(entry, regs);
+				perf_callchain_store(&ctx, PERF_CONTEXT_USER);
+			perf_callchain_user(&ctx, regs);
 		}
 	}
 
-- 
cgit v1.2.3


From 3b1fff08038bd0792b1aa1e9703b2dd0512a3fd0 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 10 May 2016 18:08:32 -0300
Subject: perf core: Add a 'nr' field to perf_event_callchain_context

We will use it to count how many addresses are in the entry->ip[] array,
excluding PERF_CONTEXT_{KERNEL,USER,etc} entries, so that we can really
return the number of entries specified by the user via the relevant
sysctl, kernel.perf_event_max_contexts, or via the per event
perf_event_attr.sample_max_stack knob.

This way we keep the perf_sample->ip_callchain->nr meaning, that is the
number of entries, be it real addresses or PERF_CONTEXT_ entries, while
honouring the max_stack knobs, i.e. the end result will be max_stack
entries if we have at least that many entries in a given stack trace.

Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-s8teto51tdqvlfhefndtat9r@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/arm/kernel/perf_callchain.c   | 2 +-
 arch/arm64/kernel/perf_callchain.c | 4 ++--
 arch/metag/kernel/perf_callchain.c | 2 +-
 arch/mips/kernel/perf_event.c      | 4 ++--
 arch/powerpc/perf/callchain.c      | 4 ++--
 arch/sparc/kernel/perf_event.c     | 6 +++---
 arch/x86/events/core.c             | 4 ++--
 include/linux/perf_event.h         | 6 ++++--
 kernel/events/callchain.c          | 3 +--
 9 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/perf_callchain.c b/arch/arm/kernel/perf_callchain.c
index bc552e813e7b..22bf1f64d99a 100644
--- a/arch/arm/kernel/perf_callchain.c
+++ b/arch/arm/kernel/perf_callchain.c
@@ -75,7 +75,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
-	while ((entry->entry->nr < entry->max_stack) &&
+	while ((entry->nr < entry->max_stack) &&
 	       tail && !((unsigned long)tail & 0x3))
 		tail = user_backtrace(tail, entry);
 }
diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 0d60150057cf..713ca824f266 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -122,7 +122,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 
 		tail = (struct frame_tail __user *)regs->regs[29];
 
-		while (entry->entry->nr < entry->max_stack &&
+		while (entry->nr < entry->max_stack &&
 		       tail && !((unsigned long)tail & 0xf))
 			tail = user_backtrace(tail, entry);
 	} else {
@@ -132,7 +132,7 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 
 		tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
 
-		while ((entry->entry->nr < entry->max_stack) &&
+		while ((entry->nr < entry->max_stack) &&
 			tail && !((unsigned long)tail & 0x3))
 			tail = compat_user_backtrace(tail, entry);
 #endif
diff --git a/arch/metag/kernel/perf_callchain.c b/arch/metag/kernel/perf_callchain.c
index b3261a98b15b..3e8e048040df 100644
--- a/arch/metag/kernel/perf_callchain.c
+++ b/arch/metag/kernel/perf_callchain.c
@@ -65,7 +65,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 
 	--frame;
 
-	while ((entry->entry->nr < entry->max_stack) && frame)
+	while ((entry->nr < entry->max_stack) && frame)
 		frame = user_backtrace(frame, entry);
 }
 
diff --git a/arch/mips/kernel/perf_event.c b/arch/mips/kernel/perf_event.c
index 22395c7d7030..d64056e0bb56 100644
--- a/arch/mips/kernel/perf_event.c
+++ b/arch/mips/kernel/perf_event.c
@@ -35,7 +35,7 @@ static void save_raw_perf_callchain(struct perf_callchain_entry_ctx *entry,
 		addr = *sp++;
 		if (__kernel_text_address(addr)) {
 			perf_callchain_store(entry, addr);
-			if (entry->entry->nr >= entry->max_stack)
+			if (entry->nr >= entry->max_stack)
 				break;
 		}
 	}
@@ -59,7 +59,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 	}
 	do {
 		perf_callchain_store(entry, pc);
-		if (entry->entry->nr >= entry->max_stack)
+		if (entry->nr >= entry->max_stack)
 			break;
 		pc = unwind_stack(current, &sp, pc, &ra);
 	} while (pc);
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index c9260c1dfdbc..f68f213dc36c 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -247,7 +247,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		fp = (unsigned long __user *) sp;
 		if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp))
 			return;
@@ -453,7 +453,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 	sp = regs->gpr[1];
 	perf_callchain_store(entry, next_ip);
 
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		fp = (unsigned int __user *) (unsigned long) sp;
 		if (!valid_user_sp(sp, 0) || read_user_stack_32(fp, &next_sp))
 			return;
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index bcc5376db74b..710f3278d448 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -1756,7 +1756,7 @@ void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			}
 		}
 #endif
-	} while (entry->entry->nr < entry->max_stack);
+	} while (entry->nr < entry->max_stack);
 }
 
 static inline int
@@ -1790,7 +1790,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 		pc = sf.callers_pc;
 		ufp = (unsigned long)sf.fp + STACK_BIAS;
 		perf_callchain_store(entry, pc);
-	} while (entry->entry->nr < entry->max_stack);
+	} while (entry->nr < entry->max_stack);
 }
 
 static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
@@ -1822,7 +1822,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 			ufp = (unsigned long)sf.fp;
 		}
 		perf_callchain_store(entry, pc);
-	} while (entry->entry->nr < entry->max_stack);
+	} while (entry->nr < entry->max_stack);
 }
 
 void
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 07f2b01cfb72..5de96a18cd9c 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2283,7 +2283,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
 
 	fp = compat_ptr(ss_base + regs->bp);
 	pagefault_disable();
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame     = 0;
 		frame.return_address = 0;
@@ -2343,7 +2343,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
 		return;
 
 	pagefault_disable();
-	while (entry->entry->nr < entry->max_stack) {
+	while (entry->nr < entry->max_stack) {
 		unsigned long bytes;
 		frame.next_frame	     = NULL;
 		frame.return_address = 0;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dbd18246b36e..3803bb1a862b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -64,6 +64,7 @@ struct perf_callchain_entry {
 struct perf_callchain_entry_ctx {
 	struct perf_callchain_entry *entry;
 	u32			    max_stack;
+	u32			    nr;
 };
 
 struct perf_raw_record {
@@ -1080,9 +1081,10 @@ extern int sysctl_perf_event_max_stack;
 
 static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
-	struct perf_callchain_entry *entry = ctx->entry;
-	if (entry->nr < ctx->max_stack) {
+	if (ctx->nr < ctx->max_stack) {
+		struct perf_callchain_entry *entry = ctx->entry;
 		entry->ip[entry->nr++] = ip;
+		++ctx->nr;
 		return 0;
 	} else {
 		return -1; /* no more room, stop walking the stack */
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index af95ad92893a..8774ff86debb 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -196,8 +196,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 
 	ctx.entry     = entry;
 	ctx.max_stack = max_stack;
-
-	entry->nr = init_nr;
+	ctx.nr	      = entry->nr = init_nr;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-- 
cgit v1.2.3


From 3e4de4ec4cfea40994b47a79767610153edbf45b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 May 2016 13:01:50 -0300
Subject: perf core: Add perf_callchain_store_context() helper

We need have different helpers to account how many contexts we have in
the sample and for real addresses, so do it now as a prep patch, to
ease review.

Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-q964tnyuqrxw5gld18vizs3c@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/powerpc/perf/callchain.c | 6 +++---
 include/linux/perf_event.h    | 2 ++
 kernel/events/callchain.c     | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index f68f213dc36c..f62597dbd757 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -76,7 +76,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re
 			next_ip = regs->nip;
 			lr = regs->link;
 			level = 0;
-			perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+			perf_callchain_store_context(entry, PERF_CONTEXT_KERNEL);
 
 		} else {
 			if (level == 0)
@@ -274,7 +274,7 @@ static void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
 			    read_user_stack_64(&uregs[PT_R1], &sp))
 				return;
 			level = 0;
-			perf_callchain_store(entry, PERF_CONTEXT_USER);
+			perf_callchain_store_context(entry, PERF_CONTEXT_USER);
 			perf_callchain_store(entry, next_ip);
 			continue;
 		}
@@ -473,7 +473,7 @@ static void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
 			    read_user_stack_32(&uregs[PT_R1], &sp))
 				return;
 			level = 0;
-			perf_callchain_store(entry, PERF_CONTEXT_USER);
+			perf_callchain_store_context(entry, PERF_CONTEXT_USER);
 			perf_callchain_store(entry, next_ip);
 			continue;
 		}
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3803bb1a862b..2024b14cc2b1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1079,6 +1079,8 @@ extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
 
+#define perf_callchain_store_context(ctx, context) perf_callchain_store(ctx, context)
+
 static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
 	if (ctx->nr < ctx->max_stack) {
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 8774ff86debb..ca645736a983 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -200,7 +200,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
-			perf_callchain_store(&ctx, PERF_CONTEXT_KERNEL);
+			perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL);
 		perf_callchain_kernel(&ctx, regs);
 	}
 
@@ -217,7 +217,7 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 				goto exit_put;
 
 			if (add_mark)
-				perf_callchain_store(&ctx, PERF_CONTEXT_USER);
+				perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
 			perf_callchain_user(&ctx, regs);
 		}
 	}
-- 
cgit v1.2.3


From c85b03349640b34f3545503c8429fc43005e9a92 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 12 May 2016 13:06:21 -0300
Subject: perf core: Separate accounting of contexts and real addresses in a
 stack trace

The perf_sample->ip_callchain->nr value includes all the entries in the
ip_callchain->ip[] array, real addresses and PERF_CONTEXT_{KERNEL,USER,etc},
while what the user expects is that what is in the kernel.perf_event_max_stack
sysctl or in the upcoming per event perf_event_attr.sample_max_stack knob be
honoured in terms of IP addresses in the stack trace.

So allocate a bunch of extra entries for contexts, and do the accounting
via perf_callchain_entry_ctx struct members.

A new sysctl, kernel.perf_event_max_contexts_per_stack is also
introduced for investigating possible bugs in the callchain
implementation by some arch.

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Brendan Gregg <brendan.d.gregg@gmail.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: He Kuang <hekuang@huawei.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Milian Wolff <milian.wolff@kdab.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Wang Nan <wangnan0@huawei.com>
Cc: Zefan Li <lizefan@huawei.com>
Link: http://lkml.kernel.org/n/tip-3b4wnqk340c4sg4gwkfdi9yk@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 Documentation/sysctl/kernel.txt | 14 ++++++++++++++
 include/linux/perf_event.h      | 18 ++++++++++++++++--
 include/uapi/linux/perf_event.h |  1 +
 kernel/events/callchain.c       | 10 +++++++++-
 kernel/sysctl.c                 |  9 +++++++++
 5 files changed, 49 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index daabdd7ee543..a3683ce2a2f3 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -61,6 +61,7 @@ show up in /proc/sys/kernel:
 - perf_cpu_time_max_percent
 - perf_event_paranoid
 - perf_event_max_stack
+- perf_event_max_contexts_per_stack
 - pid_max
 - powersave-nap               [ PPC only ]
 - printk
@@ -668,6 +669,19 @@ The default value is 127.
 
 ==============================================================
 
+perf_event_max_contexts_per_stack:
+
+Controls maximum number of stack frame context entries for
+(attr.sample_type & PERF_SAMPLE_CALLCHAIN) configured events, for
+instance, when using 'perf record -g' or 'perf trace --call-graph fp'.
+
+This can only be done when no events are in use that have callchains
+enabled, otherwise writing to this file will return -EBUSY.
+
+The default value is 8.
+
+==============================================================
+
 pid_max:
 
 PID allocation wrap value.  When the kernel's next PID value
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2024b14cc2b1..6b87be908790 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -65,6 +65,8 @@ struct perf_callchain_entry_ctx {
 	struct perf_callchain_entry *entry;
 	u32			    max_stack;
 	u32			    nr;
+	short			    contexts;
+	bool			    contexts_maxed;
 };
 
 struct perf_raw_record {
@@ -1078,12 +1080,24 @@ extern int get_callchain_buffers(void);
 extern void put_callchain_buffers(void);
 
 extern int sysctl_perf_event_max_stack;
+extern int sysctl_perf_event_max_contexts_per_stack;
 
-#define perf_callchain_store_context(ctx, context) perf_callchain_store(ctx, context)
+static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
+{
+	if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
+		struct perf_callchain_entry *entry = ctx->entry;
+		entry->ip[entry->nr++] = ip;
+		++ctx->contexts;
+		return 0;
+	} else {
+		ctx->contexts_maxed = true;
+		return -1; /* no more room, stop walking the stack */
+	}
+}
 
 static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
 {
-	if (ctx->nr < ctx->max_stack) {
+	if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
 		struct perf_callchain_entry *entry = ctx->entry;
 		entry->ip[entry->nr++] = ip;
 		++ctx->nr;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 43fc8d213472..36ce552cf6a9 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -862,6 +862,7 @@ enum perf_event_type {
 };
 
 #define PERF_MAX_STACK_DEPTH		127
+#define PERF_MAX_CONTEXTS_PER_STACK	  8
 
 enum perf_callchain_context {
 	PERF_CONTEXT_HV			= (__u64)-32,
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index ca645736a983..179ef4640964 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -19,11 +19,13 @@ struct callchain_cpus_entries {
 };
 
 int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
 
 static inline size_t perf_callchain_entry__sizeof(void)
 {
 	return (sizeof(struct perf_callchain_entry) +
-		sizeof(__u64) * sysctl_perf_event_max_stack);
+		sizeof(__u64) * (sysctl_perf_event_max_stack +
+				 sysctl_perf_event_max_contexts_per_stack));
 }
 
 static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
@@ -197,6 +199,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
 	ctx.entry     = entry;
 	ctx.max_stack = max_stack;
 	ctx.nr	      = entry->nr = init_nr;
+	ctx.contexts       = 0;
+	ctx.contexts_maxed = false;
 
 	if (kernel && !user_mode(regs)) {
 		if (add_mark)
@@ -228,6 +232,10 @@ exit_put:
 	return entry;
 }
 
+/*
+ * Used for sysctl_perf_event_max_stack and
+ * sysctl_perf_event_max_contexts_per_stack.
+ */
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0ec6907a16b3..bec4c11c47d6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1156,6 +1156,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &six_hundred_forty_kb,
 	},
+	{
+		.procname	= "perf_event_max_contexts_per_stack",
+		.data		= &sysctl_perf_event_max_contexts_per_stack,
+		.maxlen		= sizeof(sysctl_perf_event_max_contexts_per_stack),
+		.mode		= 0644,
+		.proc_handler	= perf_event_max_stack_handler,
+		.extra1		= &zero,
+		.extra2		= &one_thousand,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{
-- 
cgit v1.2.3


From 60f05e86cf3e8c5f379fe5ba94634fcec17dd67e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 18 May 2016 17:55:28 +0530
Subject: cpufreq: schedutil: Improve prints messages with pr_fmt

Prefix print messages with KBUILD_MODNAME, i.e 'cpufreq_schedutil: '.
This helps to keep similar formatting for all the print messages
particular to a file and identify those easily in kernel logs.

Its already done this way for rest of the governors.

Along with that, remove the (now) redundant bits from a print message.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 154ae3a51e86..14c4aa25cc45 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -9,6 +9,8 @@
  * published by the Free Software Foundation.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/cpufreq.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -388,7 +390,7 @@ static int sugov_init(struct cpufreq_policy *policy)
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
-	pr_err("cpufreq: schedutil governor initialization failed (error %d)\n", ret);
+	pr_err("initialization failed (error %d)\n", ret);
 	return ret;
 }
 
-- 
cgit v1.2.3


From bc2c53e5f1a2bae69ae50ce3a592633da7fcf6d9 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:02 -0700
Subject: time: add missing implementation for timespec64_add_safe()

timespec64_add_safe() has been defined in time64.h for 64 bit systems.
But, 32 bit systems only have an extern function prototype defined.
Provide a definition for the above function.

The function will be necessary as part of y2038 changes.  struct
timespec is not y2038 safe.  All references to timespec will be replaced
by struct timespec64.  The function is meant to be a replacement for
timespec_add_safe().

The implementation is similar to timespec_add_safe().

Link: http://lkml.kernel.org/r/1461947989-21926-2-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time64.h |  4 +---
 kernel/time/time.c     | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 367d5af899e8..1778937221bf 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -136,13 +136,11 @@ extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 n
 
 /*
  * timespec64_add_safe assumes both values are positive and checks for
- * overflow. It will return TIME_T_MAX if the returned value would be
- * smaller then either of the arguments.
+ * overflow. It will return TIME64_MAX in case of overflow.
  */
 extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 					 const struct timespec64 rhs);
 
-
 static inline struct timespec64 timespec64_add(struct timespec64 lhs,
 						struct timespec64 rhs)
 {
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a4064b612066..cb1f83eb5599 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -769,3 +769,28 @@ struct timespec timespec_add_safe(const struct timespec lhs,
 
 	return res;
 }
+
+#if __BITS_PER_LONG != 64
+
+/*
+ * Add two timespec64 values and do a safety check for overflow.
+ * It's assumed that both values are valid (>= 0).
+ * And, each timespec64 is in normalized form.
+ */
+struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+				const struct timespec64 rhs)
+{
+	struct timespec64 res;
+
+	set_normalized_timespec64(&res, lhs.tv_sec + rhs.tv_sec,
+			lhs.tv_nsec + rhs.tv_nsec);
+
+	if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
+		res.tv_sec = TIME64_MAX;
+		res.tv_nsec = 0;
+	}
+
+	return res;
+}
+
+#endif
-- 
cgit v1.2.3


From 8e4f70e21877297577dce13cca97599a5864a91f Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:08 -0700
Subject: time: remove timespec_add_safe()

All references to timespec_add_safe() now use timespec64_add_safe().

The plan is to replace struct timespec references with struct timespec64
throughout the kernel as timespec is not y2038 safe.

Drop timespec_add_safe() and use timespec64_add_safe() for all
architectures.

Link: http://lkml.kernel.org/r/1461947989-21926-4-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time64.h | 15 +++++++--------
 kernel/time/time.c     |  4 ----
 2 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 1778937221bf..7e5d2fa9ac46 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -65,7 +65,6 @@ static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *
 # define timespec64_equal		timespec_equal
 # define timespec64_compare		timespec_compare
 # define set_normalized_timespec64	set_normalized_timespec
-# define timespec64_add_safe		timespec_add_safe
 # define timespec64_add			timespec_add
 # define timespec64_sub			timespec_sub
 # define timespec64_valid		timespec_valid
@@ -134,13 +133,6 @@ static inline int timespec64_compare(const struct timespec64 *lhs, const struct
 
 extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);
 
-/*
- * timespec64_add_safe assumes both values are positive and checks for
- * overflow. It will return TIME64_MAX in case of overflow.
- */
-extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
-					 const struct timespec64 rhs);
-
 static inline struct timespec64 timespec64_add(struct timespec64 lhs,
 						struct timespec64 rhs)
 {
@@ -222,4 +214,11 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
 
 #endif
 
+/*
+ * timespec64_add_safe assumes both values are positive and checks for
+ * overflow. It will return TIME64_MAX in case of overflow.
+ */
+extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+					 const struct timespec64 rhs);
+
 #endif /* _LINUX_TIME64_H */
diff --git a/kernel/time/time.c b/kernel/time/time.c
index cb1f83eb5599..667b9335f5d6 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -770,8 +770,6 @@ struct timespec timespec_add_safe(const struct timespec lhs,
 	return res;
 }
 
-#if __BITS_PER_LONG != 64
-
 /*
  * Add two timespec64 values and do a safety check for overflow.
  * It's assumed that both values are valid (>= 0).
@@ -792,5 +790,3 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 
 	return res;
 }
-
-#endif
-- 
cgit v1.2.3


From 02a982a6ec631d871571f940ca13817551759884 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:26 -0700
Subject: workqueue: update debugobjects fixup callbacks return type

Update the return type to use bool instead of int, corresponding to
change (debugobjects: make fixup functions return bool instead of int)

Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5f5068e94003..6751b18fd9ac 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -437,7 +437,7 @@ static void *work_debug_hint(void *addr)
  * fixup_init is called when:
  * - an active object is initialized
  */
-static int work_fixup_init(void *addr, enum debug_obj_state state)
+static bool work_fixup_init(void *addr, enum debug_obj_state state)
 {
 	struct work_struct *work = addr;
 
@@ -445,9 +445,9 @@ static int work_fixup_init(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		cancel_work_sync(work);
 		debug_object_init(work, &work_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -456,7 +456,7 @@ static int work_fixup_init(void *addr, enum debug_obj_state state)
  * - an active object is activated
  * - an unknown object is activated (might be a statically initialized object)
  */
-static int work_fixup_activate(void *addr, enum debug_obj_state state)
+static bool work_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct work_struct *work = addr;
 
@@ -471,16 +471,16 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
 		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
 			debug_object_init(work, &work_debug_descr);
 			debug_object_activate(work, &work_debug_descr);
-			return 0;
+			return false;
 		}
 		WARN_ON_ONCE(1);
-		return 0;
+		return false;
 
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
 
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -488,7 +488,7 @@ static int work_fixup_activate(void *addr, enum debug_obj_state state)
  * fixup_free is called when:
  * - an active object is freed
  */
-static int work_fixup_free(void *addr, enum debug_obj_state state)
+static bool work_fixup_free(void *addr, enum debug_obj_state state)
 {
 	struct work_struct *work = addr;
 
@@ -496,9 +496,9 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		cancel_work_sync(work);
 		debug_object_free(work, &work_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
-- 
cgit v1.2.3


From e3252464da222ef2c2ca52dff383a824080ea3d5 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:29 -0700
Subject: timer: update debugobjects fixup callbacks return type

Update the return type to use bool instead of int, corresponding to
cheange (debugobjects: make fixup functions return bool instead of int).

Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/time/hrtimer.c | 18 +++++++++---------
 kernel/time/timer.c   | 30 +++++++++++++++---------------
 2 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fa0b983290cf..f962a58c0957 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -334,7 +334,7 @@ static void *hrtimer_debug_hint(void *addr)
  * fixup_init is called when:
  * - an active object is initialized
  */
-static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
 {
 	struct hrtimer *timer = addr;
 
@@ -342,9 +342,9 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		hrtimer_cancel(timer);
 		debug_object_init(timer, &hrtimer_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -353,19 +353,19 @@ static int hrtimer_fixup_init(void *addr, enum debug_obj_state state)
  * - an active object is activated
  * - an unknown object is activated (might be a statically initialized object)
  */
-static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	switch (state) {
 
 	case ODEBUG_STATE_NOTAVAILABLE:
 		WARN_ON_ONCE(1);
-		return 0;
+		return false;
 
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
 
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -373,7 +373,7 @@ static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
  * fixup_free is called when:
  * - an active object is freed
  */
-static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
+static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
 {
 	struct hrtimer *timer = addr;
 
@@ -381,9 +381,9 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		hrtimer_cancel(timer);
 		debug_object_free(timer, &hrtimer_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 73164c3aa56b..be33481a4da1 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -493,7 +493,7 @@ static void *timer_debug_hint(void *addr)
  * fixup_init is called when:
  * - an active object is initialized
  */
-static int timer_fixup_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_init(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
@@ -501,9 +501,9 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		del_timer_sync(timer);
 		debug_object_init(timer, &timer_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -518,7 +518,7 @@ static void stub_timer(unsigned long data)
  * - an active object is activated
  * - an unknown object is activated (might be a statically initialized object)
  */
-static int timer_fixup_activate(void *addr, enum debug_obj_state state)
+static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
@@ -534,18 +534,18 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
 		    timer->entry.next == TIMER_ENTRY_STATIC) {
 			debug_object_init(timer, &timer_debug_descr);
 			debug_object_activate(timer, &timer_debug_descr);
-			return 0;
+			return false;
 		} else {
 			setup_timer(timer, stub_timer, 0);
-			return 1;
+			return true;
 		}
-		return 0;
+		return false;
 
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
 
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -553,7 +553,7 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
  * fixup_free is called when:
  * - an active object is freed
  */
-static int timer_fixup_free(void *addr, enum debug_obj_state state)
+static bool timer_fixup_free(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
@@ -561,9 +561,9 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
 	case ODEBUG_STATE_ACTIVE:
 		del_timer_sync(timer);
 		debug_object_free(timer, &timer_debug_descr);
-		return 1;
+		return true;
 	default:
-		return 0;
+		return false;
 	}
 }
 
@@ -571,7 +571,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
  * fixup_assert_init is called when:
  * - an untracked/uninit-ed object is found
  */
-static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
+static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
@@ -584,13 +584,13 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 			 * is tracked in the object tracker.
 			 */
 			debug_object_init(timer, &timer_debug_descr);
-			return 0;
+			return false;
 		} else {
 			setup_timer(timer, stub_timer, 0);
-			return 1;
+			return true;
 		}
 	default:
-		return 0;
+		return false;
 	}
 }
 
-- 
cgit v1.2.3


From 3263d28eb5b93b3c1b024366df87b1d0c774228f Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:32 -0700
Subject: rcu: update debugobjects fixup callbacks return type

Update the return type to use bool instead of int, corresponding to
cheange (debugobjects: make fixup functions return bool instead of int).

Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/rcu/update.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3ccdc8eebc5a..a9df198eb22d 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -386,7 +386,7 @@ void destroy_rcu_head(struct rcu_head *head)
  * - an unknown object is activated (might be a statically initialized object)
  * Activation is performed internally by call_rcu().
  */
-static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct rcu_head *head = addr;
 
@@ -399,9 +399,9 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
 		 */
 		debug_object_init(head, &rcuhead_debug_descr);
 		debug_object_activate(head, &rcuhead_debug_descr);
-		return 0;
+		return false;
 	default:
-		return 1;
+		return true;
 	}
 }
 
-- 
cgit v1.2.3


From b9fdac7f660609abb157500e468d2165b3c9cf08 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:41 -0700
Subject: debugobjects: insulate non-fixup logic related to static obj from
 fixup callbacks

When activating a static object we need make sure that the object is
tracked in the object tracker.  If it is a non-static object then the
activation is illegal.

In previous implementation, each subsystem need take care of this in
their fixup callbacks.  Actually we can put it into debugobjects core.
Thus we can save duplicated code, and have *pure* fixup callbacks.

To achieve this, a new callback "is_static_object" is introduced to let
the type specific code decide whether a object is static or not.  If
yes, we take it into object tracker, otherwise give warning and invoke
fixup callback.

This change has paassed debugobjects selftest, and I also do some test
with all debugobjects supports enabled.

At last, I have a concern about the fixups that can it change the object
which is in incorrect state on fixup? Because the 'addr' may not point
to any valid object if a non-static object is not tracked.  Then Change
such object can overwrite someone's memory and cause unexpected
behaviour.  For example, the timer_fixup_activate bind timer to function
stub_timer.

Link: http://lkml.kernel.org/r/1462576157-14539-1-git-send-email-changbin.du@intel.com
[changbin.du@intel.com: improve code comments where invoke the new is_static_object callback]
  Link: http://lkml.kernel.org/r/1462777431-8171-1-git-send-email-changbin.du@intel.com
Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debugobjects.h |  2 ++
 kernel/rcu/update.c          | 26 +++--------------------
 kernel/time/hrtimer.c        |  7 +------
 kernel/time/timer.c          | 43 +++++++++++++-------------------------
 kernel/workqueue.c           | 42 ++++++++-----------------------------
 lib/debugobjects.c           | 49 +++++++++++++++++++++++++++++---------------
 6 files changed, 60 insertions(+), 109 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index a899f10c9365..46056cb161fc 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -38,6 +38,7 @@ struct debug_obj {
  * @name:		name of the object typee
  * @debug_hint:		function returning address, which have associated
  *			kernel symbol, to allow identify the object
+ * @is_static_object	return true if the obj is static, otherwise return false
  * @fixup_init:		fixup function, which is called when the init check
  *			fails. All fixup functions must return true if fixup
  *			was successful, otherwise return false
@@ -53,6 +54,7 @@ struct debug_obj {
 struct debug_obj_descr {
 	const char		*name;
 	void *(*debug_hint)(void *addr);
+	bool (*is_static_object)(void *addr);
 	bool (*fixup_init)(void *addr, enum debug_obj_state state);
 	bool (*fixup_activate)(void *addr, enum debug_obj_state state);
 	bool (*fixup_destroy)(void *addr, enum debug_obj_state state);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index a9df198eb22d..3e888cd5a594 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -380,29 +380,9 @@ void destroy_rcu_head(struct rcu_head *head)
 	debug_object_free(head, &rcuhead_debug_descr);
 }
 
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- * Activation is performed internally by call_rcu().
- */
-static bool rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
+static bool rcuhead_is_static_object(void *addr)
 {
-	struct rcu_head *head = addr;
-
-	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. We just make sure that it is
-		 * tracked in the object tracker.
-		 */
-		debug_object_init(head, &rcuhead_debug_descr);
-		debug_object_activate(head, &rcuhead_debug_descr);
-		return false;
-	default:
-		return true;
-	}
+	return true;
 }
 
 /**
@@ -440,7 +420,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
 
 struct debug_obj_descr rcuhead_debug_descr = {
 	.name = "rcu_head",
-	.fixup_activate = rcuhead_fixup_activate,
+	.is_static_object = rcuhead_is_static_object,
 };
 EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f962a58c0957..8c7392c4fdbd 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -351,16 +351,11 @@ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
 static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		WARN_ON_ONCE(1);
-		return false;
-
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index be33481a4da1..3a95f9728778 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -489,6 +489,14 @@ static void *timer_debug_hint(void *addr)
 	return ((struct timer_list *) addr)->function;
 }
 
+static bool timer_is_static_object(void *addr)
+{
+	struct timer_list *timer = addr;
+
+	return (timer->entry.pprev == NULL &&
+		timer->entry.next == TIMER_ENTRY_STATIC);
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -516,30 +524,16 @@ static void stub_timer(unsigned long data)
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
 static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
 {
 	struct timer_list *timer = addr;
 
 	switch (state) {
-
 	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. The timer was
-		 * statically initialized. We just make sure that it
-		 * is tracked in the object tracker.
-		 */
-		if (timer->entry.pprev == NULL &&
-		    timer->entry.next == TIMER_ENTRY_STATIC) {
-			debug_object_init(timer, &timer_debug_descr);
-			debug_object_activate(timer, &timer_debug_descr);
-			return false;
-		} else {
-			setup_timer(timer, stub_timer, 0);
-			return true;
-		}
-		return false;
+		setup_timer(timer, stub_timer, 0);
+		return true;
 
 	case ODEBUG_STATE_ACTIVE:
 		WARN_ON(1);
@@ -577,18 +571,8 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		if (timer->entry.next == TIMER_ENTRY_STATIC) {
-			/*
-			 * This is not really a fixup. The timer was
-			 * statically initialized. We just make sure that it
-			 * is tracked in the object tracker.
-			 */
-			debug_object_init(timer, &timer_debug_descr);
-			return false;
-		} else {
-			setup_timer(timer, stub_timer, 0);
-			return true;
-		}
+		setup_timer(timer, stub_timer, 0);
+		return true;
 	default:
 		return false;
 	}
@@ -597,6 +581,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr timer_debug_descr = {
 	.name			= "timer_list",
 	.debug_hint		= timer_debug_hint,
+	.is_static_object	= timer_is_static_object,
 	.fixup_init		= timer_fixup_init,
 	.fixup_activate		= timer_fixup_activate,
 	.fixup_free		= timer_fixup_free,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6751b18fd9ac..e1c0e996b5ae 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -433,6 +433,13 @@ static void *work_debug_hint(void *addr)
 	return ((struct work_struct *) addr)->func;
 }
 
+static bool work_is_static_object(void *addr)
+{
+	struct work_struct *work = addr;
+
+	return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -451,39 +458,6 @@ static bool work_fixup_init(void *addr, enum debug_obj_state state)
 	}
 }
 
-/*
- * fixup_activate is called when:
- * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
- */
-static bool work_fixup_activate(void *addr, enum debug_obj_state state)
-{
-	struct work_struct *work = addr;
-
-	switch (state) {
-
-	case ODEBUG_STATE_NOTAVAILABLE:
-		/*
-		 * This is not really a fixup. The work struct was
-		 * statically initialized. We just make sure that it
-		 * is tracked in the object tracker.
-		 */
-		if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) {
-			debug_object_init(work, &work_debug_descr);
-			debug_object_activate(work, &work_debug_descr);
-			return false;
-		}
-		WARN_ON_ONCE(1);
-		return false;
-
-	case ODEBUG_STATE_ACTIVE:
-		WARN_ON(1);
-
-	default:
-		return false;
-	}
-}
-
 /*
  * fixup_free is called when:
  * - an active object is freed
@@ -505,8 +479,8 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state)
 static struct debug_obj_descr work_debug_descr = {
 	.name		= "work_struct",
 	.debug_hint	= work_debug_hint,
+	.is_static_object = work_is_static_object,
 	.fixup_init	= work_fixup_init,
-	.fixup_activate	= work_fixup_activate,
 	.fixup_free	= work_fixup_free,
 };
 
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 2f07c8c697b8..a8e12601eb37 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -431,14 +431,21 @@ int debug_object_activate(void *addr, struct debug_obj_descr *descr)
 
 	raw_spin_unlock_irqrestore(&db->lock, flags);
 	/*
-	 * This happens when a static object is activated. We
-	 * let the type specific code decide whether this is
-	 * true or not.
+	 * We are here when a static object is activated. We
+	 * let the type specific code confirm whether this is
+	 * true or not. if true, we just make sure that the
+	 * static object is tracked in the object tracker. If
+	 * not, this must be a bug, so we try to fix it up.
 	 */
-	if (debug_object_fixup(descr->fixup_activate, addr,
-			   ODEBUG_STATE_NOTAVAILABLE)) {
+	if (descr->is_static_object && descr->is_static_object(addr)) {
+		/* track this static object */
+		debug_object_init(addr, descr);
+		debug_object_activate(addr, descr);
+	} else {
 		debug_print_object(&o, "activate");
-		return -EINVAL;
+		ret = debug_object_fixup(descr->fixup_activate, addr,
+					ODEBUG_STATE_NOTAVAILABLE);
+		return ret ? 0 : -EINVAL;
 	}
 	return 0;
 }
@@ -602,12 +609,18 @@ void debug_object_assert_init(void *addr, struct debug_obj_descr *descr)
 
 		raw_spin_unlock_irqrestore(&db->lock, flags);
 		/*
-		 * Maybe the object is static.  Let the type specific
-		 * code decide what to do.
+		 * Maybe the object is static, and we let the type specific
+		 * code confirm. Track this static object if true, else invoke
+		 * fixup.
 		 */
-		if (debug_object_fixup(descr->fixup_assert_init, addr,
-				       ODEBUG_STATE_NOTAVAILABLE))
+		if (descr->is_static_object && descr->is_static_object(addr)) {
+			/* Track this static object */
+			debug_object_init(addr, descr);
+		} else {
 			debug_print_object(&o, "assert_init");
+			debug_object_fixup(descr->fixup_assert_init, addr,
+					   ODEBUG_STATE_NOTAVAILABLE);
+		}
 		return;
 	}
 
@@ -792,6 +805,13 @@ struct self_test {
 
 static __initdata struct debug_obj_descr descr_type_test;
 
+static bool __init is_static_object(void *addr)
+{
+	struct self_test *obj = addr;
+
+	return obj->static_init;
+}
+
 /*
  * fixup_init is called when:
  * - an active object is initialized
@@ -813,7 +833,7 @@ static bool __init fixup_init(void *addr, enum debug_obj_state state)
 /*
  * fixup_activate is called when:
  * - an active object is activated
- * - an unknown object is activated (might be a statically initialized object)
+ * - an unknown non-static object is activated
  */
 static bool __init fixup_activate(void *addr, enum debug_obj_state state)
 {
@@ -821,13 +841,7 @@ static bool __init fixup_activate(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		if (obj->static_init == 1) {
-			debug_object_init(obj, &descr_type_test);
-			debug_object_activate(obj, &descr_type_test);
-			return false;
-		}
 		return true;
-
 	case ODEBUG_STATE_ACTIVE:
 		debug_object_deactivate(obj, &descr_type_test);
 		debug_object_activate(obj, &descr_type_test);
@@ -916,6 +930,7 @@ out:
 
 static __initdata struct debug_obj_descr descr_type_test = {
 	.name			= "selftest",
+	.is_static_object	= is_static_object,
 	.fixup_init		= fixup_init,
 	.fixup_activate		= fixup_activate,
 	.fixup_destroy		= fixup_destroy,
-- 
cgit v1.2.3


From 815613da6a67c196d7458d0e6c278ea88e21933f Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Thu, 19 May 2016 17:09:56 -0700
Subject: kernel/padata.c: removed unused code

By accident I stumbled across code that has never been used.  This
driver has EXPORT_SYMBOL functions, and the only user of the code is
pcrypt.c, but this only uses a subset of the exported symbols.

According to 'git log -G', the functions, padata_set_cpumasks,
padata_add_cpu, and padata_remove_cpu have never been used since they
were first introduced.  This patch removes the unused code.

On one 64 bit build, with CRYPTO_PCRYPT built in, the text is more than
4k smaller.

  kbuild_hp> size $KBUILD_OUTPUT/vmlinux
      text    data     bss      dec hex    filename
  10566658 4678360 1122304 16367322 f9beda vmlinux
  10561984 4678360 1122304 16362648 f9ac98 vmlinux

On another config, 32 bit, the saving is about 0.5k bytes.

  kbuild_hp-x86> size $KBUILD_OUTPUT/vmlinux
  6012005 2409513 2785280 11206798 ab008e vmlinux
  6011491 2409513 2785280 11206284 aafe8c vmlinux

Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/padata.h |  5 ----
 kernel/padata.c        | 64 --------------------------------------------------
 2 files changed, 69 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 438694650471..113ee626a4dc 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -175,11 +175,6 @@ extern int padata_do_parallel(struct padata_instance *pinst,
 extern void padata_do_serial(struct padata_priv *padata);
 extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
 			      cpumask_var_t cpumask);
-extern int padata_set_cpumasks(struct padata_instance *pinst,
-			       cpumask_var_t pcpumask,
-			       cpumask_var_t cbcpumask);
-extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask);
-extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask);
 extern int padata_start(struct padata_instance *pinst);
 extern void padata_stop(struct padata_instance *pinst);
 extern int padata_register_cpumask_notifier(struct padata_instance *pinst,
diff --git a/kernel/padata.c b/kernel/padata.c
index b38bea9c466a..67ddd4acde9d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -606,33 +606,6 @@ out_replace:
 	return 0;
 }
 
-/**
- * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
- *                       one is used by parallel workers and the second one
- *                       by the wokers doing serialization.
- *
- * @pinst: padata instance
- * @pcpumask: the cpumask to use for parallel workers
- * @cbcpumask: the cpumsak to use for serial workers
- */
-int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
-			cpumask_var_t cbcpumask)
-{
-	int err;
-
-	mutex_lock(&pinst->lock);
-	get_online_cpus();
-
-	err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
-
-	put_online_cpus();
-	mutex_unlock(&pinst->lock);
-
-	return err;
-
-}
-EXPORT_SYMBOL(padata_set_cpumasks);
-
 /**
  * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
  *                     equivalent to @cpumask.
@@ -694,42 +667,6 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
 	return 0;
 }
 
- /**
- * padata_add_cpu - add a cpu to one or both(parallel and serial)
- *                  padata cpumasks.
- *
- * @pinst: padata instance
- * @cpu: cpu to add
- * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
- *        The @mask may be any combination of the following flags:
- *          PADATA_CPU_SERIAL   - serial cpumask
- *          PADATA_CPU_PARALLEL - parallel cpumask
- */
-
-int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
-{
-	int err;
-
-	if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
-		return -EINVAL;
-
-	mutex_lock(&pinst->lock);
-
-	get_online_cpus();
-	if (mask & PADATA_CPU_SERIAL)
-		cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
-	if (mask & PADATA_CPU_PARALLEL)
-		cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
-
-	err = __padata_add_cpu(pinst, cpu);
-	put_online_cpus();
-
-	mutex_unlock(&pinst->lock);
-
-	return err;
-}
-EXPORT_SYMBOL(padata_add_cpu);
-
 static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
 {
 	struct parallel_data *pd = NULL;
@@ -1091,7 +1028,6 @@ err_free_inst:
 err:
 	return NULL;
 }
-EXPORT_SYMBOL(padata_alloc);
 
 /**
  * padata_free - free a padata instance
-- 
cgit v1.2.3


From 19d795b677bda354644cfb87a196b087fdc2a965 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 May 2016 17:09:59 -0700
Subject: kernel/padata.c: hide unused functions

A recent cleanup removed some exported functions that were not used
anywhere, which in turn exposed the fact that some other functions in
the same file are only used in some configurations.

We now get a warning about them when CONFIG_HOTPLUG_CPU is disabled:

  kernel/padata.c:670:12: error: '__padata_remove_cpu' defined but not used [-Werror=unused-function]
   static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
              ^~~~~~~~~~~~~~~~~~~
  kernel/padata.c:650:12: error: '__padata_add_cpu' defined but not used [-Werror=unused-function]
   static int __padata_add_cpu(struct padata_instance *pinst, int cpu)

This rearranges the code so the __padata_remove_cpu/__padata_add_cpu
functions are within the #ifdef that protects the code that calls them.

[akpm@linux-foundation.org: coding-style fixes]
Fixes: 4ba6d78c671e ("kernel/padata.c: removed unused code")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Richard Cochran <rcochran@linutronix.de>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/padata.c | 74 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 67ddd4acde9d..993278895ccc 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -647,6 +647,43 @@ out:
 }
 EXPORT_SYMBOL(padata_set_cpumask);
 
+/**
+ * padata_start - start the parallel processing
+ *
+ * @pinst: padata instance to start
+ */
+int padata_start(struct padata_instance *pinst)
+{
+	int err = 0;
+
+	mutex_lock(&pinst->lock);
+
+	if (pinst->flags & PADATA_INVALID)
+		err = -EINVAL;
+
+	 __padata_start(pinst);
+
+	mutex_unlock(&pinst->lock);
+
+	return err;
+}
+EXPORT_SYMBOL(padata_start);
+
+/**
+ * padata_stop - stop the parallel processing
+ *
+ * @pinst: padata instance to stop
+ */
+void padata_stop(struct padata_instance *pinst)
+{
+	mutex_lock(&pinst->lock);
+	__padata_stop(pinst);
+	mutex_unlock(&pinst->lock);
+}
+EXPORT_SYMBOL(padata_stop);
+
+#ifdef CONFIG_HOTPLUG_CPU
+
 static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
 {
 	struct parallel_data *pd;
@@ -726,43 +763,6 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
 }
 EXPORT_SYMBOL(padata_remove_cpu);
 
-/**
- * padata_start - start the parallel processing
- *
- * @pinst: padata instance to start
- */
-int padata_start(struct padata_instance *pinst)
-{
-	int err = 0;
-
-	mutex_lock(&pinst->lock);
-
-	if (pinst->flags & PADATA_INVALID)
-		err =-EINVAL;
-
-	 __padata_start(pinst);
-
-	mutex_unlock(&pinst->lock);
-
-	return err;
-}
-EXPORT_SYMBOL(padata_start);
-
-/**
- * padata_stop - stop the parallel processing
- *
- * @pinst: padata instance to stop
- */
-void padata_stop(struct padata_instance *pinst)
-{
-	mutex_lock(&pinst->lock);
-	__padata_stop(pinst);
-	mutex_unlock(&pinst->lock);
-}
-EXPORT_SYMBOL(padata_stop);
-
-#ifdef CONFIG_HOTPLUG_CPU
-
 static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
 {
 	return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
-- 
cgit v1.2.3


From 0139aa7b7fa12ceef095d99dc36606a5b10ab83a Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:49 -0700
Subject: mm: rename _count, field of the struct page, to _refcount

Many developers already know that field for reference count of the
struct page is _count and atomic type.  They would try to handle it
directly and this could break the purpose of page reference count
tracepoint.  To prevent direct _count modification, this patch rename it
to _refcount and add warning message on the code.  After that, developer
who need to handle reference count will find that field should not be
accessed directly.

[akpm@linux-foundation.org: fix comments, per Vlastimil]
[akpm@linux-foundation.org: Documentation/vm/transhuge.txt too]
[sfr@canb.auug.org.au: sync ethernet driver changes]
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Sunil Goutham <sgoutham@cavium.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Manish Chopra <manish.chopra@qlogic.com>
Cc: Yuval Mintz <yuval.mintz@qlogic.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/transhuge.txt                  | 10 +++++-----
 arch/tile/mm/init.c                             |  2 +-
 drivers/block/aoe/aoecmd.c                      |  2 +-
 drivers/hwtracing/intel_th/msu.c                |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 20 +++++++++----------
 drivers/net/ethernet/qlogic/qede/qede_main.c    |  4 ++--
 fs/proc/page.c                                  |  2 +-
 include/linux/mm.h                              |  2 +-
 include/linux/mm_types.h                        | 14 ++++++++-----
 include/linux/page_ref.h                        | 26 ++++++++++++-------------
 include/linux/pagemap.h                         |  8 ++++----
 kernel/kexec_core.c                             |  2 +-
 mm/huge_memory.c                                |  4 ++--
 mm/internal.h                                   |  2 +-
 mm/page_alloc.c                                 |  4 ++--
 mm/slub.c                                       |  4 ++--
 mm/vmscan.c                                     |  4 ++--
 17 files changed, 58 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index d9cb65cf5cfd..fb0e1f2a19cc 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -394,9 +394,9 @@ hugepage natively. Once finished you can drop the page table lock.
 Refcounting on THP is mostly consistent with refcounting on other compound
 pages:
 
-  - get_page()/put_page() and GUP operate in head page's ->_count.
+  - get_page()/put_page() and GUP operate in head page's ->_refcount.
 
-  - ->_count in tail pages is always zero: get_page_unless_zero() never
+  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
     succeed on tail pages.
 
   - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
@@ -426,15 +426,15 @@ requests to split pinned huge page: it expects page count to be equal to
 sum of mapcount of all sub-pages plus one (split_huge_page caller must
 have reference for head page).
 
-split_huge_page uses migration entries to stabilize page->_count and
+split_huge_page uses migration entries to stabilize page->_refcount and
 page->_mapcount.
 
 We safe against physical memory scanners too: the only legitimate way
 scanner can get reference to a page is get_page_unless_zero().
 
-All tail pages has zero ->_count until atomic_add(). It prevent scanner
+All tail pages has zero ->_refcount until atomic_add(). It prevent scanner
 from geting reference to tail page up to the point. After the atomic_add()
-we don't care about ->_count value.  We already known how many references
+we don't care about ->_refcount value.  We already known how many references
 with should uncharge from head page.
 
 For head page get_page_unless_zero() will succeed and we don't mind. It's
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index a0582b7f41d3..adce25462b0d 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -679,7 +679,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end)
 			 * Hacky direct set to avoid unnecessary
 			 * lock take/release for EVERY page here.
 			 */
-			p->_count.counter = 0;
+			p->_refcount.counter = 0;
 			p->_mapcount.counter = -1;
 		}
 		init_page_count(page);
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 437b3a822f44..d597e432e195 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -861,7 +861,7 @@ rqbiocnt(struct request *r)
  * discussion.
  *
  * We cannot use get_page in the workaround, because it insists on a
- * positive page count as a precondition.  So we use _count directly.
+ * positive page count as a precondition.  So we use _refcount directly.
  */
 static void
 bio_pageinc(struct bio *bio)
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index d9d6022c5aca..d2209147dc89 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -1164,7 +1164,7 @@ static void msc_mmap_close(struct vm_area_struct *vma)
 	if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex))
 		return;
 
-	/* drop page _counts */
+	/* drop page _refcounts */
 	for (pg = 0; pg < msc->nr_pages; pg++) {
 		struct page *page = msc_buffer_get_page(msc, pg);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f3456798c596..bd947704b59c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -433,8 +433,8 @@ static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
 		if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i)))
 			goto err_unmap;
-		atomic_add(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_add(wi->umr.dma_info[i].page,
+			     mlx5e_mpwqe_strides_per_page(rq));
 		wi->skbs_frags[i] = 0;
 	}
 
@@ -452,8 +452,8 @@ err_unmap:
 	while (--i >= 0) {
 		dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
 			       PCI_DMA_FROMDEVICE);
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_sub(wi->umr.dma_info[i].page,
+			     mlx5e_mpwqe_strides_per_page(rq));
 		put_page(wi->umr.dma_info[i].page);
 	}
 	dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -477,8 +477,8 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
 		dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
 			       PCI_DMA_FROMDEVICE);
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_sub(wi->umr.dma_info[i].page,
+			mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
 		put_page(wi->umr.dma_info[i].page);
 	}
 	dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -527,8 +527,8 @@ static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq,
 	 */
 	split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER);
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
-		atomic_add(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->dma_info.page[i]._count);
+		page_ref_add(&wi->dma_info.page[i],
+			     mlx5e_mpwqe_strides_per_page(rq));
 		wi->skbs_frags[i] = 0;
 	}
 
@@ -551,8 +551,8 @@ void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq,
 	dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
 		       PCI_DMA_FROMDEVICE);
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
-			   &wi->dma_info.page[i]._count);
+		page_ref_sub(&wi->dma_info.page[i],
+			mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
 		put_page(&wi->dma_info.page[i]);
 	}
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 3aabfc0adefe..73dd525fbf08 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1036,7 +1036,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 		/* Incr page ref count to reuse on allocation failure
 		 * so that it doesn't get freed while freeing SKB.
 		 */
-		atomic_inc(&current_bd->data->_count);
+		page_ref_inc(current_bd->data);
 		goto out;
 	}
 
@@ -1487,7 +1487,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
 				 * freeing SKB.
 				 */
 
-				atomic_inc(&sw_rx_data->data->_count);
+				page_ref_inc(sw_rx_data->data);
 				rxq->rx_alloc_errors++;
 				qede_recycle_rx_bd_ring(rxq, edev,
 							fp_cqe->bd_num);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 712f1b9992cc..3ecd445e830d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page)
 
 
 	/*
-	 * Caveats on high order pages: page->_count will only be set
+	 * Caveats on high order pages: page->_refcount will only be set
 	 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
 	 * SLOB won't set PG_slab at all on compound pages.
 	 */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 727f799757ab..1193a54ea2b3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -734,7 +734,7 @@ static inline void get_page(struct page *page)
 	page = compound_head(page);
 	/*
 	 * Getting a normal page or the head of a compound page
-	 * requires to already have an elevated page->_count.
+	 * requires to already have an elevated page->_refcount.
 	 */
 	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
 	page_ref_inc(page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c2d75b4fa86c..1fda9c99ef95 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -73,9 +73,9 @@ struct page {
 			unsigned long counters;
 #else
 			/*
-			 * Keep _count separate from slub cmpxchg_double data.
-			 * As the rest of the double word is protected by
-			 * slab_lock but _count is not.
+			 * Keep _refcount separate from slub cmpxchg_double
+			 * data.  As the rest of the double word is protected by
+			 * slab_lock but _refcount is not.
 			 */
 			unsigned counters;
 #endif
@@ -97,7 +97,11 @@ struct page {
 					};
 					int units;	/* SLOB */
 				};
-				atomic_t _count;		/* Usage count, see below. */
+				/*
+				 * Usage count, *USE WRAPPER FUNCTION*
+				 * when manual accounting. See page_ref.h
+				 */
+				atomic_t _refcount;
 			};
 			unsigned int active;	/* SLAB */
 		};
@@ -248,7 +252,7 @@ struct page_frag_cache {
 	__u32 offset;
 #endif
 	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_count every time we allocate a fragment.
+	 * containing page->_refcount every time we allocate a fragment.
 	 */
 	unsigned int		pagecnt_bias;
 	bool pfmemalloc;
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index e596d5d9540e..8b5e0a9f2431 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v)
 
 static inline int page_ref_count(struct page *page)
 {
-	return atomic_read(&page->_count);
+	return atomic_read(&page->_refcount);
 }
 
 static inline int page_count(struct page *page)
 {
-	return atomic_read(&compound_head(page)->_count);
+	return atomic_read(&compound_head(page)->_refcount);
 }
 
 static inline void set_page_count(struct page *page, int v)
 {
-	atomic_set(&page->_count, v);
+	atomic_set(&page->_refcount, v);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_set))
 		__page_ref_set(page, v);
 }
@@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page)
 
 static inline void page_ref_add(struct page *page, int nr)
 {
-	atomic_add(nr, &page->_count);
+	atomic_add(nr, &page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, nr);
 }
 
 static inline void page_ref_sub(struct page *page, int nr)
 {
-	atomic_sub(nr, &page->_count);
+	atomic_sub(nr, &page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, -nr);
 }
 
 static inline void page_ref_inc(struct page *page)
 {
-	atomic_inc(&page->_count);
+	atomic_inc(&page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, 1);
 }
 
 static inline void page_ref_dec(struct page *page)
 {
-	atomic_dec(&page->_count);
+	atomic_dec(&page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, -1);
 }
 
 static inline int page_ref_sub_and_test(struct page *page, int nr)
 {
-	int ret = atomic_sub_and_test(nr, &page->_count);
+	int ret = atomic_sub_and_test(nr, &page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
 		__page_ref_mod_and_test(page, -nr, ret);
@@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
 
 static inline int page_ref_dec_and_test(struct page *page)
 {
-	int ret = atomic_dec_and_test(&page->_count);
+	int ret = atomic_dec_and_test(&page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
 		__page_ref_mod_and_test(page, -1, ret);
@@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page)
 
 static inline int page_ref_dec_return(struct page *page)
 {
-	int ret = atomic_dec_return(&page->_count);
+	int ret = atomic_dec_return(&page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
 		__page_ref_mod_and_return(page, -1, ret);
@@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page)
 
 static inline int page_ref_add_unless(struct page *page, int nr, int u)
 {
-	int ret = atomic_add_unless(&page->_count, nr, u);
+	int ret = atomic_add_unless(&page->_refcount, nr, u);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless))
 		__page_ref_mod_unless(page, nr, ret);
@@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u)
 
 static inline int page_ref_freeze(struct page *page, int count)
 {
-	int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count);
+	int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze))
 		__page_ref_freeze(page, count, ret);
@@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count)
 	VM_BUG_ON_PAGE(page_count(page) != 0, page);
 	VM_BUG_ON(count == 0);
 
-	atomic_set(&page->_count, count);
+	atomic_set(&page->_refcount, count);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze))
 		__page_ref_unfreeze(page, count);
 }
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7e1ab155c67c..fe1513ffb7bf 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold);
 
 /*
  * speculatively take a reference to a page.
- * If the page is free (_count == 0), then _count is untouched, and 0
- * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
+ * If the page is free (_refcount == 0), then _refcount is untouched, and 0
+ * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
  *
  * This function must be called inside the same rcu_read_lock() section as has
  * been used to lookup the page in the pagecache radix-tree (or page table):
- * this allows allocators to use a synchronize_rcu() to stabilize _count.
+ * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
  *
  * Unless an RCU grace period has passed, the count of all pages coming out
  * of the allocator must be considered unstable. page_count may return higher
@@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold);
  * 2. conditionally increment refcount
  * 3. check the page is still in pagecache (if no, goto 1)
  *
- * Remove-side that cares about stability of _count (eg. reclaim) has the
+ * Remove-side that cares about stability of _refcount (eg. reclaim) has the
  * following (with tree_lock held for write):
  * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
  * B. remove page from pagecache
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1391d3ee3b86..1c03dfb4abfd 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_STRUCT_SIZE(list_head);
 	VMCOREINFO_SIZE(nodemask_t);
 	VMCOREINFO_OFFSET(page, flags);
-	VMCOREINFO_OFFSET(page, _count);
+	VMCOREINFO_OFFSET(page, _refcount);
 	VMCOREINFO_OFFSET(page, mapping);
 	VMCOREINFO_OFFSET(page, lru);
 	VMCOREINFO_OFFSET(page, _mapcount);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b49ee126d4d1..f8ac8f582fd8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3113,7 +3113,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
 	VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
 
 	/*
-	 * tail_page->_count is zero and not changing from under us. But
+	 * tail_page->_refcount is zero and not changing from under us. But
 	 * get_page_unless_zero() may be running from under us on the
 	 * tail_page. If we used atomic_set() below instead of atomic_inc(), we
 	 * would then run atomic_set() concurrently with
@@ -3340,7 +3340,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	if (mlocked)
 		lru_add_drain();
 
-	/* Prevent deferred_split_scan() touching ->_count */
+	/* Prevent deferred_split_scan() touching ->_refcount */
 	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 	count = page_count(head);
 	mapcount = total_mapcount(head);
diff --git a/mm/internal.h b/mm/internal.h
index b79abb6721cf..098a89e3b97c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
 }
 
 /*
- * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * Turn a non-refcounted page (->_refcount == 0) into refcounted with
  * a count of one.
  */
 static inline void set_page_refcounted(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c1069efcc4d7..4ce57f938b7f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -794,7 +794,7 @@ static inline int free_pages_check(struct page *page)
 	if (unlikely(page->mapping != NULL))
 		bad_reason = "non-NULL mapping";
 	if (unlikely(page_ref_count(page) != 0))
-		bad_reason = "nonzero _count";
+		bad_reason = "nonzero _refcount";
 	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
 		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
 		bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
@@ -6864,7 +6864,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
-		 * because their page->_count is zero at all time.
+		 * because their page->_refcount is zero at all time.
 		 */
 		if (!page_ref_count(page)) {
 			if (PageBuddy(page))
diff --git a/mm/slub.c b/mm/slub.c
index 8671de2e5b12..cf1faa4d3992 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count
 	tmp.counters = counters_new;
 	/*
 	 * page->counters can cover frozen/inuse/objects as well
-	 * as page->_count.  If we assign to ->counters directly
-	 * we run the risk of losing updates to page->_count, so
+	 * as page->_refcount.  If we assign to ->counters directly
+	 * we run the risk of losing updates to page->_refcount, so
 	 * be careful and only assign to the fields we need.
 	 */
 	page->frozen  = tmp.frozen;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 142cb61f4822..d3a02ac3eed7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 	 *
 	 * Reversing the order of the tests ensures such a situation cannot
 	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
-	 * load is not satisfied before that of page->_count.
+	 * load is not satisfied before that of page->_refcount.
 	 *
 	 * Note that if SetPageDirty is always performed via set_page_dirty,
 	 * and thus under tree_lock, then this ordering is not required.
@@ -1720,7 +1720,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
  * It is safe to rely on PG_active against the non-LRU pages in here because
  * nobody will play with that bit on a non-LRU page.
  *
- * The downside is that we have to touch page->_count against each page.
+ * The downside is that we have to touch page->_refcount against each page.
  * But we had to alter page->flags anyway.
  */
 
-- 
cgit v1.2.3


From 0edaf86cf1a6a97d811fc34765ddbcbc310de564 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:10:58 -0700
Subject: include/linux/nodemask.h: create next_node_in() helper

Lots of code does

	node = next_node(node, XXX);
	if (node == MAX_NUMNODES)
		node = first_node(XXX);

so create next_node_in() to do this and use it in various places.

[mhocko@suse.com: use next_node_in() helper]
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Hui Zhu <zhuhui@xiaomi.com>
Cc: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/tile/kernel/setup.c |  4 +---
 arch/x86/mm/numa.c       |  4 +---
 include/linux/nodemask.h | 11 ++++++++++-
 kernel/cpuset.c          |  8 +-------
 lib/Makefile             |  2 +-
 lib/nodemask.c           | 30 ++++++++++++++++++++++++++++++
 mm/hugetlb.c             |  4 +---
 mm/memcontrol.c          |  4 +---
 mm/mempolicy.c           | 24 ++----------------------
 mm/page_isolation.c      |  9 +++------
 mm/slab.c                | 13 +++----------
 11 files changed, 54 insertions(+), 59 deletions(-)
 create mode 100644 lib/nodemask.c

(limited to 'kernel')

diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index a992238e9b58..153020abd2f5 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -962,9 +962,7 @@ static void __init setup_numa_mapping(void)
 		cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]);
 		cpu_2_node[best_cpu] = node;
 		cpumask_clear_cpu(best_cpu, &unbound_cpus);
-		node = next_node(node, default_nodes);
-		if (node == MAX_NUMNODES)
-			node = first_node(default_nodes);
+		node = next_node_in(node, default_nodes);
 	}
 
 	/* Print out node assignments and set defaults for disabled cpus */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f70c1ff46125..9c086c57105c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -617,9 +617,7 @@ static void __init numa_init_array(void)
 		if (early_cpu_to_node(i) != NUMA_NO_NODE)
 			continue;
 		numa_set_node(i, rr);
-		rr = next_node(rr, node_online_map);
-		if (rr == MAX_NUMNODES)
-			rr = first_node(node_online_map);
+		rr = next_node_in(rr, node_online_map);
 	}
 }
 
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 6e85889cf9ab..f746e44d4046 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -43,8 +43,10 @@
  *
  * int first_node(mask)			Number lowest set bit, or MAX_NUMNODES
  * int next_node(node, mask)		Next node past 'node', or MAX_NUMNODES
+ * int next_node_in(node, mask)		Next node past 'node', or wrap to first,
+ *					or MAX_NUMNODES
  * int first_unset_node(mask)		First node not set in mask, or 
- *					MAX_NUMNODES.
+ *					MAX_NUMNODES
  *
  * nodemask_t nodemask_of_node(node)	Return nodemask with bit 'node' set
  * NODE_MASK_ALL			Initializer - all bits set
@@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp)
 	return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
 }
 
+/*
+ * Find the next present node in src, starting after node n, wrapping around to
+ * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
+ */
+#define next_node_in(n, src) __next_node_in((n), &(src))
+int __next_node_in(int node, const nodemask_t *srcp);
+
 static inline void init_nodemask_of_node(nodemask_t *mask, int node)
 {
 	nodes_clear(*mask);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1902956baba1..611cc69af8f0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
 
 static int cpuset_spread_node(int *rotor)
 {
-	int node;
-
-	node = next_node(*rotor, current->mems_allowed);
-	if (node == MAX_NUMNODES)
-		node = first_node(current->mems_allowed);
-	*rotor = node;
-	return node;
+	return *rotor = next_node_in(*rotor, current->mems_allowed);
 }
 
 int cpuset_mem_spread_node(void)
diff --git a/lib/Makefile b/lib/Makefile
index 931396ada5eb..42b69185f963 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o md5.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o seq_buf.o nmi_backtrace.o
+	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/nodemask.c b/lib/nodemask.c
new file mode 100644
index 000000000000..e42a5bf44d33
--- /dev/null
+++ b/lib/nodemask.c
@@ -0,0 +1,30 @@
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+int __next_node_in(int node, const nodemask_t *srcp)
+{
+	int ret = __next_node(node, srcp);
+
+	if (ret == MAX_NUMNODES)
+		ret = __first_node(srcp);
+	return ret;
+}
+EXPORT_SYMBOL(__next_node_in);
+
+#ifdef CONFIG_NUMA
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns NUMA_NO_NODE if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+	int w, bit = NUMA_NO_NODE;
+
+	w = nodes_weight(*maskp);
+	if (w)
+		bit = bitmap_ord_to_pos(maskp->bits,
+			get_random_int() % w, MAX_NUMNODES);
+	return bit;
+}
+#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d0d08b396f..5856093f9062 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -937,9 +937,7 @@ err:
  */
 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
 {
-	nid = next_node(nid, *nodes_allowed);
-	if (nid == MAX_NUMNODES)
-		nid = first_node(*nodes_allowed);
+	nid = next_node_in(nid, *nodes_allowed);
 	VM_BUG_ON(nid >= MAX_NUMNODES);
 
 	return nid;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe787f5c41bd..6740c4c2b550 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1389,9 +1389,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 	mem_cgroup_may_update_nodemask(memcg);
 	node = memcg->last_scanned_node;
 
-	node = next_node(node, memcg->scan_nodes);
-	if (node == MAX_NUMNODES)
-		node = first_node(memcg->scan_nodes);
+	node = next_node_in(node, memcg->scan_nodes);
 	/*
 	 * We call this when we hit limit, not when pages are added to LRU.
 	 * No LRU may hold pages because all pages are UNEVICTABLE or
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 36cc01bc950a..8d369cee0cd6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -97,7 +97,6 @@
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
-#include <linux/random.h>
 
 #include "internal.h"
 
@@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 		BUG();
 
 	if (!node_isset(current->il_next, tmp)) {
-		current->il_next = next_node(current->il_next, tmp);
-		if (current->il_next >= MAX_NUMNODES)
-			current->il_next = first_node(tmp);
+		current->il_next = next_node_in(current->il_next, tmp);
 		if (current->il_next >= MAX_NUMNODES)
 			current->il_next = numa_node_id();
 	}
@@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	struct task_struct *me = current;
 
 	nid = me->il_next;
-	next = next_node(nid, policy->v.nodes);
-	if (next >= MAX_NUMNODES)
-		next = first_node(policy->v.nodes);
+	next = next_node_in(nid, policy->v.nodes);
 	if (next < MAX_NUMNODES)
 		me->il_next = next;
 	return nid;
@@ -1805,21 +1800,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 		return interleave_nodes(pol);
 }
 
-/*
- * Return the bit number of a random bit set in the nodemask.
- * (returns NUMA_NO_NODE if nodemask is empty)
- */
-int node_random(const nodemask_t *maskp)
-{
-	int w, bit = NUMA_NO_NODE;
-
-	w = nodes_weight(*maskp);
-	if (w)
-		bit = bitmap_ord_to_pos(maskp->bits,
-			get_random_int() % w, MAX_NUMNODES);
-	return bit;
-}
-
 #ifdef CONFIG_HUGETLBFS
 /*
  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c4f568206544..67bedd18429c 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -288,13 +288,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
 	 * accordance with memory policy of the user process if possible. For
 	 * now as a simple work-around, we use the next node for destination.
 	 */
-	if (PageHuge(page)) {
-		int node = next_online_node(page_to_nid(page));
-		if (node == MAX_NUMNODES)
-			node = first_online_node;
+	if (PageHuge(page))
 		return alloc_huge_page_node(page_hstate(compound_head(page)),
-					    node);
-	}
+					    next_node_in(page_to_nid(page),
+							 node_online_map));
 
 	if (PageHighMem(page))
 		gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/slab.c b/mm/slab.c
index d81565a92864..c11bf5007952 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -522,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node);
 
 static void init_reap_node(int cpu)
 {
-	int node;
-
-	node = next_node(cpu_to_mem(cpu), node_online_map);
-	if (node == MAX_NUMNODES)
-		node = first_node(node_online_map);
-
-	per_cpu(slab_reap_node, cpu) = node;
+	per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
+						    node_online_map);
 }
 
 static void next_reap_node(void)
 {
 	int node = __this_cpu_read(slab_reap_node);
 
-	node = next_node(node, node_online_map);
-	if (unlikely(node >= MAX_NUMNODES))
-		node = first_node(node_online_map);
+	node = next_node_in(node, node_online_map);
 	__this_cpu_write(slab_reap_node, node);
 }
 
-- 
cgit v1.2.3


From 52b6f46bc163eef17ecba4cd552beeafe2b24453 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:50 -0700
Subject: mm: /proc/sys/vm/stat_refresh to force vmstat update

Provide /proc/sys/vm/stat_refresh to force an immediate update of
per-cpu into global vmstats: useful to avoid a sleep(2) or whatever
before checking counts when testing.  Originally added to work around a
bug which left counts stranded indefinitely on a cpu going idle (an
inaccuracy magnified when small below-batch numbers represent "huge"
amounts of memory), but I believe that bug is now fixed: nonetheless,
this is still a useful knob.

Its schedule_on_each_cpu() is probably too expensive just to fold into
reading /proc/meminfo itself: give this mode 0600 to prevent abuse.
Allow a write or a read to do the same: nothing to read, but "grep -h
Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient.  Oh, and
since global_page_state() itself is careful to disguise any underflow as
0, hack in an "Invalid argument" and pr_warn() if a counter is negative
after the refresh - this helped to fix a misaccounting of
NR_ISOLATED_FILE in my migration code.

But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED
often go negative some of the time.  I have not yet worked out why, but
have no evidence that it's actually harmful.  Punt for the moment by
just ignoring the anomaly on those.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 14 +++++++++++
 include/linux/vmstat.h      |  4 +++
 kernel/sysctl.c             |  7 ++++++
 mm/vmstat.c                 | 60 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 85 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 34a5fece3121..720355cbdf45 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm:
 - panic_on_oom
 - percpu_pagelist_fraction
 - stat_interval
+- stat_refresh
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -755,6 +756,19 @@ is 1 second.
 
 ==============================================================
 
+stat_refresh
+
+Any read or write (by root only) flushes all the per-cpu vm statistics
+into their global totals, for more accurate reports when testing
+e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
+
+As a side-effect, it also checks for negative totals (elsewhere reported
+as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
+(At time of writing, a few stats are known sometimes to be found negative,
+with no ill effects: errors and warnings on these stats are suppressed.)
+
+==============================================================
+
 swappiness
 
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 73fae8c4a5fb..02fce415b3d9 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -193,6 +193,10 @@ void quiet_vmstat(void);
 void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
+struct ctl_table;
+int vmstat_refresh(struct ctl_table *, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos);
+
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
 
 int calculate_pressure_threshold(struct zone *zone);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b318663525..2effd84d83e3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "stat_refresh",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0600,
+		.proc_handler	= vmstat_refresh,
+	},
 #endif
 #ifdef CONFIG_MMU
 	{
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a7de9adacbd9..c831be32a1a3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
 static cpumask_var_t cpu_stat_off;
 
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+	refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	long val;
+	int err;
+	int i;
+
+	/*
+	 * The regular update, every sysctl_stat_interval, may come later
+	 * than expected: leaving a significant amount in per_cpu buckets.
+	 * This is particularly misleading when checking a quantity of HUGE
+	 * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
+	 * which can equally be echo'ed to or cat'ted from (by root),
+	 * can be used to update the stats just before reading them.
+	 *
+	 * Oh, and since global_page_state() etc. are so careful to hide
+	 * transiently negative values, report an error here if any of
+	 * the stats is negative, so we know to go looking for imbalance.
+	 */
+	err = schedule_on_each_cpu(refresh_vm_stats);
+	if (err)
+		return err;
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+		val = atomic_long_read(&vm_stat[i]);
+		if (val < 0) {
+			switch (i) {
+			case NR_ALLOC_BATCH:
+			case NR_PAGES_SCANNED:
+				/*
+				 * These are often seen to go negative in
+				 * recent kernels, but not to go permanently
+				 * negative.  Whilst it would be nicer not to
+				 * have exceptions, rooting them out would be
+				 * another task, of rather low priority.
+				 */
+				break;
+			default:
+				pr_warn("%s: %s %ld\n",
+					__func__, vmstat_text[i], val);
+				err = -EINVAL;
+				break;
+			}
+		}
+	}
+	if (err)
+		return err;
+	if (write)
+		*ppos += *lenp;
+	else
+		*lenp = 0;
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
 static void vmstat_update(struct work_struct *w)
 {
 	if (refresh_cpu_vm_stats(true)) {
-- 
cgit v1.2.3


From 002f290627c27068087f6204baec7a334e5a3b48 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 19 May 2016 17:14:30 -0700
Subject: cpuset: use static key better and convert to new API

An important function for cpusets is cpuset_node_allowed(), which
optimizes on the fact if there's a single root CPU set, it must be
trivially allowed.  But the check "nr_cpusets() <= 1" doesn't use the
cpusets_enabled_key static key the right way where static keys eliminate
branching overhead with jump labels.

This patch converts it so that static key is used properly.  It's also
switched to the new static key API and the checking functions are
converted to return bool instead of int.  We also provide a new variant
__cpuset_zone_allowed() which expects that the static key check was
already done and they key was enabled.  This is needed for
get_page_from_freelist() where we want to also avoid the relatively
slower check when ALLOC_CPUSET is not set in alloc_flags.

The impact on the page allocator microbenchmark is less than expected
but the cleanup in itself is worthwhile.

                                             4.6.0-rc2                  4.6.0-rc2
                                       multcheck-v1r20               cpuset-v1r20
  Min      alloc-odr0-1               348.00 (  0.00%)           348.00 (  0.00%)
  Min      alloc-odr0-2               254.00 (  0.00%)           254.00 (  0.00%)
  Min      alloc-odr0-4               213.00 (  0.00%)           213.00 (  0.00%)
  Min      alloc-odr0-8               186.00 (  0.00%)           183.00 (  1.61%)
  Min      alloc-odr0-16              173.00 (  0.00%)           171.00 (  1.16%)
  Min      alloc-odr0-32              166.00 (  0.00%)           163.00 (  1.81%)
  Min      alloc-odr0-64              162.00 (  0.00%)           159.00 (  1.85%)
  Min      alloc-odr0-128             160.00 (  0.00%)           157.00 (  1.88%)
  Min      alloc-odr0-256             169.00 (  0.00%)           166.00 (  1.78%)
  Min      alloc-odr0-512             180.00 (  0.00%)           180.00 (  0.00%)
  Min      alloc-odr0-1024            188.00 (  0.00%)           187.00 (  0.53%)
  Min      alloc-odr0-2048            194.00 (  0.00%)           193.00 (  0.52%)
  Min      alloc-odr0-4096            199.00 (  0.00%)           198.00 (  0.50%)
  Min      alloc-odr0-8192            202.00 (  0.00%)           201.00 (  0.50%)
  Min      alloc-odr0-16384           203.00 (  0.00%)           202.00 (  0.49%)

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 42 ++++++++++++++++++++++++++++--------------
 kernel/cpuset.c        | 14 +++++++-------
 mm/page_alloc.c        |  2 +-
 3 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 85a868ccb493..bfc204e70338 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -16,26 +16,26 @@
 
 #ifdef CONFIG_CPUSETS
 
-extern struct static_key cpusets_enabled_key;
+extern struct static_key_false cpusets_enabled_key;
 static inline bool cpusets_enabled(void)
 {
-	return static_key_false(&cpusets_enabled_key);
+	return static_branch_unlikely(&cpusets_enabled_key);
 }
 
 static inline int nr_cpusets(void)
 {
 	/* jump label reference count + the top-level cpuset */
-	return static_key_count(&cpusets_enabled_key) + 1;
+	return static_key_count(&cpusets_enabled_key.key) + 1;
 }
 
 static inline void cpuset_inc(void)
 {
-	static_key_slow_inc(&cpusets_enabled_key);
+	static_branch_inc(&cpusets_enabled_key);
 }
 
 static inline void cpuset_dec(void)
 {
-	static_key_slow_dec(&cpusets_enabled_key);
+	static_branch_dec(&cpusets_enabled_key);
 }
 
 extern int cpuset_init(void);
@@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern int __cpuset_node_allowed(int node, gfp_t gfp_mask);
+extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask);
 
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-	return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask);
+	if (cpusets_enabled())
+		return __cpuset_node_allowed(node, gfp_mask);
+	return true;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+	return __cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	if (cpusets_enabled())
+		return __cpuset_zone_allowed(z, gfp_mask);
+	return true;
 }
 
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 	return 1;
 }
 
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-	return 1;
+	return true;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return 1;
+	return true;
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	return true;
 }
 
 static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 611cc69af8f0..73e93e53884d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,7 +61,7 @@
 #include <linux/cgroup.h>
 #include <linux/wait.h>
 
-struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
 
 /* See "Frequency meter" comments, below. */
 
@@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
  *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  */
-int __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
 	struct cpuset *cs;		/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 	unsigned long flags;
 
 	if (in_interrupt())
-		return 1;
+		return true;
 	if (node_isset(node, current->mems_allowed))
-		return 1;
+		return true;
 	/*
 	 * Allow tasks that have access to memory reserves because they have
 	 * been OOM killed to get memory anywhere.
 	 */
 	if (unlikely(test_thread_flag(TIF_MEMDIE)))
-		return 1;
+		return true;
 	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
-		return 0;
+		return false;
 
 	if (current->flags & PF_EXITING) /* Let dying task have memory */
-		return 1;
+		return true;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
 	spin_lock_irqsave(&callback_lock, flags);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bdf7a13311b5..39c441bb8d61 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2859,7 +2859,7 @@ zonelist_scan:
 
 		if (cpusets_enabled() &&
 			(alloc_flags & ALLOC_CPUSET) &&
-			!cpuset_zone_allowed(zone, gfp_mask))
+			!__cpuset_zone_allowed(zone, gfp_mask))
 				continue;
 		/*
 		 * Distribute pages in proportion to the individual
-- 
cgit v1.2.3


From 6112a300c9e41993cc0dc56ac393743d28381284 Mon Sep 17 00:00:00 2001
From: Soumya PN <soumya.p.n@hpe.com>
Date: Tue, 17 May 2016 21:31:14 +0530
Subject: ftrace: Don't disable irqs when taking the tasklist_lock read_lock

In ftrace.c inside the function alloc_retstack_tasklist() (which will be
invoked when function_graph tracing is on) the tasklist_lock is being
held as reader while iterating through a list of threads. Here the lock
is being held as reader with irqs disabled. The tasklist_lock is never
write_locked in interrupt context so it is safe to not disable interrupts
for the duration of read_lock in this block which, can be significant,
given the block of code iterates through all threads. Hence changing the
code to call read_lock() and read_unlock() instead of read_lock_irqsave()
and read_unlock_irqrestore().

A similar change was made in commits: 8063e41d2ffc ("tracing: Change
syscall_*regfunc() to check PF_KTHREAD and use for_each_process_thread()")'
and 3472eaa1f12e ("sched: normalize_rt_tasks(): Don't use _irqsave for
tasklist_lock, use task_rq_lock()")'

Link: http://lkml.kernel.org/r/1463500874-77480-1-git-send-email-soumya.p.n@hpe.com

Signed-off-by: Soumya PN <soumya.p.n@hpe.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b1870fbd2b67..a6804823a058 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5713,7 +5713,6 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 {
 	int i;
 	int ret = 0;
-	unsigned long flags;
 	int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
 	struct task_struct *g, *t;
 
@@ -5729,7 +5728,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 		}
 	}
 
-	read_lock_irqsave(&tasklist_lock, flags);
+	read_lock(&tasklist_lock);
 	do_each_thread(g, t) {
 		if (start == end) {
 			ret = -EAGAIN;
@@ -5747,7 +5746,7 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 	} while_each_thread(g, t);
 
 unlock:
-	read_unlock_irqrestore(&tasklist_lock, flags);
+	read_unlock(&tasklist_lock);
 free:
 	for (i = start; i < end; i++)
 		kfree(ret_stack_list[i]);
-- 
cgit v1.2.3


From b7552e1bccbe3da9c8e7386c6188e8ea4667c8e7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 18 May 2016 14:14:28 +0200
Subject: bpf: rather use get_random_int for randomizations

Start address randomization and blinding in BPF currently use
prandom_u32(). prandom_u32() values are not exposed to unpriviledged
user space to my knowledge, but given other kernel facilities such as
ASLR, stack canaries, etc make use of stronger get_random_int(), we
better make use of it here as well given blinding requests successively
new random values. get_random_int() has minimal entropy pool depletion,
is not cryptographically secure, but doesn't need to be for our use
cases here.

Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f1e8a0def99b..b94a36550591 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -231,7 +231,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 	hdr->pages = size / PAGE_SIZE;
 	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
 		     PAGE_SIZE - sizeof(*hdr));
-	start = (prandom_u32() % hole) & ~(alignment - 1);
+	start = (get_random_int() % hole) & ~(alignment - 1);
 
 	/* Leave a random number of instructions before BPF code. */
 	*image_ptr = &hdr->image[start];
@@ -251,7 +251,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
 			      struct bpf_insn *to_buff)
 {
 	struct bpf_insn *to = to_buff;
-	u32 imm_rnd = prandom_u32();
+	u32 imm_rnd = get_random_int();
 	s16 off;
 
 	BUILD_BUG_ON(BPF_REG_AX  + 1 != MAX_BPF_JIT_REG);
-- 
cgit v1.2.3


From e27f4a942a0ee4b84567a3c6cfa84f273e55cbb7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 20 May 2016 17:22:48 -0500
Subject: bpf: Use mount_nodev not mount_ns to mount the bpf filesystem

While reviewing the filesystems that set FS_USERNS_MOUNT I spotted the
bpf filesystem.  Looking at the code I saw a broken usage of mount_ns
with current->nsproxy->mnt_ns. As the code does not acquire a
reference to the mount namespace it can not possibly be correct to
store the mount namespace on the superblock as it does.

Replace mount_ns with mount_nodev so that each mount of the bpf
filesystem returns a distinct instance, and the code is not buggy.

In discussion with Hannes Frederic Sowa it was reported that the use
of mount_ns was an attempt to have one bpf instance per mount
namespace, in an attempt to keep resources that pin resources from
hiding.  That intent simply does not work, the vfs is not built to
allow that kind of behavior.  Which means that the bpf filesystem
really is buggy both semantically and in it's implemenation as it does
not nor can it implement the original intent.

This change is userspace visible, but my experience with similar
filesystems leads me to believe nothing will break with a model of each
mount of the bpf filesystem is distinct from all others.

Fixes: b2197755b263 ("bpf: add support for persistent maps/progs")
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 71b75d9c81da..04be7021f848 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -357,7 +357,7 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
 static struct dentry *bpf_mount(struct file_system_type *type, int flags,
 				const char *dev_name, void *data)
 {
-	return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
+	return mount_nodev(type, flags, data, bpf_fill_super);
 }
 
 static struct file_system_type bpf_fs_type = {
-- 
cgit v1.2.3


From d91b28ed42de99217efb2e8cb0357263d6fb737c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 19 May 2016 18:17:13 -0700
Subject: bpf: support decreasing order in direct packet access

when packet headers are accessed in 'decreasing' order (like TCP port
may be fetched before the program reads IP src) the llvm may generate
the following code:
[...]                // R7=pkt(id=0,off=22,r=70)
r2 = *(u32 *)(r7 +0) // good access
[...]
r7 += 40             // R7=pkt(id=0,off=62,r=70)
r8 = *(u32 *)(r7 +0) // good access
[...]
r1 = *(u32 *)(r7 -20) // this one will fail though it's within a safe range
                      // it's doing *(u32*)(skb->data + 42)
Fix verifier to recognize such code pattern

Alos turned out that 'off > range' condition is not a verifier bug.
It's a buggy program that may do something like:
if (ptr + 50 > data_end)
  return 0;
ptr += 60;
*(u32*)ptr;
in such case emit
"invalid access to packet, off=0 size=4, R1(id=0,off=60,r=50)" error message,
so all information is available for the program author to fix the program.

Fixes: 969bf05eb3ce ("bpf: direct packet access")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a08d66215245..d54e34874579 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -683,15 +683,11 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 {
 	struct reg_state *regs = env->cur_state.regs;
 	struct reg_state *reg = &regs[regno];
-	int linear_size = (int) reg->range - (int) reg->off;
 
-	if (linear_size < 0 || linear_size >= MAX_PACKET_OFF) {
-		verbose("verifier bug\n");
-		return -EFAULT;
-	}
-	if (off < 0 || off + size > linear_size) {
-		verbose("invalid access to packet, off=%d size=%d, allowed=%d\n",
-			off, size, linear_size);
+	off += reg->off;
+	if (off < 0 || off + size > reg->range) {
+		verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
 	}
 	return 0;
-- 
cgit v1.2.3


From 1b9b69ecb3a5236d4d3da0f0fa11af916371841e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Thu, 19 May 2016 18:17:14 -0700
Subject: bpf: teach verifier to recognize imm += ptr pattern

Humans don't write C code like:
  u8 *ptr = skb->data;
  int imm = 4;
  imm += ptr;
but from llvm backend point of view 'imm' and 'ptr' are registers and
imm += ptr may be preferred vs ptr += imm depending which register value
will be used further in the code, while verifier can only recognize ptr += imm.
That caused small unrelated changes in the C code of the bpf program to
trigger rejection by the verifier. Therefore teach the verifier to recognize
both ptr += imm and imm += ptr.
For example:
when R6=pkt(id=0,off=0,r=62) R7=imm22
after r7 += r6 instruction
will be R6=pkt(id=0,off=0,r=62) R7=pkt(id=0,off=22,r=62)

Fixes: 969bf05eb3ce ("bpf: direct packet access")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d54e34874579..668e07903c8f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1245,6 +1245,7 @@ static int check_packet_ptr_add(struct verifier_env *env, struct bpf_insn *insn)
 	struct reg_state *regs = env->cur_state.regs;
 	struct reg_state *dst_reg = &regs[insn->dst_reg];
 	struct reg_state *src_reg = &regs[insn->src_reg];
+	struct reg_state tmp_reg;
 	s32 imm;
 
 	if (BPF_SRC(insn->code) == BPF_K) {
@@ -1267,6 +1268,19 @@ add_imm:
 		 */
 		dst_reg->off += imm;
 	} else {
+		if (src_reg->type == PTR_TO_PACKET) {
+			/* R6=pkt(id=0,off=0,r=62) R7=imm22; r7 += r6 */
+			tmp_reg = *dst_reg;  /* save r7 state */
+			*dst_reg = *src_reg; /* copy pkt_ptr state r6 into r7 */
+			src_reg = &tmp_reg;  /* pretend it's src_reg state */
+			/* if the checks below reject it, the copy won't matter,
+			 * since we're rejecting the whole program. If all ok,
+			 * then imm22 state will be added to r7
+			 * and r7 will be pkt(id=0,off=22,r=62) while
+			 * r6 will stay as pkt(id=0,off=0,r=62)
+			 */
+		}
+
 		if (src_reg->type == CONST_IMM) {
 			/* pkt_ptr += reg where reg is known constant */
 			imm = src_reg->imm;
@@ -1565,7 +1579,9 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 			return 0;
 		} else if (opcode == BPF_ADD &&
 			   BPF_CLASS(insn->code) == BPF_ALU64 &&
-			   dst_reg->type == PTR_TO_PACKET) {
+			   (dst_reg->type == PTR_TO_PACKET ||
+			    (BPF_SRC(insn->code) == BPF_X &&
+			     regs[insn->src_reg].type == PTR_TO_PACKET))) {
 			/* ptr_to_packet += K|X */
 			return check_packet_ptr_add(env, insn);
 		} else if (BPF_CLASS(insn->code) == BPF_ALU64 &&
-- 
cgit v1.2.3


From ec8d7c14ea14922fe21945b458a75e39f11dd832 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:21 -0700
Subject: mm, oom_reaper: do not mmput synchronously from the oom reaper
 context

Tetsuo has properly noted that mmput slow path might get blocked waiting
for another party (e.g.  exit_aio waits for an IO).  If that happens the
oom_reaper would be put out of the way and will not be able to process
next oom victim.  We should strive for making this context as reliable
and independent on other subsystems as much as possible.

Introduce mmput_async which will perform the slow path from an async
(WQ) context.  This will delay the operation but that shouldn't be a
problem because the oom_reaper has reclaimed the victim's address space
for most cases as much as possible and the remaining context shouldn't
bind too much memory anymore.  The only exception is when mmap_sem
trylock has failed which shouldn't happen too often.

The issue is only theoretical but not impossible.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |  2 ++
 include/linux/sched.h    |  5 +++++
 kernel/fork.c            | 50 +++++++++++++++++++++++++++++++++---------------
 mm/oom_kill.c            |  8 ++++++--
 4 files changed, 48 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1fda9c99ef95..d553855503e6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
+#include <linux/workqueue.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -513,6 +514,7 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
 	atomic_long_t hugetlb_usage;
 #endif
+	struct work_struct async_put_work;
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 40eabf176ce2..479e3cade7e9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2730,6 +2730,11 @@ static inline void mmdrop(struct mm_struct * mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
+/* same as above but performs the slow path from the async kontext. Can
+ * be called from the atomic context as well
+ */
+extern void mmput_async(struct mm_struct *);
+
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 3e8451527cbe..8fbed7194af1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
 
+static inline void __mmput(struct mm_struct *mm)
+{
+	VM_BUG_ON(atomic_read(&mm->mm_users));
+
+	uprobe_clear_state(mm);
+	exit_aio(mm);
+	ksm_exit(mm);
+	khugepaged_exit(mm); /* must run before exit_mmap */
+	exit_mmap(mm);
+	set_mm_exe_file(mm, NULL);
+	if (!list_empty(&mm->mmlist)) {
+		spin_lock(&mmlist_lock);
+		list_del(&mm->mmlist);
+		spin_unlock(&mmlist_lock);
+	}
+	if (mm->binfmt)
+		module_put(mm->binfmt->module);
+	mmdrop(mm);
+}
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
@@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm)
 {
 	might_sleep();
 
+	if (atomic_dec_and_test(&mm->mm_users))
+		__mmput(mm);
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+static void mmput_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+	__mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
 	if (atomic_dec_and_test(&mm->mm_users)) {
-		uprobe_clear_state(mm);
-		exit_aio(mm);
-		ksm_exit(mm);
-		khugepaged_exit(mm); /* must run before exit_mmap */
-		exit_mmap(mm);
-		set_mm_exe_file(mm, NULL);
-		if (!list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_del(&mm->mmlist);
-			spin_unlock(&mmlist_lock);
-		}
-		if (mm->binfmt)
-			module_put(mm->binfmt->module);
-		mmdrop(mm);
+		INIT_WORK(&mm->async_put_work, mmput_async_fn);
+		schedule_work(&mm->async_put_work);
 	}
 }
-EXPORT_SYMBOL_GPL(mmput);
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c0376efa79ec..c0e37dd1422f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -446,7 +446,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 static struct task_struct *oom_reaper_list;
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
-
 static bool __oom_reap_task(struct task_struct *tsk)
 {
 	struct mmu_gather tlb;
@@ -520,7 +519,12 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 */
 	set_bit(MMF_OOM_REAPED, &mm->flags);
 out:
-	mmput(mm);
+	/*
+	 * Drop our reference but make sure the mmput slow path is called from a
+	 * different context because we shouldn't risk we get stuck there and
+	 * put the oom_reaper out of the way.
+	 */
+	mmput_async(mm);
 	return ret;
 }
 
-- 
cgit v1.2.3


From e64646946ed32902fd597fa6e514b1da84642de3 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:20 -0700
Subject: exit_thread: accept a task parameter to be exited

We need to call exit_thread from copy_process in a fail path.  So make it
accept task_struct as a parameter.

[v2]
* s390: exit_thread_runtime_instr doesn't make sense to be called for
  non-current tasks.
* arm: fix the comment in vfp_thread_copy
* change 'me' to 'tsk' for task_struct
* now we can change only archs that actually have exit_thread

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/process.c           |  4 ++--
 arch/arm/vfp/vfpmodule.c            |  4 ----
 arch/avr32/kernel/process.c         |  4 ++--
 arch/cris/arch-v32/kernel/process.c |  4 ++--
 arch/ia64/kernel/perfmon.c          |  4 ++--
 arch/ia64/kernel/process.c          | 14 +++++++-------
 arch/metag/kernel/process.c         |  6 +++---
 arch/mn10300/kernel/process.c       |  4 ++--
 arch/s390/kernel/process.c          |  5 +++--
 arch/sh/kernel/process_64.c         |  5 ++---
 arch/sparc/kernel/process_32.c      | 12 ++++++------
 arch/sparc/kernel/process_64.c      |  4 ++--
 arch/tile/kernel/process.c          |  4 ++--
 arch/x86/kernel/process.c           |  5 ++---
 arch/xtensa/kernel/process.c        |  4 ++--
 include/linux/sched.h               |  4 ++--
 kernel/exit.c                       |  2 +-
 17 files changed, 42 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 4adfb46e3ee9..a647d6642f3e 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -193,9 +193,9 @@ EXPORT_SYMBOL_GPL(thread_notify_head);
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	thread_notify(THREAD_NOTIFY_EXIT, current_thread_info());
+	thread_notify(THREAD_NOTIFY_EXIT, task_thread_info(tsk));
 }
 
 void flush_thread(void)
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 2a61e4b04600..73085d3482ed 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -156,10 +156,6 @@ static void vfp_thread_copy(struct thread_info *thread)
  *   - we could be preempted if tree preempt rcu is enabled, so
  *	it is unsafe to use thread->cpu.
  *  THREAD_NOTIFY_EXIT
- *   - the thread (v) will be running on the local CPU, so
- *	v === current_thread_info()
- *   - thread->cpu is the local CPU number at the time it is accessed,
- *	but may change at any time.
  *   - we could be preempted if tree preempt rcu is enabled, so
  *	it is unsafe to use thread->cpu.
  */
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 42a53e740a7e..68e5b9dac059 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -62,9 +62,9 @@ void machine_restart(char *cmd)
 /*
  * Free current thread data structures etc
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	ocd_disable(current);
+	ocd_disable(tsk);
 }
 
 void flush_thread(void)
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index c7ce784a393c..4d1afa9f9fd3 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -33,9 +33,9 @@ void default_idle(void)
  */
 
 extern void deconfigure_bp(long pid);
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	deconfigure_bp(current->pid);
+	deconfigure_bp(tsk->pid);
 }
 
 /*
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 9cd607b06964..2436ad5f92c1 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -4542,8 +4542,8 @@ pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *reg
 
 
 /*
- * called only from exit_thread(): task == current
- * we come here only if current has a context attached (loaded or masked)
+ * called only from exit_thread()
+ * we come here only if the task has a context attached (loaded or masked)
  */
 void
 pfm_exit_thread(struct task_struct *task)
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index b51514957620..aae6c4dc7ae7 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -570,22 +570,22 @@ flush_thread (void)
 }
 
 /*
- * Clean up state associated with current thread.  This is called when
+ * Clean up state associated with a thread.  This is called when
  * the thread calls exit().
  */
 void
-exit_thread (void)
+exit_thread (struct task_struct *tsk)
 {
 
-	ia64_drop_fpu(current);
+	ia64_drop_fpu(tsk);
 #ifdef CONFIG_PERFMON
        /* if needed, stop monitoring and flush state to perfmon context */
-	if (current->thread.pfm_context)
-		pfm_exit_thread(current);
+	if (tsk->thread.pfm_context)
+		pfm_exit_thread(tsk);
 
 	/* free debug register resources */
-	if (current->thread.flags & IA64_THREAD_DBG_VALID)
-		pfm_release_debug_registers(current);
+	if (tsk->thread.flags & IA64_THREAD_DBG_VALID)
+		pfm_release_debug_registers(tsk);
 #endif
 }
 
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index 7f546183a0f0..35062796edf2 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -345,10 +345,10 @@ void flush_thread(void)
 /*
  * Free current thread data structures etc.
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	clear_fpu(&current->thread);
-	clear_dsp(&current->thread);
+	clear_fpu(&tsk->thread);
+	clear_dsp(&tsk->thread);
 }
 
 /* TODO: figure out how to unwind the kernel stack here to figure out
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 74a96ccf7451..cbede4e88dee 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -103,9 +103,9 @@ void show_regs(struct pt_regs *regs)
 /*
  * free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	exit_fpu(current);
+	exit_fpu(tsk);
 }
 
 void flush_thread(void)
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 481d7a83efc6..bba4fa74b321 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -68,9 +68,10 @@ extern void kernel_thread_starter(void);
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	exit_thread_runtime_instr();
+	if (tsk == current)
+		exit_thread_runtime_instr();
 }
 
 void flush_thread(void)
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index e2062e643341..9d3e9916555d 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -288,7 +288,7 @@ void show_regs(struct pt_regs *regs)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 	/*
 	 * See arch/sparc/kernel/process.c for the precedent for doing
@@ -307,9 +307,8 @@ void exit_thread(void)
 	 * which it would get safely nulled.
 	 */
 #ifdef CONFIG_SH_FPU
-	if (last_task_used_math == current) {
+	if (last_task_used_math == tsk)
 		last_task_used_math = NULL;
-	}
 #endif
 }
 
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index c5113c7ce2fd..b7780a5bef11 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -184,21 +184,21 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #ifndef CONFIG_SMP
-	if(last_task_used_math == current) {
+	if (last_task_used_math == tsk) {
 #else
-	if (test_thread_flag(TIF_USEDFPU)) {
+	if (test_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU)) {
 #endif
 		/* Keep process from leaving FPU in a bogon state. */
 		put_psr(get_psr() | PSR_EF);
-		fpsave(&current->thread.float_regs[0], &current->thread.fsr,
-		       &current->thread.fpqueue[0], &current->thread.fpqdepth);
+		fpsave(&tsk->thread.float_regs[0], &tsk->thread.fsr,
+		       &tsk->thread.fpqueue[0], &tsk->thread.fpqdepth);
 #ifndef CONFIG_SMP
 		last_task_used_math = NULL;
 #else
-		clear_thread_flag(TIF_USEDFPU);
+		clear_ti_thread_flag(task_thread_info(tsk), TIF_USEDFPU);
 #endif
 	}
 }
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index c16ef1af1843..fa14402b33f9 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -417,9 +417,9 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 }
 
 /* Free current thread data structures etc.. */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	struct thread_info *t = current_thread_info();
+	struct thread_info *t = task_thread_info(tsk);
 
 	if (t->utraps) {
 		if (t->utraps[0] < 2)
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index b5f30d376ce1..6b705ccc9cc1 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -541,7 +541,7 @@ void flush_thread(void)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #ifdef CONFIG_HARDWALL
 	/*
@@ -550,7 +550,7 @@ void exit_thread(void)
 	 * the last reference to a hardwall fd, it would already have
 	 * been released and deactivated at this point.)
 	 */
-	hardwall_deactivate_all(current);
+	hardwall_deactivate_all(tsk);
 #endif
 }
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2915d54e9dd5..96becbbb52e0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -97,10 +97,9 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 /*
  * Free current thread data structures etc..
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
-	struct task_struct *me = current;
-	struct thread_struct *t = &me->thread;
+	struct thread_struct *t = &tsk->thread;
 	unsigned long *bp = t->io_bitmap_ptr;
 	struct fpu *fpu = &t->fpu;
 
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 5bbfed81c97b..e0ded48561db 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -115,10 +115,10 @@ void arch_cpu_idle(void)
 /*
  * This is called when the thread calls exit().
  */
-void exit_thread(void)
+void exit_thread(struct task_struct *tsk)
 {
 #if XTENSA_HAVE_COPROCESSORS
-	coprocessor_release_all(current_thread_info());
+	coprocessor_release_all(task_thread_info(tsk));
 #endif
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 167c0d4bf3fa..02bdab4d6db7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2771,9 +2771,9 @@ static inline int copy_thread_tls(
 extern void flush_thread(void);
 
 #ifdef CONFIG_HAVE_EXIT_THREAD
-extern void exit_thread(void);
+extern void exit_thread(struct task_struct *tsk);
 #else
-static inline void exit_thread(void)
+static inline void exit_thread(struct task_struct *tsk)
 {
 }
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index fd90195667e1..75b34fe835b2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -746,7 +746,7 @@ void do_exit(long code)
 		disassociate_ctty(1);
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
-	exit_thread();
+	exit_thread(tsk);
 
 	/*
 	 * Flush inherited counters to the parent - before the parent
-- 
cgit v1.2.3


From 0740aa5f6375681c57488c4ea55d05a0341cfc9c Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 20 May 2016 17:00:25 -0700
Subject: fork: free thread in copy_process on failure

When using this program (as root):

	#include <err.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>

	#include <sys/io.h>
	#include <sys/types.h>
	#include <sys/wait.h>

	#define ITER 1000
	#define FORKERS 15
	#define THREADS (6000/FORKERS) // 1850 is proc max

	static void fork_100_wait()
	{
		unsigned a, to_wait = 0;

		printf("\t%d forking %d\n", THREADS, getpid());

		for (a = 0; a < THREADS; a++) {
			switch (fork()) {
			case 0:
				usleep(1000);
				exit(0);
				break;
			case -1:
				break;
			default:
				to_wait++;
				break;
			}
		}

		printf("\t%d forked from %d, waiting for %d\n", THREADS, getpid(),
				to_wait);

		for (a = 0; a < to_wait; a++)
			wait(NULL);

		printf("\t%d waited from %d\n", THREADS, getpid());
	}

	static void run_forkers()
	{
		pid_t forkers[FORKERS];
		unsigned a;

		for (a = 0; a < FORKERS; a++) {
			switch ((forkers[a] = fork())) {
			case 0:
				fork_100_wait();
				exit(0);
				break;
			case -1:
				err(1, "DIE fork of %d'th forker", a);
				break;
			default:
				break;
			}
		}

		for (a = 0; a < FORKERS; a++)
			waitpid(forkers[a], NULL, 0);
	}

	int main()
	{
		unsigned a;
		int ret;

		ret = ioperm(10, 20, 0);
		if (ret < 0)
			err(1, "ioperm");

		for (a = 0; a < ITER; a++)
			run_forkers();

		return 0;
	}

kmemleak reports many occurences of this leak:
unreferenced object 0xffff8805917c8000 (size 8192):
  comm "fork-leak", pid 2932, jiffies 4295354292 (age 1871.028s)
  hex dump (first 32 bytes):
    ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff  ................
    ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff  ................
  backtrace:
    [<ffffffff814cfbf5>] kmemdup+0x25/0x50
    [<ffffffff8103ab43>] copy_thread_tls+0x6c3/0x9a0
    [<ffffffff81150174>] copy_process+0x1a84/0x5790
    [<ffffffff811dc375>] wake_up_new_task+0x2d5/0x6f0
    [<ffffffff8115411d>] _do_fork+0x12d/0x820
...

Due to the leakage of the memory items which should have been freed in
arch/x86/kernel/process.c:exit_thread().

Make sure the memory is freed when fork fails later in copy_process.
This is done by calling exit_thread with the thread to kill.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.linux@gmail.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Richard Kuo <rkuo@codeaurora.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Steven Miao <realmz6@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8fbed7194af1..103d78fd8f75 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1490,7 +1490,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		pid = alloc_pid(p->nsproxy->pid_ns_for_children);
 		if (IS_ERR(pid)) {
 			retval = PTR_ERR(pid);
-			goto bad_fork_cleanup_io;
+			goto bad_fork_cleanup_thread;
 		}
 	}
 
@@ -1652,6 +1652,8 @@ bad_fork_cancel_cgroup:
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
+bad_fork_cleanup_thread:
+	exit_thread(p);
 bad_fork_cleanup_io:
 	if (p->io_context)
 		exit_io_context(p);
-- 
cgit v1.2.3


From 42a0bb3f71383b457a7db362f1c69e7afb96732b Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:33 -0700
Subject: printk/nmi: generic solution for safe printk in NMI

printk() takes some locks and could not be used a safe way in NMI
context.

The chance of a deadlock is real especially when printing stacks from
all CPUs.  This particular problem has been addressed on x86 by the
commit a9edc8809328 ("x86/nmi: Perform a safe NMI stack trace on all
CPUs").

The patchset brings two big advantages.  First, it makes the NMI
backtraces safe on all architectures for free.  Second, it makes all NMI
messages almost safe on all architectures (the temporary buffer is
limited.  We still should keep the number of messages in NMI context at
minimum).

Note that there already are several messages printed in NMI context:
WARN_ON(in_nmi()), BUG_ON(in_nmi()), anything being printed out from MCE
handlers.  These are not easy to avoid.

This patch reuses most of the code and makes it generic.  It is useful
for all messages and architectures that support NMI.

The alternative printk_func is set when entering and is reseted when
leaving NMI context.  It queues IRQ work to copy the messages into the
main ring buffer in a safe context.

__printk_nmi_flush() copies all available messages and reset the buffer.
Then we could use a simple cmpxchg operations to get synchronized with
writers.  There is also used a spinlock to get synchronized with other
flushers.

We do not longer use seq_buf because it depends on external lock.  It
would be hard to make all supported operations safe for a lockless use.
It would be confusing and error prone to make only some operations safe.

The code is put into separate printk/nmi.c as suggested by Steven
Rostedt.  It needs a per-CPU buffer and is compiled only on
architectures that call nmi_enter().  This is achieved by the new
HAVE_NMI Kconfig flag.

The are MN10300 and Xtensa architectures.  We need to clean up NMI
handling there first.  Let's do it separately.

The patch is heavily based on the draft from Peter Zijlstra, see

  https://lkml.org/lkml/2015/6/10/327

[arnd@arndb.de: printk-nmi: use %zu format string for size_t]
[akpm@linux-foundation.org: min_t->min - all types are size_t here]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Jan Kara <jack@suse.cz>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>	[arm part]
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                  |   4 +
 arch/arm/Kconfig              |   1 +
 arch/arm/kernel/smp.c         |   2 +
 arch/avr32/Kconfig            |   1 +
 arch/blackfin/Kconfig         |   1 +
 arch/cris/Kconfig             |   1 +
 arch/mips/Kconfig             |   1 +
 arch/powerpc/Kconfig          |   1 +
 arch/s390/Kconfig             |   1 +
 arch/sh/Kconfig               |   1 +
 arch/sparc/Kconfig            |   1 +
 arch/tile/Kconfig             |   1 +
 arch/x86/Kconfig              |   1 +
 arch/x86/kernel/apic/hw_nmi.c |   1 -
 include/linux/hardirq.h       |   2 +
 include/linux/percpu.h        |   3 -
 include/linux/printk.h        |  12 ++-
 init/Kconfig                  |   5 +
 init/main.c                   |   1 +
 kernel/printk/Makefile        |   1 +
 kernel/printk/internal.h      |  44 +++++++++
 kernel/printk/nmi.c           | 219 ++++++++++++++++++++++++++++++++++++++++++
 kernel/printk/printk.c        |  19 +---
 lib/nmi_backtrace.c           |  89 +----------------
 24 files changed, 306 insertions(+), 107 deletions(-)
 create mode 100644 kernel/printk/internal.h
 create mode 100644 kernel/printk/nmi.c

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 0f298f9123dc..8f84fd268dee 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -187,7 +187,11 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_NMI
+	bool
+
 config HAVE_NMI_WATCHDOG
+	depends on HAVE_NMI
 	bool
 #
 # An arch should select this if it provides all these things:
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 956d3575426c..90542db1220d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -67,6 +67,7 @@ config ARM
 	select HAVE_KRETPROBES if (HAVE_KPROBES)
 	select HAVE_MEMBLOCK
 	select HAVE_MOD_ARCH_SPECIFIC
+	select HAVE_NMI
 	select HAVE_OPROFILE if (HAVE_PERF_EVENTS)
 	select HAVE_OPTPROBES if !THUMB2_KERNEL
 	select HAVE_PERF_EVENTS
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index baee70267f29..df90bc59bfce 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -644,9 +644,11 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 		break;
 
 	case IPI_CPU_BACKTRACE:
+		printk_nmi_enter();
 		irq_enter();
 		nmi_cpu_backtrace(regs);
 		irq_exit();
+		printk_nmi_exit();
 		break;
 
 	default:
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index e43519a2ca89..7e75d45e20cd 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -18,6 +18,7 @@ config AVR32
 	select GENERIC_CLOCKEVENTS
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
+	select HAVE_NMI
 	help
 	  AVR32 is a high-performance 32-bit RISC microprocessor core,
 	  designed for cost-sensitive embedded applications, with particular
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index a63c12259e77..28c63fea786d 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -40,6 +40,7 @@ config BLACKFIN
 	select HAVE_MOD_ARCH_SPECIFIC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select HAVE_NMI
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 5c0ca8ae9293..deba2662b9f3 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -70,6 +70,7 @@ config CRIS
 	select GENERIC_CLOCKEVENTS if ETRAX_ARCH_V32
 	select GENERIC_SCHED_CLOCK if ETRAX_ARCH_V32
 	select HAVE_DEBUG_BUGVERBOSE if ETRAX_ARCH_V32
+	select HAVE_NMI
 
 config HZ
 	int
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 5663f411c225..8040fb1845b4 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -48,6 +48,7 @@ config MIPS
 	select GENERIC_SCHED_CLOCK if !CAVIUM_OCTEON_SOC
 	select GENERIC_CMOS_UPDATE
 	select HAVE_MOD_ARCH_SPECIFIC
+	select HAVE_NMI
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL if MODULES
 	select MODULES_USE_ELF_RELA if MODULES && 64BIT
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f0403b58ae8b..01f7464d9fea 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -155,6 +155,7 @@ config PPC
 	select NO_BOOTMEM
 	select HAVE_GENERIC_RCU_GUP
 	select HAVE_PERF_EVENTS_NMI if PPC64
+	select HAVE_NMI if PERF_EVENTS
 	select EDAC_SUPPORT
 	select EDAC_ATOMIC_SCRUB
 	select ARCH_HAS_DMA_SET_COHERENT_MASK
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index e2c9aaaf64b2..1c3c43d9d1b5 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -166,6 +166,7 @@ config S390
 	select TTY
 	select VIRT_CPU_ACCOUNTING
 	select VIRT_TO_BUS
+	select HAVE_NMI
 
 
 config SCHED_OMIT_FRAME_POINTER
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index cb93af8f8017..f6254341c065 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select HAVE_NMI
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 27b3a0ad40a0..1012f7ffcdf5 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -79,6 +79,7 @@ config SPARC64
 	select NO_BOOTMEM
 	select HAVE_ARCH_AUDITSYSCALL
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select HAVE_NMI
 
 config ARCH_DEFCONFIG
 	string
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 174746225577..76989b878f3c 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -30,6 +30,7 @@ config TILE
 	select HAVE_DEBUG_STACKOVERFLOW
 	select ARCH_WANT_FRAME_POINTERS
 	select HAVE_CONTEXT_TRACKING
+	select HAVE_NMI if USE_PMC
 	select EDAC_SUPPORT
 	select GENERIC_STRNCPY_FROM_USER
 	select GENERIC_STRNLEN_USER
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8ff5b3be95d4..0a7b885964ba 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -131,6 +131,7 @@ config X86
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
 	select HAVE_MIXED_BREAKPOINTS_REGS
+	select HAVE_NMI
 	select HAVE_OPROFILE
 	select HAVE_OPTPROBES
 	select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 045e424fb368..7788ce643bf4 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -18,7 +18,6 @@
 #include <linux/nmi.h>
 #include <linux/module.h>
 #include <linux/delay.h>
-#include <linux/seq_buf.h>
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 u64 hw_nmi_get_sample_period(int watchdog_thresh)
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index dfd59d6bc6f0..c683996110b1 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -61,6 +61,7 @@ extern void irq_exit(void);
 
 #define nmi_enter()						\
 	do {							\
+		printk_nmi_enter();				\
 		lockdep_off();					\
 		ftrace_nmi_enter();				\
 		BUG_ON(in_nmi());				\
@@ -77,6 +78,7 @@ extern void irq_exit(void);
 		preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
 		ftrace_nmi_exit();				\
 		lockdep_on();					\
+		printk_nmi_exit();				\
 	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 4bc6dafb703e..56939d3f6e53 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -129,7 +129,4 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
 	(typeof(type) __percpu *)__alloc_percpu(sizeof(type),		\
 						__alignof__(type))
 
-/* To avoid include hell, as printk can not declare this, we declare it here */
-DECLARE_PER_CPU(printk_func_t, printk_func);
-
 #endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 9ccbdf2c1453..51dd6b824fe2 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -122,7 +122,17 @@ static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
 #endif
 
-typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+#ifdef CONFIG_PRINTK_NMI
+extern void printk_nmi_init(void);
+extern void printk_nmi_enter(void);
+extern void printk_nmi_exit(void);
+extern void printk_nmi_flush(void);
+#else
+static inline void printk_nmi_init(void) { }
+static inline void printk_nmi_enter(void) { }
+static inline void printk_nmi_exit(void) { }
+static inline void printk_nmi_flush(void) { }
+#endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(5, 0)
diff --git a/init/Kconfig b/init/Kconfig
index 79a91a2c0444..bccc1d607be5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1454,6 +1454,11 @@ config PRINTK
 	  very difficult to diagnose system problems, saying N here is
 	  strongly discouraged.
 
+config PRINTK_NMI
+	def_bool y
+	depends on PRINTK
+	depends on HAVE_NMI
+
 config BUG
 	bool "BUG() support" if EXPERT
 	default y
diff --git a/init/main.c b/init/main.c
index 2075fafaad59..fa9b2bdde183 100644
--- a/init/main.c
+++ b/init/main.c
@@ -569,6 +569,7 @@ asmlinkage __visible void __init start_kernel(void)
 	timekeeping_init();
 	time_init();
 	sched_clock_postinit();
+	printk_nmi_init();
 	perf_event_init();
 	profile_init();
 	call_function_init();
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 85405bdcf2b3..abb0042a427b 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,2 +1,3 @@
 obj-y	= printk.o
+obj-$(CONFIG_PRINTK_NMI)		+= nmi.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
new file mode 100644
index 000000000000..2de99faedfc1
--- /dev/null
+++ b/kernel/printk/internal.h
@@ -0,0 +1,44 @@
+/*
+ * internal.h - printk internal definitions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/percpu.h>
+
+typedef __printf(1, 0) int (*printk_func_t)(const char *fmt, va_list args);
+
+int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
+
+#ifdef CONFIG_PRINTK_NMI
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it temporary stores the strings into a per-CPU buffer.
+ * The alternative implementation is chosen transparently
+ * via per-CPU variable.
+ */
+DECLARE_PER_CPU(printk_func_t, printk_func);
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	return this_cpu_read(printk_func)(fmt, args);
+}
+
+#else /* CONFIG_PRINTK_NMI */
+
+static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
+{
+	return vprintk_default(fmt, args);
+}
+
+#endif /* CONFIG_PRINTK_NMI */
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
new file mode 100644
index 000000000000..303cf0d15e57
--- /dev/null
+++ b/kernel/printk/nmi.c
@@ -0,0 +1,219 @@
+/*
+ * nmi.c - Safe printk in NMI context
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/preempt.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/cpumask.h>
+#include <linux/irq_work.h>
+#include <linux/printk.h>
+
+#include "internal.h"
+
+/*
+ * printk() could not take logbuf_lock in NMI context. Instead,
+ * it uses an alternative implementation that temporary stores
+ * the strings into a per-CPU buffer. The content of the buffer
+ * is later flushed into the main ring buffer via IRQ work.
+ *
+ * The alternative implementation is chosen transparently
+ * via @printk_func per-CPU variable.
+ *
+ * The implementation allows to flush the strings also from another CPU.
+ * There are situations when we want to make sure that all buffers
+ * were handled or when IRQs are blocked.
+ */
+DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
+static int printk_nmi_irq_ready;
+
+#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work))
+
+struct nmi_seq_buf {
+	atomic_t		len;	/* length of written data */
+	struct irq_work		work;	/* IRQ work that flushes the buffer */
+	unsigned char		buffer[NMI_LOG_BUF_LEN];
+};
+static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
+
+/*
+ * Safe printk() for NMI context. It uses a per-CPU buffer to
+ * store the message. NMIs are not nested, so there is always only
+ * one writer running. But the buffer might get flushed from another
+ * CPU, so we need to be careful.
+ */
+static int vprintk_nmi(const char *fmt, va_list args)
+{
+	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
+	int add = 0;
+	size_t len;
+
+again:
+	len = atomic_read(&s->len);
+
+	if (len >= sizeof(s->buffer))
+		return 0;
+
+	/*
+	 * Make sure that all old data have been read before the buffer was
+	 * reseted. This is not needed when we just append data.
+	 */
+	if (!len)
+		smp_rmb();
+
+	add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
+
+	/*
+	 * Do it once again if the buffer has been flushed in the meantime.
+	 * Note that atomic_cmpxchg() is an implicit memory barrier that
+	 * makes sure that the data were written before updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, len + add) != len)
+		goto again;
+
+	/* Get flushed in a more safe context. */
+	if (add && printk_nmi_irq_ready) {
+		/* Make sure that IRQ work is really initialized. */
+		smp_rmb();
+		irq_work_queue(&s->work);
+	}
+
+	return add;
+}
+
+/*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
+{
+	const char *buf = s->buffer + start;
+
+	printk("%.*s", (end - start) + 1, buf);
+}
+
+/*
+ * Flush data from the associated per_CPU buffer. The function
+ * can be called either via IRQ work or independently.
+ */
+static void __printk_nmi_flush(struct irq_work *work)
+{
+	static raw_spinlock_t read_lock =
+		__RAW_SPIN_LOCK_INITIALIZER(read_lock);
+	struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
+	unsigned long flags;
+	size_t len, size;
+	int i, last_i;
+
+	/*
+	 * The lock has two functions. First, one reader has to flush all
+	 * available message to make the lockless synchronization with
+	 * writers easier. Second, we do not want to mix messages from
+	 * different CPUs. This is especially important when printing
+	 * a backtrace.
+	 */
+	raw_spin_lock_irqsave(&read_lock, flags);
+
+	i = 0;
+more:
+	len = atomic_read(&s->len);
+
+	/*
+	 * This is just a paranoid check that nobody has manipulated
+	 * the buffer an unexpected way. If we printed something then
+	 * @len must only increase.
+	 */
+	if (i && i >= len)
+		pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
+		       i, len);
+
+	if (!len)
+		goto out; /* Someone else has already flushed the buffer. */
+
+	/* Make sure that data has been written up to the @len */
+	smp_rmb();
+
+	size = min(len, sizeof(s->buffer));
+	last_i = i;
+
+	/* Print line by line. */
+	for (; i < size; i++) {
+		if (s->buffer[i] == '\n') {
+			print_nmi_seq_line(s, last_i, i);
+			last_i = i + 1;
+		}
+	}
+	/* Check if there was a partial line. */
+	if (last_i < size) {
+		print_nmi_seq_line(s, last_i, size - 1);
+		pr_cont("\n");
+	}
+
+	/*
+	 * Check that nothing has got added in the meantime and truncate
+	 * the buffer. Note that atomic_cmpxchg() is an implicit memory
+	 * barrier that makes sure that the data were copied before
+	 * updating s->len.
+	 */
+	if (atomic_cmpxchg(&s->len, len, 0) != len)
+		goto more;
+
+out:
+	raw_spin_unlock_irqrestore(&read_lock, flags);
+}
+
+/**
+ * printk_nmi_flush - flush all per-cpu nmi buffers.
+ *
+ * The buffers are flushed automatically via IRQ work. This function
+ * is useful only when someone wants to be sure that all buffers have
+ * been flushed at some point.
+ */
+void printk_nmi_flush(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
+}
+
+void __init printk_nmi_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct nmi_seq_buf *s = &per_cpu(nmi_print_seq, cpu);
+
+		init_irq_work(&s->work, __printk_nmi_flush);
+	}
+
+	/* Make sure that IRQ works are initialized before enabling. */
+	smp_wmb();
+	printk_nmi_irq_ready = 1;
+
+	/* Flush pending messages that did not have scheduled IRQ works. */
+	printk_nmi_flush();
+}
+
+void printk_nmi_enter(void)
+{
+	this_cpu_write(printk_func, vprintk_nmi);
+}
+
+void printk_nmi_exit(void)
+{
+	this_cpu_write(printk_func, vprintk_default);
+}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bfbf284e4218..71eba0607034 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,7 @@
 
 #include "console_cmdline.h"
 #include "braille.h"
+#include "internal.h"
 
 int console_printk[4] = {
 	CONSOLE_LOGLEVEL_DEFAULT,	/* console_loglevel */
@@ -1807,14 +1808,6 @@ int vprintk_default(const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(vprintk_default);
 
-/*
- * This allows printk to be diverted to another function per cpu.
- * This is useful for calling printk functions from within NMI
- * without worrying about race conditions that can lock up the
- * box.
- */
-DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
-
 /**
  * printk - print a kernel message
  * @fmt: format string
@@ -1838,21 +1831,11 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
  */
 asmlinkage __visible int printk(const char *fmt, ...)
 {
-	printk_func_t vprintk_func;
 	va_list args;
 	int r;
 
 	va_start(args, fmt);
-
-	/*
-	 * If a caller overrides the per_cpu printk_func, then it needs
-	 * to disable preemption when calling printk(). Otherwise
-	 * the printk_func should be set to the default. No need to
-	 * disable preemption here.
-	 */
-	vprintk_func = this_cpu_read(printk_func);
 	r = vprintk_func(fmt, args);
-
 	va_end(args);
 
 	return r;
diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c
index 6019c53c669e..26caf51cc238 100644
--- a/lib/nmi_backtrace.c
+++ b/lib/nmi_backtrace.c
@@ -16,33 +16,14 @@
 #include <linux/delay.h>
 #include <linux/kprobes.h>
 #include <linux/nmi.h>
-#include <linux/seq_buf.h>
 
 #ifdef arch_trigger_all_cpu_backtrace
 /* For reliability, we're prepared to waste bits here. */
 static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-static cpumask_t printtrace_mask;
-
-#define NMI_BUF_SIZE		4096
-
-struct nmi_seq_buf {
-	unsigned char		buffer[NMI_BUF_SIZE];
-	struct seq_buf		seq;
-};
-
-/* Safe printing in NMI context */
-static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq);
 
 /* "in progress" flag of arch_trigger_all_cpu_backtrace */
 static unsigned long backtrace_flag;
 
-static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
-{
-	const char *buf = s->buffer + start;
-
-	printk("%.*s", (end - start) + 1, buf);
-}
-
 /*
  * When raise() is called it will be is passed a pointer to the
  * backtrace_mask. Architectures that call nmi_cpu_backtrace()
@@ -52,8 +33,7 @@ static void print_seq_line(struct nmi_seq_buf *s, int start, int end)
 void nmi_trigger_all_cpu_backtrace(bool include_self,
 				   void (*raise)(cpumask_t *mask))
 {
-	struct nmi_seq_buf *s;
-	int i, cpu, this_cpu = get_cpu();
+	int i, this_cpu = get_cpu();
 
 	if (test_and_set_bit(0, &backtrace_flag)) {
 		/*
@@ -68,17 +48,6 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 	if (!include_self)
 		cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask));
 
-	cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask));
-
-	/*
-	 * Set up per_cpu seq_buf buffers that the NMIs running on the other
-	 * CPUs will write to.
-	 */
-	for_each_cpu(cpu, to_cpumask(backtrace_mask)) {
-		s = &per_cpu(nmi_print_seq, cpu);
-		seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE);
-	}
-
 	if (!cpumask_empty(to_cpumask(backtrace_mask))) {
 		pr_info("Sending NMI to %s CPUs:\n",
 			(include_self ? "all" : "other"));
@@ -94,73 +63,25 @@ void nmi_trigger_all_cpu_backtrace(bool include_self,
 	}
 
 	/*
-	 * Now that all the NMIs have triggered, we can dump out their
-	 * back traces safely to the console.
+	 * Force flush any remote buffers that might be stuck in IRQ context
+	 * and therefore could not run their irq_work.
 	 */
-	for_each_cpu(cpu, &printtrace_mask) {
-		int len, last_i = 0;
+	printk_nmi_flush();
 
-		s = &per_cpu(nmi_print_seq, cpu);
-		len = seq_buf_used(&s->seq);
-		if (!len)
-			continue;
-
-		/* Print line by line. */
-		for (i = 0; i < len; i++) {
-			if (s->buffer[i] == '\n') {
-				print_seq_line(s, last_i, i);
-				last_i = i + 1;
-			}
-		}
-		/* Check if there was a partial line. */
-		if (last_i < len) {
-			print_seq_line(s, last_i, len - 1);
-			pr_cont("\n");
-		}
-	}
-
-	clear_bit(0, &backtrace_flag);
-	smp_mb__after_atomic();
+	clear_bit_unlock(0, &backtrace_flag);
 	put_cpu();
 }
 
-/*
- * It is not safe to call printk() directly from NMI handlers.
- * It may be fine if the NMI detected a lock up and we have no choice
- * but to do so, but doing a NMI on all other CPUs to get a back trace
- * can be done with a sysrq-l. We don't want that to lock up, which
- * can happen if the NMI interrupts a printk in progress.
- *
- * Instead, we redirect the vprintk() to this nmi_vprintk() that writes
- * the content into a per cpu seq_buf buffer. Then when the NMIs are
- * all done, we can safely dump the contents of the seq_buf to a printk()
- * from a non NMI context.
- */
-static int nmi_vprintk(const char *fmt, va_list args)
-{
-	struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
-	unsigned int len = seq_buf_used(&s->seq);
-
-	seq_buf_vprintf(&s->seq, fmt, args);
-	return seq_buf_used(&s->seq) - len;
-}
-
 bool nmi_cpu_backtrace(struct pt_regs *regs)
 {
 	int cpu = smp_processor_id();
 
 	if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
-		printk_func_t printk_func_save = this_cpu_read(printk_func);
-
-		/* Replace printk to write into the NMI seq */
-		this_cpu_write(printk_func, nmi_vprintk);
 		pr_warn("NMI backtrace for cpu %d\n", cpu);
 		if (regs)
 			show_regs(regs);
 		else
 			dump_stack();
-		this_cpu_write(printk_func, printk_func_save);
-
 		cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
 		return true;
 	}
-- 
cgit v1.2.3


From b522deabc6f18e4f938d93a84f345f2cbf3347d1 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:36 -0700
Subject: printk/nmi: warn when some message has been lost in NMI context

We could not resize the temporary buffer in NMI context.  Let's warn if
a message is lost.

This is rather theoretical.  printk() should not be used in NMI.  The
only sensible use is when we want to print backtrace from all CPUs.  The
current buffer should be enough for this purpose.

[akpm@linux-foundation.org: whitespace fixlet]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/internal.h | 11 +++++++++++
 kernel/printk/nmi.c      |  5 ++++-
 kernel/printk/printk.c   | 10 ++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 2de99faedfc1..341bedccc065 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -34,6 +34,12 @@ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 	return this_cpu_read(printk_func)(fmt, args);
 }
 
+extern atomic_t nmi_message_lost;
+static inline int get_nmi_message_lost(void)
+{
+	return atomic_xchg(&nmi_message_lost, 0);
+}
+
 #else /* CONFIG_PRINTK_NMI */
 
 static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
@@ -41,4 +47,9 @@ static inline __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
 	return vprintk_default(fmt, args);
 }
 
+static inline int get_nmi_message_lost(void)
+{
+	return 0;
+}
+
 #endif /* CONFIG_PRINTK_NMI */
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index 303cf0d15e57..572f94922230 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -39,6 +39,7 @@
  */
 DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
 static int printk_nmi_irq_ready;
+atomic_t nmi_message_lost;
 
 #define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work))
 
@@ -64,8 +65,10 @@ static int vprintk_nmi(const char *fmt, va_list args)
 again:
 	len = atomic_read(&s->len);
 
-	if (len >= sizeof(s->buffer))
+	if (len >= sizeof(s->buffer)) {
+		atomic_inc(&nmi_message_lost);
 		return 0;
+	}
 
 	/*
 	 * Make sure that all old data have been read before the buffer was
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 71eba0607034..e38579d730f4 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1617,6 +1617,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 	unsigned long flags;
 	int this_cpu;
 	int printed_len = 0;
+	int nmi_message_lost;
 	bool in_sched = false;
 	/* cpu currently holding logbuf_lock in this function */
 	static unsigned int logbuf_cpu = UINT_MAX;
@@ -1667,6 +1668,15 @@ asmlinkage int vprintk_emit(int facility, int level,
 					 strlen(recursion_msg));
 	}
 
+	nmi_message_lost = get_nmi_message_lost();
+	if (unlikely(nmi_message_lost)) {
+		text_len = scnprintf(textbuf, sizeof(textbuf),
+				     "BAD LUCK: lost %d message(s) from NMI context!",
+				     nmi_message_lost);
+		printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+					 NULL, 0, textbuf, text_len);
+	}
+
 	/*
 	 * The printf needs to come first; we need the syslog
 	 * prefix which might be passed-in as a parameter.
-- 
cgit v1.2.3


From 427934b8714ec130b068d1c9d88f25b24aaede32 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:39 -0700
Subject: printk/nmi: increase the size of NMI buffer and make it configurable

Testing has shown that the backtrace sometimes does not fit into the 4kB
temporary buffer that is used in NMI context.  The warnings are gone
when I double the temporary buffer size.

This patch doubles the buffer size and makes it configurable.

Note that this problem existed even in the x86-specific implementation
that was added by the commit a9edc8809328 ("x86/nmi: Perform a safe NMI
stack trace on all CPUs").  Nobody noticed it because it did not print
any warnings.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: David Miller <davem@davemloft.net>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig        | 22 ++++++++++++++++++++++
 kernel/printk/nmi.c |  3 ++-
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index bccc1d607be5..a9c4aefd5436 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -862,6 +862,28 @@ config LOG_CPU_MAX_BUF_SHIFT
 		     13 =>   8 KB for each CPU
 		     12 =>   4 KB for each CPU
 
+config NMI_LOG_BUF_SHIFT
+	int "Temporary per-CPU NMI log buffer size (12 => 4KB, 13 => 8KB)"
+	range 10 21
+	default 13
+	depends on PRINTK_NMI
+	help
+	  Select the size of a per-CPU buffer where NMI messages are temporary
+	  stored. They are copied to the main log buffer in a safe context
+	  to avoid a deadlock. The value defines the size as a power of 2.
+
+	  NMI messages are rare and limited. The largest one is when
+	  a backtrace is printed. It usually fits into 4KB. Select
+	  8KB if you want to be on the safe side.
+
+	  Examples:
+		     17 => 128 KB for each CPU
+		     16 =>  64 KB for each CPU
+		     15 =>  32 KB for each CPU
+		     14 =>  16 KB for each CPU
+		     13 =>   8 KB for each CPU
+		     12 =>   4 KB for each CPU
+
 #
 # Architectures with an unreliable sched_clock() should select this:
 #
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index 572f94922230..bf08557d7e3d 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -41,7 +41,8 @@ DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
 static int printk_nmi_irq_ready;
 atomic_t nmi_message_lost;
 
-#define NMI_LOG_BUF_LEN (4096 - sizeof(atomic_t) - sizeof(struct irq_work))
+#define NMI_LOG_BUF_LEN ((1 << CONFIG_NMI_LOG_BUF_SHIFT) -		\
+			 sizeof(atomic_t) - sizeof(struct irq_work))
 
 struct nmi_seq_buf {
 	atomic_t		len;	/* length of written data */
-- 
cgit v1.2.3


From cf9b1106c81c45cde02208fca49d3f3e4ab6ee74 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 May 2016 17:00:42 -0700
Subject: printk/nmi: flush NMI messages on the system panic

In NMI context, printk() messages are stored into per-CPU buffers to
avoid a possible deadlock.  They are normally flushed to the main ring
buffer via an IRQ work.  But the work is never called when the system
calls panic() in the very same NMI handler.

This patch tries to flush NMI buffers before the crash dump is
generated.  In this case it does not risk a double release and bails out
when the logbuf_lock is already taken.  The aim is to get the messages
into the main ring buffer when possible.  It makes them better
accessible in the vmcore.

Then the patch tries to flush the buffers second time when other CPUs
are down.  It might be more aggressive and reset logbuf_lock.  The aim
is to get the messages available for the consequent kmsg_dump() and
console_flush_on_panic() calls.

The patch causes vprintk_emit() to be called even in NMI context again.
But it is done via printk_deferred() so that the console handling is
skipped.  Consoles use internal locks and we could not prevent a
deadlock easily.  They are explicitly called later when the crash dump
is not generated, see console_flush_on_panic().

Signed-off-by: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: David Miller <davem@davemloft.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Kosina <jkosina@suse.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h   |  2 ++
 kernel/kexec_core.c      |  1 +
 kernel/panic.c           |  6 +++++-
 kernel/printk/internal.h |  2 ++
 kernel/printk/nmi.c      | 39 ++++++++++++++++++++++++++++++++++++++-
 kernel/printk/printk.c   |  2 +-
 6 files changed, 49 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 51dd6b824fe2..f4da695fd615 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -127,11 +127,13 @@ extern void printk_nmi_init(void);
 extern void printk_nmi_enter(void);
 extern void printk_nmi_exit(void);
 extern void printk_nmi_flush(void);
+extern void printk_nmi_flush_on_panic(void);
 #else
 static inline void printk_nmi_init(void) { }
 static inline void printk_nmi_enter(void) { }
 static inline void printk_nmi_exit(void) { }
 static inline void printk_nmi_flush(void) { }
+static inline void printk_nmi_flush_on_panic(void) { }
 #endif /* PRINTK_NMI */
 
 #ifdef CONFIG_PRINTK
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1c03dfb4abfd..d5d408252992 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -893,6 +893,7 @@ void crash_kexec(struct pt_regs *regs)
 	old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu);
 	if (old_cpu == PANIC_CPU_INVALID) {
 		/* This is the 1st CPU which comes here, so go ahead. */
+		printk_nmi_flush_on_panic();
 		__crash_kexec(regs);
 
 		/*
diff --git a/kernel/panic.c b/kernel/panic.c
index 535c96510a44..8aa74497cc5a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -160,8 +160,10 @@ void panic(const char *fmt, ...)
 	 *
 	 * Bypass the panic_cpu check and call __crash_kexec directly.
 	 */
-	if (!crash_kexec_post_notifiers)
+	if (!crash_kexec_post_notifiers) {
+		printk_nmi_flush_on_panic();
 		__crash_kexec(NULL);
+	}
 
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
@@ -176,6 +178,8 @@ void panic(const char *fmt, ...)
 	 */
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
+	/* Call flush even twice. It tries harder with a single online CPU */
+	printk_nmi_flush_on_panic();
 	kmsg_dump(KMSG_DUMP_PANIC);
 
 	/*
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 341bedccc065..7fd2838fa417 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -22,6 +22,8 @@ int __printf(1, 0) vprintk_default(const char *fmt, va_list args);
 
 #ifdef CONFIG_PRINTK_NMI
 
+extern raw_spinlock_t logbuf_lock;
+
 /*
  * printk() could not take logbuf_lock in NMI context. Instead,
  * it temporary stores the strings into a per-CPU buffer.
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index bf08557d7e3d..b69eb8a2876f 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -17,6 +17,7 @@
 
 #include <linux/preempt.h>
 #include <linux/spinlock.h>
+#include <linux/debug_locks.h>
 #include <linux/smp.h>
 #include <linux/cpumask.h>
 #include <linux/irq_work.h>
@@ -106,7 +107,16 @@ static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
 {
 	const char *buf = s->buffer + start;
 
-	printk("%.*s", (end - start) + 1, buf);
+	/*
+	 * The buffers are flushed in NMI only on panic.  The messages must
+	 * go only into the ring buffer at this stage.  Consoles will get
+	 * explicitly called later when a crashdump is not generated.
+	 */
+	if (in_nmi())
+		printk_deferred("%.*s", (end - start) + 1, buf);
+	else
+		printk("%.*s", (end - start) + 1, buf);
+
 }
 
 /*
@@ -194,6 +204,33 @@ void printk_nmi_flush(void)
 		__printk_nmi_flush(&per_cpu(nmi_print_seq, cpu).work);
 }
 
+/**
+ * printk_nmi_flush_on_panic - flush all per-cpu nmi buffers when the system
+ *	goes down.
+ *
+ * Similar to printk_nmi_flush() but it can be called even in NMI context when
+ * the system goes down. It does the best effort to get NMI messages into
+ * the main ring buffer.
+ *
+ * Note that it could try harder when there is only one CPU online.
+ */
+void printk_nmi_flush_on_panic(void)
+{
+	/*
+	 * Make sure that we could access the main ring buffer.
+	 * Do not risk a double release when more CPUs are up.
+	 */
+	if (in_nmi() && raw_spin_is_locked(&logbuf_lock)) {
+		if (num_online_cpus() > 1)
+			return;
+
+		debug_locks_off();
+		raw_spin_lock_init(&logbuf_lock);
+	}
+
+	printk_nmi_flush();
+}
+
 void __init printk_nmi_init(void)
 {
 	int cpu;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e38579d730f4..60cdf6386763 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -245,7 +245,7 @@ __packed __aligned(4)
  * within the scheduler's rq lock. It must be released before calling
  * console_unlock() or anything else that might wake up a process.
  */
-static DEFINE_RAW_SPINLOCK(logbuf_lock);
+DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #ifdef CONFIG_PRINTK
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
-- 
cgit v1.2.3


From ede9c27749b9b35efdffa4f63a39f819d7913752 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 20 May 2016 17:01:10 -0700
Subject: kernel/sysctl_binary.c: use generic UUID library

UUID library provides uuid_be type and uuid_be_to_bin() function.  This
substitutes open coded variant by generic library calls.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Dmitry Kasatkin <dmitry.kasatkin@gmail.com>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl_binary.c | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 10a1d7dc9313..6eb99c17dbd8 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
 #include <linux/ctype.h>
 #include <linux/netdevice.h>
 #include <linux/kernel.h>
+#include <linux/uuid.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
 
@@ -1117,9 +1118,8 @@ static ssize_t bin_uuid(struct file *file,
 
 	/* Only supports reads */
 	if (oldval && oldlen) {
-		char buf[40], *str = buf;
-		unsigned char uuid[16];
-		int i;
+		char buf[UUID_STRING_LEN + 1];
+		uuid_be uuid;
 
 		result = kernel_read(file, 0, buf, sizeof(buf) - 1);
 		if (result < 0)
@@ -1127,24 +1127,15 @@ static ssize_t bin_uuid(struct file *file,
 
 		buf[result] = '\0';
 
-		/* Convert the uuid to from a string to binary */
-		for (i = 0; i < 16; i++) {
-			result = -EIO;
-			if (!isxdigit(str[0]) || !isxdigit(str[1]))
-				goto out;
-
-			uuid[i] = (hex_to_bin(str[0]) << 4) |
-					hex_to_bin(str[1]);
-			str += 2;
-			if (*str == '-')
-				str++;
-		}
+		result = -EIO;
+		if (uuid_be_to_bin(buf, &uuid))
+			goto out;
 
 		if (oldlen > 16)
 			oldlen = 16;
 
 		result = -EFAULT;
-		if (copy_to_user(oldval, uuid, oldlen))
+		if (copy_to_user(oldval, &uuid, oldlen))
 			goto out;
 
 		copied = oldlen;
-- 
cgit v1.2.3


From e9256efcc8e390fa4fcf796a0c0b47d642d77d32 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:01:33 -0700
Subject: radix-tree: introduce radix_tree_empty

Commit e61452365372 ("radix_tree: add support for multi-order entries")
left the impression that the support for multiorder radix tree entries
was functional.  As soon as Ross tried to use it, it became apparent
that my testing was completely inadequate, and it didn't even work a
little bit for orders that were not a multiple of shift.

This series of patches is the result of about 6 weeks of redesign,
reimplementation, testing, arguing and hair-pulling.  The great news is
that the test-suite is now far better than it was.  That's reflected in
the diffstat for the test-suite alone:

 12 files changed, 436 insertions(+), 28 deletions(-)

The highlight for users of the tree is that the restriction on the order
of inserted entries being >= RADIX_TREE_MAP_SHIFT is now gone; the radix
tree now supports any order between 0 and 64.

For those who are interested in how the tree works, patch 9 is probably
the most interesting one as it introduces the new machinery for handling
sibling entries.

I've tried to be fair in attributing authorship to the person who
contributed the majority of the code in each patch; Ross has been an
invaluable partner in the development of this support and it's fair to
say that each of us has code in every commit.

I should also express my appreciation of the 0day testing.  It prompted
me that I was bloating the tinyconfig in an unacceptable way, and it
bisected to a commit which contained a rather nasty memory-corruption
bug.

This patch (of 29):

The irqdomain code was checking for 0 or 1 entries, not 0 entries like
the comment said they were.  Introduce a new helper that will actually
check for an empty tree.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h | 5 +++++
 kernel/irq/irqdomain.c     | 7 +------
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 51a97ac8bfbf..83f708e5db59 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -136,6 +136,11 @@ do {									\
 	(root)->rnode = NULL;						\
 } while (0)
 
+static inline bool radix_tree_empty(struct radix_tree_root *root)
+{
+	return root->rnode == NULL;
+}
+
 /**
  * Radix-tree synchronization
  *
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d65f6f31a5b3..8798b6c9e945 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -139,12 +139,7 @@ void irq_domain_remove(struct irq_domain *domain)
 {
 	mutex_lock(&irq_domain_mutex);
 
-	/*
-	 * radix_tree_delete() takes care of destroying the root
-	 * node when all entries are removed. Shout if there are
-	 * any mappings left.
-	 */
-	WARN_ON(domain->revmap_tree.height);
+	WARN_ON(!radix_tree_empty(&domain->revmap_tree));
 
 	list_del(&domain->link);
 
-- 
cgit v1.2.3


From bd28b14591b98f696bc9f94c5ba2e598ca487dfd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 22 May 2016 17:21:27 -0700
Subject: x86: remove more uaccess_32.h complexity

I'm looking at trying to possibly merge the 32-bit and 64-bit versions
of the x86 uaccess.h implementation, but first this needs to be cleaned
up.

For example, the 32-bit version of "__copy_from_user_inatomic()" is
mostly the special cases for the constant size, and it's actually almost
never relevant.  Most users aren't actually using a constant size
anyway, and the few cases that do small constant copies are better off
just using __get_user() instead.

So get rid of the unnecessary complexity.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_32.h | 26 --------------------------
 kernel/events/uprobes.c           |  3 +--
 kernel/futex.c                    |  2 +-
 mm/maccess.c                      |  3 +--
 4 files changed, 3 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 537cc883ea29..4b32da24faaf 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -65,32 +65,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
 static __always_inline unsigned long
 __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 {
-	/* Avoid zeroing the tail if the copy fails..
-	 * If 'n' is constant and 1, 2, or 4, we do still zero on a failure,
-	 * but as the zeroing behaviour is only significant when n is not
-	 * constant, that shouldn't be a problem.
-	 */
-	if (__builtin_constant_p(n)) {
-		unsigned long ret;
-
-		switch (n) {
-		case 1:
-			__uaccess_begin();
-			__get_user_size(*(u8 *)to, from, 1, ret, 1);
-			__uaccess_end();
-			return ret;
-		case 2:
-			__uaccess_begin();
-			__get_user_size(*(u16 *)to, from, 2, ret, 2);
-			__uaccess_end();
-			return ret;
-		case 4:
-			__uaccess_begin();
-			__get_user_size(*(u32 *)to, from, 4, ret, 4);
-			__uaccess_end();
-			return ret;
-		}
-	}
 	return __copy_from_user_ll_nozero(to, from, n);
 }
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7edc95edfaee..c01f733ff2e1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1694,8 +1694,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	int result;
 
 	pagefault_disable();
-	result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
-							sizeof(opcode));
+	result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
 	pagefault_enable();
 
 	if (likely(result == 0))
diff --git a/kernel/futex.c b/kernel/futex.c
index c20f06f38ef3..ee25f5ba4aca 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -729,7 +729,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 	int ret;
 
 	pagefault_disable();
-	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
+	ret = __get_user(*dest, from);
 	pagefault_enable();
 
 	return ret ? -EFAULT : 0;
diff --git a/mm/maccess.c b/mm/maccess.c
index d159b1c96e48..78f9274dd49d 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -96,8 +96,7 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
 	pagefault_disable();
 
 	do {
-		ret = __copy_from_user_inatomic(dst++,
-						(const void __user __force *)src++, 1);
+		ret = __get_user(*dst++, (const char __user __force *)src++);
 	} while (dst[-1] && ret == 0 && src - unsafe_addr < count);
 
 	dst[-1] = '\0';
-- 
cgit v1.2.3


From 612bacad78ba6d0a91166fc4487af114bac172a8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 22 May 2016 23:16:18 +0200
Subject: bpf, inode: disallow userns mounts

Follow-up to commit e27f4a942a0e ("bpf: Use mount_nodev not mount_ns
to mount the bpf filesystem"), which removes the FS_USERNS_MOUNT flag.

The original idea was to have a per mountns instance instead of a
single global fs instance, but that didn't work out and we had to
switch to mount_nodev() model. The intent of that middle ground was
that we avoid users who don't play nice to create endless instances
of bpf fs which are difficult to control and discover from an admin
point of view, but at the same time it would have allowed us to be
more flexible with regard to namespaces.

Therefore, since we now did the switch to mount_nodev() as a fix
where individual instances are created, we also need to remove userns
mount flag along with it to avoid running into mentioned situation.
I don't expect any breakage at this early point in time with removing
the flag and we can revisit this later should the requirement for
this come up with future users. This and commit e27f4a942a0e have
been split to facilitate tracking should any of them run into the
unlikely case of causing a regression.

Fixes: b2197755b263 ("bpf: add support for persistent maps/progs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 04be7021f848..318858edb1cd 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -365,7 +365,6 @@ static struct file_system_type bpf_fs_type = {
 	.name		= "bpf",
 	.mount		= bpf_mount,
 	.kill_sb	= kill_litter_super,
-	.fs_flags	= FS_USERNS_MOUNT,
 };
 
 MODULE_ALIAS_FS("bpf");
-- 
cgit v1.2.3


From f43edca7ed08fc02279f2a62015da5cb6aa0ad61 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Mon, 23 May 2016 16:22:26 -0700
Subject: ELF/MIPS build fix

CONFIG_MIPS32_N32=y but CONFIG_BINFMT_ELF disabled results in the
following linker errors:

  arch/mips/built-in.o: In function `elf_core_dump':
  binfmt_elfn32.c:(.text+0x23dbc): undefined reference to `elf_core_extra_phdrs'
  binfmt_elfn32.c:(.text+0x246e4): undefined reference to `elf_core_extra_data_size'
  binfmt_elfn32.c:(.text+0x248d0): undefined reference to `elf_core_write_extra_phdrs'
  binfmt_elfn32.c:(.text+0x24ac4): undefined reference to `elf_core_write_extra_data'

CONFIG_MIPS32_O32=y but CONFIG_BINFMT_ELF disabled results in the following
linker errors:

  arch/mips/built-in.o: In function `elf_core_dump':
  binfmt_elfo32.c:(.text+0x28a04): undefined reference to `elf_core_extra_phdrs'
  binfmt_elfo32.c:(.text+0x29330): undefined reference to `elf_core_extra_data_size'
  binfmt_elfo32.c:(.text+0x2951c): undefined reference to `elf_core_write_extra_phdrs'
  binfmt_elfo32.c:(.text+0x29710): undefined reference to `elf_core_write_extra_data'

This is because binfmt_elfn32 and binfmt_elfo32 are using symbols from
elfcore but for these configurations elfcore will not be built.

Fixed by making elfcore selectable by a separate config symbol which
unlike the current mechanism can also be used from other directories
than kernel/, then having each flavor of ELF that relies on elfcore.o,
select it in Kconfig, including CONFIG_MIPS32_N32 and CONFIG_MIPS32_O32
which fixes this issue.

Link: http://lkml.kernel.org/r/20160520141705.GA1913@linux-mips.org
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Cc: "Maciej W. Rozycki" <macro@linux-mips.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/Kconfig | 1 +
 fs/Kconfig.binfmt | 8 ++++++++
 kernel/Makefile   | 4 +---
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 8040fb1845b4..46938847e794 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -3117,6 +3117,7 @@ config MIPS32_N32
 config BINFMT_ELF32
 	bool
 	default y if MIPS32_O32 || MIPS32_N32
+	select ELFCORE
 
 endmenu
 
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 2d0cbbd14cfc..72c03354c14b 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -1,6 +1,7 @@
 config BINFMT_ELF
 	bool "Kernel support for ELF binaries"
 	depends on MMU && (BROKEN || !FRV)
+	select ELFCORE
 	default y
 	---help---
 	  ELF (Executable and Linkable Format) is a format for libraries and
@@ -26,6 +27,7 @@ config BINFMT_ELF
 config COMPAT_BINFMT_ELF
 	bool
 	depends on COMPAT && BINFMT_ELF
+	select ELFCORE
 
 config ARCH_BINFMT_ELF_STATE
 	bool
@@ -34,6 +36,7 @@ config BINFMT_ELF_FDPIC
 	bool "Kernel support for FDPIC ELF binaries"
 	default y
 	depends on (FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X)
+	select ELFCORE
 	help
 	  ELF FDPIC binaries are based on ELF, but allow the individual load
 	  segments of a binary to be located in memory independently of each
@@ -43,6 +46,11 @@ config BINFMT_ELF_FDPIC
 
 	  It is also possible to run FDPIC ELF binaries on MMU linux also.
 
+config ELFCORE
+	bool
+	help
+	  This option enables kernel/elfcore.o.
+
 config CORE_DUMP_DEFAULT_ELF_HEADERS
 	bool "Write ELF core dumps with partial segments"
 	default y
diff --git a/kernel/Makefile b/kernel/Makefile
index f0c40bf49d9f..e2ec54e2b952 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -91,9 +91,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
-obj-$(CONFIG_BINFMT_ELF) += elfcore.o
-obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
-obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
+obj-$(CONFIG_ELFCORE) += elfcore.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_TRACE_CLOCK) += trace/
-- 
cgit v1.2.3


From bf959931ddb88c4e4366e96dd22e68fa0db9527c Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 23 May 2016 16:23:50 -0700
Subject: wait/ptrace: assume __WALL if the child is traced

The following program (simplified version of generated by syzkaller)

	#include <pthread.h>
	#include <unistd.h>
	#include <sys/ptrace.h>
	#include <stdio.h>
	#include <signal.h>

	void *thread_func(void *arg)
	{
		ptrace(PTRACE_TRACEME, 0,0,0);
		return 0;
	}

	int main(void)
	{
		pthread_t thread;

		if (fork())
			return 0;

		while (getppid() != 1)
			;

		pthread_create(&thread, NULL, thread_func, NULL);
		pthread_join(thread, NULL);
		return 0;
	}

creates an unreapable zombie if /sbin/init doesn't use __WALL.

This is not a kernel bug, at least in a sense that everything works as
expected: debugger should reap a traced sub-thread before it can reap the
leader, but without __WALL/__WCLONE do_wait() ignores sub-threads.

Unfortunately, it seems that /sbin/init in most (all?) distributions
doesn't use it and we have to change the kernel to avoid the problem.
Note also that most init's use sys_waitid() which doesn't allow __WALL, so
the necessary user-space fix is not that trivial.

This patch just adds the "ptrace" check into eligible_child().  To some
degree this matches the "tsk->ptrace" in exit_notify(), ->exit_signal is
mostly ignored when the tracee reports to debugger.  Or WSTOPPED, the
tracer doesn't need to set this flag to wait for the stopped tracee.

This obviously means the user-visible change: __WCLONE and __WALL no
longer have any meaning for debugger.  And I can only hope that this won't
break something, but at least strace/gdb won't suffer.

We could make a more conservative change.  Say, we can take __WCLONE into
account, or !thread_group_leader().  But it would be nice to not
complicate these historical/confusing checks.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: <syzkaller@googlegroups.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 75b34fe835b2..44fbe6edd7fe 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -918,17 +918,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
 		task_pid_type(p, wo->wo_type) == wo->wo_pid;
 }
 
-static int eligible_child(struct wait_opts *wo, struct task_struct *p)
+static int
+eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
 {
 	if (!eligible_pid(wo, p))
 		return 0;
-	/* Wait for all children (clone and not) if __WALL is set;
-	 * otherwise, wait for clone children *only* if __WCLONE is
-	 * set; otherwise, wait for non-clone children *only*.  (Note:
-	 * A "clone" child here is one that reports to its parent
-	 * using a signal other than SIGCHLD.) */
-	if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
-	    && !(wo->wo_flags & __WALL))
+
+	/*
+	 * Wait for all children (clone and not) if __WALL is set or
+	 * if it is traced by us.
+	 */
+	if (ptrace || (wo->wo_flags & __WALL))
+		return 1;
+
+	/*
+	 * Otherwise, wait for clone children *only* if __WCLONE is set;
+	 * otherwise, wait for non-clone children *only*.
+	 *
+	 * Note: a "clone" child here is one that reports to its parent
+	 * using a signal other than SIGCHLD, or a non-leader thread which
+	 * we can only see if it is traced by us.
+	 */
+	if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
 		return 0;
 
 	return 1;
@@ -1300,7 +1311,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	if (unlikely(exit_state == EXIT_DEAD))
 		return 0;
 
-	ret = eligible_child(wo, p);
+	ret = eligible_child(wo, ptrace, p);
 	if (!ret)
 		return ret;
 
-- 
cgit v1.2.3


From 91c4e8ea8f05916df0c8a6f383508ac7c9e10dba Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 23 May 2016 16:23:53 -0700
Subject: wait: allow sys_waitid() to accept __WNOTHREAD/__WCLONE/__WALL

I see no reason why waitid() can't support other linux-specific flags
allowed in sys_wait4().

In particular this change can help if we reconsider the previous change
("wait/ptrace: assume __WALL if the child is traced") which adds the
"automagical" __WALL for debugger.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: "Michael Kerrisk (man-pages)" <mtk.manpages@gmail.com>
Cc: Pedro Alves <palves@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 44fbe6edd7fe..9e6e1356e6bb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1535,7 +1535,8 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	enum pid_type type;
 	long ret;
 
-	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED))
+	if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
+			__WNOTHREAD|__WCLONE|__WALL))
 		return -EINVAL;
 	if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
 		return -EINVAL;
-- 
cgit v1.2.3


From 747800efbe8b98459f48d1d9d742298f8283f8fa Mon Sep 17 00:00:00 2001
From: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Date: Mon, 23 May 2016 16:23:59 -0700
Subject: kernel/signal.c: convert printk(KERN_<LEVEL> ...) to pr_<level>(...)

Use pr_<level> instead of printk(KERN_<LEVEL> ).

Signed-off-by: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ab122a2cee41..96e9bc40667f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -224,7 +224,7 @@ static inline void print_dropped_signal(int sig)
 	if (!__ratelimit(&ratelimit_state))
 		return;
 
-	printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
+	pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
 				current->comm, current->pid, sig);
 }
 
@@ -1089,10 +1089,10 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 static void print_fatal_signal(int signr)
 {
 	struct pt_regs *regs = signal_pt_regs();
-	printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr);
+	pr_info("potentially unexpected fatal signal %d.\n", signr);
 
 #if defined(__i386__) && !defined(__arch_um__)
-	printk(KERN_INFO "code at %08lx: ", regs->ip);
+	pr_info("code at %08lx: ", regs->ip);
 	{
 		int i;
 		for (i = 0; i < 16; i++) {
@@ -1100,10 +1100,10 @@ static void print_fatal_signal(int signr)
 
 			if (get_user(insn, (unsigned char *)(regs->ip + i)))
 				break;
-			printk(KERN_CONT "%02x ", insn);
+			pr_cont("%02x ", insn);
 		}
 	}
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 #endif
 	preempt_disable();
 	show_regs(regs);
-- 
cgit v1.2.3


From 725fc629ff2545b061407305ae51016c9f928fce Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 23 May 2016 16:24:05 -0700
Subject: kernek/fork.c: allocate idle task for a CPU always on its local node

Linux preallocates the task structs of the idle tasks for all possible
CPUs.  This currently means they all end up on node 0.  This also
implies that the cache line of MWAIT, which is around the flags field in
the task struct, are all located in node 0.

We see a noticeable performance improvement on Knights Landing CPUs when
the cache lines used for MWAIT are located in the local nodes of the
CPUs using them.  I would expect this to give a (likely slight)
improvement on other systems too.

The patch implements placing the idle task in the node of its CPUs, by
passing the right target node to copy_process()

[akpm@linux-foundation.org: use NUMA_NO_NODE, not a bare -1]
Link: http://lkml.kernel.org/r/1463492694-15833-1-git-send-email-andi@firstfloor.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 103d78fd8f75..e67d7b773348 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -340,13 +340,14 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 }
 
-static struct task_struct *dup_task_struct(struct task_struct *orig)
+static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
 	struct thread_info *ti;
-	int node = tsk_fork_get_node(orig);
 	int err;
 
+	if (node == NUMA_NO_NODE)
+		node = tsk_fork_get_node(orig);
 	tsk = alloc_task_struct_node(node);
 	if (!tsk)
 		return NULL;
@@ -1276,7 +1277,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 					int __user *child_tidptr,
 					struct pid *pid,
 					int trace,
-					unsigned long tls)
+					unsigned long tls,
+					int node)
 {
 	int retval;
 	struct task_struct *p;
@@ -1328,7 +1330,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto fork_out;
 
 	retval = -ENOMEM;
-	p = dup_task_struct(current);
+	p = dup_task_struct(current, node);
 	if (!p)
 		goto fork_out;
 
@@ -1706,7 +1708,8 @@ static inline void init_idle_pids(struct pid_link *links)
 struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
-	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
+	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
+			    cpu_to_node(cpu));
 	if (!IS_ERR(task)) {
 		init_idle_pids(task->pids);
 		init_idle(task, cpu);
@@ -1751,7 +1754,7 @@ long _do_fork(unsigned long clone_flags,
 	}
 
 	p = copy_process(clone_flags, stack_start, stack_size,
-			 child_tidptr, NULL, trace, tls);
+			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
 	/*
 	 * Do this prior waking up the new thread - the thread pointer
 	 * might get invalid after that point, if the thread exits quickly.
-- 
cgit v1.2.3


From 9b492cf58077a0254eb4b9574029ac6e79add9f9 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Mon, 23 May 2016 16:24:10 -0700
Subject: kexec: introduce a protection mechanism for the crashkernel reserved
 memory

For the cases that some kernel (module) path stamps the crash reserved
memory(already mapped by the kernel) where has been loaded the second
kernel data, the kdump kernel will probably fail to boot when panic
happens (or even not happens) leaving the culprit at large, this is
unacceptable.

The patch introduces a mechanism for detecting such cases:

1) After each crash kexec loading, it simply marks the reserved memory
   regions readonly since we no longer access it after that.  When someone
   stamps the region, the first kernel will panic and trigger the kdump.
   The weak arch_kexec_protect_crashkres() is introduced to do the actual
   protection.

2) To allow multiple loading, once 1) was done we also need to remark
   the reserved memory to readwrite each time a system call related to
   kdump is made.  The weak arch_kexec_unprotect_crashkres() is introduced
   to do the actual protection.

The architecture can make its specific implementation by overriding
arch_kexec_protect_crashkres() and arch_kexec_unprotect_crashkres().

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Minfei Huang <mhuang@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kexec.h | 2 ++
 kernel/kexec.c        | 9 ++++++++-
 kernel/kexec_core.c   | 6 ++++++
 kernel/kexec_file.c   | 8 +++++++-
 4 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 2cc643c6e870..643ff4a3fbf6 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -317,6 +317,8 @@ int __weak arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr,
 					Elf_Shdr *sechdrs, unsigned int relsec);
 int __weak arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 					unsigned int relsec);
+void arch_kexec_protect_crashkres(void);
+void arch_kexec_unprotect_crashkres(void);
 
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ee70aef5cd81..b44cb3f5a15c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -167,8 +167,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		return -EBUSY;
 
 	dest_image = &kexec_image;
-	if (flags & KEXEC_ON_CRASH)
+	if (flags & KEXEC_ON_CRASH) {
 		dest_image = &kexec_crash_image;
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
+	}
+
 	if (nr_segments > 0) {
 		unsigned long i;
 
@@ -211,6 +215,9 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 	image = xchg(dest_image, image);
 
 out:
+	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+		arch_kexec_protect_crashkres();
+
 	mutex_unlock(&kexec_mutex);
 	kimage_free(image);
 
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index d5d408252992..48b73cc8e425 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1563,3 +1563,9 @@ void __weak crash_map_reserved_pages(void)
 
 void __weak crash_unmap_reserved_pages(void)
 {}
+
+void __weak arch_kexec_protect_crashkres(void)
+{}
+
+void __weak arch_kexec_unprotect_crashkres(void)
+{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index c72d2ff5896e..503bc2d348e5 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -274,8 +274,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 		return -EBUSY;
 
 	dest_image = &kexec_image;
-	if (flags & KEXEC_FILE_ON_CRASH)
+	if (flags & KEXEC_FILE_ON_CRASH) {
 		dest_image = &kexec_crash_image;
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
+	}
 
 	if (flags & KEXEC_FILE_UNLOAD)
 		goto exchange;
@@ -324,6 +327,9 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 exchange:
 	image = xchg(dest_image, image);
 out:
+	if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image)
+		arch_kexec_protect_crashkres();
+
 	mutex_unlock(&kexec_mutex);
 	kimage_free(image);
 	return ret;
-- 
cgit v1.2.3


From 917a35605f09c0d16aeb2e92c7fbff562e19a116 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnfhuang@gmail.com>
Date: Mon, 23 May 2016 16:24:16 -0700
Subject: kexec: make a pair of map/unmap reserved pages in error path

For some arch, kexec shall map the reserved pages, then use them, when
we try to start the kdump service.

kexec may return directly, without unmaping the reserved pages, if it
fails during starting service.  To fix it, we make a pair of map/unmap
reserved pages both in generic path and error path.

This patch only affects s390.  Other architecturess don't implement the
interface of crash_unmap_reserved_pages and crash_map_reserved_pages.

It isn't a urgent patch.  Kernel can work well without any risk,
although the reserved pages are not unmapped before returning in error
path.

Signed-off-by: Minfei Huang <mnfhuang@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Xunlei Pang <xlpang@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index b44cb3f5a15c..4b49aa71304f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -194,22 +194,25 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 						   segments, flags);
 		}
 		if (result)
-			goto out;
+			goto unmap_page;
 
 		if (flags & KEXEC_PRESERVE_CONTEXT)
 			image->preserve_context = 1;
 		result = machine_kexec_prepare(image);
 		if (result)
-			goto out;
+			goto unmap_page;
 
 		for (i = 0; i < nr_segments; i++) {
 			result = kimage_load_segment(image, &image->segment[i]);
 			if (result)
-				goto out;
+				goto unmap_page;
 		}
 		kimage_terminate(image);
+unmap_page:
 		if (flags & KEXEC_ON_CRASH)
 			crash_unmap_reserved_pages();
+		if (result)
+			goto out;
 	}
 	/* Install the new kernel, and  Uninstall the old */
 	image = xchg(dest_image, image);
-- 
cgit v1.2.3


From 0eea08678ebe9f7d8ef98fed974a5bf1a0dd2dd2 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnfhuang@gmail.com>
Date: Mon, 23 May 2016 16:24:19 -0700
Subject: kexec: do a cleanup for function kexec_load

There are a lof of work to be done in function kexec_load, not only for
allocating structs and loading initram, but also for some misc.

To make it more clear, wrap a new function do_kexec_load which is used
to allocate structs and load initram.  And the pre-work will be done in
kexec_load.

Signed-off-by: Minfei Huang <mnfhuang@gmail.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Xunlei Pang <xlpang@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 125 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 69 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4b49aa71304f..b73dc211fcfd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -103,6 +103,74 @@ out_free_image:
 	return ret;
 }
 
+static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
+		struct kexec_segment __user *segments, unsigned long flags)
+{
+	struct kimage **dest_image, *image;
+	unsigned long i;
+	int ret;
+
+	if (flags & KEXEC_ON_CRASH) {
+		dest_image = &kexec_crash_image;
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
+	} else {
+		dest_image = &kexec_image;
+	}
+
+	if (nr_segments == 0) {
+		/* Uninstall image */
+		kimage_free(xchg(dest_image, NULL));
+		return 0;
+	}
+	if (flags & KEXEC_ON_CRASH) {
+		/*
+		 * Loading another kernel to switch to if this one
+		 * crashes.  Free any current crash dump kernel before
+		 * we corrupt it.
+		 */
+		kimage_free(xchg(&kexec_crash_image, NULL));
+	}
+
+	ret = kimage_alloc_init(&image, entry, nr_segments, segments, flags);
+	if (ret)
+		return ret;
+
+	if (flags & KEXEC_ON_CRASH)
+		crash_map_reserved_pages();
+
+	if (flags & KEXEC_PRESERVE_CONTEXT)
+		image->preserve_context = 1;
+
+	ret = machine_kexec_prepare(image);
+	if (ret)
+		goto out;
+
+	for (i = 0; i < nr_segments; i++) {
+		ret = kimage_load_segment(image, &image->segment[i]);
+		if (ret)
+			goto out;
+	}
+
+	kimage_terminate(image);
+
+	/* Install the new kernel and uninstall the old */
+	image = xchg(dest_image, image);
+
+out:
+	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
+		arch_kexec_protect_crashkres();
+
+	/*
+	 * Once the reserved memory is mapped, we should unmap this memory
+	 * before returning
+	 */
+	if (flags & KEXEC_ON_CRASH)
+		crash_unmap_reserved_pages();
+	kimage_free(image);
+	return ret;
+}
+
 /*
  * Exec Kernel system call: for obvious reasons only root may call it.
  *
@@ -127,7 +195,6 @@ out_free_image:
 SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 		struct kexec_segment __user *, segments, unsigned long, flags)
 {
-	struct kimage **dest_image, *image;
 	int result;
 
 	/* We only trust the superuser with rebooting the system. */
@@ -152,9 +219,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 	if (nr_segments > KEXEC_SEGMENT_MAX)
 		return -EINVAL;
 
-	image = NULL;
-	result = 0;
-
 	/* Because we write directly to the reserved memory
 	 * region when loading crash kernels we need a mutex here to
 	 * prevent multiple crash  kernels from attempting to load
@@ -166,63 +230,12 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 	if (!mutex_trylock(&kexec_mutex))
 		return -EBUSY;
 
-	dest_image = &kexec_image;
-	if (flags & KEXEC_ON_CRASH) {
-		dest_image = &kexec_crash_image;
-		if (kexec_crash_image)
-			arch_kexec_unprotect_crashkres();
-	}
-
-	if (nr_segments > 0) {
-		unsigned long i;
-
-		if (flags & KEXEC_ON_CRASH) {
-			/*
-			 * Loading another kernel to switch to if this one
-			 * crashes.  Free any current crash dump kernel before
-			 * we corrupt it.
-			 */
-
-			kimage_free(xchg(&kexec_crash_image, NULL));
-			result = kimage_alloc_init(&image, entry, nr_segments,
-						   segments, flags);
-			crash_map_reserved_pages();
-		} else {
-			/* Loading another kernel to reboot into. */
-
-			result = kimage_alloc_init(&image, entry, nr_segments,
-						   segments, flags);
-		}
-		if (result)
-			goto unmap_page;
-
-		if (flags & KEXEC_PRESERVE_CONTEXT)
-			image->preserve_context = 1;
-		result = machine_kexec_prepare(image);
-		if (result)
-			goto unmap_page;
-
-		for (i = 0; i < nr_segments; i++) {
-			result = kimage_load_segment(image, &image->segment[i]);
-			if (result)
-				goto unmap_page;
-		}
-		kimage_terminate(image);
-unmap_page:
-		if (flags & KEXEC_ON_CRASH)
-			crash_unmap_reserved_pages();
-		if (result)
-			goto out;
-	}
-	/* Install the new kernel, and  Uninstall the old */
-	image = xchg(dest_image, image);
+	result = do_kexec_load(entry, nr_segments, segments, flags);
 
-out:
 	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
 		arch_kexec_protect_crashkres();
 
 	mutex_unlock(&kexec_mutex);
-	kimage_free(image);
 
 	return result;
 }
-- 
cgit v1.2.3


From 7a0058ec78602da02b34fa2ae3afc523e90d1ab2 Mon Sep 17 00:00:00 2001
From: Xunlei Pang <xlpang@redhat.com>
Date: Mon, 23 May 2016 16:24:22 -0700
Subject: s390/kexec: consolidate crash_map/unmap_reserved_pages() and
 arch_kexec_protect(unprotect)_crashkres()

Commit 3f625002581b ("kexec: introduce a protection mechanism for the
crashkernel reserved memory") is a similar mechanism for protecting the
crash kernel reserved memory to previous crash_map/unmap_reserved_pages()
implementation, the new one is more generic in name and cleaner in code
(besides, some arch may not be allowed to unmap the pgtable).

Therefore, this patch consolidates them, and uses the new
arch_kexec_protect(unprotect)_crashkres() to replace former
crash_map/unmap_reserved_pages() which by now has been only used by
S390.

The consolidation work needs the crash memory to be mapped initially,
this is done in machine_kdump_pm_init() which is after
reserve_crashkernel().  Once kdump kernel is loaded, the new
arch_kexec_protect_crashkres() implemented for S390 will actually
unmap the pgtable like before.

Signed-off-by: Xunlei Pang <xlpang@redhat.com>
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Minfei Huang <mhuang@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/kernel/machine_kexec.c | 28 ++++++++++++++++++----------
 include/linux/kexec.h            |  2 --
 kernel/kexec.c                   | 12 ------------
 kernel/kexec_core.c              | 11 ++---------
 4 files changed, 20 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 2f1b7217c25c..0e64f08d3d69 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -43,13 +43,13 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
 	switch (action) {
 	case PM_SUSPEND_PREPARE:
 	case PM_HIBERNATION_PREPARE:
-		if (crashk_res.start)
-			crash_map_reserved_pages();
+		if (kexec_crash_image)
+			arch_kexec_unprotect_crashkres();
 		break;
 	case PM_POST_SUSPEND:
 	case PM_POST_HIBERNATION:
-		if (crashk_res.start)
-			crash_unmap_reserved_pages();
+		if (kexec_crash_image)
+			arch_kexec_protect_crashkres();
 		break;
 	default:
 		return NOTIFY_DONE;
@@ -60,6 +60,8 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
 static int __init machine_kdump_pm_init(void)
 {
 	pm_notifier(machine_kdump_pm_cb, 0);
+	/* Create initial mapping for crashkernel memory */
+	arch_kexec_unprotect_crashkres();
 	return 0;
 }
 arch_initcall(machine_kdump_pm_init);
@@ -146,6 +148,8 @@ static int kdump_csum_valid(struct kimage *image)
 #endif
 }
 
+#ifdef CONFIG_CRASH_DUMP
+
 /*
  * Map or unmap crashkernel memory
  */
@@ -167,21 +171,25 @@ static void crash_map_pages(int enable)
 }
 
 /*
- * Map crashkernel memory
+ * Unmap crashkernel memory
  */
-void crash_map_reserved_pages(void)
+void arch_kexec_protect_crashkres(void)
 {
-	crash_map_pages(1);
+	if (crashk_res.end)
+		crash_map_pages(0);
 }
 
 /*
- * Unmap crashkernel memory
+ * Map crashkernel memory
  */
-void crash_unmap_reserved_pages(void)
+void arch_kexec_unprotect_crashkres(void)
 {
-	crash_map_pages(0);
+	if (crashk_res.end)
+		crash_map_pages(1);
 }
 
+#endif
+
 /*
  * Give back memory to hypervisor before new kdump is loaded
  */
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 643ff4a3fbf6..e8acb2b43dd9 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -230,8 +230,6 @@ extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
 void crash_save_cpu(struct pt_regs *regs, int cpu);
 void crash_save_vmcoreinfo(void);
-void crash_map_reserved_pages(void);
-void crash_unmap_reserved_pages(void);
 void arch_crash_save_vmcoreinfo(void);
 __printf(1, 2)
 void vmcoreinfo_append_str(const char *fmt, ...);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b73dc211fcfd..4384672d3245 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -136,9 +136,6 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 	if (ret)
 		return ret;
 
-	if (flags & KEXEC_ON_CRASH)
-		crash_map_reserved_pages();
-
 	if (flags & KEXEC_PRESERVE_CONTEXT)
 		image->preserve_context = 1;
 
@@ -161,12 +158,6 @@ out:
 	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
 		arch_kexec_protect_crashkres();
 
-	/*
-	 * Once the reserved memory is mapped, we should unmap this memory
-	 * before returning
-	 */
-	if (flags & KEXEC_ON_CRASH)
-		crash_unmap_reserved_pages();
 	kimage_free(image);
 	return ret;
 }
@@ -232,9 +223,6 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 
 	result = do_kexec_load(entry, nr_segments, segments, flags);
 
-	if ((flags & KEXEC_ON_CRASH) && kexec_crash_image)
-		arch_kexec_protect_crashkres();
-
 	mutex_unlock(&kexec_mutex);
 
 	return result;
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 48b73cc8e425..56b3ed0927b0 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -954,7 +954,6 @@ int crash_shrink_memory(unsigned long new_size)
 	start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
 	end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
 
-	crash_map_reserved_pages();
 	crash_free_reserved_phys_range(end, crashk_res.end);
 
 	if ((start == end) && (crashk_res.parent != NULL))
@@ -968,7 +967,6 @@ int crash_shrink_memory(unsigned long new_size)
 	crashk_res.end = end - 1;
 
 	insert_resource(&iomem_resource, ram_res);
-	crash_unmap_reserved_pages();
 
 unlock:
 	mutex_unlock(&kexec_mutex);
@@ -1553,17 +1551,12 @@ int kernel_kexec(void)
 }
 
 /*
- * Add and remove page tables for crashkernel memory
+ * Protection mechanism for crashkernel reserved memory after
+ * the kdump kernel is loaded.
  *
  * Provide an empty default implementation here -- architecture
  * code may override this
  */
-void __weak crash_map_reserved_pages(void)
-{}
-
-void __weak crash_unmap_reserved_pages(void)
-{}
-
 void __weak arch_kexec_protect_crashkres(void)
 {}
 
-- 
cgit v1.2.3


From 7c051267931a9be9c6620cc17b362bc6ee6dedc8 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:25:48 -0700
Subject: mm, fork: make dup_mmap wait for mmap_sem for write killable

dup_mmap needs to lock current's mm mmap_sem for write.  If the waiting
task gets killed by the oom killer it would block oom_reaper from
asynchronous address space reclaim and reduce the chances of timely OOM
resolving.  Wait for the lock in the killable mode and return with EINTR
if the task got killed while waiting.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index e67d7b773348..47887bba944f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -414,7 +414,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	unsigned long charge;
 
 	uprobe_start_dup_mmap();
-	down_write(&oldmm->mmap_sem);
+	if (down_write_killable(&oldmm->mmap_sem)) {
+		retval = -EINTR;
+		goto fail_uprobe_end;
+	}
 	flush_cache_dup_mm(oldmm);
 	uprobe_dup_mmap(oldmm, mm);
 	/*
@@ -526,6 +529,7 @@ out:
 	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
+fail_uprobe_end:
 	uprobe_end_dup_mmap();
 	return retval;
 fail_nomem_anon_vma_fork:
-- 
cgit v1.2.3


From 17b0573d77fbd547cbf1d711b90d269bdcc1efd3 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:26:05 -0700
Subject: prctl: make PR_SET_THP_DISABLE wait for mmap_sem killable

PR_SET_THP_DISABLE requires mmap_sem for write.  If the waiting task
gets killed by the oom killer it would block oom_reaper from
asynchronous address space reclaim and reduce the chances of timely OOM
resolving.  Wait for the lock in the killable mode and return with EINTR
if the task got killed while waiting.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Alex Thorlton <athorlton@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index cf8ba545c7d3..89d5be418157 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2246,7 +2246,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_SET_THP_DISABLE:
 		if (arg3 || arg4 || arg5)
 			return -EINVAL;
-		down_write(&me->mm->mmap_sem);
+		if (down_write_killable(&me->mm->mmap_sem))
+			return -EINTR;
 		if (arg2)
 			me->mm->def_flags |= VM_NOHUGEPAGE;
 		else
-- 
cgit v1.2.3


From 598fdc1d66674264e122ca9d007ad822e98d8b8d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:26:08 -0700
Subject: uprobes: wait for mmap_sem for write killable

xol_add_vma needs mmap_sem for write.  If the waiting task gets killed
by the oom killer it would block oom_reaper from asynchronous address
space reclaim and reduce the chances of timely OOM resolving.  Wait for
the lock in the killable mode and return with EINTR if the task got
killed while waiting.

Do not warn in dup_xol_work if __create_xol_area failed due to fatal
signal pending because this is usually considered a kernel issue.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/events/uprobes.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c01f733ff2e1..b7a525ab2083 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1130,7 +1130,9 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 	struct vm_area_struct *vma;
 	int ret;
 
-	down_write(&mm->mmap_sem);
+	if (down_write_killable(&mm->mmap_sem))
+		return -EINTR;
+
 	if (mm->uprobes_state.xol_area) {
 		ret = -EALREADY;
 		goto fail;
@@ -1469,7 +1471,8 @@ static void dup_xol_work(struct callback_head *work)
 	if (current->flags & PF_EXITING)
 		return;
 
-	if (!__create_xol_area(current->utask->dup_xol_addr))
+	if (!__create_xol_area(current->utask->dup_xol_addr) &&
+			!fatal_signal_pending(current))
 		uprobe_warn(current, "dup xol area");
 }
 
-- 
cgit v1.2.3


From 59fa5860204ffc95128d60cba9f54f9740a42c7d Mon Sep 17 00:00:00 2001
From: Matt Redfearn <matt.redfearn@imgtec.com>
Date: Tue, 24 May 2016 11:42:30 +0100
Subject: genirq: Fix missing return value in irq_destroy_ipi()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 7cec18a3906b changed the return type of irq_destroy_ipi to int, but
missed adding a value to one return statement. Fix this to silence the
resulting compiler warning:

kernel/irq/ipi.c In function ‘irq_destroy_ipi’:
kernel/irq/ipi.c:128:3: warning: ‘return’ with no value, in function returning non-void [-Wreturn-type]

Fixes: 7cec18a3906b "genirq: Add error code reporting to irq_{reserve,destroy}_ipi"
Signed-off-by: Matt Redfearn <matt.redfearn@imgtec.com>
Cc: linux-mips@linux-mips.org
Link: http://lkml.kernel.org/r/1464086550-24734-1-git-send-email-matt.redfearn@imgtec.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/ipi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index c42742208e5e..89b49f6773f0 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -125,7 +125,7 @@ int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
 
 	domain = data->domain;
 	if (WARN_ON(domain == NULL))
-		return;
+		return -EINVAL;
 
 	if (!irq_domain_is_ipi(domain)) {
 		pr_warn("Trying to destroy a non IPI domain!\n");
-- 
cgit v1.2.3


From b7e7ade34e6188bee2e3b0d42b51d25137d9e2a5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 23 May 2016 11:19:07 +0200
Subject: sched/core: Fix remote wakeups

Commit:

  b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration")

... introduced a bug: Mike Galbraith found that it introduced a
performance regression, while Paul E. McKenney reported lost
wakeups and bisected it to this commit.

The reason is that I mis-read ttwu_queue() such that I assumed any
wakeup that got a remote queue must have had the task migrated.

Since this is not so; we need to transfer this information between
queueing the wakeup and actually doing the wakeup. Use a new
task_struct::sched_flag for this, we already write to
sched_contributes_to_load in the wakeup path so this is a hot and
modified cacheline.

Reported-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Ben Segall <bsegall@google.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Turner <pjt@google.com>
Cc: Pavan Kondeti <pkondeti@codeaurora.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: byungchul.park@lge.com
Fixes: b5179ac70de8 ("sched/fair: Prepare to fix fairness problems on migration")
Link: http://lkml.kernel.org/r/20160523091907.GD15728@worktop.ger.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  1 +
 kernel/sched/core.c   | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6cc0df970f1a..e053517a88b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1533,6 +1533,7 @@ struct task_struct {
 	unsigned sched_reset_on_fork:1;
 	unsigned sched_contributes_to_load:1;
 	unsigned sched_migrated:1;
+	unsigned sched_remote_wakeup:1;
 	unsigned :0; /* force alignment to the next boundary */
 
 	/* unserialized, strictly 'current' */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 404c0784b1fc..7f2cae4620c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1768,13 +1768,15 @@ void sched_ttwu_pending(void)
 	cookie = lockdep_pin_lock(&rq->lock);
 
 	while (llist) {
+		int wake_flags = 0;
+
 		p = llist_entry(llist, struct task_struct, wake_entry);
 		llist = llist_next(llist);
-		/*
-		 * See ttwu_queue(); we only call ttwu_queue_remote() when
-		 * its a x-cpu wakeup.
-		 */
-		ttwu_do_activate(rq, p, WF_MIGRATED, cookie);
+
+		if (p->sched_remote_wakeup)
+			wake_flags = WF_MIGRATED;
+
+		ttwu_do_activate(rq, p, wake_flags, cookie);
 	}
 
 	lockdep_unpin_lock(&rq->lock, cookie);
@@ -1819,10 +1821,12 @@ void scheduler_ipi(void)
 	irq_exit();
 }
 
-static void ttwu_queue_remote(struct task_struct *p, int cpu)
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
 {
 	struct rq *rq = cpu_rq(cpu);
 
+	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
+
 	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
 		if (!set_nr_if_polling(rq->idle))
 			smp_send_reschedule(cpu);
@@ -1869,7 +1873,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 #if defined(CONFIG_SMP)
 	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
-		ttwu_queue_remote(p, cpu);
+		ttwu_queue_remote(p, cpu, wake_flags);
 		return;
 	}
 #endif
-- 
cgit v1.2.3


From 887bddfa90c79957d61067cd54a10087be0c8b23 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 26 May 2016 00:04:58 -0400
Subject: add down_write_killable_nested()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/rwsem.h  |  2 ++
 kernel/locking/rwsem.c | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index d1c12d160ace..d37fbb34d06f 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -156,6 +156,7 @@ extern void downgrade_write(struct rw_semaphore *sem);
  */
 extern void down_read_nested(struct rw_semaphore *sem, int subclass);
 extern void down_write_nested(struct rw_semaphore *sem, int subclass);
+extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass);
 extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock);
 
 # define down_write_nest_lock(sem, nest_lock)			\
@@ -176,6 +177,7 @@ extern void up_read_non_owner(struct rw_semaphore *sem);
 # define down_read_nested(sem, subclass)		down_read(sem)
 # define down_write_nest_lock(sem, nest_lock)	down_write(sem)
 # define down_write_nested(sem, subclass)	down_write(sem)
+# define down_write_killable_nested(sem, subclass)	down_write_killable(sem)
 # define down_read_non_owner(sem)		down_read(sem)
 # define up_read_non_owner(sem)			up_read(sem)
 #endif
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index c817216c1615..2e853ad93a3a 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -173,6 +173,22 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 
 EXPORT_SYMBOL(down_write_nested);
 
+int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
+{
+	might_sleep();
+	rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
+
+	if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) {
+		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+		return -EINTR;
+	}
+
+	rwsem_set_owner(sem);
+	return 0;
+}
+
+EXPORT_SYMBOL(down_write_killable_nested);
+
 void up_read_non_owner(struct rw_semaphore *sem)
 {
 	__up_read(sem);
-- 
cgit v1.2.3


From b00c52dae6d9ee8d0f2407118ef6544ae5524781 Mon Sep 17 00:00:00 2001
From: Wenwei Tao <ww.tao0320@gmail.com>
Date: Fri, 13 May 2016 22:59:20 +0800
Subject: cgroup: remove redundant cleanup in css_create

When create css failed, before call css_free_rcu_fn, we remove the css
id and exit the percpu_ref, but we will do these again in
css_free_work_fn, so they are redundant.  Especially the css id, that
would cause problem if we remove it twice, since it may be assigned to
another css after the first remove.

tj: This was broken by two commits updating the free path without
    synchronizing the creation failure path.  This can be easily
    triggered by trying to create more than 64k memory cgroups.

Signed-off-by: Wenwei Tao <ww.tao0320@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Fixes: 9a1049da9bd2 ("percpu-refcount: require percpu_ref to be exited explicitly")
Fixes: 01e586598b22 ("cgroup: release css->id after css_free")
Cc: stable@vger.kernel.org # v3.17+
---
 kernel/cgroup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 86cb5c6e8932..789b84f973c9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5150,7 +5150,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 
 	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
 	if (err < 0)
-		goto err_free_percpu_ref;
+		goto err_free_css;
 	css->id = err;
 
 	/* @css is ready to be brought online now, make it visible */
@@ -5174,9 +5174,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 
 err_list_del:
 	list_del_rcu(&css->sibling);
-	cgroup_idr_remove(&ss->css_idr, css->id);
-err_free_percpu_ref:
-	percpu_ref_exit(&css->refcnt);
 err_free_css:
 	call_rcu(&css->rcu_head, css_free_rcu_fn);
 	return ERR_PTR(err);
-- 
cgit v1.2.3


From 7ef949d77f95f0d129f0d404b336459a34a00101 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 26 May 2016 15:16:22 -0700
Subject: mm: oom_reaper: remove some bloat

mmput_async is currently used only from the oom_reaper which is defined
only for CONFIG_MMU.  We can save work_struct in mm_struct for
!CONFIG_MMU.

[akpm@linux-foundation.org: fix typo, per Minchan]
Link: http://lkml.kernel.org/r/20160520061658.GB19172@dhcp22.suse.cz
Reported-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 2 ++
 include/linux/sched.h    | 4 +++-
 kernel/fork.c            | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d553855503e6..ca3e517980a0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -514,7 +514,9 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
 	atomic_long_t hugetlb_usage;
 #endif
+#ifdef CONFIG_MMU
 	struct work_struct async_put_work;
+#endif
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 23e075dcdfe4..6e42ada26345 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2745,10 +2745,12 @@ static inline bool mmget_not_zero(struct mm_struct *mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
-/* same as above but performs the slow path from the async kontext. Can
+#ifdef CONFIG_MMU
+/* same as above but performs the slow path from the async context. Can
  * be called from the atomic context as well
  */
 extern void mmput_async(struct mm_struct *);
+#endif
 
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
diff --git a/kernel/fork.c b/kernel/fork.c
index 47887bba944f..5c2c355aa97f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -736,6 +736,7 @@ void mmput(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mmput);
 
+#ifdef CONFIG_MMU
 static void mmput_async_fn(struct work_struct *work)
 {
 	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
@@ -749,6 +750,7 @@ void mmput_async(struct mm_struct *mm)
 		schedule_work(&mm->async_put_work);
 	}
 }
+#endif
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
-- 
cgit v1.2.3


From 287980e49ffc0f6d911601e7e352a812ed27768e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 27 May 2016 23:23:25 +0200
Subject: remove lots of IS_ERR_VALUE abuses

Most users of IS_ERR_VALUE() in the kernel are wrong, as they
pass an 'int' into a function that takes an 'unsigned long'
argument. This happens to work because the type is sign-extended
on 64-bit architectures before it gets converted into an
unsigned type.

However, anything that passes an 'unsigned short' or 'unsigned int'
argument into IS_ERR_VALUE() is guaranteed to be broken, as are
8-bit integers and types that are wider than 'unsigned long'.

Andrzej Hajda has already fixed a lot of the worst abusers that
were causing actual bugs, but it would be nice to prevent any
users that are not passing 'unsigned long' arguments.

This patch changes all users of IS_ERR_VALUE() that I could find
on 32-bit ARM randconfig builds and x86 allmodconfig. For the
moment, this doesn't change the definition of IS_ERR_VALUE()
because there are probably still architecture specific users
elsewhere.

Almost all the warnings I got are for files that are better off
using 'if (err)' or 'if (err < 0)'.
The only legitimate user I could find that we get a warning for
is the (32-bit only) freescale fman driver, so I did not remove
the IS_ERR_VALUE() there but changed the type to 'unsigned long'.
For 9pfs, I just worked around one user whose calling conventions
are so obscure that I did not dare change the behavior.

I was using this definition for testing:

 #define IS_ERR_VALUE(x) ((unsigned long*)NULL == (typeof (x)*)NULL && \
       unlikely((unsigned long long)(x) >= (unsigned long long)(typeof(x))-MAX_ERRNO))

which ends up making all 16-bit or wider types work correctly with
the most plausible interpretation of what IS_ERR_VALUE() was supposed
to return according to its users, but also causes a compile-time
warning for any users that do not pass an 'unsigned long' argument.

I suggested this approach earlier this year, but back then we ended
up deciding to just fix the users that are obviously broken. After
the initial warning that caused me to get involved in the discussion
(fs/gfs2/dir.c) showed up again in the mainline kernel, Linus
asked me to send the whole thing again.

[ Updated the 9p parts as per Al Viro  - Linus ]

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Andrzej Hajda <a.hajda@samsung.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lkml.org/lkml/2016/1/7/363
Link: https://lkml.org/lkml/2016/5/27/486
Acked-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org> # For nvmem part
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/acpi/acpi_dbg.c                          | 22 +++++++++++-----------
 drivers/ata/sata_highbank.c                      |  2 +-
 drivers/clk/tegra/clk-tegra210.c                 |  2 +-
 drivers/cpufreq/omap-cpufreq.c                   |  2 +-
 drivers/crypto/caam/ctrl.c                       |  2 +-
 drivers/dma/sun4i-dma.c                          | 16 ++++++++--------
 drivers/gpio/gpio-xlp.c                          |  2 +-
 drivers/gpu/drm/sti/sti_vtg.c                    |  4 ++--
 drivers/gpu/drm/tilcdc/tilcdc_tfp410.c           |  2 +-
 drivers/gpu/host1x/hw/intr_hw.c                  |  2 +-
 drivers/iommu/arm-smmu-v3.c                      | 18 +++++++++---------
 drivers/iommu/arm-smmu.c                         |  8 ++++----
 drivers/irqchip/irq-clps711x.c                   |  2 +-
 drivers/irqchip/irq-gic.c                        |  2 +-
 drivers/irqchip/irq-hip04.c                      |  2 +-
 drivers/irqchip/spear-shirq.c                    |  2 +-
 drivers/media/i2c/adp1653.c                      | 10 +++++-----
 drivers/media/platform/s5p-tv/mixer_drv.c        |  2 +-
 drivers/mfd/twl4030-irq.c                        |  2 +-
 drivers/mmc/core/mmc.c                           |  4 ++--
 drivers/mmc/host/dw_mmc.c                        |  6 +++---
 drivers/mmc/host/sdhci-esdhc-imx.c               |  2 +-
 drivers/mmc/host/sdhci-of-at91.c                 |  2 +-
 drivers/mmc/host/sdhci.c                         |  4 ++--
 drivers/net/ethernet/freescale/fman/fman.c       |  2 +-
 drivers/net/ethernet/freescale/fman/fman_muram.c |  4 ++--
 drivers/net/ethernet/freescale/fman/fman_muram.h |  4 ++--
 drivers/net/wireless/ti/wlcore/spi.c             |  4 ++--
 drivers/nvmem/core.c                             | 22 +++++++++++-----------
 drivers/tty/serial/amba-pl011.c                  |  2 +-
 drivers/tty/serial/sprd_serial.c                 |  2 +-
 drivers/video/fbdev/da8xx-fb.c                   |  4 ++--
 fs/afs/write.c                                   |  4 ----
 fs/binfmt_flat.c                                 |  6 +++---
 fs/gfs2/dir.c                                    | 15 +++++++++------
 kernel/pid.c                                     |  2 +-
 net/9p/client.c                                  |  8 ++++----
 sound/soc/qcom/lpass-platform.c                  |  4 ++--
 38 files changed, 102 insertions(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/acpi_dbg.c b/drivers/acpi/acpi_dbg.c
index 15e4604efba7..1f4128487dd4 100644
--- a/drivers/acpi/acpi_dbg.c
+++ b/drivers/acpi/acpi_dbg.c
@@ -265,7 +265,7 @@ static int acpi_aml_write_kern(const char *buf, int len)
 	char *p;
 
 	ret = acpi_aml_lock_write(crc, ACPI_AML_OUT_KERN);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		return ret;
 	/* sync tail before inserting logs */
 	smp_mb();
@@ -286,7 +286,7 @@ static int acpi_aml_readb_kern(void)
 	char *p;
 
 	ret = acpi_aml_lock_read(crc, ACPI_AML_IN_KERN);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		return ret;
 	/* sync head before removing cmds */
 	smp_rmb();
@@ -330,7 +330,7 @@ again:
 				goto again;
 			break;
 		}
-		if (IS_ERR_VALUE(ret))
+		if (ret < 0)
 			break;
 		size += ret;
 		count -= ret;
@@ -373,7 +373,7 @@ again:
 			if (ret == 0)
 				goto again;
 		}
-		if (IS_ERR_VALUE(ret))
+		if (ret < 0)
 			break;
 		*(msg + size) = (char)ret;
 		size++;
@@ -526,7 +526,7 @@ static int acpi_aml_open(struct inode *inode, struct file *file)
 	}
 	acpi_aml_io.users++;
 err_lock:
-	if (IS_ERR_VALUE(ret)) {
+	if (ret < 0) {
 		if (acpi_aml_active_reader == file)
 			acpi_aml_active_reader = NULL;
 	}
@@ -587,7 +587,7 @@ static int acpi_aml_read_user(char __user *buf, int len)
 	char *p;
 
 	ret = acpi_aml_lock_read(crc, ACPI_AML_OUT_USER);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		return ret;
 	/* sync head before removing logs */
 	smp_rmb();
@@ -602,7 +602,7 @@ static int acpi_aml_read_user(char __user *buf, int len)
 	crc->tail = (crc->tail + n) & (ACPI_AML_BUF_SIZE - 1);
 	ret = n;
 out:
-	acpi_aml_unlock_fifo(ACPI_AML_OUT_USER, !IS_ERR_VALUE(ret));
+	acpi_aml_unlock_fifo(ACPI_AML_OUT_USER, !ret);
 	return ret;
 }
 
@@ -634,7 +634,7 @@ again:
 					goto again;
 			}
 		}
-		if (IS_ERR_VALUE(ret)) {
+		if (ret < 0) {
 			if (!acpi_aml_running())
 				ret = 0;
 			break;
@@ -657,7 +657,7 @@ static int acpi_aml_write_user(const char __user *buf, int len)
 	char *p;
 
 	ret = acpi_aml_lock_write(crc, ACPI_AML_IN_USER);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		return ret;
 	/* sync tail before inserting cmds */
 	smp_mb();
@@ -672,7 +672,7 @@ static int acpi_aml_write_user(const char __user *buf, int len)
 	crc->head = (crc->head + n) & (ACPI_AML_BUF_SIZE - 1);
 	ret = n;
 out:
-	acpi_aml_unlock_fifo(ACPI_AML_IN_USER, !IS_ERR_VALUE(ret));
+	acpi_aml_unlock_fifo(ACPI_AML_IN_USER, !ret);
 	return n;
 }
 
@@ -704,7 +704,7 @@ again:
 					goto again;
 			}
 		}
-		if (IS_ERR_VALUE(ret)) {
+		if (ret < 0) {
 			if (!acpi_aml_running())
 				ret = 0;
 			break;
diff --git a/drivers/ata/sata_highbank.c b/drivers/ata/sata_highbank.c
index 8638d575b2b9..aafb8cc03523 100644
--- a/drivers/ata/sata_highbank.c
+++ b/drivers/ata/sata_highbank.c
@@ -197,7 +197,7 @@ static void highbank_set_em_messages(struct device *dev,
 
 	for (i = 0; i < SGPIO_PINS; i++) {
 		err = of_get_named_gpio(np, "calxeda,sgpio-gpio", i);
-		if (IS_ERR_VALUE(err))
+		if (err < 0)
 			return;
 
 		pdata->sgpio_gpio[i] = err;
diff --git a/drivers/clk/tegra/clk-tegra210.c b/drivers/clk/tegra/clk-tegra210.c
index b8551813ec43..456cf586d2c2 100644
--- a/drivers/clk/tegra/clk-tegra210.c
+++ b/drivers/clk/tegra/clk-tegra210.c
@@ -1221,7 +1221,7 @@ static int tegra210_pll_fixed_mdiv_cfg(struct clk_hw *hw,
 		p = rate >= params->vco_min ? 1 : -EINVAL;
 	}
 
-	if (IS_ERR_VALUE(p))
+	if (p < 0)
 		return -EINVAL;
 
 	cfg->m = tegra_pll_get_fixed_mdiv(hw, input_rate);
diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
index cead9bec4843..376e63ca94e8 100644
--- a/drivers/cpufreq/omap-cpufreq.c
+++ b/drivers/cpufreq/omap-cpufreq.c
@@ -54,7 +54,7 @@ static int omap_target(struct cpufreq_policy *policy, unsigned int index)
 
 	freq = new_freq * 1000;
 	ret = clk_round_rate(policy->clk, freq);
-	if (IS_ERR_VALUE(ret)) {
+	if (ret < 0) {
 		dev_warn(mpu_dev,
 			 "CPUfreq: Cannot find matching frequency for %lu\n",
 			 freq);
diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index 44d30b45f3cc..5ad5f3009ae0 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -402,7 +402,7 @@ int caam_get_era(void)
 	ret = of_property_read_u32(caam_node, "fsl,sec-era", &prop);
 	of_node_put(caam_node);
 
-	return IS_ERR_VALUE(ret) ? -ENOTSUPP : prop;
+	return ret ? -ENOTSUPP : prop;
 }
 EXPORT_SYMBOL(caam_get_era);
 
diff --git a/drivers/dma/sun4i-dma.c b/drivers/dma/sun4i-dma.c
index e0df233dde92..57aa227bfadb 100644
--- a/drivers/dma/sun4i-dma.c
+++ b/drivers/dma/sun4i-dma.c
@@ -461,25 +461,25 @@ generate_ndma_promise(struct dma_chan *chan, dma_addr_t src, dma_addr_t dest,
 
 	/* Source burst */
 	ret = convert_burst(sconfig->src_maxburst);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		goto fail;
 	promise->cfg |= SUN4I_DMA_CFG_SRC_BURST_LENGTH(ret);
 
 	/* Destination burst */
 	ret = convert_burst(sconfig->dst_maxburst);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		goto fail;
 	promise->cfg |= SUN4I_DMA_CFG_DST_BURST_LENGTH(ret);
 
 	/* Source bus width */
 	ret = convert_buswidth(sconfig->src_addr_width);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		goto fail;
 	promise->cfg |= SUN4I_DMA_CFG_SRC_DATA_WIDTH(ret);
 
 	/* Destination bus width */
 	ret = convert_buswidth(sconfig->dst_addr_width);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		goto fail;
 	promise->cfg |= SUN4I_DMA_CFG_DST_DATA_WIDTH(ret);
 
@@ -518,25 +518,25 @@ generate_ddma_promise(struct dma_chan *chan, dma_addr_t src, dma_addr_t dest,
 
 	/* Source burst */
 	ret = convert_burst(sconfig->src_maxburst);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		goto fail;
 	promise->cfg |= SUN4I_DMA_CFG_SRC_BURST_LENGTH(ret);
 
 	/* Destination burst */
 	ret = convert_burst(sconfig->dst_maxburst);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		goto fail;
 	promise->cfg |= SUN4I_DMA_CFG_DST_BURST_LENGTH(ret);
 
 	/* Source bus width */
 	ret = convert_buswidth(sconfig->src_addr_width);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		goto fail;
 	promise->cfg |= SUN4I_DMA_CFG_SRC_DATA_WIDTH(ret);
 
 	/* Destination bus width */
 	ret = convert_buswidth(sconfig->dst_addr_width);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		goto fail;
 	promise->cfg |= SUN4I_DMA_CFG_DST_DATA_WIDTH(ret);
 
diff --git a/drivers/gpio/gpio-xlp.c b/drivers/gpio/gpio-xlp.c
index 08897dc11915..1a33a19d95b9 100644
--- a/drivers/gpio/gpio-xlp.c
+++ b/drivers/gpio/gpio-xlp.c
@@ -393,7 +393,7 @@ static int xlp_gpio_probe(struct platform_device *pdev)
 		irq_base = irq_alloc_descs(-1, 0, gc->ngpio, 0);
 	else
 		irq_base = irq_alloc_descs(-1, XLP_GPIO_IRQ_BASE, gc->ngpio, 0);
-	if (IS_ERR_VALUE(irq_base)) {
+	if (irq_base < 0) {
 		dev_err(&pdev->dev, "Failed to allocate IRQ numbers\n");
 		return irq_base;
 	}
diff --git a/drivers/gpu/drm/sti/sti_vtg.c b/drivers/gpu/drm/sti/sti_vtg.c
index 32c7986b63ab..6bf4ce466d20 100644
--- a/drivers/gpu/drm/sti/sti_vtg.c
+++ b/drivers/gpu/drm/sti/sti_vtg.c
@@ -437,7 +437,7 @@ static int vtg_probe(struct platform_device *pdev)
 			return -EPROBE_DEFER;
 	} else {
 		vtg->irq = platform_get_irq(pdev, 0);
-		if (IS_ERR_VALUE(vtg->irq)) {
+		if (vtg->irq < 0) {
 			DRM_ERROR("Failed to get VTG interrupt\n");
 			return vtg->irq;
 		}
@@ -447,7 +447,7 @@ static int vtg_probe(struct platform_device *pdev)
 		ret = devm_request_threaded_irq(dev, vtg->irq, vtg_irq,
 				vtg_irq_thread, IRQF_ONESHOT,
 				dev_name(dev), vtg);
-		if (IS_ERR_VALUE(ret)) {
+		if (ret < 0) {
 			DRM_ERROR("Failed to register VTG interrupt\n");
 			return ret;
 		}
diff --git a/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c b/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c
index 7716f42f8aab..6b8c5b3bf588 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c
+++ b/drivers/gpu/drm/tilcdc/tilcdc_tfp410.c
@@ -342,7 +342,7 @@ static int tfp410_probe(struct platform_device *pdev)
 
 	tfp410_mod->gpio = of_get_named_gpio_flags(node, "powerdn-gpio",
 			0, NULL);
-	if (IS_ERR_VALUE(tfp410_mod->gpio)) {
+	if (tfp410_mod->gpio < 0) {
 		dev_warn(&pdev->dev, "No power down GPIO\n");
 	} else {
 		ret = gpio_request(tfp410_mod->gpio, "DVI_PDn");
diff --git a/drivers/gpu/host1x/hw/intr_hw.c b/drivers/gpu/host1x/hw/intr_hw.c
index 498b37e39058..e1e31e9e67cd 100644
--- a/drivers/gpu/host1x/hw/intr_hw.c
+++ b/drivers/gpu/host1x/hw/intr_hw.c
@@ -85,7 +85,7 @@ static int _host1x_intr_init_host_sync(struct host1x *host, u32 cpm,
 	err = devm_request_irq(host->dev, host->intr_syncpt_irq,
 			       syncpt_thresh_isr, IRQF_SHARED,
 			       "host1x_syncpt", host);
-	if (IS_ERR_VALUE(err)) {
+	if (err < 0) {
 		WARN_ON(1);
 		return err;
 	}
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index ebab33e77d67..94b68213c50d 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1477,7 +1477,7 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
 	struct arm_smmu_s1_cfg *cfg = &smmu_domain->s1_cfg;
 
 	asid = arm_smmu_bitmap_alloc(smmu->asid_map, smmu->asid_bits);
-	if (IS_ERR_VALUE(asid))
+	if (asid < 0)
 		return asid;
 
 	cfg->cdptr = dmam_alloc_coherent(smmu->dev, CTXDESC_CD_DWORDS << 3,
@@ -1508,7 +1508,7 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
 	struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
 
 	vmid = arm_smmu_bitmap_alloc(smmu->vmid_map, smmu->vmid_bits);
-	if (IS_ERR_VALUE(vmid))
+	if (vmid < 0)
 		return vmid;
 
 	cfg->vmid	= (u16)vmid;
@@ -1569,7 +1569,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
 	smmu_domain->pgtbl_ops = pgtbl_ops;
 
 	ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		free_io_pgtable_ops(pgtbl_ops);
 
 	return ret;
@@ -1642,7 +1642,7 @@ static void arm_smmu_detach_dev(struct device *dev)
 	struct arm_smmu_group *smmu_group = arm_smmu_group_get(dev);
 
 	smmu_group->ste.bypass = true;
-	if (IS_ERR_VALUE(arm_smmu_install_ste_for_group(smmu_group)))
+	if (arm_smmu_install_ste_for_group(smmu_group) < 0)
 		dev_warn(dev, "failed to install bypass STE\n");
 
 	smmu_group->domain = NULL;
@@ -1694,7 +1694,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	smmu_group->ste.bypass	= domain->type == IOMMU_DOMAIN_DMA;
 
 	ret = arm_smmu_install_ste_for_group(smmu_group);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		smmu_group->domain = NULL;
 
 out_unlock:
@@ -2235,7 +2235,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
 						arm_smmu_evtq_handler,
 						arm_smmu_evtq_thread,
 						0, "arm-smmu-v3-evtq", smmu);
-		if (IS_ERR_VALUE(ret))
+		if (ret < 0)
 			dev_warn(smmu->dev, "failed to enable evtq irq\n");
 	}
 
@@ -2244,7 +2244,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
 		ret = devm_request_irq(smmu->dev, irq,
 				       arm_smmu_cmdq_sync_handler, 0,
 				       "arm-smmu-v3-cmdq-sync", smmu);
-		if (IS_ERR_VALUE(ret))
+		if (ret < 0)
 			dev_warn(smmu->dev, "failed to enable cmdq-sync irq\n");
 	}
 
@@ -2252,7 +2252,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
 	if (irq) {
 		ret = devm_request_irq(smmu->dev, irq, arm_smmu_gerror_handler,
 				       0, "arm-smmu-v3-gerror", smmu);
-		if (IS_ERR_VALUE(ret))
+		if (ret < 0)
 			dev_warn(smmu->dev, "failed to enable gerror irq\n");
 	}
 
@@ -2264,7 +2264,7 @@ static int arm_smmu_setup_irqs(struct arm_smmu_device *smmu)
 							arm_smmu_priq_thread,
 							0, "arm-smmu-v3-priq",
 							smmu);
-			if (IS_ERR_VALUE(ret))
+			if (ret < 0)
 				dev_warn(smmu->dev,
 					 "failed to enable priq irq\n");
 			else
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index e206ce7a4e4b..9345a3fcb706 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -950,7 +950,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 
 	ret = __arm_smmu_alloc_bitmap(smmu->context_map, start,
 				      smmu->num_context_banks);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		goto out_unlock;
 
 	cfg->cbndx = ret;
@@ -989,7 +989,7 @@ static int arm_smmu_init_domain_context(struct iommu_domain *domain,
 	irq = smmu->irqs[smmu->num_global_irqs + cfg->irptndx];
 	ret = request_irq(irq, arm_smmu_context_fault, IRQF_SHARED,
 			  "arm-smmu-context-fault", domain);
-	if (IS_ERR_VALUE(ret)) {
+	if (ret < 0) {
 		dev_err(smmu->dev, "failed to request context IRQ %d (%u)\n",
 			cfg->irptndx, irq);
 		cfg->irptndx = INVALID_IRPTNDX;
@@ -1099,7 +1099,7 @@ static int arm_smmu_master_configure_smrs(struct arm_smmu_device *smmu,
 	for (i = 0; i < cfg->num_streamids; ++i) {
 		int idx = __arm_smmu_alloc_bitmap(smmu->smr_map, 0,
 						  smmu->num_mapping_groups);
-		if (IS_ERR_VALUE(idx)) {
+		if (idx < 0) {
 			dev_err(smmu->dev, "failed to allocate free SMR\n");
 			goto err_free_smrs;
 		}
@@ -1233,7 +1233,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 
 	/* Ensure that the domain is finalised */
 	ret = arm_smmu_init_domain_context(domain, smmu);
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		return ret;
 
 	/*
diff --git a/drivers/irqchip/irq-clps711x.c b/drivers/irqchip/irq-clps711x.c
index eb5eb0cd414d..2223b3f15d68 100644
--- a/drivers/irqchip/irq-clps711x.c
+++ b/drivers/irqchip/irq-clps711x.c
@@ -182,7 +182,7 @@ static int __init _clps711x_intc_init(struct device_node *np,
 	writel_relaxed(0, clps711x_intc->intmr[2]);
 
 	err = irq_alloc_descs(-1, 0, ARRAY_SIZE(clps711x_irqs), numa_node_id());
-	if (IS_ERR_VALUE(err))
+	if (err < 0)
 		goto out_iounmap;
 
 	clps711x_intc->ops.map = clps711x_intc_irq_map;
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index b4e647179346..fbc4ae2afd29 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -1123,7 +1123,7 @@ static int __init __gic_init_bases(struct gic_chip_data *gic, int irq_start,
 
 		irq_base = irq_alloc_descs(irq_start, 16, gic_irqs,
 					   numa_node_id());
-		if (IS_ERR_VALUE(irq_base)) {
+		if (irq_base < 0) {
 			WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n",
 			     irq_start);
 			irq_base = irq_start;
diff --git a/drivers/irqchip/irq-hip04.c b/drivers/irqchip/irq-hip04.c
index 9688d2e2a636..9e25d8ce08e5 100644
--- a/drivers/irqchip/irq-hip04.c
+++ b/drivers/irqchip/irq-hip04.c
@@ -402,7 +402,7 @@ hip04_of_init(struct device_node *node, struct device_node *parent)
 	nr_irqs -= hwirq_base; /* calculate # of irqs to allocate */
 
 	irq_base = irq_alloc_descs(-1, hwirq_base, nr_irqs, numa_node_id());
-	if (IS_ERR_VALUE(irq_base)) {
+	if (irq_base < 0) {
 		pr_err("failed to allocate IRQ numbers\n");
 		return -EINVAL;
 	}
diff --git a/drivers/irqchip/spear-shirq.c b/drivers/irqchip/spear-shirq.c
index 1ccd2abed65f..1518ba31a80c 100644
--- a/drivers/irqchip/spear-shirq.c
+++ b/drivers/irqchip/spear-shirq.c
@@ -232,7 +232,7 @@ static int __init shirq_init(struct spear_shirq **shirq_blocks, int block_nr,
 		nr_irqs += shirq_blocks[i]->nr_irqs;
 
 	virq_base = irq_alloc_descs(-1, 0, nr_irqs, 0);
-	if (IS_ERR_VALUE(virq_base)) {
+	if (virq_base < 0) {
 		pr_err("%s: irq desc alloc failed\n", __func__);
 		goto err_unmap;
 	}
diff --git a/drivers/media/i2c/adp1653.c b/drivers/media/i2c/adp1653.c
index 9e1731c565e7..e191e295c951 100644
--- a/drivers/media/i2c/adp1653.c
+++ b/drivers/media/i2c/adp1653.c
@@ -95,7 +95,7 @@ static int adp1653_get_fault(struct adp1653_flash *flash)
 	int rval;
 
 	fault = i2c_smbus_read_byte_data(client, ADP1653_REG_FAULT);
-	if (IS_ERR_VALUE(fault))
+	if (fault < 0)
 		return fault;
 
 	flash->fault |= fault;
@@ -105,13 +105,13 @@ static int adp1653_get_fault(struct adp1653_flash *flash)
 
 	/* Clear faults. */
 	rval = i2c_smbus_write_byte_data(client, ADP1653_REG_OUT_SEL, 0);
-	if (IS_ERR_VALUE(rval))
+	if (rval < 0)
 		return rval;
 
 	flash->led_mode->val = V4L2_FLASH_LED_MODE_NONE;
 
 	rval = adp1653_update_hw(flash);
-	if (IS_ERR_VALUE(rval))
+	if (rval)
 		return rval;
 
 	return flash->fault;
@@ -158,7 +158,7 @@ static int adp1653_get_ctrl(struct v4l2_ctrl *ctrl)
 	int rval;
 
 	rval = adp1653_get_fault(flash);
-	if (IS_ERR_VALUE(rval))
+	if (rval)
 		return rval;
 
 	ctrl->cur.val = 0;
@@ -184,7 +184,7 @@ static int adp1653_set_ctrl(struct v4l2_ctrl *ctrl)
 	int rval;
 
 	rval = adp1653_get_fault(flash);
-	if (IS_ERR_VALUE(rval))
+	if (rval)
 		return rval;
 	if ((rval & (ADP1653_REG_FAULT_FLT_SCP |
 		     ADP1653_REG_FAULT_FLT_OT |
diff --git a/drivers/media/platform/s5p-tv/mixer_drv.c b/drivers/media/platform/s5p-tv/mixer_drv.c
index 5ef67774971d..8a5d19469ddc 100644
--- a/drivers/media/platform/s5p-tv/mixer_drv.c
+++ b/drivers/media/platform/s5p-tv/mixer_drv.c
@@ -146,7 +146,7 @@ int mxr_power_get(struct mxr_device *mdev)
 
 	/* returning 1 means that power is already enabled,
 	 * so zero success be returned */
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		return ret;
 	return 0;
 }
diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c
index 40e51b0baa46..b46c0cfc27d9 100644
--- a/drivers/mfd/twl4030-irq.c
+++ b/drivers/mfd/twl4030-irq.c
@@ -696,7 +696,7 @@ int twl4030_init_irq(struct device *dev, int irq_num)
 	nr_irqs = TWL4030_PWR_NR_IRQS + TWL4030_CORE_NR_IRQS;
 
 	irq_base = irq_alloc_descs(-1, 0, nr_irqs, 0);
-	if (IS_ERR_VALUE(irq_base)) {
+	if (irq_base < 0) {
 		dev_err(dev, "Fail to allocate IRQ descs\n");
 		return irq_base;
 	}
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index b81b08f81325..c984321d1881 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1276,7 +1276,7 @@ static int mmc_select_hs200(struct mmc_card *card)
 	 * switch to HS200 mode if bus width is set successfully.
 	 */
 	err = mmc_select_bus_width(card);
-	if (!IS_ERR_VALUE(err)) {
+	if (!err) {
 		val = EXT_CSD_TIMING_HS200 |
 		      card->drive_strength << EXT_CSD_DRV_STR_SHIFT;
 		err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
@@ -1583,7 +1583,7 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr,
 	} else if (mmc_card_hs(card)) {
 		/* Select the desired bus width optionally */
 		err = mmc_select_bus_width(card);
-		if (!IS_ERR_VALUE(err)) {
+		if (!err) {
 			err = mmc_select_hs_ddr(card);
 			if (err)
 				goto free_card;
diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index 829a6eebcdce..2cc6123b1df9 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -1431,7 +1431,7 @@ static int dw_mci_get_ro(struct mmc_host *mmc)
 	int gpio_ro = mmc_gpio_get_ro(mmc);
 
 	/* Use platform get_ro function, else try on board write protect */
-	if (!IS_ERR_VALUE(gpio_ro))
+	if (gpio_ro >= 0)
 		read_only = gpio_ro;
 	else
 		read_only =
@@ -1454,7 +1454,7 @@ static int dw_mci_get_cd(struct mmc_host *mmc)
 	if ((mmc->caps & MMC_CAP_NEEDS_POLL) ||
 	    (mmc->caps & MMC_CAP_NONREMOVABLE))
 		present = 1;
-	else if (!IS_ERR_VALUE(gpio_cd))
+	else if (gpio_cd >= 0)
 		present = gpio_cd;
 	else
 		present = (mci_readl(slot->host, CDETECT) & (1 << slot->id))
@@ -2927,7 +2927,7 @@ static void dw_mci_enable_cd(struct dw_mci *host)
 		if (slot->mmc->caps & MMC_CAP_NEEDS_POLL)
 			return;
 
-		if (IS_ERR_VALUE(mmc_gpio_get_cd(slot->mmc)))
+		if (mmc_gpio_get_cd(slot->mmc) < 0)
 			break;
 	}
 	if (i == host->num_slots)
diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c
index 2d300d87cda8..9d3ae1f4bd3c 100644
--- a/drivers/mmc/host/sdhci-esdhc-imx.c
+++ b/drivers/mmc/host/sdhci-esdhc-imx.c
@@ -1011,7 +1011,7 @@ sdhci_esdhc_imx_probe_dt(struct platform_device *pdev,
 	if (ret)
 		return ret;
 
-	if (!IS_ERR_VALUE(mmc_gpio_get_cd(host->mmc)))
+	if (mmc_gpio_get_cd(host->mmc) >= 0)
 		host->quirks &= ~SDHCI_QUIRK_BROKEN_CARD_DETECTION;
 
 	return 0;
diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c
index 25f779e09d8e..d4cef713d246 100644
--- a/drivers/mmc/host/sdhci-of-at91.c
+++ b/drivers/mmc/host/sdhci-of-at91.c
@@ -289,7 +289,7 @@ static int sdhci_at91_probe(struct platform_device *pdev)
 	 * to enable polling via device tree with broken-cd property.
 	 */
 	if (!(host->mmc->caps & MMC_CAP_NONREMOVABLE) &&
-	    IS_ERR_VALUE(mmc_gpio_get_cd(host->mmc))) {
+	    mmc_gpio_get_cd(host->mmc) < 0) {
 		host->mmc->caps |= MMC_CAP_NEEDS_POLL;
 		host->quirks &= ~SDHCI_QUIRK_BROKEN_CARD_DETECTION;
 	}
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index e010ea4eb6f5..0e3d7c056cb1 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -1624,7 +1624,7 @@ static int sdhci_get_cd(struct mmc_host *mmc)
 	 * Try slot gpio detect, if defined it take precedence
 	 * over build in controller functionality
 	 */
-	if (!IS_ERR_VALUE(gpio_cd))
+	if (gpio_cd >= 0)
 		return !!gpio_cd;
 
 	/* If polling, assume that the card is always present. */
@@ -3077,7 +3077,7 @@ int sdhci_add_host(struct sdhci_host *host)
 
 	if ((host->quirks & SDHCI_QUIRK_BROKEN_CARD_DETECTION) &&
 	    !(mmc->caps & MMC_CAP_NONREMOVABLE) &&
-	    IS_ERR_VALUE(mmc_gpio_get_cd(host->mmc)))
+	    mmc_gpio_get_cd(host->mmc) < 0)
 		mmc->caps |= MMC_CAP_NEEDS_POLL;
 
 	/* If there are external regulators, get them */
diff --git a/drivers/net/ethernet/freescale/fman/fman.c b/drivers/net/ethernet/freescale/fman/fman.c
index bcb9dccada4d..1de2e1e51c2b 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -615,7 +615,7 @@ struct fman {
 	struct fman_cfg *cfg;
 	struct muram_info *muram;
 	/* cam section in muram */
-	int cam_offset;
+	unsigned long cam_offset;
 	size_t cam_size;
 	/* Fifo in MURAM */
 	int fifo_offset;
diff --git a/drivers/net/ethernet/freescale/fman/fman_muram.c b/drivers/net/ethernet/freescale/fman/fman_muram.c
index 4eb0e9ac7182..47394c45b6e8 100644
--- a/drivers/net/ethernet/freescale/fman/fman_muram.c
+++ b/drivers/net/ethernet/freescale/fman/fman_muram.c
@@ -129,7 +129,7 @@ unsigned long fman_muram_offset_to_vbase(struct muram_info *muram,
  *
  * Return: address of the allocated memory; NULL otherwise.
  */
-int fman_muram_alloc(struct muram_info *muram, size_t size)
+unsigned long fman_muram_alloc(struct muram_info *muram, size_t size)
 {
 	unsigned long vaddr;
 
@@ -150,7 +150,7 @@ int fman_muram_alloc(struct muram_info *muram, size_t size)
  *
  * Free an allocated memory from FM-MURAM partition.
  */
-void fman_muram_free_mem(struct muram_info *muram, u32 offset, size_t size)
+void fman_muram_free_mem(struct muram_info *muram, unsigned long offset, size_t size)
 {
 	unsigned long addr = fman_muram_offset_to_vbase(muram, offset);
 
diff --git a/drivers/net/ethernet/freescale/fman/fman_muram.h b/drivers/net/ethernet/freescale/fman/fman_muram.h
index dbf0af9e5bb5..889649ad8931 100644
--- a/drivers/net/ethernet/freescale/fman/fman_muram.h
+++ b/drivers/net/ethernet/freescale/fman/fman_muram.h
@@ -44,8 +44,8 @@ struct muram_info *fman_muram_init(phys_addr_t base, size_t size);
 unsigned long fman_muram_offset_to_vbase(struct muram_info *muram,
 					 unsigned long offset);
 
-int fman_muram_alloc(struct muram_info *muram, size_t size);
+unsigned long fman_muram_alloc(struct muram_info *muram, size_t size);
 
-void fman_muram_free_mem(struct muram_info *muram, u32 offset, size_t size);
+void fman_muram_free_mem(struct muram_info *muram, unsigned long offset, size_t size);
 
 #endif /* __FM_MURAM_EXT */
diff --git a/drivers/net/wireless/ti/wlcore/spi.c b/drivers/net/wireless/ti/wlcore/spi.c
index 020ac1a4b408..cea9443c22a6 100644
--- a/drivers/net/wireless/ti/wlcore/spi.c
+++ b/drivers/net/wireless/ti/wlcore/spi.c
@@ -382,7 +382,7 @@ static int wlcore_probe_of(struct spi_device *spi, struct wl12xx_spi_glue *glue,
 
 	ret = of_property_read_u32(dt_node, "ref-clock-frequency",
 				   &pdev_data->ref_clock_freq);
-	if (IS_ERR_VALUE(ret)) {
+	if (ret) {
 		dev_err(glue->dev,
 			"can't get reference clock frequency (%d)\n", ret);
 		return ret;
@@ -425,7 +425,7 @@ static int wl1271_probe(struct spi_device *spi)
 	}
 
 	ret = wlcore_probe_of(spi, glue, &pdev_data);
-	if (IS_ERR_VALUE(ret)) {
+	if (ret) {
 		dev_err(glue->dev,
 			"can't get device tree parameters (%d)\n", ret);
 		return ret;
diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c
index bb4ea123547f..965911d9b36a 100644
--- a/drivers/nvmem/core.c
+++ b/drivers/nvmem/core.c
@@ -113,7 +113,7 @@ static ssize_t bin_attr_nvmem_read(struct file *filp, struct kobject *kobj,
 
 	rc = nvmem_reg_read(nvmem, pos, buf, count);
 
-	if (IS_ERR_VALUE(rc))
+	if (rc)
 		return rc;
 
 	return count;
@@ -147,7 +147,7 @@ static ssize_t bin_attr_nvmem_write(struct file *filp, struct kobject *kobj,
 
 	rc = nvmem_reg_write(nvmem, pos, buf, count);
 
-	if (IS_ERR_VALUE(rc))
+	if (rc)
 		return rc;
 
 	return count;
@@ -366,7 +366,7 @@ static int nvmem_add_cells(struct nvmem_device *nvmem,
 		}
 
 		rval = nvmem_cell_info_to_nvmem_cell(nvmem, &info[i], cells[i]);
-		if (IS_ERR_VALUE(rval)) {
+		if (rval) {
 			kfree(cells[i]);
 			goto err;
 		}
@@ -963,7 +963,7 @@ static int __nvmem_cell_read(struct nvmem_device *nvmem,
 
 	rc = nvmem_reg_read(nvmem, cell->offset, buf, cell->bytes);
 
-	if (IS_ERR_VALUE(rc))
+	if (rc)
 		return rc;
 
 	/* shift bits in-place */
@@ -998,7 +998,7 @@ void *nvmem_cell_read(struct nvmem_cell *cell, size_t *len)
 		return ERR_PTR(-ENOMEM);
 
 	rc = __nvmem_cell_read(nvmem, cell, buf, len);
-	if (IS_ERR_VALUE(rc)) {
+	if (rc) {
 		kfree(buf);
 		return ERR_PTR(rc);
 	}
@@ -1083,7 +1083,7 @@ int nvmem_cell_write(struct nvmem_cell *cell, void *buf, size_t len)
 	if (cell->bit_offset || cell->nbits)
 		kfree(buf);
 
-	if (IS_ERR_VALUE(rc))
+	if (rc)
 		return rc;
 
 	return len;
@@ -1111,11 +1111,11 @@ ssize_t nvmem_device_cell_read(struct nvmem_device *nvmem,
 		return -EINVAL;
 
 	rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell);
-	if (IS_ERR_VALUE(rc))
+	if (rc)
 		return rc;
 
 	rc = __nvmem_cell_read(nvmem, &cell, buf, &len);
-	if (IS_ERR_VALUE(rc))
+	if (rc)
 		return rc;
 
 	return len;
@@ -1141,7 +1141,7 @@ int nvmem_device_cell_write(struct nvmem_device *nvmem,
 		return -EINVAL;
 
 	rc = nvmem_cell_info_to_nvmem_cell(nvmem, info, &cell);
-	if (IS_ERR_VALUE(rc))
+	if (rc)
 		return rc;
 
 	return nvmem_cell_write(&cell, buf, cell.bytes);
@@ -1170,7 +1170,7 @@ int nvmem_device_read(struct nvmem_device *nvmem,
 
 	rc = nvmem_reg_read(nvmem, offset, buf, bytes);
 
-	if (IS_ERR_VALUE(rc))
+	if (rc)
 		return rc;
 
 	return bytes;
@@ -1198,7 +1198,7 @@ int nvmem_device_write(struct nvmem_device *nvmem,
 
 	rc = nvmem_reg_write(nvmem, offset, buf, bytes);
 
-	if (IS_ERR_VALUE(rc))
+	if (rc)
 		return rc;
 
 
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index a2aa655f56c4..1b7331e40d79 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -2360,7 +2360,7 @@ static int pl011_probe_dt_alias(int index, struct device *dev)
 		return ret;
 
 	ret = of_alias_get_id(np, "serial");
-	if (IS_ERR_VALUE(ret)) {
+	if (ret < 0) {
 		seen_dev_without_alias = true;
 		ret = index;
 	} else {
diff --git a/drivers/tty/serial/sprd_serial.c b/drivers/tty/serial/sprd_serial.c
index 18971063f95f..699447aa8b43 100644
--- a/drivers/tty/serial/sprd_serial.c
+++ b/drivers/tty/serial/sprd_serial.c
@@ -654,7 +654,7 @@ static int sprd_probe_dt_alias(int index, struct device *dev)
 		return ret;
 
 	ret = of_alias_get_id(np, "serial");
-	if (IS_ERR_VALUE(ret))
+	if (ret < 0)
 		ret = index;
 	else if (ret >= ARRAY_SIZE(sprd_port) || sprd_port[ret] != NULL) {
 		dev_warn(dev, "requested serial port %d not available.\n", ret);
diff --git a/drivers/video/fbdev/da8xx-fb.c b/drivers/video/fbdev/da8xx-fb.c
index d8d583d32a37..c229b1a0d13b 100644
--- a/drivers/video/fbdev/da8xx-fb.c
+++ b/drivers/video/fbdev/da8xx-fb.c
@@ -713,7 +713,7 @@ static int da8xx_fb_config_clk_divider(struct da8xx_fb_par *par,
 
 	if (par->lcdc_clk_rate != lcdc_clk_rate) {
 		ret = clk_set_rate(par->lcdc_clk, lcdc_clk_rate);
-		if (IS_ERR_VALUE(ret)) {
+		if (ret) {
 			dev_err(par->dev,
 				"unable to set clock rate at %u\n",
 				lcdc_clk_rate);
@@ -784,7 +784,7 @@ static int lcd_init(struct da8xx_fb_par *par, const struct lcd_ctrl_config *cfg,
 	int ret = 0;
 
 	ret = da8xx_fb_calc_config_clk_divider(par, panel);
-	if (IS_ERR_VALUE(ret)) {
+	if (ret) {
 		dev_err(par->dev, "unable to configure clock\n");
 		return ret;
 	}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 65de439bdc4f..14d506efd1aa 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -643,10 +643,6 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
 		return 0;
 
 	result = generic_file_write_iter(iocb, from);
-	if (IS_ERR_VALUE(result)) {
-		_leave(" = %zd", result);
-		return result;
-	}
 
 	_leave(" = %zd", result);
 	return result;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index f723cd3a455c..caf9e39bb82b 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -337,7 +337,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
 					"(%d != %d)", (unsigned) r, curid, id);
 			goto failed;
 		} else if ( ! p->lib_list[id].loaded &&
-				IS_ERR_VALUE(load_flat_shared_library(id, p))) {
+				load_flat_shared_library(id, p) < 0) {
 			printk("BINFMT_FLAT: failed to load library %d", id);
 			goto failed;
 		}
@@ -837,7 +837,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
 
 	res = prepare_binprm(&bprm);
 
-	if (!IS_ERR_VALUE(res))
+	if (!res)
 		res = load_flat_file(&bprm, libs, id, NULL);
 
 	abort_creds(bprm.cred);
@@ -883,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm)
 	stack_len += FLAT_STACK_ALIGN - 1;  /* reserve for upcoming alignment */
 	
 	res = load_flat_file(bprm, &libinfo, 0, &stack_len);
-	if (IS_ERR_VALUE(res))
+	if (res < 0)
 		return res;
 	
 	/* Update data segment pointers for all libraries */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 4a01f30e9995..271d93905bac 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -783,12 +783,15 @@ static int get_leaf_nr(struct gfs2_inode *dip, u32 index,
 		       u64 *leaf_out)
 {
 	__be64 *hash;
+	int error;
 
 	hash = gfs2_dir_get_hash_table(dip);
-	if (IS_ERR(hash))
-		return PTR_ERR(hash);
-	*leaf_out = be64_to_cpu(*(hash + index));
-	return 0;
+	error = PTR_ERR_OR_ZERO(hash);
+
+	if (!error)
+		*leaf_out = be64_to_cpu(*(hash + index));
+
+	return error;
 }
 
 static int get_first_leaf(struct gfs2_inode *dip, u32 index,
@@ -798,7 +801,7 @@ static int get_first_leaf(struct gfs2_inode *dip, u32 index,
 	int error;
 
 	error = get_leaf_nr(dip, index, &leaf_no);
-	if (!IS_ERR_VALUE(error))
+	if (!error)
 		error = get_leaf(dip, leaf_no, bh_out);
 
 	return error;
@@ -1014,7 +1017,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 
 	index = name->hash >> (32 - dip->i_depth);
 	error = get_leaf_nr(dip, index, &leaf_no);
-	if (IS_ERR_VALUE(error))
+	if (error)
 		return error;
 
 	/*  Get the old leaf block  */
diff --git a/kernel/pid.c b/kernel/pid.c
index 4d73a834c7e6..f66162f2359b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -311,7 +311,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	pid->level = ns->level;
 	for (i = ns->level; i >= 0; i--) {
 		nr = alloc_pidmap(tmp);
-		if (IS_ERR_VALUE(nr)) {
+		if (nr < 0) {
 			retval = nr;
 			goto out_free;
 		}
diff --git a/net/9p/client.c b/net/9p/client.c
index ea79ee9a7348..3fc94a49ccd5 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -518,10 +518,10 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
 		if (err)
 			goto out_err;
 
-		if (p9_is_proto_dotu(c))
+		if (p9_is_proto_dotu(c) && ecode < 512)
 			err = -ecode;
 
-		if (!err || !IS_ERR_VALUE(err)) {
+		if (!err) {
 			err = p9_errstr2errno(ename, strlen(ename));
 
 			p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
@@ -605,10 +605,10 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
 		if (err)
 			goto out_err;
 
-		if (p9_is_proto_dotu(c))
+		if (p9_is_proto_dotu(c) && ecode < 512)
 			err = -ecode;
 
-		if (!err || !IS_ERR_VALUE(err)) {
+		if (!err) {
 			err = p9_errstr2errno(ename, strlen(ename));
 
 			p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
diff --git a/sound/soc/qcom/lpass-platform.c b/sound/soc/qcom/lpass-platform.c
index 6e8665430bd5..ddfe34434765 100644
--- a/sound/soc/qcom/lpass-platform.c
+++ b/sound/soc/qcom/lpass-platform.c
@@ -491,7 +491,7 @@ static int lpass_platform_pcm_new(struct snd_soc_pcm_runtime *soc_runtime)
 			data->rdma_ch = v->alloc_dma_channel(drvdata,
 						SNDRV_PCM_STREAM_PLAYBACK);
 
-		if (IS_ERR_VALUE(data->rdma_ch))
+		if (data->rdma_ch < 0)
 			return data->rdma_ch;
 
 		drvdata->substream[data->rdma_ch] = psubstream;
@@ -518,7 +518,7 @@ static int lpass_platform_pcm_new(struct snd_soc_pcm_runtime *soc_runtime)
 			data->wrdma_ch = v->alloc_dma_channel(drvdata,
 						SNDRV_PCM_STREAM_CAPTURE);
 
-		if (IS_ERR_VALUE(data->wrdma_ch))
+		if (data->wrdma_ch < 0)
 			goto capture_alloc_err;
 
 		drvdata->substream[data->wrdma_ch] = csubstream;
-- 
cgit v1.2.3


From c08376ac97cb202ec65320f3d90d5c4c5e2adb0b Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Thu, 26 May 2016 17:21:05 -0700
Subject: timer: Export destroy_hrtimer_on_stack()

hrtimer_init_on_stack() needs a matching call to
destroy_hrtimer_on_stack(), so both need to be exported.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/time/hrtimer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 8c7392c4fdbd..e99df0ff1d42 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -425,6 +425,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
 {
 	debug_object_free(timer, &hrtimer_debug_descr);
 }
+EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
 
 #else
 static inline void debug_hrtimer_init(struct hrtimer *timer) { }
-- 
cgit v1.2.3


From 9bd616e3dbedfc103f158197c8ad93678849b1ed Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 1 Jun 2016 18:52:16 +0100
Subject: cpuidle: Do not access cpuidle_devices when !CONFIG_CPU_IDLE

The cpuidle_devices per-CPU variable is only defined when CPU_IDLE is
enabled. Commit c8cc7d4de7a4 ("sched/idle: Reorganize the idle loop")
removed the #ifdef CONFIG_CPU_IDLE around cpuidle_idle_call() with the
compiler optimising away __this_cpu_read(cpuidle_devices). However, with
CONFIG_UBSAN && !CONFIG_CPU_IDLE, this optimisation no longer happens
and the kernel fails to link since cpuidle_devices is not defined.

This patch introduces an accessor function for the current CPU cpuidle
device (returning NULL when !CONFIG_CPU_IDLE) and uses it in
cpuidle_idle_call().

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: 4.5+ <stable@vger.kernel.org> # 4.5+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpuidle.h | 3 +++
 kernel/sched/idle.c     | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 786ad32631a6..07b83d32f66c 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -152,6 +152,8 @@ extern void cpuidle_disable_device(struct cpuidle_device *dev);
 extern int cpuidle_play_dead(void);
 
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
+static inline struct cpuidle_device *cpuidle_get_device(void)
+{return __this_cpu_read(cpuidle_devices); }
 #else
 static inline void disable_cpuidle(void) { }
 static inline bool cpuidle_not_available(struct cpuidle_driver *drv,
@@ -187,6 +189,7 @@ static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
 static inline int cpuidle_play_dead(void) {return -ENODEV; }
 static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
 	struct cpuidle_device *dev) {return NULL; }
+static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; }
 #endif
 
 #if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index bd12c6c714ec..c5aeedf4e93a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -127,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
  */
 static void cpuidle_idle_call(void)
 {
-	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_device *dev = cpuidle_get_device();
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
 	int next_state, entered_state;
 
-- 
cgit v1.2.3


From 0422e83d84ae24b933e4b0d4c1e0f0b4ae8a0a3b Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Thu, 26 May 2016 21:08:17 +0100
Subject: locking/ww_mutex: Report recursive ww_mutex locking early

Recursive locking for ww_mutexes was originally conceived as an
exception. However, it is heavily used by the DRM atomic modesetting
code. Currently, the recursive deadlock is checked after we have queued
up for a busy-spin and as we never release the lock, we spin until
kicked, whereupon the deadlock is discovered and reported.

A simple solution for the now common problem is to move the recursive
deadlock discovery to the first action when taking the ww_mutex.

Suggested-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1464293297-19777-1-git-send-email-chris@chris-wilson.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index e364b424b019..79d2d765a75f 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
 	if (!hold_ctx)
 		return 0;
 
-	if (unlikely(ctx == hold_ctx))
-		return -EALREADY;
-
 	if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
 	    (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
 #ifdef CONFIG_DEBUG_MUTEXES
@@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	unsigned long flags;
 	int ret;
 
+	if (use_ww_ctx) {
+		struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+		if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
+			return -EALREADY;
+	}
+
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
 
-- 
cgit v1.2.3


From 5b6c1b4d46b0dae4edea636a776d09f2064f4cd7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sat, 4 Jun 2016 20:50:59 +0200
Subject: bpf, trace: use READ_ONCE for retrieving file ptr

In bpf_perf_event_read() and bpf_perf_event_output(), we must use
READ_ONCE() for fetching the struct file pointer, which could get
updated concurrently, so we must prevent the compiler from potential
refetching.

We already do this with tail calls for fetching the related bpf_prog,
but not so on stored perf events. Semantics for both are the same
with regards to updates.

Fixes: a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
Fixes: 35578d798400 ("bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 780bcbe1d4de..720b7bb01d43 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -198,7 +198,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	file = (struct file *)array->ptrs[index];
+	file = READ_ONCE(array->ptrs[index]);
 	if (unlikely(!file))
 		return -ENOENT;
 
@@ -247,7 +247,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)
 	if (unlikely(index >= array->map.max_entries))
 		return -E2BIG;
 
-	file = (struct file *)array->ptrs[index];
+	file = READ_ONCE(array->ptrs[index]);
 	if (unlikely(!file))
 		return -ENOENT;
 
-- 
cgit v1.2.3


From 2c610022711675ee908b903d242f0b90e1db661f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Jun 2016 10:19:51 +0200
Subject: locking/qspinlock: Fix spin_unlock_wait() some more

While this prior commit:

  54cf809b9512 ("locking,qspinlock: Fix spin_is_locked() and spin_unlock_wait()")

... fixes spin_is_locked() and spin_unlock_wait() for the usage
in ipc/sem and netfilter, it does not in fact work right for the
usage in task_work and futex.

So while the 2 locks crossed problem:

	spin_lock(A)		spin_lock(B)
	if (!spin_is_locked(B)) spin_unlock_wait(A)
	  foo()			foo();

... works with the smp_mb() injected by both spin_is_locked() and
spin_unlock_wait(), this is not sufficient for:

	flag = 1;
	smp_mb();		spin_lock()
	spin_unlock_wait()	if (!flag)
				  // add to lockless list
	// iterate lockless list

... because in this scenario, the store from spin_lock() can be delayed
past the load of flag, uncrossing the variables and loosing the
guarantee.

This patch reworks spin_is_locked() and spin_unlock_wait() to work in
both cases by exploiting the observation that while the lock byte
store can be delayed, the contender must have registered itself
visibly in other state contained in the word.

It also allows for architectures to override both functions, as PPC
and ARM64 have an additional issue for which we currently have no
generic solution.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Giovanni Gherdovich <ggherdovich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <waiman.long@hpe.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: stable@vger.kernel.org # v4.2 and later
Fixes: 54cf809b9512 ("locking,qspinlock: Fix spin_is_locked() and spin_unlock_wait()")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/qspinlock.h | 53 ++++++++++++------------------------
 kernel/locking/qspinlock.c      | 60 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index 6bd05700d8c9..05f05f17a7c2 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -21,38 +21,34 @@
 
 #include <asm-generic/qspinlock_types.h>
 
+/**
+ * queued_spin_unlock_wait - wait until the _current_ lock holder releases the lock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * There is a very slight possibility of live-lock if the lockers keep coming
+ * and the waiter is just unfortunate enough to not see any unlock state.
+ */
+#ifndef queued_spin_unlock_wait
+extern void queued_spin_unlock_wait(struct qspinlock *lock);
+#endif
+
 /**
  * queued_spin_is_locked - is the spinlock locked?
  * @lock: Pointer to queued spinlock structure
  * Return: 1 if it is locked, 0 otherwise
  */
+#ifndef queued_spin_is_locked
 static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 {
 	/*
-	 * queued_spin_lock_slowpath() can ACQUIRE the lock before
-	 * issuing the unordered store that sets _Q_LOCKED_VAL.
-	 *
-	 * See both smp_cond_acquire() sites for more detail.
-	 *
-	 * This however means that in code like:
-	 *
-	 *   spin_lock(A)		spin_lock(B)
-	 *   spin_unlock_wait(B)	spin_is_locked(A)
-	 *   do_something()		do_something()
-	 *
-	 * Both CPUs can end up running do_something() because the store
-	 * setting _Q_LOCKED_VAL will pass through the loads in
-	 * spin_unlock_wait() and/or spin_is_locked().
+	 * See queued_spin_unlock_wait().
 	 *
-	 * Avoid this by issuing a full memory barrier between the spin_lock()
-	 * and the loads in spin_unlock_wait() and spin_is_locked().
-	 *
-	 * Note that regular mutual exclusion doesn't care about this
-	 * delayed store.
+	 * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL
+	 * isn't immediately observable.
 	 */
-	smp_mb();
-	return atomic_read(&lock->val) & _Q_LOCKED_MASK;
+	return atomic_read(&lock->val);
 }
+#endif
 
 /**
  * queued_spin_value_unlocked - is the spinlock structure unlocked?
@@ -122,21 +118,6 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock)
 }
 #endif
 
-/**
- * queued_spin_unlock_wait - wait until current lock holder releases the lock
- * @lock : Pointer to queued spinlock structure
- *
- * There is a very slight possibility of live-lock if the lockers keep coming
- * and the waiter is just unfortunate enough to not see any unlock state.
- */
-static inline void queued_spin_unlock_wait(struct qspinlock *lock)
-{
-	/* See queued_spin_is_locked() */
-	smp_mb();
-	while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
-		cpu_relax();
-}
-
 #ifndef virt_spin_lock
 static __always_inline bool virt_spin_lock(struct qspinlock *lock)
 {
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e32ae1..5fc8c311b8fe 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -267,6 +267,66 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
 #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath
 #endif
 
+/*
+ * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
+ * issuing an _unordered_ store to set _Q_LOCKED_VAL.
+ *
+ * This means that the store can be delayed, but no later than the
+ * store-release from the unlock. This means that simply observing
+ * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
+ *
+ * There are two paths that can issue the unordered store:
+ *
+ *  (1) clear_pending_set_locked():	*,1,0 -> *,0,1
+ *
+ *  (2) set_locked():			t,0,0 -> t,0,1 ; t != 0
+ *      atomic_cmpxchg_relaxed():	t,0,0 -> 0,0,1
+ *
+ * However, in both cases we have other !0 state we've set before to queue
+ * ourseves:
+ *
+ * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
+ * load is constrained by that ACQUIRE to not pass before that, and thus must
+ * observe the store.
+ *
+ * For (2) we have a more intersting scenario. We enqueue ourselves using
+ * xchg_tail(), which ends up being a RELEASE. This in itself is not
+ * sufficient, however that is followed by an smp_cond_acquire() on the same
+ * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
+ * guarantees we must observe that store.
+ *
+ * Therefore both cases have other !0 state that is observable before the
+ * unordered locked byte store comes through. This means we can use that to
+ * wait for the lock store, and then wait for an unlock.
+ */
+#ifndef queued_spin_unlock_wait
+void queued_spin_unlock_wait(struct qspinlock *lock)
+{
+	u32 val;
+
+	for (;;) {
+		val = atomic_read(&lock->val);
+
+		if (!val) /* not locked, we're done */
+			goto done;
+
+		if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
+			break;
+
+		/* not locked, but pending, wait until we observe the lock */
+		cpu_relax();
+	}
+
+	/* any unlock is good */
+	while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
+		cpu_relax();
+
+done:
+	smp_rmb(); /* CTRL + RMB -> ACQUIRE */
+}
+EXPORT_SYMBOL(queued_spin_unlock_wait);
+#endif
+
 #endif /* _GEN_PV_LOCK_SLOWPATH */
 
 /**
-- 
cgit v1.2.3


From 62a92c8f553e49270a0ee391b8733da71ab0aebc Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 7 Jun 2016 15:44:15 +0300
Subject: perf/core: Remove a redundant check

There is no way to end up in _free_event() with event::pmu being NULL.
The latter is initialized in event allocation path and remains set
forever. In case of allocation failure, the error path doesn't use
_free_event().

Having the check, however, suggests that it is possible to have a
event::pmu==NULL situation in _free_event() and confuses the robots.

This patch gets rid of the check.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: eranian@google.com
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/1465303455-26032-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 050a290c72c7..87e945d6ebb8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3862,10 +3862,8 @@ static void _free_event(struct perf_event *event)
 	if (event->ctx)
 		put_ctx(event->ctx);
 
-	if (event->pmu) {
-		exclusive_event_destroy(event);
-		module_put(event->pmu->module);
-	}
+	exclusive_event_destroy(event);
+	module_put(event->pmu->module);
 
 	call_rcu(&event->rcu_head, free_event_rcu);
 }
-- 
cgit v1.2.3


From 9c57259117b9c25472a3fa6d5a14d6bb3b647e87 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 3 Jun 2016 17:58:40 -0500
Subject: sched/debug: Fix /proc/sched_debug regression

Commit:

  cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default")

... introduced a bug when CONFIG_SCHEDSTATS is enabled and the
runtime tunable is disabled (which is the default).

The wait-time, sum-exec, and sum-sleep fields are missing from the
/proc/sched_debug file in the runnable_tasks section.

Fix it with a new schedstat_val() macro which returns the field value
when schedstats is enabled and zero otherwise.  The macro works with
both SCHEDSTATS and !SCHEDSTATS.  I put the macro in stats.h since it
might end up being useful in other places.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default")
Link: http://lkml.kernel.org/r/bcda7c2790cf2ccbe586a28c02dd7b6fe7749a2b.1464994423.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c | 15 ++++-----------
 kernel/sched/stats.h |  3 +++
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index cf905f655ba1..0368c393a336 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -427,19 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		SPLIT_NS(p->se.vruntime),
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
-#ifdef CONFIG_SCHEDSTATS
-	if (schedstat_enabled()) {
-		SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-			SPLIT_NS(p->se.statistics.wait_sum),
-			SPLIT_NS(p->se.sum_exec_runtime),
-			SPLIT_NS(p->se.statistics.sum_sleep_runtime));
-	}
-#else
+
 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-		0LL, 0L,
+		SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
 		SPLIT_NS(p->se.sum_exec_runtime),
-		0LL, 0L);
-#endif
+		SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
+
 #ifdef CONFIG_NUMA_BALANCING
 	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
 #endif
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 70b3b6a20fb0..78955cbea31c 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -33,6 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 # define schedstat_inc(rq, field)	do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
 # define schedstat_add(rq, field, amt)	do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
 # define schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
+# define schedstat_val(rq, field)	((schedstat_enabled()) ? (rq)->field : 0)
+
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -47,6 +49,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
 # define schedstat_set(var, val)	do { } while (0)
+# define schedstat_val(rq, field)	0
 #endif
 
 #ifdef CONFIG_SCHED_INFO
-- 
cgit v1.2.3


From 4698f88c06b893f2acc0b443004a53bf490fde7c Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 7 Jun 2016 14:43:16 -0500
Subject: sched/debug: Fix 'schedstats=enable' cmdline option

The 'schedstats=enable' option doesn't work, and also produces the
following warning during boot:

  WARNING: CPU: 0 PID: 0 at /home/jpoimboe/git/linux/kernel/jump_label.c:61 static_key_slow_inc+0x8c/0xa0
  static_key_slow_inc used before call to jump_label_init
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted 4.7.0-rc1+ #25
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014
   0000000000000086 3ae3475a4bea95d4 ffffffff81e03da8 ffffffff8143fc83
   ffffffff81e03df8 0000000000000000 ffffffff81e03de8 ffffffff810b1ffb
   0000003d00000096 ffffffff823514d0 ffff88007ff197c8 0000000000000000
  Call Trace:
   [<ffffffff8143fc83>] dump_stack+0x85/0xc2
   [<ffffffff810b1ffb>] __warn+0xcb/0xf0
   [<ffffffff810b207f>] warn_slowpath_fmt+0x5f/0x80
   [<ffffffff811e9c0c>] static_key_slow_inc+0x8c/0xa0
   [<ffffffff810e07c6>] static_key_enable+0x16/0x40
   [<ffffffff8216d633>] setup_schedstats+0x29/0x94
   [<ffffffff82148a05>] unknown_bootoption+0x89/0x191
   [<ffffffff810d8617>] parse_args+0x297/0x4b0
   [<ffffffff82148d61>] start_kernel+0x1d8/0x4a9
   [<ffffffff8214897c>] ? set_init_arg+0x55/0x55
   [<ffffffff82148120>] ? early_idt_handler_array+0x120/0x120
   [<ffffffff821482db>] x86_64_start_reservations+0x2f/0x31
   [<ffffffff82148427>] x86_64_start_kernel+0x14a/0x16d

The problem is that it tries to update the 'sched_schedstats' static key
before jump labels have been initialized.

Changing jump_label_init() to be called earlier before
parse_early_param() wouldn't fix it: it would still fail trying to
poke_text() because mm isn't yet initialized.

Instead, just create a temporary '__sched_schedstats' variable which can
be copied to the static key later during sched_init() after jump labels
have been initialized.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default")
Link: http://lkml.kernel.org/r/453775fe3433bed65731a583e228ccea806d18cd.1465322027.git.jpoimboe@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f2cae4620c7..385c947482e1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2253,9 +2253,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 #endif
 #endif
 
+#ifdef CONFIG_SCHEDSTATS
+
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
 
-#ifdef CONFIG_SCHEDSTATS
 static void set_schedstats(bool enabled)
 {
 	if (enabled)
@@ -2278,11 +2280,16 @@ static int __init setup_schedstats(char *str)
 	if (!str)
 		goto out;
 
+	/*
+	 * This code is called before jump labels have been set up, so we can't
+	 * change the static branch directly just yet.  Instead set a temporary
+	 * variable so init_schedstats() can do it later.
+	 */
 	if (!strcmp(str, "enable")) {
-		set_schedstats(true);
+		__sched_schedstats = true;
 		ret = 1;
 	} else if (!strcmp(str, "disable")) {
-		set_schedstats(false);
+		__sched_schedstats = false;
 		ret = 1;
 	}
 out:
@@ -2293,6 +2300,11 @@ out:
 }
 __setup("schedstats=", setup_schedstats);
 
+static void __init init_schedstats(void)
+{
+	set_schedstats(__sched_schedstats);
+}
+
 #ifdef CONFIG_PROC_SYSCTL
 int sysctl_schedstats(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2313,8 +2325,10 @@ int sysctl_schedstats(struct ctl_table *table, int write,
 		set_schedstats(state);
 	return err;
 }
-#endif
-#endif
+#endif /* CONFIG_PROC_SYSCTL */
+#else  /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
 
 /*
  * fork()/clone()-time setup:
@@ -7487,6 +7501,8 @@ void __init sched_init(void)
 #endif
 	init_sched_fair_class();
 
+	init_schedstats();
+
 	scheduler_running = 1;
 }
 
-- 
cgit v1.2.3


From 077fa7aed17de5022e44bf07dbaf732078b7b5b2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 8 Jun 2016 14:25:22 +0100
Subject: futex: Calculate the futex key based on a tail page for file-based
 futexes

Mike Galbraith reported that the LTP test case futex_wake04 was broken
by commit 65d8fc777f6d ("futex: Remove requirement for lock_page()
in get_futex_key()").

This test case uses futexes backed by hugetlbfs pages and so there is an
associated inode with a futex stored on such pages. The problem is that
the key is being calculated based on the head page index of the hugetlbfs
page and not the tail page.

Prior to the optimisation, the page lock was used to stabilise mappings and
pin the inode is file-backed which is overkill. If the page was a compound
page, the head page was automatically looked up as part of the page lock
operation but the tail page index was used to calculate the futex key.

After the optimisation, the compound head is looked up early and the page
lock is only relied upon to identify truncated pages, special pages or a
shmem page moving to swapcache. The head page is looked up because without
the page lock, special care has to be taken to pin the inode correctly.
However, the tail page is still required to calculate the futex key so
this patch records the tail page.

On vanilla 4.6, the output of the test case is;

futex_wake04    0  TINFO  :  Hugepagesize 2097152
futex_wake04    1  TFAIL  :  futex_wake04.c:126: Bug: wait_thread2 did not wake after 30 secs.

With the patch applied

futex_wake04    0  TINFO  :  Hugepagesize 2097152
futex_wake04    1  TPASS  :  Hi hydra, thread2 awake!

Fixes: 65d8fc777f6d "futex: Remove requirement for lock_page() in get_futex_key()"
Reported-and-tested-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20160608132522.GM2469@suse.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index ee25f5ba4aca..33664f70e2d2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
-	struct page *page;
+	struct page *page, *tail;
 	struct address_space *mapping;
 	int err, ro = 0;
 
@@ -530,7 +530,15 @@ again:
 	 * considered here and page lock forces unnecessarily serialization
 	 * From this point on, mapping will be re-verified if necessary and
 	 * page lock will be acquired only if it is unavoidable
-	 */
+	 *
+	 * Mapping checks require the head page for any compound page so the
+	 * head page and mapping is looked up now. For anonymous pages, it
+	 * does not matter if the page splits in the future as the key is
+	 * based on the address. For filesystem-backed pages, the tail is
+	 * required as the index of the page determines the key. For
+	 * base pages, there is no tail page and tail == page.
+	 */
+	tail = page;
 	page = compound_head(page);
 	mapping = READ_ONCE(page->mapping);
 
@@ -654,7 +662,7 @@ again:
 
 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 		key->shared.inode = inode;
-		key->shared.pgoff = basepage_index(page);
+		key->shared.pgoff = basepage_index(tail);
 		rcu_read_unlock();
 	}
 
-- 
cgit v1.2.3


From ba62bafe942b159a6109cbec780d36496e06b6c5 Mon Sep 17 00:00:00 2001
From: Zhouyi Zhou <yizhouzhou@ict.ac.cn>
Date: Wed, 8 Jun 2016 15:33:53 -0700
Subject: kernel/relay.c: fix potential memory leak

When relay_open_buf() fails in relay_open(), code will goto free_bufs,
but chan is nowhere freed.

Link: http://lkml.kernel.org/r/1464777927-19675-1-git-send-email-yizhouzhou@ict.ac.cn
Signed-off-by: Zhouyi Zhou <zhouzhouyi@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 074994bcfa9b..04d7cf3ef8cf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -614,6 +614,7 @@ free_bufs:
 
 	kref_put(&chan->kref, relay_destroy_channel);
 	mutex_unlock(&relay_channels_mutex);
+	kfree(chan);
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(relay_open);
-- 
cgit v1.2.3


From 29d6455178a09e1dc340380c582b13356227e8df Mon Sep 17 00:00:00 2001
From: Jann Horn <jannh@google.com>
Date: Wed, 1 Jun 2016 11:55:07 +0200
Subject: sched: panic on corrupted stack end

Until now, hitting this BUG_ON caused a recursive oops (because oops
handling involves do_exit(), which calls into the scheduler, which in
turn raises an oops), which caused stuff below the stack to be
overwritten until a panic happened (e.g.  via an oops in interrupt
context, caused by the overwritten CPU index in the thread_info).

Just panic directly.

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d1f7149f8704..11546a6ed5df 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3047,7 +3047,8 @@ static noinline void __schedule_bug(struct task_struct *prev)
 static inline void schedule_debug(struct task_struct *prev)
 {
 #ifdef CONFIG_SCHED_STACK_END_CHECK
-	BUG_ON(task_stack_end_corrupted(prev));
+	if (task_stack_end_corrupted(prev))
+		panic("corrupted stack end detected inside scheduler\n");
 #endif
 
 	if (unlikely(in_atomic_preempt_off())) {
-- 
cgit v1.2.3


From b7fa30c9cc48c4f55663420472505d3b4f6e1705 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 9 Jun 2016 15:07:50 +0200
Subject: sched/fair: Fix post_init_entity_util_avg() serialization

Chris Wilson reported a divide by 0 at:

 post_init_entity_util_avg():

 >    725	if (cfs_rq->avg.util_avg != 0) {
 >    726		sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
 > -> 727		sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 >    728
 >    729		if (sa->util_avg > cap)
 >    730			sa->util_avg = cap;
 >    731	} else {

Which given the lack of serialization, and the code generated from
update_cfs_rq_load_avg() is entirely possible:

	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
		sa->load_avg = max_t(long, sa->load_avg - r, 0);
		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
		removed_load = 1;
	}

turns into:

  ffffffff81087064:       49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
  ffffffff8108706b:       48 85 c0                test   %rax,%rax
  ffffffff8108706e:       74 40                   je     ffffffff810870b0
  ffffffff81087070:       4c 89 f8                mov    %r15,%rax
  ffffffff81087073:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
  ffffffff8108707a:       49 29 45 70             sub    %rax,0x70(%r13)
  ffffffff8108707e:       4c 89 f9                mov    %r15,%rcx
  ffffffff81087081:       bb 01 00 00 00          mov    $0x1,%ebx
  ffffffff81087086:       49 83 7d 70 00          cmpq   $0x0,0x70(%r13)
  ffffffff8108708b:       49 0f 49 4d 70          cmovns 0x70(%r13),%rcx

Which you'll note ends up with 'sa->load_avg - r' in memory at
ffffffff8108707a.

By calling post_init_entity_util_avg() under rq->lock we're sure to be
fully serialized against PELT updates and cannot observe intermediate
state like this.

Reported-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuyang Du <yuyang.du@intel.com>
Cc: bsegall@google.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: steve.muckle@linaro.org
Fixes: 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a bounded value")
Link: http://lkml.kernel.org/r/20160609130750.GQ30909@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 +--
 kernel/sched/fair.c | 8 +++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 017d5394f5dc..13d0896aff87 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2535,10 +2535,9 @@ void wake_up_new_task(struct task_struct *p)
 	 */
 	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
-	/* Post initialize new task's util average when its cfs_rq is set */
+	rq = __task_rq_lock(p, &rf);
 	post_init_entity_util_avg(&p->se);
 
-	rq = __task_rq_lock(p, &rf);
 	activate_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
 	trace_sched_wakeup_new(p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 218f8e83db73..4e33ad12bb68 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8496,8 +8496,9 @@ void free_fair_sched_group(struct task_group *tg)
 
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
-	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
 	int i;
 
 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8512,6 +8513,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
 
 	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+
 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
 				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
@@ -8525,7 +8528,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 		init_entity_runnable_average(se);
+
+		raw_spin_lock_irq(&rq->lock);
 		post_init_entity_util_avg(se);
+		raw_spin_unlock_irq(&rq->lock);
 	}
 
 	return 1;
-- 
cgit v1.2.3


From eda8dca519269c92a0771668b3d5678792de7b78 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Mon, 13 Jun 2016 02:32:09 -0500
Subject: sched/debug: Fix deadlock when enabling sched events

I see a hang when enabling sched events:

  echo 1 > /sys/kernel/debug/tracing/events/sched/enable

The printk buffer shows:

  BUG: spinlock recursion on CPU#1, swapper/1/0
   lock: 0xffff88007d5d8c00, .magic: dead4ead, .owner: swapper/1/0, .owner_cpu: 1
  CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.7.0-rc2+ #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.1-20150318_183358- 04/01/2014
  ...
  Call Trace:
   <IRQ>  [<ffffffff8143d663>] dump_stack+0x85/0xc2
   [<ffffffff81115948>] spin_dump+0x78/0xc0
   [<ffffffff81115aea>] do_raw_spin_lock+0x11a/0x150
   [<ffffffff81891471>] _raw_spin_lock+0x61/0x80
   [<ffffffff810e5466>] ? try_to_wake_up+0x256/0x4e0
   [<ffffffff810e5466>] try_to_wake_up+0x256/0x4e0
   [<ffffffff81891a0a>] ? _raw_spin_unlock_irqrestore+0x4a/0x80
   [<ffffffff810e5705>] wake_up_process+0x15/0x20
   [<ffffffff810cebb4>] insert_work+0x84/0xc0
   [<ffffffff810ced7f>] __queue_work+0x18f/0x660
   [<ffffffff810cf9a6>] queue_work_on+0x46/0x90
   [<ffffffffa00cd95b>] drm_fb_helper_dirty.isra.11+0xcb/0xe0 [drm_kms_helper]
   [<ffffffffa00cdac0>] drm_fb_helper_sys_imageblit+0x30/0x40 [drm_kms_helper]
   [<ffffffff814babcd>] soft_cursor+0x1ad/0x230
   [<ffffffff814ba379>] bit_cursor+0x649/0x680
   [<ffffffff814b9d30>] ? update_attr.isra.2+0x90/0x90
   [<ffffffff814b5e6a>] fbcon_cursor+0x14a/0x1c0
   [<ffffffff81555ef8>] hide_cursor+0x28/0x90
   [<ffffffff81558b6f>] vt_console_print+0x3bf/0x3f0
   [<ffffffff81122c63>] call_console_drivers.constprop.24+0x183/0x200
   [<ffffffff811241f4>] console_unlock+0x3d4/0x610
   [<ffffffff811247f5>] vprintk_emit+0x3c5/0x610
   [<ffffffff81124bc9>] vprintk_default+0x29/0x40
   [<ffffffff811e965b>] printk+0x57/0x73
   [<ffffffff810f7a9e>] enqueue_entity+0xc2e/0xc70
   [<ffffffff810f7b39>] enqueue_task_fair+0x59/0xab0
   [<ffffffff8106dcd9>] ? kvm_sched_clock_read+0x9/0x20
   [<ffffffff8103fb39>] ? sched_clock+0x9/0x10
   [<ffffffff810e3fcc>] activate_task+0x5c/0xa0
   [<ffffffff810e4514>] ttwu_do_activate+0x54/0xb0
   [<ffffffff810e5cea>] sched_ttwu_pending+0x7a/0xb0
   [<ffffffff810e5e51>] scheduler_ipi+0x61/0x170
   [<ffffffff81059e7f>] smp_trace_reschedule_interrupt+0x4f/0x2a0
   [<ffffffff81893ba6>] trace_reschedule_interrupt+0x96/0xa0
   <EOI>  [<ffffffff8106e0d6>] ? native_safe_halt+0x6/0x10
   [<ffffffff8110fb1d>] ? trace_hardirqs_on+0xd/0x10
   [<ffffffff81040ac0>] default_idle+0x20/0x1a0
   [<ffffffff8104147f>] arch_cpu_idle+0xf/0x20
   [<ffffffff81102f8f>] default_idle_call+0x2f/0x50
   [<ffffffff8110332e>] cpu_startup_entry+0x37e/0x450
   [<ffffffff8105af70>] start_secondary+0x160/0x1a0

Note the hang only occurs when echoing the above from a physical serial
console, not from an ssh session.

The bug is caused by a deadlock where the task is trying to grab the rq
lock twice because printk()'s aren't safe in sched code.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: cb2517653fcc ("sched/debug: Make schedstats a runtime tunable that is disabled by default")
Link: http://lkml.kernel.org/r/20160613073209.gdvdybiruljbkn3p@treble
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4e33ad12bb68..a2348deab7a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3246,7 +3246,7 @@ static inline void check_schedstat_required(void)
 			trace_sched_stat_iowait_enabled()  ||
 			trace_sched_stat_blocked_enabled() ||
 			trace_sched_stat_runtime_enabled())  {
-		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+		printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
 			     "stat_blocked and stat_runtime require the "
 			     "kernel parameter schedstats=enabled or "
 			     "kernel.sched_schedstats=1\n");
-- 
cgit v1.2.3


From 57675cb976eff977aefb428e68e4e0236d48a9ff Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 9 Jun 2016 15:20:05 +0300
Subject: kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while
 processing sysrq-w

Lengthy output of sysrq-w may take a lot of time on slow serial console.

Currently we reset NMI-watchdog on the current CPU to avoid spurious
lockup messages. Sometimes this doesn't work since softlockup watchdog
might trigger on another CPU which is waiting for an IPI to proceed.
We reset softlockup watchdogs on all CPUs, but we do this only after
listing all tasks, and this may be too late on a busy system.

So, reset watchdogs CPUs earlier, in for_each_process_thread() loop.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/1465474805-14641-1-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 13d0896aff87..4135ac83cb65 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5147,14 +5147,16 @@ void show_state_filter(unsigned long state_filter)
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take a lot of time:
+		 * Also, reset softlockup watchdogs on all CPUs, because
+		 * another CPU might be blocked waiting for us to process
+		 * an IPI.
 		 */
 		touch_nmi_watchdog();
+		touch_all_softlockup_watchdogs();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
 	}
 
-	touch_all_softlockup_watchdogs();
-
 #ifdef CONFIG_SCHED_DEBUG
 	if (!state_filter)
 		sysrq_sched_debug_show();
-- 
cgit v1.2.3


From df4565f9ebdc4d6dc50edc6e8fed08004e328332 Mon Sep 17 00:00:00 2001
From: Nicolai Stange <nicstange@gmail.com>
Date: Tue, 24 May 2016 14:05:05 +0200
Subject: kernel/kcov: unproxify debugfs file's fops

Since commit 49d200deaa68 ("debugfs: prevent access to removed files'
private data"), a debugfs file's file_operations methods get proxied
through lifetime aware wrappers.

However, only a certain subset of the file_operations members is supported
by debugfs and ->mmap isn't among them -- it appears to be NULL from the
VFS layer's perspective.

This behaviour breaks the /sys/kernel/debug/kcov file introduced
concurrently with commit 5c9a8750a640 ("kernel: add kcov code coverage").

Since that file never gets removed, there is no file removal race and thus,
a lifetime checking proxy isn't needed.

Avoid the proxying for /sys/kernel/debug/kcov by creating it via
debugfs_create_file_unsafe() rather than debugfs_create_file().

Fixes: 49d200deaa68 ("debugfs: prevent access to removed files' private data")
Fixes: 5c9a8750a640 ("kernel: add kcov code coverage")
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Nicolai Stange <nicstange@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/kcov.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index a02f2dddd1d7..8d44b3fea9d0 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = {
 
 static int __init kcov_init(void)
 {
-	if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+	/*
+	 * The kcov debugfs file won't ever get removed and thus,
+	 * there is no need to protect it against removal races. The
+	 * use of debugfs_create_file_unsafe() is actually safe here.
+	 */
+	if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {
 		pr_err("failed to create kcov in debugfs\n");
 		return -ENOMEM;
 	}
-- 
cgit v1.2.3


From 19de99f70b87fcc3338da52a89c439b088cbff71 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 15 Jun 2016 18:25:38 -0700
Subject: bpf: fix matching of data/data_end in verifier

The ctx structure passed into bpf programs is different depending on bpf
program type. The verifier incorrectly marked ctx->data and ctx->data_end
access based on ctx offset only. That caused loads in tracing programs
int bpf_prog(struct pt_regs *ctx) { .. ctx->ax .. }
to be incorrectly marked as PTR_TO_PACKET which later caused verifier
to reject the program that was actually valid in tracing context.
Fix this by doing program type specific matching of ctx offsets.

Fixes: 969bf05eb3ce ("bpf: direct packet access")
Reported-by: Sasha Goldshtein <goldshtn@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 28 +++++++++++++++++++++++++++-
 kernel/bpf/verifier.c    | 41 +++++++----------------------------------
 kernel/trace/bpf_trace.c |  6 ++++--
 net/core/filter.c        | 16 ++++++++++++++--
 4 files changed, 52 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8ee27b8afe81..8269cafc6eb1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -111,6 +111,31 @@ enum bpf_access_type {
 	BPF_WRITE = 2
 };
 
+/* types of values stored in eBPF registers */
+enum bpf_reg_type {
+	NOT_INIT = 0,		 /* nothing was written into register */
+	UNKNOWN_VALUE,		 /* reg doesn't contain a valid pointer */
+	PTR_TO_CTX,		 /* reg points to bpf_context */
+	CONST_PTR_TO_MAP,	 /* reg points to struct bpf_map */
+	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
+	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+	FRAME_PTR,		 /* reg == frame_pointer */
+	PTR_TO_STACK,		 /* reg == frame_pointer + imm */
+	CONST_IMM,		 /* constant integer value */
+
+	/* PTR_TO_PACKET represents:
+	 * skb->data
+	 * skb->data + imm
+	 * skb->data + (u16) var
+	 * skb->data + (u16) var + imm
+	 * if (range > 0) then [ptr, ptr + range - off) is safe to access
+	 * if (id > 0) means that some 'var' was added
+	 * if (off > 0) menas that 'imm' was added
+	 */
+	PTR_TO_PACKET,
+	PTR_TO_PACKET_END,	 /* skb->data + headlen */
+};
+
 struct bpf_prog;
 
 struct bpf_verifier_ops {
@@ -120,7 +145,8 @@ struct bpf_verifier_ops {
 	/* return true if 'size' wide access at offset 'off' within bpf_context
 	 * with 'type' (read or write) is allowed
 	 */
-	bool (*is_valid_access)(int off, int size, enum bpf_access_type type);
+	bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
+				enum bpf_reg_type *reg_type);
 
 	u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg,
 				  int src_reg, int ctx_off,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 668e07903c8f..eec9f90ba030 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -126,31 +126,6 @@
  * are set to NOT_INIT to indicate that they are no longer readable.
  */
 
-/* types of values stored in eBPF registers */
-enum bpf_reg_type {
-	NOT_INIT = 0,		 /* nothing was written into register */
-	UNKNOWN_VALUE,		 /* reg doesn't contain a valid pointer */
-	PTR_TO_CTX,		 /* reg points to bpf_context */
-	CONST_PTR_TO_MAP,	 /* reg points to struct bpf_map */
-	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
-	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
-	FRAME_PTR,		 /* reg == frame_pointer */
-	PTR_TO_STACK,		 /* reg == frame_pointer + imm */
-	CONST_IMM,		 /* constant integer value */
-
-	/* PTR_TO_PACKET represents:
-	 * skb->data
-	 * skb->data + imm
-	 * skb->data + (u16) var
-	 * skb->data + (u16) var + imm
-	 * if (range > 0) then [ptr, ptr + range - off) is safe to access
-	 * if (id > 0) means that some 'var' was added
-	 * if (off > 0) menas that 'imm' was added
-	 */
-	PTR_TO_PACKET,
-	PTR_TO_PACKET_END,	 /* skb->data + headlen */
-};
-
 struct reg_state {
 	enum bpf_reg_type type;
 	union {
@@ -695,10 +670,10 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,
 
 /* check access to 'struct bpf_context' fields */
 static int check_ctx_access(struct verifier_env *env, int off, int size,
-			    enum bpf_access_type t)
+			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
 	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t)) {
+	    env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
 			env->prog->aux->max_ctx_offset = off + size;
@@ -798,21 +773,19 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			mark_reg_unknown_value(state->regs, value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
+		enum bpf_reg_type reg_type = UNKNOWN_VALUE;
+
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
 			verbose("R%d leaks addr into ctx\n", value_regno);
 			return -EACCES;
 		}
-		err = check_ctx_access(env, off, size, t);
+		err = check_ctx_access(env, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			mark_reg_unknown_value(state->regs, value_regno);
-			if (off == offsetof(struct __sk_buff, data) &&
-			    env->allow_ptr_leaks)
+			if (env->allow_ptr_leaks)
 				/* note that reg.[id|off|range] == 0 */
-				state->regs[value_regno].type = PTR_TO_PACKET;
-			else if (off == offsetof(struct __sk_buff, data_end) &&
-				 env->allow_ptr_leaks)
-				state->regs[value_regno].type = PTR_TO_PACKET_END;
+				state->regs[value_regno].type = reg_type;
 		}
 
 	} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 720b7bb01d43..e7af6cb9d5cf 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -349,7 +349,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 }
 
 /* bpf+kprobe programs can access fields of 'struct pt_regs' */
-static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+					enum bpf_reg_type *reg_type)
 {
 	/* check bounds */
 	if (off < 0 || off >= sizeof(struct pt_regs))
@@ -427,7 +428,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 	}
 }
 
-static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type)
+static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
+				    enum bpf_reg_type *reg_type)
 {
 	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
 		return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index 68adb5f52110..c4b330c85c02 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2085,7 +2085,8 @@ static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 }
 
 static bool sk_filter_is_valid_access(int off, int size,
-				      enum bpf_access_type type)
+				      enum bpf_access_type type,
+				      enum bpf_reg_type *reg_type)
 {
 	switch (off) {
 	case offsetof(struct __sk_buff, tc_classid):
@@ -2108,7 +2109,8 @@ static bool sk_filter_is_valid_access(int off, int size,
 }
 
 static bool tc_cls_act_is_valid_access(int off, int size,
-				       enum bpf_access_type type)
+				       enum bpf_access_type type,
+				       enum bpf_reg_type *reg_type)
 {
 	if (type == BPF_WRITE) {
 		switch (off) {
@@ -2123,6 +2125,16 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 			return false;
 		}
 	}
+
+	switch (off) {
+	case offsetof(struct __sk_buff, data):
+		*reg_type = PTR_TO_PACKET;
+		break;
+	case offsetof(struct __sk_buff, data_end):
+		*reg_type = PTR_TO_PACKET_END;
+		break;
+	}
+
 	return __is_valid_access(off, size, type);
 }
 
-- 
cgit v1.2.3


From ad572d174787daa59e24b8b5c83028c09cdb5ddb Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 15 Jun 2016 18:25:39 -0700
Subject: bpf, trace: check event type in bpf_perf_event_read

similar to bpf_perf_event_output() the bpf_perf_event_read() helper
needs to check the type of the perf_event before reading the counter.

Fixes: a43eec304259 ("bpf: introduce bpf_perf_event_output() helper")
Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e7af6cb9d5cf..26f603da7e26 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -209,6 +209,10 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 	    event->pmu->count)
 		return -EINVAL;
 
+	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
+		     event->attr.type != PERF_TYPE_RAW))
+		return -EINVAL;
+
 	/*
 	 * we don't know if the function is run successfully by the
 	 * return value. It can be judged in other places, such as
-- 
cgit v1.2.3


From 8fa3b8d689a54d6d04ff7803c724fb7aca6ce98e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 May 2016 15:42:13 -0400
Subject: cgroup: set css->id to -1 during init

If percpu_ref initialization fails during css_create(), the free path
can end up trying to free css->id of zero.  As ID 0 is unused, it
doesn't cause a critical breakage but it does trigger a warning
message.  Fix it by setting css->id to -1 from init_and_link_css().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Wenwei Tao <ww.tao0320@gmail.com>
Fixes: 01e586598b22 ("cgroup: release css->id after css_free")
Cc: stable@vger.kernel.org # v4.0+
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 789b84f973c9..688eb0cd1851 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5063,6 +5063,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 	memset(css, 0, sizeof(*css));
 	css->cgroup = cgrp;
 	css->ss = ss;
+	css->id = -1;
 	INIT_LIST_HEAD(&css->sibling);
 	INIT_LIST_HEAD(&css->children);
 	css->serial_nr = css_serial_nr_next++;
-- 
cgit v1.2.3


From 8974189222159154c55f24ddad33e3613960521a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jun 2016 10:50:40 +0200
Subject: sched/fair: Fix cfs_rq avg tracking underflow

As per commit:

  b7fa30c9cc48 ("sched/fair: Fix post_init_entity_util_avg() serialization")

> the code generated from update_cfs_rq_load_avg():
>
> 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> 		sa->load_avg = max_t(long, sa->load_avg - r, 0);
> 		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
> 		removed_load = 1;
> 	}
>
> turns into:
>
> ffffffff81087064:       49 8b 85 98 00 00 00    mov    0x98(%r13),%rax
> ffffffff8108706b:       48 85 c0                test   %rax,%rax
> ffffffff8108706e:       74 40                   je     ffffffff810870b0 <update_blocked_averages+0xc0>
> ffffffff81087070:       4c 89 f8                mov    %r15,%rax
> ffffffff81087073:       49 87 85 98 00 00 00    xchg   %rax,0x98(%r13)
> ffffffff8108707a:       49 29 45 70             sub    %rax,0x70(%r13)
> ffffffff8108707e:       4c 89 f9                mov    %r15,%rcx
> ffffffff81087081:       bb 01 00 00 00          mov    $0x1,%ebx
> ffffffff81087086:       49 83 7d 70 00          cmpq   $0x0,0x70(%r13)
> ffffffff8108708b:       49 0f 49 4d 70          cmovns 0x70(%r13),%rcx
>
> Which you'll note ends up with sa->load_avg -= r in memory at
> ffffffff8108707a.

So I _should_ have looked at other unserialized users of ->load_avg,
but alas. Luckily nikbor reported a similar /0 from task_h_load() which
instantly triggered recollection of this here problem.

Aside from the intermediate value hitting memory and causing problems,
there's another problem: the underflow detection relies on the signed
bit. This reduces the effective width of the variables, IOW its
effectively the same as having these variables be of signed type.

This patch changes to a different means of unsigned underflow
detection to not rely on the signed bit. This allows the variables to
use the 'full' unsigned range. And it does so with explicit LOAD -
STORE to ensure any intermediate value will never be visible in
memory, allowing these unserialized loads.

Note: GCC generates crap code for this, might warrant a look later.

Note2: I say 'full' above, if we end up at U*_MAX we'll still explode;
       maybe we should do clamping on add too.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yuyang Du <yuyang.du@intel.com>
Cc: bsegall@google.com
Cc: kernel@kyup.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: steve.muckle@linaro.org
Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
Link: http://lkml.kernel.org/r/20160617091948.GJ30927@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a2348deab7a3..2ae68f0e3bf5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2904,6 +2904,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 	}
 }
 
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {				\
+	typeof(_ptr) ptr = (_ptr);				\
+	typeof(*ptr) val = (_val);				\
+	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
+	res = var - val;					\
+	if (res > var)						\
+		res = 0;					\
+	WRITE_ONCE(*ptr, res);					\
+} while (0)
+
 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -2913,15 +2930,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 
 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-		sa->load_avg = max_t(long, sa->load_avg - r, 0);
-		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+		sub_positive(&sa->load_avg, r);
+		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
 		removed_load = 1;
 	}
 
 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-		sa->util_avg = max_t(long, sa->util_avg - r, 0);
-		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+		sub_positive(&sa->util_avg, r);
+		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
 		removed_util = 1;
 	}
 
@@ -2994,10 +3011,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
 			  cfs_rq->curr == se, NULL);
 
-	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
 
 	cfs_rq_util_change(cfs_rq);
 }
-- 
cgit v1.2.3


From 70c8217acd4383e069fe1898bbad36ea4fcdbdcc Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 17 Jun 2016 16:10:42 -0400
Subject: tracing: Handle NULL formats in hold_module_trace_bprintk_format()

If a task uses a non constant string for the format parameter in
trace_printk(), then the trace_printk_fmt variable is set to NULL. This
variable is then saved in the __trace_printk_fmt section.

The function hold_module_trace_bprintk_format() checks to see if duplicate
formats are used by modules, and reuses them if so (saves them to the list
if it is new). But this function calls lookup_format() that does a strcmp()
to the value (which is now NULL) and can cause a kernel oops.

This wasn't an issue till 3debb0a9ddb ("tracing: Fix trace_printk() to print
when not using bprintk()") which added "__used" to the trace_printk_fmt
variable, and before that, the kernel simply optimized it out (no NULL value
was saved).

The fix is simply to handle the NULL pointer in lookup_format() and have the
caller ignore the value if it was NULL.

Link: http://lkml.kernel.org/r/1464769870-18344-1-git-send-email-zhengjun.xing@intel.com

Reported-by: xingzhen <zhengjun.xing@intel.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Fixes: 3debb0a9ddb ("tracing: Fix trace_printk() to print when not using bprintk()")
Cc: stable@vger.kernel.org # v3.5+
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_printk.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index f96f0383f6c6..ad1d6164e946 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -36,6 +36,10 @@ struct trace_bprintk_fmt {
 static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
 {
 	struct trace_bprintk_fmt *pos;
+
+	if (!fmt)
+		return ERR_PTR(-EINVAL);
+
 	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
 		if (!strcmp(pos->fmt, fmt))
 			return pos;
@@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
 	for (iter = start; iter < end; iter++) {
 		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
 		if (tb_fmt) {
-			*iter = tb_fmt->fmt;
+			if (!IS_ERR(tb_fmt))
+				*iter = tb_fmt->fmt;
 			continue;
 		}
 
-- 
cgit v1.2.3


From 6720a305df74ca30bcc10fc316881641b6ff0c80 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 23 Jun 2016 12:11:17 -0700
Subject: locking: avoid passing around 'thread_info' in mutex debugging code

None of the code actually wants a thread_info, it all wants a
task_struct, and it's just converting back and forth between the two
("ti->task" to get the task_struct from the thread_info, and
"task_thread_info(task)" to go the other way).

No semantic change.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/locking/mutex-debug.c | 12 ++++++------
 kernel/locking/mutex-debug.h |  4 ++--
 kernel/locking/mutex.c       |  6 +++---
 kernel/locking/mutex.h       |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 3ef3736002d8..9c951fade415 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)
 }
 
 void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-			    struct thread_info *ti)
+			    struct task_struct *task)
 {
 	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
 
 	/* Mark the current thread as blocked on the lock: */
-	ti->task->blocked_on = waiter;
+	task->blocked_on = waiter;
 }
 
 void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-			 struct thread_info *ti)
+			 struct task_struct *task)
 {
 	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
-	DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
-	DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
-	ti->task->blocked_on = NULL;
+	DEBUG_LOCKS_WARN_ON(waiter->task != task);
+	DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
+	task->blocked_on = NULL;
 
 	list_del_init(&waiter->list);
 	waiter->task = NULL;
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
index 0799fd3e4cfa..d06ae3bb46c5 100644
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock,
 extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
 extern void debug_mutex_add_waiter(struct mutex *lock,
 				   struct mutex_waiter *waiter,
-				   struct thread_info *ti);
+				   struct task_struct *task);
 extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
-				struct thread_info *ti);
+				struct task_struct *task);
 extern void debug_mutex_unlock(struct mutex *lock);
 extern void debug_mutex_init(struct mutex *lock, const char *name,
 			     struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 79d2d765a75f..a70b90db3909 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -537,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		goto skip_wait;
 
 	debug_mutex_lock_common(lock, &waiter);
-	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+	debug_mutex_add_waiter(lock, &waiter, task);
 
 	/* add waiting tasks to the end of the waitqueue (FIFO): */
 	list_add_tail(&waiter.list, &lock->wait_list);
@@ -584,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	}
 	__set_task_state(task, TASK_RUNNING);
 
-	mutex_remove_waiter(lock, &waiter, current_thread_info());
+	mutex_remove_waiter(lock, &waiter, task);
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -605,7 +605,7 @@ skip_wait:
 	return 0;
 
 err:
-	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+	mutex_remove_waiter(lock, &waiter, task);
 	spin_unlock_mutex(&lock->wait_lock, flags);
 	debug_mutex_free_waiter(&waiter);
 	mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 5cda397607f2..a68bae5e852a 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -13,7 +13,7 @@
 		do { spin_lock(lock); (void)(flags); } while (0)
 #define spin_unlock_mutex(lock, flags) \
 		do { spin_unlock(lock); (void)(flags); } while (0)
-#define mutex_remove_waiter(lock, waiter, ti) \
+#define mutex_remove_waiter(lock, waiter, task) \
 		__list_del((waiter)->list.prev, (waiter)->list.next)
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-- 
cgit v1.2.3


From 82d6489d0fed2ec8a8c48c19e8d8a04ac8e5bb26 Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Wed, 22 Jun 2016 17:28:41 -0300
Subject: cgroup: Disable IRQs while holding css_set_lock

While testing the deadline scheduler + cgroup setup I hit this
warning.

[  132.612935] ------------[ cut here ]------------
[  132.612951] WARNING: CPU: 5 PID: 0 at kernel/softirq.c:150 __local_bh_enable_ip+0x6b/0x80
[  132.612952] Modules linked in: (a ton of modules...)
[  132.612981] CPU: 5 PID: 0 Comm: swapper/5 Not tainted 4.7.0-rc2 #2
[  132.612981] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.8.2-20150714_191134- 04/01/2014
[  132.612982]  0000000000000086 45c8bb5effdd088b ffff88013fd43da0 ffffffff813d229e
[  132.612984]  0000000000000000 0000000000000000 ffff88013fd43de0 ffffffff810a652b
[  132.612985]  00000096811387b5 0000000000000200 ffff8800bab29d80 ffff880034c54c00
[  132.612986] Call Trace:
[  132.612987]  <IRQ>  [<ffffffff813d229e>] dump_stack+0x63/0x85
[  132.612994]  [<ffffffff810a652b>] __warn+0xcb/0xf0
[  132.612997]  [<ffffffff810e76a0>] ? push_dl_task.part.32+0x170/0x170
[  132.612999]  [<ffffffff810a665d>] warn_slowpath_null+0x1d/0x20
[  132.613000]  [<ffffffff810aba5b>] __local_bh_enable_ip+0x6b/0x80
[  132.613008]  [<ffffffff817d6c8a>] _raw_write_unlock_bh+0x1a/0x20
[  132.613010]  [<ffffffff817d6c9e>] _raw_spin_unlock_bh+0xe/0x10
[  132.613015]  [<ffffffff811388ac>] put_css_set+0x5c/0x60
[  132.613016]  [<ffffffff8113dc7f>] cgroup_free+0x7f/0xa0
[  132.613017]  [<ffffffff810a3912>] __put_task_struct+0x42/0x140
[  132.613018]  [<ffffffff810e776a>] dl_task_timer+0xca/0x250
[  132.613027]  [<ffffffff810e76a0>] ? push_dl_task.part.32+0x170/0x170
[  132.613030]  [<ffffffff8111371e>] __hrtimer_run_queues+0xee/0x270
[  132.613031]  [<ffffffff81113ec8>] hrtimer_interrupt+0xa8/0x190
[  132.613034]  [<ffffffff81051a58>] local_apic_timer_interrupt+0x38/0x60
[  132.613035]  [<ffffffff817d9b0d>] smp_apic_timer_interrupt+0x3d/0x50
[  132.613037]  [<ffffffff817d7c5c>] apic_timer_interrupt+0x8c/0xa0
[  132.613038]  <EOI>  [<ffffffff81063466>] ? native_safe_halt+0x6/0x10
[  132.613043]  [<ffffffff81037a4e>] default_idle+0x1e/0xd0
[  132.613044]  [<ffffffff810381cf>] arch_cpu_idle+0xf/0x20
[  132.613046]  [<ffffffff810e8fda>] default_idle_call+0x2a/0x40
[  132.613047]  [<ffffffff810e92d7>] cpu_startup_entry+0x2e7/0x340
[  132.613048]  [<ffffffff81050235>] start_secondary+0x155/0x190
[  132.613049] ---[ end trace f91934d162ce9977 ]---

The warn is the spin_(lock|unlock)_bh(&css_set_lock) in the interrupt
context. Converting the spin_lock_bh to spin_lock_irq(save) to avoid
this problem - and other problems of sharing a spinlock with an
interrupt.

Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: cgroups@vger.kernel.org
Cc: stable@vger.kernel.org # 4.5+
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: "Luis Claudio R. Goncalves" <lgoncalv@redhat.com>
Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Acked-by: Zefan Li <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 142 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 74 insertions(+), 68 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 688eb0cd1851..75c0ff00aca6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -837,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset)
 
 static void put_css_set(struct css_set *cset)
 {
+	unsigned long flags;
+
 	/*
 	 * Ensure that the refcount doesn't hit zero while any readers
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
@@ -845,9 +847,9 @@ static void put_css_set(struct css_set *cset)
 	if (atomic_add_unless(&cset->refcount, -1, 1))
 		return;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irqsave(&css_set_lock, flags);
 	put_css_set_locked(cset);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irqrestore(&css_set_lock, flags);
 }
 
 /*
@@ -1070,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	cset = find_existing_css_set(old_cset, cgrp, template);
 	if (cset)
 		get_css_set(cset);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	if (cset)
 		return cset;
@@ -1102,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	 * find_existing_css_set() */
 	memcpy(cset->subsys, template, sizeof(cset->subsys));
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	/* Add reference counts and links from the new css_set. */
 	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
@@ -1128,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 		css_get(css);
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	return cset;
 }
@@ -1192,7 +1194,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	 * Release all the links from cset_links to this hierarchy's
 	 * root cgroup
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 		list_del(&link->cset_link);
@@ -1200,7 +1202,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 		kfree(link);
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
@@ -1600,11 +1602,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
 		ss->root = dst_root;
 		css->cgroup = dcgrp;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		hash_for_each(css_set_table, i, cset, hlist)
 			list_move_tail(&cset->e_cset_node[ss->id],
 				       &dcgrp->e_csets[ss->id]);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 
 		/* default hierarchy doesn't enable controllers by default */
 		dst_root->subsys_mask |= 1 << ssid;
@@ -1640,10 +1642,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 	if (!buf)
 		return -ENOMEM;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
 	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	if (len >= PATH_MAX)
 		len = -ERANGE;
@@ -1897,7 +1899,7 @@ static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	if (use_task_css_set_links)
 		goto out_unlock;
@@ -1922,8 +1924,12 @@ static void cgroup_enable_task_cg_lists(void)
 		 * entry won't be deleted though the process has exited.
 		 * Do it while holding siglock so that we don't end up
 		 * racing against cgroup_exit().
+		 *
+		 * Interrupts were already disabled while acquiring
+		 * the css_set_lock, so we do not need to disable it
+		 * again when acquiring the sighand->siglock here.
 		 */
-		spin_lock_irq(&p->sighand->siglock);
+		spin_lock(&p->sighand->siglock);
 		if (!(p->flags & PF_EXITING)) {
 			struct css_set *cset = task_css_set(p);
 
@@ -1932,11 +1938,11 @@ static void cgroup_enable_task_cg_lists(void)
 			list_add_tail(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
-		spin_unlock_irq(&p->sighand->siglock);
+		spin_unlock(&p->sighand->siglock);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 out_unlock:
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 }
 
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -2043,13 +2049,13 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
 	 * Link the root cgroup in this hierarchy into all the css_set
 	 * objects.
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	hash_for_each(css_set_table, i, cset, hlist) {
 		link_css_set(&tmp_links, cset, root_cgrp);
 		if (css_set_populated(cset))
 			cgroup_update_populated(root_cgrp, true);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	BUG_ON(!list_empty(&root_cgrp->self.children));
 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2256,11 +2262,11 @@ out_mount:
 		struct cgroup *cgrp;
 
 		mutex_lock(&cgroup_mutex);
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 
 		cgrp = cset_cgroup_from_root(ns->root_cset, root);
 
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 		mutex_unlock(&cgroup_mutex);
 
 		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
@@ -2337,11 +2343,11 @@ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
 	char *ret;
 
 	mutex_lock(&cgroup_mutex);
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 
 	return ret;
@@ -2369,7 +2375,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 	char *path = NULL;
 
 	mutex_lock(&cgroup_mutex);
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
 
@@ -2382,7 +2388,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 			path = buf;
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 	return path;
 }
@@ -2557,7 +2563,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 	 * the new cgroup.  There are no failure cases after here, so this
 	 * is the commit point.
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(cset, &tset->src_csets, mg_node) {
 		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
 			struct css_set *from_cset = task_css_set(task);
@@ -2568,7 +2574,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 			put_css_set_locked(from_cset);
 		}
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/*
 	 * Migration is committed, all target tasks are now on dst_csets.
@@ -2597,13 +2603,13 @@ out_cancel_attach:
 		}
 	} while_each_subsys_mask();
 out_release_tset:
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_splice_init(&tset->dst_csets, &tset->src_csets);
 	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
 		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 		list_del_init(&cset->mg_node);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	return ret;
 }
 
@@ -2634,7 +2640,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cgrp = NULL;
@@ -2642,7 +2648,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 }
 
 /**
@@ -2783,7 +2789,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 	 * already PF_EXITING could be freed from underneath us unless we
 	 * take an rcu_read_lock.
 	 */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
@@ -2792,7 +2798,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	return cgroup_taskset_migrate(&tset, root);
 }
@@ -2816,7 +2822,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 		return -EBUSY;
 
 	/* look up all src csets */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
@@ -2826,7 +2832,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/* prepare dst csets and commit */
 	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
@@ -2859,9 +2865,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 		struct cgroup *cgrp;
 		struct inode *inode;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 
 		while (!cgroup_is_descendant(dst_cgrp, cgrp))
 			cgrp = cgroup_parent(cgrp);
@@ -2962,9 +2968,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 		if (root == &cgrp_dfl_root)
 			continue;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		from_cgrp = task_cgroup_from_root(from, root);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 
 		retval = cgroup_attach_task(from_cgrp, tsk, false);
 		if (retval)
@@ -3080,7 +3086,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	percpu_down_write(&cgroup_threadgroup_rwsem);
 
 	/* look up all csses currently attached to @cgrp's subtree */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
 		struct cgrp_cset_link *link;
 
@@ -3088,14 +3094,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 			cgroup_migrate_add_src(link->cset, dsct,
 					       &preloaded_csets);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/* NULL dst indicates self on default hierarchy */
 	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
 	if (ret)
 		goto out_finish;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
 		struct task_struct *task, *ntask;
 
@@ -3107,7 +3113,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
 			cgroup_taskset_add(task, &tset);
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	ret = cgroup_taskset_migrate(&tset, cgrp->root);
 out_finish:
@@ -3908,10 +3914,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 	int count = 0;
 	struct cgrp_cset_link *link;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		count += atomic_read(&link->cset->refcount);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	return count;
 }
 
@@ -4249,7 +4255,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	memset(it, 0, sizeof(*it));
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	it->ss = css->ss;
 
@@ -4262,7 +4268,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	css_task_iter_advance_css_set(it);
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 }
 
 /**
@@ -4280,7 +4286,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 		it->cur_task = NULL;
 	}
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	if (it->task_pos) {
 		it->cur_task = list_entry(it->task_pos, struct task_struct,
@@ -4289,7 +4295,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 		css_task_iter_advance(it);
 	}
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	return it->cur_task;
 }
@@ -4303,10 +4309,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 void css_task_iter_end(struct css_task_iter *it)
 {
 	if (it->cur_cset) {
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		list_del(&it->iters_node);
 		put_css_set_locked(it->cur_cset);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 	}
 
 	if (it->cur_task)
@@ -4338,10 +4344,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	mutex_lock(&cgroup_mutex);
 
 	/* all tasks in @from are being moved, all csets are source */
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &from->cset_links, cset_link)
 		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	ret = cgroup_migrate_prepare_dst(&preloaded_csets);
 	if (ret)
@@ -5449,10 +5455,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 */
 	cgrp->self.flags &= ~CSS_ONLINE;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		link->cset->dead = true;
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 
 	/* initiate massacre of all css's */
 	for_each_css(css, ssid, cgrp)
@@ -5723,7 +5729,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		goto out;
 
 	mutex_lock(&cgroup_mutex);
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	for_each_root(root) {
 		struct cgroup_subsys *ss;
@@ -5776,7 +5782,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 
 	retval = 0;
 out_unlock:
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 	kfree(buf);
 out:
@@ -5921,13 +5927,13 @@ void cgroup_post_fork(struct task_struct *child)
 	if (use_task_css_set_links) {
 		struct css_set *cset;
 
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			get_css_set(cset);
 			css_set_move_task(child, NULL, cset, false);
 		}
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 	}
 
 	/*
@@ -5972,9 +5978,9 @@ void cgroup_exit(struct task_struct *tsk)
 	cset = task_css_set(tsk);
 
 	if (!list_empty(&tsk->cg_list)) {
-		spin_lock_bh(&css_set_lock);
+		spin_lock_irq(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
-		spin_unlock_bh(&css_set_lock);
+		spin_unlock_irq(&css_set_lock);
 	} else {
 		get_css_set(cset);
 	}
@@ -6042,9 +6048,9 @@ static void cgroup_release_agent(struct work_struct *work)
 	if (!pathbuf || !agentbuf)
 		goto out;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	if (!path)
 		goto out;
 
@@ -6304,12 +6310,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
 		return ERR_PTR(-EPERM);
 
 	mutex_lock(&cgroup_mutex);
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 
 	cset = task_css_set(current);
 	get_css_set(cset);
 
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 
 	new_ns = alloc_cgroup_ns();
@@ -6433,7 +6439,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 	if (!name_buf)
 		return -ENOMEM;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -6444,7 +6450,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 			   c->root->hierarchy_id, name_buf);
 	}
 	rcu_read_unlock();
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	kfree(name_buf);
 	return 0;
 }
@@ -6455,7 +6461,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
 
-	spin_lock_bh(&css_set_lock);
+	spin_lock_irq(&css_set_lock);
 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
@@ -6478,7 +6484,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 	overflow:
 		seq_puts(seq, "  ...\n");
 	}
-	spin_unlock_bh(&css_set_lock);
+	spin_unlock_irq(&css_set_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 4c5ea0a9cd02d6aa8adc86e100b2a4cff8d614ff Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 21 Jun 2016 18:52:17 +0200
Subject: locking/static_key: Fix concurrent static_key_slow_inc()

The following scenario is possible:

    CPU 1                                   CPU 2
    static_key_slow_inc()
     atomic_inc_not_zero()
      -> key.enabled == 0, no increment
     jump_label_lock()
     atomic_inc_return()
      -> key.enabled == 1 now
                                            static_key_slow_inc()
                                             atomic_inc_not_zero()
                                              -> key.enabled == 1, inc to 2
                                             return
                                            ** static key is wrong!
     jump_label_update()
     jump_label_unlock()

Testing the static key at the point marked by (**) will follow the
wrong path for jumps that have not been patched yet.  This can
actually happen when creating many KVM virtual machines with userspace
LAPIC emulation; just run several copies of the following program:

    #include <fcntl.h>
    #include <unistd.h>
    #include <sys/ioctl.h>
    #include <linux/kvm.h>

    int main(void)
    {
        for (;;) {
            int kvmfd = open("/dev/kvm", O_RDONLY);
            int vmfd = ioctl(kvmfd, KVM_CREATE_VM, 0);
            close(ioctl(vmfd, KVM_CREATE_VCPU, 1));
            close(vmfd);
            close(kvmfd);
        }
        return 0;
    }

Every KVM_CREATE_VCPU ioctl will attempt a static_key_slow_inc() call.
The static key's purpose is to skip NULL pointer checks and indeed one
of the processes eventually dereferences NULL.

As explained in the commit that introduced the bug:

  706249c222f6 ("locking/static_keys: Rework update logic")

jump_label_update() needs key.enabled to be true.  The solution adopted
here is to temporarily make key.enabled == -1, and use go down the
slow path when key.enabled <= 0.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org> # v4.3+
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 706249c222f6 ("locking/static_keys: Rework update logic")
Link: http://lkml.kernel.org/r/1466527937-69798-1-git-send-email-pbonzini@redhat.com
[ Small stylistic edits to the changelog and the code. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h | 16 +++++++++++++---
 kernel/jump_label.c        | 36 +++++++++++++++++++++++++++++++++---
 2 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 0536524bb9eb..68904469fba1 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -117,13 +117,18 @@ struct module;
 
 #include <linux/atomic.h>
 
+#ifdef HAVE_JUMP_LABEL
+
 static inline int static_key_count(struct static_key *key)
 {
-	return atomic_read(&key->enabled);
+	/*
+	 * -1 means the first static_key_slow_inc() is in progress.
+	 *  static_key_enabled() must return true, so return 1 here.
+	 */
+	int n = atomic_read(&key->enabled);
+	return n >= 0 ? n : 1;
 }
 
-#ifdef HAVE_JUMP_LABEL
-
 #define JUMP_TYPE_FALSE	0UL
 #define JUMP_TYPE_TRUE	1UL
 #define JUMP_TYPE_MASK	1UL
@@ -162,6 +167,11 @@ extern void jump_label_apply_nops(struct module *mod);
 
 #else  /* !HAVE_JUMP_LABEL */
 
+static inline int static_key_count(struct static_key *key)
+{
+	return atomic_read(&key->enabled);
+}
+
 static __always_inline void jump_label_init(void)
 {
 	static_key_initialized = true;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 05254eeb4b4e..4b353e0be121 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key);
 
 void static_key_slow_inc(struct static_key *key)
 {
+	int v, v1;
+
 	STATIC_KEY_CHECK_USE();
-	if (atomic_inc_not_zero(&key->enabled))
-		return;
+
+	/*
+	 * Careful if we get concurrent static_key_slow_inc() calls;
+	 * later calls must wait for the first one to _finish_ the
+	 * jump_label_update() process.  At the same time, however,
+	 * the jump_label_update() call below wants to see
+	 * static_key_enabled(&key) for jumps to be updated properly.
+	 *
+	 * So give a special meaning to negative key->enabled: it sends
+	 * static_key_slow_inc() down the slow path, and it is non-zero
+	 * so it counts as "enabled" in jump_label_update().  Note that
+	 * atomic_inc_unless_negative() checks >= 0, so roll our own.
+	 */
+	for (v = atomic_read(&key->enabled); v > 0; v = v1) {
+		v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
+		if (likely(v1 == v))
+			return;
+	}
 
 	jump_label_lock();
-	if (atomic_inc_return(&key->enabled) == 1)
+	if (atomic_read(&key->enabled) == 0) {
+		atomic_set(&key->enabled, -1);
 		jump_label_update(key);
+		atomic_set(&key->enabled, 1);
+	} else {
+		atomic_inc(&key->enabled);
+	}
 	jump_label_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
 static void __static_key_slow_dec(struct static_key *key,
 		unsigned long rate_limit, struct delayed_work *work)
 {
+	/*
+	 * The negative count check is valid even when a negative
+	 * key->enabled is in use by static_key_slow_inc(); a
+	 * __static_key_slow_dec() before the first static_key_slow_inc()
+	 * returns is unbalanced, because all other static_key_slow_inc()
+	 * instances block while the update is in progress.
+	 */
 	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
 		WARN(atomic_read(&key->enabled) < 0,
 		     "jump label: negative count!\n");
-- 
cgit v1.2.3


From 094f469172e00d6ab0a3130b0e01c83b3cf3a98d Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Thu, 16 Jun 2016 15:57:01 +0300
Subject: sched/fair: Initialize throttle_count for new task-groups lazily

Cgroup created inside throttled group must inherit current throttle_count.
Broken throttle_count allows to nominate throttled entries as a next buddy,
later this leads to null pointer dereference in pick_next_task_fair().

This patch initialize cfs_rq->throttle_count at first enqueue: laziness
allows to skip locking all rq at group creation. Lazy approach also allows
to skip full sub-tree scan at throttling hierarchy (not in this patch).

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Link: http://lkml.kernel.org/r/146608182119.21870.8439834428248129633.stgit@buzz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 20 ++++++++++++++++++++
 kernel/sched/sched.h |  2 +-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2ae68f0e3bf5..8c5d8c0c8827 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4202,6 +4202,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 	if (!cfs_bandwidth_used())
 		return;
 
+	/* Synchronize hierarchical throttle counter: */
+	if (unlikely(!cfs_rq->throttle_uptodate)) {
+		struct rq *rq = rq_of(cfs_rq);
+		struct cfs_rq *pcfs_rq;
+		struct task_group *tg;
+
+		cfs_rq->throttle_uptodate = 1;
+
+		/* Get closest up-to-date node, because leaves go first: */
+		for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+			pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+			if (pcfs_rq->throttle_uptodate)
+				break;
+		}
+		if (tg) {
+			cfs_rq->throttle_count = pcfs_rq->throttle_count;
+			cfs_rq->throttled_clock_task = rq_clock_task(rq);
+		}
+	}
+
 	/* an active group must be handled by the update_curr()->put() path */
 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 		return;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 72f1f3087b04..7cbeb92a1cb9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -437,7 +437,7 @@ struct cfs_rq {
 
 	u64 throttled_clock, throttled_clock_task;
 	u64 throttled_clock_task_time;
-	int throttled, throttle_count;
+	int throttled, throttle_count, throttle_uptodate;
 	struct list_head throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
-- 
cgit v1.2.3


From 754bd598be9bbc953bc709a9e8ed7f3188bfb9d7 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Thu, 16 Jun 2016 15:57:15 +0300
Subject: sched/fair: Do not announce throttled next buddy in
 dequeue_task_fair()

Hierarchy could be already throttled at this point. Throttled next
buddy could trigger a NULL pointer dereference in pick_next_task_fair().

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/146608183552.21905.15924473394414832071.stgit@buzz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8c5d8c0c8827..bdcbeea90c95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4537,15 +4537,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
+			/* Avoid re-evaluating load for this entity: */
+			se = parent_entity(se);
 			/*
 			 * Bias pick_next to pick a task from this cfs_rq, as
 			 * p is sleeping when it is within its sched_slice.
 			 */
-			if (task_sleep && parent_entity(se))
-				set_next_buddy(parent_entity(se));
-
-			/* avoid re-evaluating load for this entity */
-			se = parent_entity(se);
+			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
+				set_next_buddy(se);
 			break;
 		}
 		flags |= DEQUEUE_SLEEP;
-- 
cgit v1.2.3


From feb245e304f343cf5e4f9123db36354144dce8a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 16 Jun 2016 15:35:04 -0400
Subject: sched/core: Allow kthreads to fall back to online && !active cpus

During CPU hotplug, CPU_ONLINE callbacks are run while the CPU is
online but not active.  A CPU_ONLINE callback may create or bind a
kthread so that its cpus_allowed mask only allows the CPU which is
being brought online.  The kthread may start executing before the CPU
is made active and can end up in select_fallback_rq().

In such cases, the expected behavior is selecting the CPU which is
coming online; however, because select_fallback_rq() only chooses from
active CPUs, it determines that the task doesn't have any viable CPU
in its allowed mask and ends up overriding it to cpu_possible_mask.

CPU_ONLINE callbacks should be able to put kthreads on the CPU which
is coming online.  Update select_fallback_rq() so that it follows
cpu_online() rather than cpu_active() for kthreads.

Reported-by: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Tested-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Abdul Haleem <abdhalee@linux.vnet.ibm.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@fb.com
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20160616193504.GB3262@mtj.duckdns.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4135ac83cb65..51d7105f529a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 	for (;;) {
 		/* Any allowed, online CPU? */
 		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-			if (!cpu_active(dest_cpu))
+			if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
+				continue;
+			if (!cpu_online(dest_cpu))
 				continue;
 			goto out;
 		}
-- 
cgit v1.2.3


From b235beea9e996a4d36fed6cfef4801a3e7d7a9a5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 24 Jun 2016 15:09:37 -0700
Subject: Clarify naming of thread info/stack allocators

We've had the thread info allocated together with the thread stack for
most architectures for a long time (since the thread_info was split off
from the task struct), but that is about to change.

But the patches that move the thread info to be off-stack (and a part of
the task struct instead) made it clear how confused the allocator and
freeing functions are.

Because the common case was that we share an allocation with the thread
stack and the thread_info, the two pointers were identical.  That
identity then meant that we would have things like

	ti = alloc_thread_info_node(tsk, node);
	...
	tsk->stack = ti;

which certainly _worked_ (since stack and thread_info have the same
value), but is rather confusing: why are we assigning a thread_info to
the stack? And if we move the thread_info away, the "confusing" code
just gets to be entirely bogus.

So remove all this confusion, and make it clear that we are doing the
stack allocation by renaming and clarifying the function names to be
about the stack.  The fact that the thread_info then shares the
allocation is an implementation detail, and not really about the
allocation itself.

This is a pure renaming and type fix: we pass in the same pointer, it's
just that we clarify what the pointer means.

The ia64 code that actually only has one single allocation (for all of
task_struct, thread_info and kernel thread stack) now looks a bit odd,
but since "tsk->stack" is actually not even used there, that oddity
doesn't matter.  It would be a separate thing to clean that up, I
intentionally left the ia64 changes as a pure brute-force renaming and
type change.

Acked-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                           |  4 +--
 arch/ia64/Kconfig                      |  2 +-
 arch/ia64/include/asm/thread_info.h    |  8 +++---
 arch/mn10300/include/asm/thread_info.h |  2 +-
 arch/mn10300/kernel/kgdb.c             |  3 +-
 arch/tile/include/asm/thread_info.h    |  2 +-
 arch/tile/kernel/process.c             |  3 +-
 include/linux/sched.h                  |  2 +-
 init/main.c                            |  4 +--
 kernel/fork.c                          | 50 +++++++++++++++++-----------------
 10 files changed, 41 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index e9734796531f..15996290fed4 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -226,8 +226,8 @@ config ARCH_INIT_TASK
 config ARCH_TASK_STRUCT_ALLOCATOR
 	bool
 
-# Select if arch has its private alloc_thread_info() function
-config ARCH_THREAD_INFO_ALLOCATOR
+# Select if arch has its private alloc_thread_stack() function
+config ARCH_THREAD_STACK_ALLOCATOR
 	bool
 
 # Select if arch wants to size task_struct dynamically via arch_task_struct_size:
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index f80758cb7157..e109ee95e919 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -45,7 +45,7 @@ config IA64
 	select GENERIC_SMP_IDLE_THREAD
 	select ARCH_INIT_TASK
 	select ARCH_TASK_STRUCT_ALLOCATOR
-	select ARCH_THREAD_INFO_ALLOCATOR
+	select ARCH_THREAD_STACK_ALLOCATOR
 	select ARCH_CLOCKSOURCE_DATA
 	select GENERIC_TIME_VSYSCALL_OLD
 	select SYSCTL_ARCH_UNALIGN_NO_WARN
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index aa995b67c3f5..d1212b84fb83 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -48,15 +48,15 @@ struct thread_info {
 #ifndef ASM_OFFSETS_C
 /* how to get the thread information struct from C */
 #define current_thread_info()	((struct thread_info *) ((char *) current + IA64_TASK_SIZE))
-#define alloc_thread_info_node(tsk, node)	\
-		((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE))
+#define alloc_thread_stack_node(tsk, node)	\
+		((unsigned long *) ((char *) (tsk) + IA64_TASK_SIZE))
 #define task_thread_info(tsk)	((struct thread_info *) ((char *) (tsk) + IA64_TASK_SIZE))
 #else
 #define current_thread_info()	((struct thread_info *) 0)
-#define alloc_thread_info_node(tsk, node)	((struct thread_info *) 0)
+#define alloc_thread_stack_node(tsk, node)	((unsigned long *) 0)
 #define task_thread_info(tsk)	((struct thread_info *) 0)
 #endif
-#define free_thread_info(ti)	/* nothing */
+#define free_thread_stack(ti)	/* nothing */
 #define task_stack_page(tsk)	((void *)(tsk))
 
 #define __HAVE_THREAD_FUNCTIONS
diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h
index 4861a78c7160..f5f90bbf019d 100644
--- a/arch/mn10300/include/asm/thread_info.h
+++ b/arch/mn10300/include/asm/thread_info.h
@@ -115,7 +115,7 @@ static inline unsigned long current_stack_pointer(void)
 }
 
 #ifndef CONFIG_KGDB
-void arch_release_thread_info(struct thread_info *ti);
+void arch_release_thread_stack(unsigned long *stack);
 #endif
 #define get_thread_info(ti)	get_task_struct((ti)->task)
 #define put_thread_info(ti)	put_task_struct((ti)->task)
diff --git a/arch/mn10300/kernel/kgdb.c b/arch/mn10300/kernel/kgdb.c
index 99770823451a..2d7986c386fe 100644
--- a/arch/mn10300/kernel/kgdb.c
+++ b/arch/mn10300/kernel/kgdb.c
@@ -397,8 +397,9 @@ static bool kgdb_arch_undo_singlestep(struct pt_regs *regs)
  * single-step state is cleared.  At this point the breakpoints should have
  * been removed by __switch_to().
  */
-void arch_release_thread_info(struct thread_info *ti)
+void arch_release_thread_stack(unsigned long *stack)
 {
+	struct thread_info *ti = (void *)stack;
 	if (kgdb_sstep_thread == ti) {
 		kgdb_sstep_thread = NULL;
 
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index 4b7cef9e94e0..c1467ac59ce6 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -78,7 +78,7 @@ struct thread_info {
 
 #ifndef __ASSEMBLY__
 
-void arch_release_thread_info(struct thread_info *info);
+void arch_release_thread_stack(unsigned long *stack);
 
 /* How to get the thread information struct from C. */
 register unsigned long stack_pointer __asm__("sp");
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 6b705ccc9cc1..a465d8372edd 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -73,8 +73,9 @@ void arch_cpu_idle(void)
 /*
  * Release a thread_info structure
  */
-void arch_release_thread_info(struct thread_info *info)
+void arch_release_thread_stack(unsigned long *stack)
 {
+	struct thread_info *info = (void *)stack;
 	struct single_step_state *step_state = info->step_state;
 
 	if (step_state) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e42ada26345..253538f29ade 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3007,7 +3007,7 @@ static inline int object_is_on_stack(void *obj)
 	return (obj >= stack) && (obj < (stack + THREAD_SIZE));
 }
 
-extern void thread_info_cache_init(void);
+extern void thread_stack_cache_init(void);
 
 #ifdef CONFIG_DEBUG_STACK_USAGE
 static inline unsigned long stack_not_used(struct task_struct *p)
diff --git a/init/main.c b/init/main.c
index 4c17fda5c2ff..826fd57fa3aa 100644
--- a/init/main.c
+++ b/init/main.c
@@ -453,7 +453,7 @@ void __init __weak smp_setup_processor_id(void)
 }
 
 # if THREAD_SIZE >= PAGE_SIZE
-void __init __weak thread_info_cache_init(void)
+void __init __weak thread_stack_cache_init(void)
 {
 }
 #endif
@@ -627,7 +627,7 @@ asmlinkage __visible void __init start_kernel(void)
 	/* Should be run before the first non-init thread is created */
 	init_espfix_bsp();
 #endif
-	thread_info_cache_init();
+	thread_stack_cache_init();
 	cred_init();
 	fork_init();
 	proc_caches_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 5c2c355aa97f..37b9439b8c07 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -148,18 +148,18 @@ static inline void free_task_struct(struct task_struct *tsk)
 }
 #endif
 
-void __weak arch_release_thread_info(struct thread_info *ti)
+void __weak arch_release_thread_stack(unsigned long *stack)
 {
 }
 
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
 
 /*
  * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
  * kmemcache based allocator.
  */
 # if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
 	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
@@ -172,33 +172,33 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 	return page ? page_address(page) : NULL;
 }
 
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_stack(unsigned long *stack)
 {
-	struct page *page = virt_to_page(ti);
+	struct page *page = virt_to_page(stack);
 
 	memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
 				    -(1 << THREAD_SIZE_ORDER));
 	__free_kmem_pages(page, THREAD_SIZE_ORDER);
 }
 # else
-static struct kmem_cache *thread_info_cache;
+static struct kmem_cache *thread_stack_cache;
 
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
-	return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_thread_info(struct thread_info *ti)
+static void free_stack(unsigned long *stack)
 {
-	kmem_cache_free(thread_info_cache, ti);
+	kmem_cache_free(thread_stack_cache, stack);
 }
 
-void thread_info_cache_init(void)
+void thread_stack_cache_init(void)
 {
-	thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+	thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
 					      THREAD_SIZE, 0, NULL);
-	BUG_ON(thread_info_cache == NULL);
+	BUG_ON(thread_stack_cache == NULL);
 }
 # endif
 #endif
@@ -221,9 +221,9 @@ struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(unsigned long *stack, int account)
 {
-	struct zone *zone = page_zone(virt_to_page(ti));
+	struct zone *zone = page_zone(virt_to_page(stack));
 
 	mod_zone_page_state(zone, NR_KERNEL_STACK, account);
 }
@@ -231,8 +231,8 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
 	account_kernel_stack(tsk->stack, -1);
-	arch_release_thread_info(tsk->stack);
-	free_thread_info(tsk->stack);
+	arch_release_thread_stack(tsk->stack);
+	free_thread_stack(tsk->stack);
 	rt_mutex_debug_task_free(tsk);
 	ftrace_graph_exit_task(tsk);
 	put_seccomp_filter(tsk);
@@ -343,7 +343,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
 	struct task_struct *tsk;
-	struct thread_info *ti;
+	unsigned long *stack;
 	int err;
 
 	if (node == NUMA_NO_NODE)
@@ -352,15 +352,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	if (!tsk)
 		return NULL;
 
-	ti = alloc_thread_info_node(tsk, node);
-	if (!ti)
+	stack = alloc_thread_stack_node(tsk, node);
+	if (!stack)
 		goto free_tsk;
 
 	err = arch_dup_task_struct(tsk, orig);
 	if (err)
-		goto free_ti;
+		goto free_stack;
 
-	tsk->stack = ti;
+	tsk->stack = stack;
 #ifdef CONFIG_SECCOMP
 	/*
 	 * We must handle setting up seccomp filters once we're under
@@ -392,14 +392,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->task_frag.page = NULL;
 	tsk->wake_q.next = NULL;
 
-	account_kernel_stack(ti, 1);
+	account_kernel_stack(stack, 1);
 
 	kcov_task_init(tsk);
 
 	return tsk;
 
-free_ti:
-	free_thread_info(ti);
+free_stack:
+	free_thread_stack(stack);
 free_tsk:
 	free_task_struct(tsk);
 	return NULL;
-- 
cgit v1.2.3


From 74070542099c66d87aebeacd7b54dc0e8b6a73f9 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 24 Jun 2016 14:50:16 -0700
Subject: oom, suspend: fix oom_reaper vs. oom_killer_disable race

Tetsuo has reported the following potential oom_killer_disable vs.
oom_reaper race:

 (1) freeze_processes() starts freezing user space threads.
 (2) Somebody (maybe a kenrel thread) calls out_of_memory().
 (3) The OOM killer calls mark_oom_victim() on a user space thread
     P1 which is already in __refrigerator().
 (4) oom_killer_disable() sets oom_killer_disabled = true.
 (5) P1 leaves __refrigerator() and enters do_exit().
 (6) The OOM reaper calls exit_oom_victim(P1) before P1 can call
     exit_oom_victim(P1).
 (7) oom_killer_disable() returns while P1 not yet finished
 (8) P1 perform IO/interfere with the freezer.

This situation is unfortunate.  We cannot move oom_killer_disable after
all the freezable kernel threads are frozen because the oom victim might
depend on some of those kthreads to make a forward progress to exit so
we could deadlock.  It is also far from trivial to teach the oom_reaper
to not call exit_oom_victim() because then we would lose a guarantee of
the OOM killer and oom_killer_disable forward progress because
exit_mm->mmput might block and never call exit_oom_victim.

It seems the easiest way forward is to workaround this race by calling
try_to_freeze_tasks again after oom_killer_disable.  This will make sure
that all the tasks are frozen or it bails out.

Fixes: 449d777d7ad6 ("mm, oom_reaper: clear TIF_MEMDIE for all tasks queued for oom_reaper")
Link: http://lkml.kernel.org/r/1466597634-16199-1-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/process.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index df058bed53ce..0c2ee9761d57 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -146,6 +146,18 @@ int freeze_processes(void)
 	if (!error && !oom_killer_disable())
 		error = -EBUSY;
 
+	/*
+	 * There is a hard to fix race between oom_reaper kernel thread
+	 * and oom_killer_disable. oom_reaper calls exit_oom_victim
+	 * before the victim reaches exit_mm so try to freeze all the tasks
+	 * again and catch such a left over task.
+	 */
+	if (!error) {
+		pr_info("Double checking all user space processes after OOM killer disable... ");
+		error = try_to_freeze_tasks(true);
+		pr_cont("\n");
+	}
+
 	if (error)
 		thaw_processes();
 	return error;
-- 
cgit v1.2.3


From 9521d39976db20f8ef9b56af66661482a17d5364 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Sat, 25 Jun 2016 21:53:30 +1000
Subject: Fix build break in fork.c when THREAD_SIZE < PAGE_SIZE

Commit b235beea9e99 ("Clarify naming of thread info/stack allocators")
breaks the build on some powerpc configs, where THREAD_SIZE < PAGE_SIZE:

  kernel/fork.c:235:2: error: implicit declaration of function 'free_thread_stack'
  kernel/fork.c:355:8: error: assignment from incompatible pointer type
    stack = alloc_thread_stack_node(tsk, node);
    ^

Fix it by renaming free_stack() to free_thread_stack(), and updating the
return type of alloc_thread_stack_node().

Fixes: b235beea9e99 ("Clarify naming of thread info/stack allocators")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 37b9439b8c07..4a7ec0c6c88c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -183,13 +183,13 @@ static inline void free_thread_stack(unsigned long *stack)
 # else
 static struct kmem_cache *thread_stack_cache;
 
-static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 						  int node)
 {
 	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
 }
 
-static void free_stack(unsigned long *stack)
+static void free_thread_stack(unsigned long *stack)
 {
 	kmem_cache_free(thread_stack_cache, stack);
 }
-- 
cgit v1.2.3


From 76a658c20efd541a62838d9ff68ce94170d7a549 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 28 Jun 2016 12:06:58 -0400
Subject: audit: move calcs after alloc and check when logging set loginuid

Move the calculations of values after the allocation in case the
allocation fails.  This avoids wasting effort in the rare case that it
fails, but more importantly saves us extra logic to release the tty
ref.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/auditsc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 71e14d836e69..33dafa70229d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1985,14 +1985,15 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	if (!audit_enabled)
 		return;
 
+	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+	if (!ab)
+		return;
+
 	uid = from_kuid(&init_user_ns, task_uid(current));
 	oldloginuid = from_kuid(&init_user_ns, koldloginuid);
 	loginuid = from_kuid(&init_user_ns, kloginuid),
 	tty = audit_get_tty(current);
 
-	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
-	if (!ab)
-		return;
 	audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
 	audit_log_task_context(ab);
 	audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
-- 
cgit v1.2.3


From 3f5be2da8565c1cce5655bb0948fcc957c6eb6c6 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 28 Jun 2016 12:07:50 -0400
Subject: audit: move audit_get_tty to reduce scope and kabi changes

The only users of audit_get_tty and audit_put_tty are internal to
audit, so move it out of include/linux/audit.h to kernel.h and create
a proper function rather than inlining it.  This also reduces kABI
changes.

Suggested-by: Paul Moore <pmoore@redhat.com>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: line wrapped description]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h | 24 ------------------------
 kernel/audit.c        | 17 +++++++++++++++++
 kernel/audit.h        |  4 ++++
 kernel/auditsc.c      |  1 -
 4 files changed, 21 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 32cdafb312d8..b40ed5df5542 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -26,7 +26,6 @@
 #include <linux/sched.h>
 #include <linux/ptrace.h>
 #include <uapi/linux/audit.h>
-#include <linux/tty.h>
 
 #define AUDIT_INO_UNSET ((unsigned long)-1)
 #define AUDIT_DEV_UNSET ((dev_t)-1)
@@ -344,23 +343,6 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 	return tsk->sessionid;
 }
 
-static inline struct tty_struct *audit_get_tty(struct task_struct *tsk)
-{
-	struct tty_struct *tty = NULL;
-	unsigned long flags;
-
-	spin_lock_irqsave(&tsk->sighand->siglock, flags);
-	if (tsk->signal)
-		tty = tty_kref_get(tsk->signal->tty);
-	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
-	return tty;
-}
-
-static inline void audit_put_tty(struct tty_struct *tty)
-{
-	tty_kref_put(tty);
-}
-
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
 extern void __audit_bprm(struct linux_binprm *bprm);
@@ -518,12 +500,6 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
 {
 	return -1;
 }
-static inline struct tty_struct *audit_get_tty(struct task_struct *tsk)
-{
-	return NULL;
-}
-static inline void audit_put_tty(struct tty_struct *tty)
-{ }
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 { }
 static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
diff --git a/kernel/audit.c b/kernel/audit.c
index 384374a1d232..d5971010e44a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1866,6 +1866,23 @@ out_null:
 	audit_log_format(ab, " exe=(null)");
 }
 
+struct tty_struct *audit_get_tty(struct task_struct *tsk)
+{
+	struct tty_struct *tty = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->sighand->siglock, flags);
+	if (tsk->signal)
+		tty = tty_kref_get(tsk->signal->tty);
+	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+	return tty;
+}
+
+void audit_put_tty(struct tty_struct *tty)
+{
+	tty_kref_put(tty);
+}
+
 void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
 	const struct cred *cred;
diff --git a/kernel/audit.h b/kernel/audit.h
index cbbe6bb6496e..a492f4c4e710 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -23,6 +23,7 @@
 #include <linux/audit.h>
 #include <linux/skbuff.h>
 #include <uapi/linux/mqueue.h>
+#include <linux/tty.h>
 
 /* AUDIT_NAMES is the number of slots we reserve in the audit_context
  * for saving names from getname().  If we get more names we will allocate
@@ -262,6 +263,9 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
 extern void audit_log_d_path_exe(struct audit_buffer *ab,
 				 struct mm_struct *mm);
 
+extern struct tty_struct *audit_get_tty(struct task_struct *tsk);
+extern void audit_put_tty(struct tty_struct *tty);
+
 /* audit watch functions */
 #ifdef CONFIG_AUDIT_WATCH
 extern void audit_put_watch(struct audit_watch *watch);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 33dafa70229d..60a354eed2fa 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -63,7 +63,6 @@
 #include <asm/unistd.h>
 #include <linux/security.h>
 #include <linux/list.h>
-#include <linux/tty.h>
 #include <linux/binfmts.h>
 #include <linux/highmem.h>
 #include <linux/syscalls.h>
-- 
cgit v1.2.3


From ceb56070359b7329b5678b5d95a376fcb24767be Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 27 Jun 2016 21:38:11 +0200
Subject: bpf, perf: delay release of BPF prog after grace period

Commit dead9f29ddcc ("perf: Fix race in BPF program unregister") moved
destruction of BPF program from free_event_rcu() callback to __free_event(),
which is problematic if used with tail calls: if prog A is attached as
trace event directly, but at the same time present in a tail call map used
by another trace event program elsewhere, then we need to delay destruction
via RCU grace period since it can still be in use by the program doing the
tail call (the prog first needs to be dropped from the tail call map, then
trace event with prog A attached destroyed, so we get immediate destruction).

Fixes: dead9f29ddcc ("perf: Fix race in BPF program unregister")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Jann Horn <jann@thejh.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h  | 4 ++++
 kernel/events/core.c | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8269cafc6eb1..0de4de6dd43e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -264,6 +264,10 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 static inline void bpf_prog_put(struct bpf_prog *prog)
 {
 }
+
+static inline void bpf_prog_put_rcu(struct bpf_prog *prog)
+{
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 /* verifier prototypes for helper functions called from eBPF programs */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 274450efea90..d00c47b67a97 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7531,7 +7531,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
 	prog = event->tp_event->prog;
 	if (prog) {
 		event->tp_event->prog = NULL;
-		bpf_prog_put(prog);
+		bpf_prog_put_rcu(prog);
 	}
 }
 
-- 
cgit v1.2.3