From 0461a4149836c792d186027c8c859637a4cfb11a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 9 Dec 2014 21:38:05 +0000
Subject: spi: Pump transfers inside calling context for spi_sync()

If we are using the standard SPI message pump (which all drivers should be
transitioning over to) then special case the message enqueue and instead of
starting the worker thread to push messages to the hardware do so in the
context of the caller if the controller is idle. This avoids a context
switch in the common case where the controller has a single user in a
single thread, for short PIO transfers there may be no need to context
switch away from the calling context to complete the transfer.

The code is a bit more complex than is desirable in part due to the need
to handle drivers not using the standard queue and in part due to handling
the various combinations of bus locking and asynchronous submission in
interrupt context.

It is still suboptimal since it will still wake the message pump for each
transfer in order to schedule idling of the hardware and if multiple
contexts are using the controller simultaneously a caller may end up
pumping a message for some random other thread rather than for itself,
and if the thread ends up deferring due to another context idling the
hardware then it will just busy wait.  It can, however, have the benefit
of aggregating power up and down of the hardware when a caller performs
a series of transfers back to back without any need for the use of
spi_async().

Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index a6ef2a8e6de4..4e6db75e9469 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -260,6 +260,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  * @pump_messages: work struct for scheduling work to the message pump
  * @queue_lock: spinlock to syncronise access to message queue
  * @queue: message queue
+ * @idling: the device is entering idle state
  * @cur_msg: the currently in-flight message
  * @cur_msg_prepared: spi_prepare_message was called for the currently
  *                    in-flight message
@@ -425,6 +426,7 @@ struct spi_master {
 	spinlock_t			queue_lock;
 	struct list_head		queue;
 	struct spi_message		*cur_msg;
+	bool				idling;
 	bool				busy;
 	bool				running;
 	bool				rt;
-- 
cgit v1.2.3


From e39f2d5956999c05c85814787a113ffadbcd4b26 Mon Sep 17 00:00:00 2001
From: Andrew Duggan <aduggan@synaptics.com>
Date: Fri, 12 Dec 2014 10:17:26 -0800
Subject: HID: rmi: Scan the report descriptor to determine if the device is
 suitable for the hid-rmi driver

On composite HID devices there may be multiple HID devices on separate
interfaces, but hid-rmi should only bind to the touchpad. The previous version
simply checked that the interface protocol was set to mouse. Unfortuately, it
is not always the case that the touchpad has the mouse interface protocol set.
This patch takes a different approach and scans the report descriptor looking
for the Generic Desktop Pointer usage and the Vendor Specific Top Level
Collection needed by the hid-rmi driver to interface with the device.

Signed-off-by: Andrew Duggan <aduggan@synaptics.com>
Reviewed-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c | 22 +++++++++++++++++-----
 include/linux/hid.h    |  4 +++-
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index c3d0ac1a0988..81665b4f2258 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -698,6 +698,7 @@ static void hid_scan_feature_usage(struct hid_parser *parser, u32 usage)
 static void hid_scan_collection(struct hid_parser *parser, unsigned type)
 {
 	struct hid_device *hid = parser->device;
+	int i;
 
 	if (((parser->global.usage_page << 16) == HID_UP_SENSOR) &&
 	    type == HID_COLLECTION_PHYSICAL)
@@ -707,6 +708,14 @@ static void hid_scan_collection(struct hid_parser *parser, unsigned type)
 	    hid->product == USB_DEVICE_ID_MS_TYPE_COVER_3 &&
 	    hid->group == HID_GROUP_MULTITOUCH)
 		hid->group = HID_GROUP_GENERIC;
+
+	if ((parser->global.usage_page << 16) == HID_UP_GENDESK)
+		for (i = 0; i < parser->local.usage_index; i++)
+			if (parser->local.usage[i] == HID_GD_POINTER)
+				parser->scan_flags |= HID_SCAN_FLAG_GD_POINTER;
+
+	if ((parser->global.usage_page << 16) >= HID_UP_MSVENDOR)
+		parser->scan_flags |= HID_SCAN_FLAG_VENDOR_SPECIFIC;
 }
 
 static int hid_scan_main(struct hid_parser *parser, struct hid_item *item)
@@ -792,11 +801,14 @@ static int hid_scan_report(struct hid_device *hid)
 		hid->group = HID_GROUP_WACOM;
 		break;
 	case USB_VENDOR_ID_SYNAPTICS:
-		if ((hid->group == HID_GROUP_GENERIC) &&
-		    (hid->bus != BUS_USB || hid->type == HID_TYPE_USBMOUSE))
-			/* hid-rmi should only bind to the mouse interface of
-			 * composite USB devices */
-			hid->group = HID_GROUP_RMI;
+		if (hid->group == HID_GROUP_GENERIC)
+			if ((parser->scan_flags & HID_SCAN_FLAG_VENDOR_SPECIFIC)
+			    && (parser->scan_flags & HID_SCAN_FLAG_GD_POINTER))
+				/*
+				 * hid-rmi should take care of them,
+				 * not hid-generic
+				 */
+				hid->group = HID_GROUP_RMI;
 		break;
 	}
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 06c4607744f6..efc7787a41a8 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -574,7 +574,9 @@ static inline void hid_set_drvdata(struct hid_device *hdev, void *data)
 #define HID_GLOBAL_STACK_SIZE 4
 #define HID_COLLECTION_STACK_SIZE 4
 
-#define HID_SCAN_FLAG_MT_WIN_8			0x00000001
+#define HID_SCAN_FLAG_MT_WIN_8			BIT(0)
+#define HID_SCAN_FLAG_VENDOR_SPECIFIC		BIT(1)
+#define HID_SCAN_FLAG_GD_POINTER		BIT(2)
 
 struct hid_parser {
 	struct hid_global     global;
-- 
cgit v1.2.3


From d64cb71bede87dbca60d586a7bb4cef87fbe2731 Mon Sep 17 00:00:00 2001
From: Jaewon Kim <jaewon02.kim@samsung.com>
Date: Wed, 17 Dec 2014 10:31:08 -0800
Subject: Input: add regulator haptic driver

This change adds support for haptic driver controlled by voltage of a
regulator. Userspace can control the device via Force Feedback interface
from input framework.

Signed-off-by: Jaewon Kim <jaewon02.kim@samsung.com>
Signed-off-by: Hyunhee Kim <hyunhee.kim@samsung.com>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
Tested-by: Chanwoo Choi <cw00.choi@samsung.com>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Reviewed-by: Pankaj Dubey <pankaj.dubey@samsung.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 .../devicetree/bindings/input/regulator-haptic.txt |  21 ++
 drivers/input/misc/Kconfig                         |  12 +
 drivers/input/misc/Makefile                        |   1 +
 drivers/input/misc/regulator-haptic.c              | 279 +++++++++++++++++++++
 include/linux/platform_data/regulator-haptic.h     |  29 +++
 5 files changed, 342 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/input/regulator-haptic.txt
 create mode 100644 drivers/input/misc/regulator-haptic.c
 create mode 100644 include/linux/platform_data/regulator-haptic.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/input/regulator-haptic.txt b/Documentation/devicetree/bindings/input/regulator-haptic.txt
new file mode 100644
index 000000000000..3ed1c7eb2f97
--- /dev/null
+++ b/Documentation/devicetree/bindings/input/regulator-haptic.txt
@@ -0,0 +1,21 @@
+* Regulator Haptic Device Tree Bindings
+
+Required Properties:
+ - compatible : Should be "regulator-haptic"
+ - haptic-supply : Power supply to the haptic motor.
+	[*] refer Documentation/devicetree/bindings/regulator/regulator.txt
+
+ - max-microvolt : The maximum voltage value supplied to the haptic motor.
+		[The unit of the voltage is a micro]
+
+ - min-microvolt : The minimum voltage value supplied to the haptic motor.
+		[The unit of the voltage is a micro]
+
+Example:
+
+	haptics {
+		compatible = "regulator-haptic";
+		haptic-supply = <&motor_regulator>;
+		max-microvolt = <2700000>;
+		min-microvolt = <1100000>;
+	};
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 23297ab6163f..1da0a20c42ea 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -394,6 +394,18 @@ config INPUT_CM109
 	  To compile this driver as a module, choose M here: the module will be
 	  called cm109.
 
+config INPUT_REGULATOR_HAPTIC
+	tristate "Regulator haptics support"
+	depends on REGULATOR
+	select INPUT_FF_MEMLESS
+	help
+	  This option enables device driver support for the haptic controlled
+	  by a regulator. This driver supports ff-memless interface
+	  from input framework.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called regulator-haptic.
+
 config INPUT_RETU_PWRBUTTON
 	tristate "Retu Power button Driver"
 	depends on MFD_RETU
diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile
index 19c760361f80..1f135af4af01 100644
--- a/drivers/input/misc/Makefile
+++ b/drivers/input/misc/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_INPUT_PMIC8XXX_PWRKEY)	+= pmic8xxx-pwrkey.o
 obj-$(CONFIG_INPUT_POWERMATE)		+= powermate.o
 obj-$(CONFIG_INPUT_PWM_BEEPER)		+= pwm-beeper.o
 obj-$(CONFIG_INPUT_RB532_BUTTON)	+= rb532_button.o
+obj-$(CONFIG_INPUT_REGULATOR_HAPTIC)	+= regulator-haptic.o
 obj-$(CONFIG_INPUT_RETU_PWRBUTTON)	+= retu-pwrbutton.o
 obj-$(CONFIG_INPUT_GPIO_ROTARY_ENCODER)	+= rotary_encoder.o
 obj-$(CONFIG_INPUT_SGI_BTNS)		+= sgi_btns.o
diff --git a/drivers/input/misc/regulator-haptic.c b/drivers/input/misc/regulator-haptic.c
new file mode 100644
index 000000000000..942622189bee
--- /dev/null
+++ b/drivers/input/misc/regulator-haptic.c
@@ -0,0 +1,279 @@
+/*
+ * Regulator haptic driver
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Jaewon Kim <jaewon02.kim@samsung.com>
+ * Author: Hyunhee Kim <hyunhee.kim@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/input.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_data/regulator-haptic.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+
+#define MAX_MAGNITUDE_SHIFT	16
+
+struct regulator_haptic {
+	struct device *dev;
+	struct input_dev *input_dev;
+	struct regulator *regulator;
+
+	struct work_struct work;
+	struct mutex mutex;
+
+	bool active;
+	bool suspended;
+
+	unsigned int max_volt;
+	unsigned int min_volt;
+	unsigned int magnitude;
+};
+
+static int regulator_haptic_toggle(struct regulator_haptic *haptic, bool on)
+{
+	int error;
+
+	if (haptic->active != on) {
+
+		error = on ? regulator_enable(haptic->regulator) :
+			     regulator_disable(haptic->regulator);
+		if (error) {
+			dev_err(haptic->dev,
+				"failed to switch regulator %s: %d\n",
+				on ? "on" : "off", error);
+			return error;
+		}
+
+		haptic->active = on;
+	}
+
+	return 0;
+}
+
+static int regulator_haptic_set_voltage(struct regulator_haptic *haptic,
+					 unsigned int magnitude)
+{
+	u64 volt_mag_multi;
+	unsigned int intensity;
+	int error;
+
+	volt_mag_multi = (u64)(haptic->max_volt - haptic->min_volt) * magnitude;
+	intensity = (unsigned int)(volt_mag_multi >> MAX_MAGNITUDE_SHIFT);
+
+	error = regulator_set_voltage(haptic->regulator,
+				      intensity + haptic->min_volt,
+				      haptic->max_volt);
+	if (error) {
+		dev_err(haptic->dev, "cannot set regulator voltage to %d: %d\n",
+			intensity + haptic->min_volt, error);
+		return error;
+	}
+
+	return 0;
+}
+
+static void regulator_haptic_work(struct work_struct *work)
+{
+	struct regulator_haptic *haptic = container_of(work,
+					struct regulator_haptic, work);
+	unsigned int magnitude;
+	int error;
+
+	mutex_lock(&haptic->mutex);
+
+	if (haptic->suspended)
+		goto out;
+
+	magnitude = ACCESS_ONCE(haptic->magnitude);
+
+	error = regulator_haptic_set_voltage(haptic, magnitude);
+	if (error)
+		goto out;
+
+	regulator_haptic_toggle(haptic, magnitude != 0);
+
+out:
+	mutex_unlock(&haptic->mutex);
+}
+
+static int regulator_haptic_play_effect(struct input_dev *input, void *data,
+					struct ff_effect *effect)
+{
+	struct regulator_haptic *haptic = input_get_drvdata(input);
+
+	haptic->magnitude = effect->u.rumble.strong_magnitude;
+	if (!haptic->magnitude)
+		haptic->magnitude = effect->u.rumble.weak_magnitude;
+
+	schedule_work(&haptic->work);
+
+	return 0;
+}
+
+static void regulator_haptic_close(struct input_dev *input)
+{
+	struct regulator_haptic *haptic = input_get_drvdata(input);
+
+	cancel_work_sync(&haptic->work);
+	regulator_haptic_set_voltage(haptic, 0);
+	regulator_haptic_toggle(haptic, false);
+}
+
+static int __maybe_unused
+regulator_haptic_parse_dt(struct device *dev, struct regulator_haptic *haptic)
+{
+	struct device_node *node;
+	int error;
+
+	node = dev->of_node;
+	if(!node) {
+		dev_err(dev, "Missing dveice tree data\n");
+		return -EINVAL;
+	}
+
+	error = of_property_read_u32(node, "max-microvolt", &haptic->max_volt);
+	if (error) {
+		dev_err(dev, "cannot parse max-microvolt\n");
+		return error;
+	}
+
+	error = of_property_read_u32(node, "min-microvolt", &haptic->min_volt);
+	if (error) {
+		dev_err(dev, "cannot parse min-microvolt\n");
+		return error;
+	}
+
+	return 0;
+}
+
+static int regulator_haptic_probe(struct platform_device *pdev)
+{
+	const struct regulator_haptic_data *pdata = dev_get_platdata(&pdev->dev);
+	struct regulator_haptic *haptic;
+	struct input_dev *input_dev;
+	int error;
+
+	haptic = devm_kzalloc(&pdev->dev, sizeof(*haptic), GFP_KERNEL);
+	if (!haptic)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, haptic);
+	haptic->dev = &pdev->dev;
+	mutex_init(&haptic->mutex);
+	INIT_WORK(&haptic->work, regulator_haptic_work);
+
+	if (pdata) {
+		haptic->max_volt = pdata->max_volt;
+		haptic->min_volt = pdata->min_volt;
+	} else if (IS_ENABLED(CONFIG_OF)) {
+		error = regulator_haptic_parse_dt(&pdev->dev, haptic);
+		if (error)
+			return error;
+	} else {
+		dev_err(&pdev->dev, "Missing platform data\n");
+		return -EINVAL;
+	}
+
+	haptic->regulator = devm_regulator_get_exclusive(&pdev->dev, "haptic");
+	if (IS_ERR(haptic->regulator)) {
+		dev_err(&pdev->dev, "failed to get regulator\n");
+		return PTR_ERR(haptic->regulator);
+	}
+
+	input_dev = devm_input_allocate_device(&pdev->dev);
+	if (!input_dev)
+		return	-ENOMEM;
+
+	haptic->input_dev = input_dev;
+	haptic->input_dev->name = "regulator-haptic";
+	haptic->input_dev->dev.parent = &pdev->dev;
+	haptic->input_dev->close = regulator_haptic_close;
+	input_set_drvdata(haptic->input_dev, haptic);
+	input_set_capability(haptic->input_dev, EV_FF, FF_RUMBLE);
+
+	error = input_ff_create_memless(input_dev, NULL,
+					regulator_haptic_play_effect);
+	if (error) {
+		dev_err(&pdev->dev, "failed to create force-feedback\n");
+		return error;
+	}
+
+	error = input_register_device(haptic->input_dev);
+	if (error) {
+		dev_err(&pdev->dev, "failed to register input device\n");
+		return error;
+	}
+
+	return 0;
+}
+
+static int __maybe_unused regulator_haptic_suspend(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct regulator_haptic *haptic = platform_get_drvdata(pdev);
+	int error;
+
+	error = mutex_lock_interruptible(&haptic->mutex);
+	if (error)
+		return error;
+
+	regulator_haptic_set_voltage(haptic, 0);
+	regulator_haptic_toggle(haptic, false);
+
+	haptic->suspended = true;
+
+	mutex_unlock(&haptic->mutex);
+
+	return 0;
+}
+
+static int __maybe_unused regulator_haptic_resume(struct device *dev)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	struct regulator_haptic *haptic = platform_get_drvdata(pdev);
+	unsigned int magnitude;
+
+	mutex_lock(&haptic->mutex);
+
+	haptic->suspended = false;
+
+	magnitude = ACCESS_ONCE(haptic->magnitude);
+	if (magnitude) {
+		regulator_haptic_set_voltage(haptic, magnitude);
+		regulator_haptic_toggle(haptic, true);
+	}
+
+	mutex_unlock(&haptic->mutex);
+
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(regulator_haptic_pm_ops,
+		regulator_haptic_suspend, regulator_haptic_resume);
+
+static struct of_device_id regulator_haptic_dt_match[] = {
+	{ .compatible = "regulator-haptic" },
+	{ /* sentinel */ },
+};
+
+static struct platform_driver regulator_haptic_driver = {
+	.probe		= regulator_haptic_probe,
+	.driver		= {
+		.name		= "regulator-haptic",
+		.of_match_table = regulator_haptic_dt_match,
+		.pm		= &regulator_haptic_pm_ops,
+	},
+};
+module_platform_driver(regulator_haptic_driver);
+
+MODULE_AUTHOR("Jaewon Kim <jaewon02.kim@samsung.com>");
+MODULE_AUTHOR("Hyunhee Kim <hyunhee.kim@samsung.com>");
+MODULE_DESCRIPTION("Regulator haptic driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/platform_data/regulator-haptic.h b/include/linux/platform_data/regulator-haptic.h
new file mode 100644
index 000000000000..5658e58e0738
--- /dev/null
+++ b/include/linux/platform_data/regulator-haptic.h
@@ -0,0 +1,29 @@
+/*
+ * Regulator Haptic Platform Data
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Jaewon Kim <jaewon02.kim@samsung.com>
+ * Author: Hyunhee Kim <hyunhee.kim@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _REGULATOR_HAPTIC_H
+#define _REGULATOR_HAPTIC_H
+
+/*
+ * struct regulator_haptic_data - Platform device data
+ *
+ * @max_volt: maximum voltage value supplied to the haptic motor.
+ *		<The unit of the voltage is a micro>
+ * @min_volt: minimum voltage value supplied to the haptic motor.
+ *		<The unit of the voltage is a micro>
+ */
+struct regulator_haptic_data {
+	unsigned int max_volt;
+	unsigned int min_volt;
+};
+
+#endif /* _REGULATOR_HAPTIC_H */
-- 
cgit v1.2.3


From c5f4546593e9911800f0926c1090959b58bc5c93 Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@redhat.com>
Date: Tue, 16 Dec 2014 11:58:18 -0600
Subject: livepatch: kernel: add TAINT_LIVEPATCH

This adds a new taint flag to indicate when the kernel or a kernel
module has been live patched.  This will provide a clean indication in
bug reports that live patching was used.

Additionally, if the crash occurs in a live patched function, the live
patch module will appear beside the patched function in the backtrace.

Signed-off-by: Seth Jennings <sjenning@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/oops-tracing.txt  | 2 ++
 Documentation/sysctl/kernel.txt | 1 +
 include/linux/kernel.h          | 1 +
 kernel/panic.c                  | 2 ++
 4 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index beefb9f82902..f3ac05cc23e4 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -270,6 +270,8 @@ characters, each representing a particular tainted value.
 
  15: 'L' if a soft lockup has previously occurred on the system.
 
+ 16: 'K' if the kernel has been live patched.
+
 The primary reason for the 'Tainted: ' string is to tell kernel
 debuggers if this is a clean kernel or if anything unusual has
 occurred.  Tainting is permanent: even if an offending module is
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 75511efefc64..83ab25660fc9 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -843,6 +843,7 @@ can be ORed together:
 8192 - An unsigned module has been loaded in a kernel supporting module
        signature.
 16384 - A soft lockup has previously occurred on the system.
+32768 - The kernel has been live patched.
 
 ==============================================================
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5449d2f4a1ef..d03e3deee091 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -471,6 +471,7 @@ extern enum system_states {
 #define TAINT_OOT_MODULE		12
 #define TAINT_UNSIGNED_MODULE		13
 #define TAINT_SOFTLOCKUP		14
+#define TAINT_LIVEPATCH			15
 
 extern const char hex_asc[];
 #define hex_asc_lo(x)	hex_asc[((x) & 0x0f)]
diff --git a/kernel/panic.c b/kernel/panic.c
index 4d8d6f906dec..8136ad76e5fd 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -226,6 +226,7 @@ static const struct tnt tnts[] = {
 	{ TAINT_OOT_MODULE,		'O', ' ' },
 	{ TAINT_UNSIGNED_MODULE,	'E', ' ' },
 	{ TAINT_SOFTLOCKUP,		'L', ' ' },
+	{ TAINT_LIVEPATCH,		'K', ' ' },
 };
 
 /**
@@ -246,6 +247,7 @@ static const struct tnt tnts[] = {
  *  'O' - Out-of-tree module has been loaded.
  *  'E' - Unsigned module has been loaded.
  *  'L' - A soft lockup has previously occurred.
+ *  'K' - Kernel has been live patched.
  *
  *	The string is overwritten by the next call to print_tainted().
  */
-- 
cgit v1.2.3


From b700e7f03df5d92f85fa5247fe1f557528d3363d Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@redhat.com>
Date: Tue, 16 Dec 2014 11:58:19 -0600
Subject: livepatch: kernel: add support for live patching

This commit introduces code for the live patching core.  It implements
an ftrace-based mechanism and kernel interface for doing live patching
of kernel and kernel module functions.

It represents the greatest common functionality set between kpatch and
kgraft and can accept patches built using either method.

This first version does not implement any consistency mechanism that
ensures that old and new code do not run together.  In practice, ~90% of
CVEs are safe to apply in this way, since they simply add a conditional
check.  However, any function change that can not execute safely with
the old version of the function can _not_ be safely applied in this
version.

[ jkosina@suse.cz: due to the number of contributions that got folded into
  this original patch from Seth Jennings, add SUSE's copyright as well, as
  discussed via e-mail ]

Signed-off-by: Seth Jennings <sjenning@redhat.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Petr Mladek <pmladek@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ABI/testing/sysfs-kernel-livepatch |  44 ++
 MAINTAINERS                                      |  13 +
 arch/x86/Kconfig                                 |   3 +
 arch/x86/include/asm/livepatch.h                 |  37 +
 arch/x86/kernel/Makefile                         |   1 +
 arch/x86/kernel/livepatch.c                      |  90 +++
 include/linux/livepatch.h                        | 133 ++++
 kernel/Makefile                                  |   1 +
 kernel/livepatch/Kconfig                         |  18 +
 kernel/livepatch/Makefile                        |   3 +
 kernel/livepatch/core.c                          | 930 +++++++++++++++++++++++
 11 files changed, 1273 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-livepatch
 create mode 100644 arch/x86/include/asm/livepatch.h
 create mode 100644 arch/x86/kernel/livepatch.c
 create mode 100644 include/linux/livepatch.h
 create mode 100644 kernel/livepatch/Kconfig
 create mode 100644 kernel/livepatch/Makefile
 create mode 100644 kernel/livepatch/core.c

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch
new file mode 100644
index 000000000000..5bf42a840b22
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-livepatch
@@ -0,0 +1,44 @@
+What:		/sys/kernel/livepatch
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		Interface for kernel live patching
+
+		The /sys/kernel/livepatch directory contains subdirectories for
+		each loaded live patch module.
+
+What:		/sys/kernel/livepatch/<patch>
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		The patch directory contains subdirectories for each kernel
+		object (vmlinux or a module) in which it patched functions.
+
+What:		/sys/kernel/livepatch/<patch>/enabled
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		A writable attribute that indicates whether the patched
+		code is currently applied.  Writing 0 will disable the patch
+		while writing 1 will re-enable the patch.
+
+What:		/sys/kernel/livepatch/<patch>/<object>
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		The object directory contains subdirectories for each function
+		that is patched within the object.
+
+What:		/sys/kernel/livepatch/<patch>/<object>/<function>
+Date:		Nov 2014
+KernelVersion:	3.19.0
+Contact:	live-patching@vger.kernel.org
+Description:
+		The function directory contains attributes regarding the
+		properties and state of the patched function.
+
+		There are currently no such attributes.
diff --git a/MAINTAINERS b/MAINTAINERS
index ddb9ac8d32b3..df6a0784b466 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5784,6 +5784,19 @@ F:	Documentation/misc-devices/lis3lv02d
 F:	drivers/misc/lis3lv02d/
 F:	drivers/platform/x86/hp_accel.c
 
+LIVE PATCHING
+M:	Josh Poimboeuf <jpoimboe@redhat.com>
+M:	Seth Jennings <sjenning@redhat.com>
+M:	Jiri Kosina <jkosina@suse.cz>
+M:	Vojtech Pavlik <vojtech@suse.cz>
+S:	Maintained
+F:	kernel/livepatch/
+F:	include/linux/livepatch.h
+F:	arch/x86/include/asm/livepatch.h
+F:	arch/x86/kernel/livepatch.c
+F:	Documentation/ABI/testing/sysfs-kernel-livepatch
+L:	live-patching@vger.kernel.org
+
 LLC (802.2)
 M:	Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
 S:	Maintained
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba397bde7948..460b31b79938 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,6 +17,7 @@ config X86_64
 	depends on 64BIT
 	select X86_DEV_DMA_OPS
 	select ARCH_USE_CMPXCHG_LOCKREF
+	select ARCH_HAVE_LIVE_PATCHING
 
 ### Arch settings
 config X86
@@ -2008,6 +2009,8 @@ config CMDLINE_OVERRIDE
 	  This is used to work around broken boot loaders.  This should
 	  be set to 'N' under normal conditions.
 
+source "kernel/livepatch/Kconfig"
+
 endmenu
 
 config ARCH_ENABLE_MEMORY_HOTPLUG
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
new file mode 100644
index 000000000000..d529db1b1edf
--- /dev/null
+++ b/arch/x86/include/asm/livepatch.h
@@ -0,0 +1,37 @@
+/*
+ * livepatch.h - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _ASM_X86_LIVEPATCH_H
+#define _ASM_X86_LIVEPATCH_H
+
+#include <linux/module.h>
+
+#ifdef CONFIG_LIVE_PATCHING
+#ifndef CC_USING_FENTRY
+#error Your compiler must support -mfentry for live patching to work
+#endif
+extern int klp_write_module_reloc(struct module *mod, unsigned long type,
+				  unsigned long loc, unsigned long value);
+
+#else
+#error Live patching support is disabled; check CONFIG_LIVE_PATCHING
+#endif
+
+#endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5d4502c8b983..316b34e74c15 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_LIVE_PATCHING)	+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
diff --git a/arch/x86/kernel/livepatch.c b/arch/x86/kernel/livepatch.c
new file mode 100644
index 000000000000..ff3c3101d003
--- /dev/null
+++ b/arch/x86/kernel/livepatch.c
@@ -0,0 +1,90 @@
+/*
+ * livepatch.c - x86-specific Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/page_types.h>
+#include <asm/elf.h>
+#include <asm/livepatch.h>
+
+/**
+ * klp_write_module_reloc() - write a relocation in a module
+ * @mod:	module in which the section to be modified is found
+ * @type:	ELF relocation type (see asm/elf.h)
+ * @loc:	address that the relocation should be written to
+ * @value:	relocation value (sym address + addend)
+ *
+ * This function writes a relocation to the specified location for
+ * a particular module.
+ */
+int klp_write_module_reloc(struct module *mod, unsigned long type,
+			   unsigned long loc, unsigned long value)
+{
+	int ret, numpages, size = 4;
+	bool readonly;
+	unsigned long val;
+	unsigned long core = (unsigned long)mod->module_core;
+	unsigned long core_ro_size = mod->core_ro_size;
+	unsigned long core_size = mod->core_size;
+
+	switch (type) {
+	case R_X86_64_NONE:
+		return 0;
+	case R_X86_64_64:
+		val = value;
+		size = 8;
+		break;
+	case R_X86_64_32:
+		val = (u32)value;
+		break;
+	case R_X86_64_32S:
+		val = (s32)value;
+		break;
+	case R_X86_64_PC32:
+		val = (u32)(value - loc);
+		break;
+	default:
+		/* unsupported relocation type */
+		return -EINVAL;
+	}
+
+	if (loc < core || loc >= core + core_size)
+		/* loc does not point to any symbol inside the module */
+		return -EINVAL;
+
+	if (loc < core + core_ro_size)
+		readonly = true;
+	else
+		readonly = false;
+
+	/* determine if the relocation spans a page boundary */
+	numpages = ((loc & PAGE_MASK) == ((loc + size) & PAGE_MASK)) ? 1 : 2;
+
+	if (readonly)
+		set_memory_rw(loc & PAGE_MASK, numpages);
+
+	ret = probe_kernel_write((void *)loc, &val, size);
+
+	if (readonly)
+		set_memory_ro(loc & PAGE_MASK, numpages);
+
+	return ret;
+}
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
new file mode 100644
index 000000000000..950bc615842f
--- /dev/null
+++ b/include/linux/livepatch.h
@@ -0,0 +1,133 @@
+/*
+ * livepatch.h - Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _LINUX_LIVEPATCH_H_
+#define _LINUX_LIVEPATCH_H_
+
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+#if IS_ENABLED(CONFIG_LIVE_PATCHING)
+
+#include <asm/livepatch.h>
+
+enum klp_state {
+	KLP_DISABLED,
+	KLP_ENABLED
+};
+
+/**
+ * struct klp_func - function structure for live patching
+ * @old_name:	name of the function to be patched
+ * @new_func:	pointer to the patched function code
+ * @old_addr:	a hint conveying at what address the old function
+ *		can be found (optional, vmlinux patches only)
+ * @kobj:	kobject for sysfs resources
+ * @fops:	ftrace operations structure
+ * @state:	tracks function-level patch application state
+ */
+struct klp_func {
+	/* external */
+	const char *old_name;
+	void *new_func;
+	/*
+	 * The old_addr field is optional and can be used to resolve
+	 * duplicate symbol names in the vmlinux object.  If this
+	 * information is not present, the symbol is located by name
+	 * with kallsyms. If the name is not unique and old_addr is
+	 * not provided, the patch application fails as there is no
+	 * way to resolve the ambiguity.
+	 */
+	unsigned long old_addr;
+
+	/* internal */
+	struct kobject kobj;
+	struct ftrace_ops *fops;
+	enum klp_state state;
+};
+
+/**
+ * struct klp_reloc - relocation structure for live patching
+ * @loc:	address where the relocation will be written
+ * @val:	address of the referenced symbol (optional,
+ *		vmlinux	patches only)
+ * @type:	ELF relocation type
+ * @name:	name of the referenced symbol (for lookup/verification)
+ * @addend:	offset from the referenced symbol
+ * @external:	symbol is either exported or within the live patch module itself
+ */
+struct klp_reloc {
+	unsigned long loc;
+	unsigned long val;
+	unsigned long type;
+	const char *name;
+	int addend;
+	int external;
+};
+
+/**
+ * struct klp_object - kernel object structure for live patching
+ * @name:	module name (or NULL for vmlinux)
+ * @relocs:	relocation entries to be applied at load time
+ * @funcs:	function entries for functions to be patched in the object
+ * @kobj:	kobject for sysfs resources
+ * @mod:	kernel module associated with the patched object
+ * 		(NULL for vmlinux)
+ * @state:	tracks object-level patch application state
+ */
+struct klp_object {
+	/* external */
+	const char *name;
+	struct klp_reloc *relocs;
+	struct klp_func *funcs;
+
+	/* internal */
+	struct kobject *kobj;
+	struct module *mod;
+	enum klp_state state;
+};
+
+/**
+ * struct klp_patch - patch structure for live patching
+ * @mod:	reference to the live patch module
+ * @objs:	object entries for kernel objects to be patched
+ * @list:	list node for global list of registered patches
+ * @kobj:	kobject for sysfs resources
+ * @state:	tracks patch-level application state
+ */
+struct klp_patch {
+	/* external */
+	struct module *mod;
+	struct klp_object *objs;
+
+	/* internal */
+	struct list_head list;
+	struct kobject kobj;
+	enum klp_state state;
+};
+
+extern int klp_register_patch(struct klp_patch *);
+extern int klp_unregister_patch(struct klp_patch *);
+extern int klp_enable_patch(struct klp_patch *);
+extern int klp_disable_patch(struct klp_patch *);
+
+#endif /* CONFIG_LIVE_PATCHING */
+
+#endif /* _LINUX_LIVEPATCH_H_ */
diff --git a/kernel/Makefile b/kernel/Makefile
index a59481a3fa6c..616994f0a76f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -26,6 +26,7 @@ obj-y += power/
 obj-y += printk/
 obj-y += irq/
 obj-y += rcu/
+obj-y += livepatch/
 
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
new file mode 100644
index 000000000000..96da00fbc120
--- /dev/null
+++ b/kernel/livepatch/Kconfig
@@ -0,0 +1,18 @@
+config ARCH_HAVE_LIVE_PATCHING
+	boolean
+	help
+	  Arch supports kernel live patching
+
+config LIVE_PATCHING
+	boolean "Kernel Live Patching"
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	depends on MODULES
+	depends on SYSFS
+	depends on KALLSYMS_ALL
+	depends on ARCH_HAVE_LIVE_PATCHING
+	help
+	  Say Y here if you want to support kernel live patching.
+	  This option has no runtime impact until a kernel "patch"
+	  module uses the interface provided by this option to register
+	  a patch, causing calls to patched functions to be redirected
+	  to new function code contained in the patch module.
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
new file mode 100644
index 000000000000..7c1f00861428
--- /dev/null
+++ b/kernel/livepatch/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_LIVE_PATCHING) += livepatch.o
+
+livepatch-objs := core.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
new file mode 100644
index 000000000000..f99fe189d596
--- /dev/null
+++ b/kernel/livepatch/core.c
@@ -0,0 +1,930 @@
+/*
+ * core.c - Kernel Live Patching Core
+ *
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2014 SUSE
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/ftrace.h>
+#include <linux/list.h>
+#include <linux/kallsyms.h>
+#include <linux/livepatch.h>
+
+/*
+ * The klp_mutex protects the klp_patches list and state transitions of any
+ * structure reachable from the patches list.  References to any structure must
+ * be obtained under mutex protection.
+ */
+
+static DEFINE_MUTEX(klp_mutex);
+static LIST_HEAD(klp_patches);
+
+static struct kobject *klp_root_kobj;
+
+static bool klp_is_module(struct klp_object *obj)
+{
+	return obj->name;
+}
+
+static bool klp_is_object_loaded(struct klp_object *obj)
+{
+	return !obj->name || obj->mod;
+}
+
+/* sets obj->mod if object is not vmlinux and module is found */
+static void klp_find_object_module(struct klp_object *obj)
+{
+	if (!klp_is_module(obj))
+		return;
+
+	mutex_lock(&module_mutex);
+	/*
+	 * We don't need to take a reference on the module here because we have
+	 * the klp_mutex, which is also taken by the module notifier.  This
+	 * prevents any module from unloading until we release the klp_mutex.
+	 */
+	obj->mod = find_module(obj->name);
+	mutex_unlock(&module_mutex);
+}
+
+/* klp_mutex must be held by caller */
+static bool klp_is_patch_registered(struct klp_patch *patch)
+{
+	struct klp_patch *mypatch;
+
+	list_for_each_entry(mypatch, &klp_patches, list)
+		if (mypatch == patch)
+			return true;
+
+	return false;
+}
+
+static bool klp_initialized(void)
+{
+	return klp_root_kobj;
+}
+
+struct klp_find_arg {
+	const char *objname;
+	const char *name;
+	unsigned long addr;
+	/*
+	 * If count == 0, the symbol was not found. If count == 1, a unique
+	 * match was found and addr is set.  If count > 1, there is
+	 * unresolvable ambiguity among "count" number of symbols with the same
+	 * name in the same object.
+	 */
+	unsigned long count;
+};
+
+static int klp_find_callback(void *data, const char *name,
+			     struct module *mod, unsigned long addr)
+{
+	struct klp_find_arg *args = data;
+
+	if ((mod && !args->objname) || (!mod && args->objname))
+		return 0;
+
+	if (strcmp(args->name, name))
+		return 0;
+
+	if (args->objname && strcmp(args->objname, mod->name))
+		return 0;
+
+	/*
+	 * args->addr might be overwritten if another match is found
+	 * but klp_find_object_symbol() handles this and only returns the
+	 * addr if count == 1.
+	 */
+	args->addr = addr;
+	args->count++;
+
+	return 0;
+}
+
+static int klp_find_object_symbol(const char *objname, const char *name,
+				  unsigned long *addr)
+{
+	struct klp_find_arg args = {
+		.objname = objname,
+		.name = name,
+		.addr = 0,
+		.count = 0
+	};
+
+	kallsyms_on_each_symbol(klp_find_callback, &args);
+
+	if (args.count == 0)
+		pr_err("symbol '%s' not found in symbol table\n", name);
+	else if (args.count > 1)
+		pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n",
+		       args.count, name, objname);
+	else {
+		*addr = args.addr;
+		return 0;
+	}
+
+	*addr = 0;
+	return -EINVAL;
+}
+
+struct klp_verify_args {
+	const char *name;
+	const unsigned long addr;
+};
+
+static int klp_verify_callback(void *data, const char *name,
+			       struct module *mod, unsigned long addr)
+{
+	struct klp_verify_args *args = data;
+
+	if (!mod &&
+	    !strcmp(args->name, name) &&
+	    args->addr == addr)
+		return 1;
+
+	return 0;
+}
+
+static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr)
+{
+	struct klp_verify_args args = {
+		.name = name,
+		.addr = addr,
+	};
+
+	if (kallsyms_on_each_symbol(klp_verify_callback, &args))
+		return 0;
+
+	pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?",
+		name, addr);
+	return -EINVAL;
+}
+
+static int klp_find_verify_func_addr(struct klp_object *obj,
+				     struct klp_func *func)
+{
+	int ret;
+
+#if defined(CONFIG_RANDOMIZE_BASE)
+	/* KASLR is enabled, disregard old_addr from user */
+	func->old_addr = 0;
+#endif
+
+	if (!func->old_addr || klp_is_module(obj))
+		ret = klp_find_object_symbol(obj->name, func->old_name,
+					     &func->old_addr);
+	else
+		ret = klp_verify_vmlinux_symbol(func->old_name,
+						func->old_addr);
+
+	return ret;
+}
+
+/*
+ * external symbols are located outside the parent object (where the parent
+ * object is either vmlinux or the kmod being patched).
+ */
+static int klp_find_external_symbol(struct module *pmod, const char *name,
+				    unsigned long *addr)
+{
+	const struct kernel_symbol *sym;
+
+	/* first, check if it's an exported symbol */
+	preempt_disable();
+	sym = find_symbol(name, NULL, NULL, true, true);
+	preempt_enable();
+	if (sym) {
+		*addr = sym->value;
+		return 0;
+	}
+
+	/* otherwise check if it's in another .o within the patch module */
+	return klp_find_object_symbol(pmod->name, name, addr);
+}
+
+static int klp_write_object_relocations(struct module *pmod,
+					struct klp_object *obj)
+{
+	int ret;
+	struct klp_reloc *reloc;
+
+	if (WARN_ON(!klp_is_object_loaded(obj)))
+		return -EINVAL;
+
+	if (WARN_ON(!obj->relocs))
+		return -EINVAL;
+
+	for (reloc = obj->relocs; reloc->name; reloc++) {
+		if (!klp_is_module(obj)) {
+			ret = klp_verify_vmlinux_symbol(reloc->name,
+							reloc->val);
+			if (ret)
+				return ret;
+		} else {
+			/* module, reloc->val needs to be discovered */
+			if (reloc->external)
+				ret = klp_find_external_symbol(pmod,
+							       reloc->name,
+							       &reloc->val);
+			else
+				ret = klp_find_object_symbol(obj->mod->name,
+							     reloc->name,
+							     &reloc->val);
+			if (ret)
+				return ret;
+		}
+		ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc,
+					     reloc->val + reloc->addend);
+		if (ret) {
+			pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n",
+			       reloc->name, reloc->val, ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void notrace klp_ftrace_handler(unsigned long ip,
+				       unsigned long parent_ip,
+				       struct ftrace_ops *ops,
+				       struct pt_regs *regs)
+{
+	struct klp_func *func = ops->private;
+
+	regs->ip = (unsigned long)func->new_func;
+}
+
+static int klp_disable_func(struct klp_func *func)
+{
+	int ret;
+
+	if (WARN_ON(func->state != KLP_ENABLED))
+		return -EINVAL;
+
+	if (WARN_ON(!func->old_addr))
+		return -EINVAL;
+
+	ret = unregister_ftrace_function(func->fops);
+	if (ret) {
+		pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
+		       func->old_name, ret);
+		return ret;
+	}
+
+	ret = ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0);
+	if (ret)
+		pr_warn("function unregister succeeded but failed to clear the filter\n");
+
+	func->state = KLP_DISABLED;
+
+	return 0;
+}
+
+static int klp_enable_func(struct klp_func *func)
+{
+	int ret;
+
+	if (WARN_ON(!func->old_addr))
+		return -EINVAL;
+
+	if (WARN_ON(func->state != KLP_DISABLED))
+		return -EINVAL;
+
+	ret = ftrace_set_filter_ip(func->fops, func->old_addr, 0, 0);
+	if (ret) {
+		pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+		       func->old_name, ret);
+		return ret;
+	}
+
+	ret = register_ftrace_function(func->fops);
+	if (ret) {
+		pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+		       func->old_name, ret);
+		ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0);
+	} else {
+		func->state = KLP_ENABLED;
+	}
+
+	return ret;
+}
+
+static int klp_disable_object(struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+
+	for (func = obj->funcs; func->old_name; func++) {
+		if (func->state != KLP_ENABLED)
+			continue;
+
+		ret = klp_disable_func(func);
+		if (ret)
+			return ret;
+	}
+
+	obj->state = KLP_DISABLED;
+
+	return 0;
+}
+
+static int klp_enable_object(struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+
+	if (WARN_ON(obj->state != KLP_DISABLED))
+		return -EINVAL;
+
+	if (WARN_ON(!klp_is_object_loaded(obj)))
+		return -EINVAL;
+
+	for (func = obj->funcs; func->old_name; func++) {
+		ret = klp_enable_func(func);
+		if (ret)
+			goto unregister;
+	}
+	obj->state = KLP_ENABLED;
+
+	return 0;
+
+unregister:
+	WARN_ON(klp_disable_object(obj));
+	return ret;
+}
+
+static int __klp_disable_patch(struct klp_patch *patch)
+{
+	struct klp_object *obj;
+	int ret;
+
+	pr_notice("disabling patch '%s'\n", patch->mod->name);
+
+	for (obj = patch->objs; obj->funcs; obj++) {
+		if (obj->state != KLP_ENABLED)
+			continue;
+
+		ret = klp_disable_object(obj);
+		if (ret)
+			return ret;
+	}
+
+	patch->state = KLP_DISABLED;
+
+	return 0;
+}
+
+/**
+ * klp_disable_patch() - disables a registered patch
+ * @patch:	The registered, enabled patch to be disabled
+ *
+ * Unregisters the patched functions from ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_disable_patch(struct klp_patch *patch)
+{
+	int ret;
+
+	mutex_lock(&klp_mutex);
+
+	if (!klp_is_patch_registered(patch)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (patch->state == KLP_DISABLED) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = __klp_disable_patch(patch);
+
+err:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_disable_patch);
+
+static int __klp_enable_patch(struct klp_patch *patch)
+{
+	struct klp_object *obj;
+	int ret;
+
+	if (WARN_ON(patch->state != KLP_DISABLED))
+		return -EINVAL;
+
+	pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n");
+	add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+
+	pr_notice("enabling patch '%s'\n", patch->mod->name);
+
+	for (obj = patch->objs; obj->funcs; obj++) {
+		klp_find_object_module(obj);
+
+		if (!klp_is_object_loaded(obj))
+			continue;
+
+		ret = klp_enable_object(obj);
+		if (ret)
+			goto unregister;
+	}
+
+	patch->state = KLP_ENABLED;
+
+	return 0;
+
+unregister:
+	WARN_ON(__klp_disable_patch(patch));
+	return ret;
+}
+
+/**
+ * klp_enable_patch() - enables a registered patch
+ * @patch:	The registered, disabled patch to be enabled
+ *
+ * Performs the needed symbol lookups and code relocations,
+ * then registers the patched functions with ftrace.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_enable_patch(struct klp_patch *patch)
+{
+	int ret;
+
+	mutex_lock(&klp_mutex);
+
+	if (!klp_is_patch_registered(patch)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = __klp_enable_patch(patch);
+
+err:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_enable_patch);
+
+/*
+ * Sysfs Interface
+ *
+ * /sys/kernel/livepatch
+ * /sys/kernel/livepatch/<patch>
+ * /sys/kernel/livepatch/<patch>/enabled
+ * /sys/kernel/livepatch/<patch>/<object>
+ * /sys/kernel/livepatch/<patch>/<object>/<func>
+ */
+
+static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct klp_patch *patch;
+	int ret;
+	unsigned long val;
+
+	ret = kstrtoul(buf, 10, &val);
+	if (ret)
+		return -EINVAL;
+
+	if (val != KLP_DISABLED && val != KLP_ENABLED)
+		return -EINVAL;
+
+	patch = container_of(kobj, struct klp_patch, kobj);
+
+	mutex_lock(&klp_mutex);
+
+	if (val == patch->state) {
+		/* already in requested state */
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (val == KLP_ENABLED) {
+		ret = __klp_enable_patch(patch);
+		if (ret)
+			goto err;
+	} else {
+		ret = __klp_disable_patch(patch);
+		if (ret)
+			goto err;
+	}
+
+	mutex_unlock(&klp_mutex);
+
+	return count;
+
+err:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+
+static ssize_t enabled_show(struct kobject *kobj,
+			    struct kobj_attribute *attr, char *buf)
+{
+	struct klp_patch *patch;
+
+	patch = container_of(kobj, struct klp_patch, kobj);
+	return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state);
+}
+
+static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
+static struct attribute *klp_patch_attrs[] = {
+	&enabled_kobj_attr.attr,
+	NULL
+};
+
+static void klp_kobj_release_patch(struct kobject *kobj)
+{
+	/*
+	 * Once we have a consistency model we'll need to module_put() the
+	 * patch module here.  See klp_register_patch() for more details.
+	 */
+}
+
+static struct kobj_type klp_ktype_patch = {
+	.release = klp_kobj_release_patch,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_attrs = klp_patch_attrs,
+};
+
+static void klp_kobj_release_func(struct kobject *kobj)
+{
+	struct klp_func *func;
+
+	func = container_of(kobj, struct klp_func, kobj);
+	kfree(func->fops);
+}
+
+static struct kobj_type klp_ktype_func = {
+	.release = klp_kobj_release_func,
+	.sysfs_ops = &kobj_sysfs_ops,
+};
+
+/*
+ * Free all functions' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_funcs_limited(struct klp_object *obj,
+				   struct klp_func *limit)
+{
+	struct klp_func *func;
+
+	for (func = obj->funcs; func->old_name && func != limit; func++)
+		kobject_put(&func->kobj);
+}
+
+/* Clean up when a patched object is unloaded */
+static void klp_free_object_loaded(struct klp_object *obj)
+{
+	struct klp_func *func;
+
+	obj->mod = NULL;
+
+	for (func = obj->funcs; func->old_name; func++)
+		func->old_addr = 0;
+}
+
+/*
+ * Free all objects' kobjects in the array up to some limit. When limit is
+ * NULL, all kobjects are freed.
+ */
+static void klp_free_objects_limited(struct klp_patch *patch,
+				     struct klp_object *limit)
+{
+	struct klp_object *obj;
+
+	for (obj = patch->objs; obj->funcs && obj != limit; obj++) {
+		klp_free_funcs_limited(obj, NULL);
+		kobject_put(obj->kobj);
+	}
+}
+
+static void klp_free_patch(struct klp_patch *patch)
+{
+	klp_free_objects_limited(patch, NULL);
+	if (!list_empty(&patch->list))
+		list_del(&patch->list);
+	kobject_put(&patch->kobj);
+}
+
+static int klp_init_func(struct klp_object *obj, struct klp_func *func)
+{
+	struct ftrace_ops *ops;
+	int ret;
+
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
+
+	ops->private = func;
+	ops->func = klp_ftrace_handler;
+	ops->flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC;
+	func->fops = ops;
+	func->state = KLP_DISABLED;
+
+	ret = kobject_init_and_add(&func->kobj, &klp_ktype_func,
+				   obj->kobj, func->old_name);
+	if (ret) {
+		kfree(func->fops);
+		return ret;
+	}
+
+	return 0;
+}
+
+/* parts of the initialization that is done only when the object is loaded */
+static int klp_init_object_loaded(struct klp_patch *patch,
+				  struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+
+	if (obj->relocs) {
+		ret = klp_write_object_relocations(patch->mod, obj);
+		if (ret)
+			return ret;
+	}
+
+	for (func = obj->funcs; func->old_name; func++) {
+		ret = klp_find_verify_func_addr(obj, func);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int klp_init_object(struct klp_patch *patch, struct klp_object *obj)
+{
+	struct klp_func *func;
+	int ret;
+	const char *name;
+
+	if (!obj->funcs)
+		return -EINVAL;
+
+	obj->state = KLP_DISABLED;
+
+	klp_find_object_module(obj);
+
+	name = klp_is_module(obj) ? obj->name : "vmlinux";
+	obj->kobj = kobject_create_and_add(name, &patch->kobj);
+	if (!obj->kobj)
+		return -ENOMEM;
+
+	for (func = obj->funcs; func->old_name; func++) {
+		ret = klp_init_func(obj, func);
+		if (ret)
+			goto free;
+	}
+
+	if (klp_is_object_loaded(obj)) {
+		ret = klp_init_object_loaded(patch, obj);
+		if (ret)
+			goto free;
+	}
+
+	return 0;
+
+free:
+	klp_free_funcs_limited(obj, func);
+	kobject_put(obj->kobj);
+	return ret;
+}
+
+static int klp_init_patch(struct klp_patch *patch)
+{
+	struct klp_object *obj;
+	int ret;
+
+	if (!patch->objs)
+		return -EINVAL;
+
+	mutex_lock(&klp_mutex);
+
+	patch->state = KLP_DISABLED;
+
+	ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch,
+				   klp_root_kobj, patch->mod->name);
+	if (ret)
+		goto unlock;
+
+	for (obj = patch->objs; obj->funcs; obj++) {
+		ret = klp_init_object(patch, obj);
+		if (ret)
+			goto free;
+	}
+
+	list_add(&patch->list, &klp_patches);
+
+	mutex_unlock(&klp_mutex);
+
+	return 0;
+
+free:
+	klp_free_objects_limited(patch, obj);
+	kobject_put(&patch->kobj);
+unlock:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+
+/**
+ * klp_unregister_patch() - unregisters a patch
+ * @patch:	Disabled patch to be unregistered
+ *
+ * Frees the data structures and removes the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_unregister_patch(struct klp_patch *patch)
+{
+	int ret = 0;
+
+	mutex_lock(&klp_mutex);
+
+	if (!klp_is_patch_registered(patch)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (patch->state == KLP_ENABLED) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	klp_free_patch(patch);
+
+out:
+	mutex_unlock(&klp_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_unregister_patch);
+
+/**
+ * klp_register_patch() - registers a patch
+ * @patch:	Patch to be registered
+ *
+ * Initializes the data structure associated with the patch and
+ * creates the sysfs interface.
+ *
+ * Return: 0 on success, otherwise error
+ */
+int klp_register_patch(struct klp_patch *patch)
+{
+	int ret;
+
+	if (!klp_initialized())
+		return -ENODEV;
+
+	if (!patch || !patch->mod)
+		return -EINVAL;
+
+	/*
+	 * A reference is taken on the patch module to prevent it from being
+	 * unloaded.  Right now, we don't allow patch modules to unload since
+	 * there is currently no method to determine if a thread is still
+	 * running in the patched code contained in the patch module once
+	 * the ftrace registration is successful.
+	 */
+	if (!try_module_get(patch->mod))
+		return -ENODEV;
+
+	ret = klp_init_patch(patch);
+	if (ret)
+		module_put(patch->mod);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(klp_register_patch);
+
+static void klp_module_notify_coming(struct klp_patch *patch,
+				     struct klp_object *obj)
+{
+	struct module *pmod = patch->mod;
+	struct module *mod = obj->mod;
+	int ret;
+
+	ret = klp_init_object_loaded(patch, obj);
+	if (ret)
+		goto err;
+
+	if (patch->state == KLP_DISABLED)
+		return;
+
+	pr_notice("applying patch '%s' to loading module '%s'\n",
+		  pmod->name, mod->name);
+
+	ret = klp_enable_object(obj);
+	if (!ret)
+		return;
+
+err:
+	pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
+		pmod->name, mod->name, ret);
+}
+
+static void klp_module_notify_going(struct klp_patch *patch,
+				    struct klp_object *obj)
+{
+	struct module *pmod = patch->mod;
+	struct module *mod = obj->mod;
+	int ret;
+
+	if (patch->state == KLP_DISABLED)
+		goto disabled;
+
+	pr_notice("reverting patch '%s' on unloading module '%s'\n",
+		  pmod->name, mod->name);
+
+	ret = klp_disable_object(obj);
+	if (ret)
+		pr_warn("failed to revert patch '%s' on module '%s' (%d)\n",
+			pmod->name, mod->name, ret);
+
+disabled:
+	klp_free_object_loaded(obj);
+}
+
+static int klp_module_notify(struct notifier_block *nb, unsigned long action,
+			     void *data)
+{
+	struct module *mod = data;
+	struct klp_patch *patch;
+	struct klp_object *obj;
+
+	if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING)
+		return 0;
+
+	mutex_lock(&klp_mutex);
+
+	list_for_each_entry(patch, &klp_patches, list) {
+		for (obj = patch->objs; obj->funcs; obj++) {
+			if (!klp_is_module(obj) || strcmp(obj->name, mod->name))
+				continue;
+
+			if (action == MODULE_STATE_COMING) {
+				obj->mod = mod;
+				klp_module_notify_coming(patch, obj);
+			} else /* MODULE_STATE_GOING */
+				klp_module_notify_going(patch, obj);
+
+			break;
+		}
+	}
+
+	mutex_unlock(&klp_mutex);
+
+	return 0;
+}
+
+static struct notifier_block klp_module_nb = {
+	.notifier_call = klp_module_notify,
+	.priority = INT_MIN+1, /* called late but before ftrace notifier */
+};
+
+static int klp_init(void)
+{
+	int ret;
+
+	ret = register_module_notifier(&klp_module_nb);
+	if (ret)
+		return ret;
+
+	klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
+	if (!klp_root_kobj) {
+		ret = -ENOMEM;
+		goto unregister;
+	}
+
+	return 0;
+
+unregister:
+	unregister_module_notifier(&klp_module_nb);
+	return ret;
+}
+
+module_init(klp_init);
-- 
cgit v1.2.3


From 2c658e212ce7e40ace56d9441c8c5634d4d420e3 Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Thu, 18 Dec 2014 16:12:08 +0200
Subject: spi: Remove FSF mailing addresses

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-au1550.c        | 4 ----
 drivers/spi/spi-bcm2835.c       | 4 ----
 drivers/spi/spi-bcm63xx.c       | 4 ----
 drivers/spi/spi-bitbang.c       | 4 ----
 drivers/spi/spi-butterfly.c     | 4 ----
 drivers/spi/spi-coldfire-qspi.c | 5 -----
 drivers/spi/spi-davinci.c       | 4 ----
 drivers/spi/spi-gpio.c          | 4 ----
 drivers/spi/spi-lm70llp.c       | 4 ----
 drivers/spi/spi-omap-100k.c     | 5 -----
 drivers/spi/spi-omap-uwire.c    | 4 ----
 drivers/spi/spi-omap2-mcspi.c   | 5 -----
 drivers/spi/spi-pxa2xx-pxadma.c | 4 ----
 drivers/spi/spi-pxa2xx.c        | 4 ----
 drivers/spi/spi-rspi.c          | 5 -----
 drivers/spi/spi-s3c64xx.c       | 4 ----
 drivers/spi/spi-sc18is602.c     | 4 ----
 drivers/spi/spi-sh-hspi.c       | 5 -----
 drivers/spi/spi-sh.c            | 5 -----
 drivers/spi/spi-topcliff-pch.c  | 4 ----
 drivers/spi/spi.c               | 4 ----
 drivers/spi/spidev.c            | 4 ----
 include/linux/spi/at86rf230.h   | 4 ----
 include/linux/spi/l4f00242t03.h | 4 ----
 include/linux/spi/lms283gf05.h  | 4 ----
 include/linux/spi/mxs-spi.h     | 4 ----
 include/linux/spi/pxa2xx_spi.h  | 4 ----
 include/linux/spi/rspi.h        | 5 -----
 include/linux/spi/sh_hspi.h     | 4 ----
 include/linux/spi/spi.h         | 4 ----
 include/linux/spi/tle62x0.h     | 4 ----
 include/linux/spi/tsc2005.h     | 5 -----
 32 files changed, 136 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-au1550.c b/drivers/spi/spi-au1550.c
index 326f47973684..f45e085c01a6 100644
--- a/drivers/spi/spi-au1550.c
+++ b/drivers/spi/spi-au1550.c
@@ -15,10 +15,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/init.h>
diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index 98aab457b24d..419a782ab6d5 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -17,10 +17,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 
 #include <linux/clk.h>
diff --git a/drivers/spi/spi-bcm63xx.c b/drivers/spi/spi-bcm63xx.c
index c20530982e26..e73e2b052c9c 100644
--- a/drivers/spi/spi-bcm63xx.c
+++ b/drivers/spi/spi-bcm63xx.c
@@ -13,10 +13,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the
- * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/spi/spi-bitbang.c b/drivers/spi/spi-bitbang.c
index dc7d2c2d643e..5ef6638d5e8a 100644
--- a/drivers/spi/spi-bitbang.c
+++ b/drivers/spi/spi-bitbang.c
@@ -10,10 +10,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/spinlock.h>
diff --git a/drivers/spi/spi-butterfly.c b/drivers/spi/spi-butterfly.c
index ee4f91ccd8fd..9a95862986c8 100644
--- a/drivers/spi/spi-butterfly.c
+++ b/drivers/spi/spi-butterfly.c
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
diff --git a/drivers/spi/spi-coldfire-qspi.c b/drivers/spi/spi-coldfire-qspi.c
index 41b5dc4445f6..688956ff5095 100644
--- a/drivers/spi/spi-coldfire-qspi.c
+++ b/drivers/spi/spi-coldfire-qspi.c
@@ -12,11 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA
- *
 */
 
 #include <linux/kernel.h>
diff --git a/drivers/spi/spi-davinci.c b/drivers/spi/spi-davinci.c
index b3707badb1e5..5e991065f5b0 100644
--- a/drivers/spi/spi-davinci.c
+++ b/drivers/spi/spi-davinci.c
@@ -11,10 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/interrupt.h>
diff --git a/drivers/spi/spi-gpio.c b/drivers/spi/spi-gpio.c
index aee4e7589568..2b76492fe5c1 100644
--- a/drivers/spi/spi-gpio.c
+++ b/drivers/spi/spi-gpio.c
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/drivers/spi/spi-lm70llp.c b/drivers/spi/spi-lm70llp.c
index 41c5765be746..ba72347cb99d 100644
--- a/drivers/spi/spi-lm70llp.c
+++ b/drivers/spi/spi-lm70llp.c
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/init.h>
diff --git a/drivers/spi/spi-omap-100k.c b/drivers/spi/spi-omap-100k.c
index 79399ae9c84c..d890d309dff9 100644
--- a/drivers/spi/spi-omap-100k.c
+++ b/drivers/spi/spi-omap-100k.c
@@ -16,11 +16,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
diff --git a/drivers/spi/spi-omap-uwire.c b/drivers/spi/spi-omap-uwire.c
index daf1ada5cd11..3c0844457c07 100644
--- a/drivers/spi/spi-omap-uwire.c
+++ b/drivers/spi/spi-omap-uwire.c
@@ -28,10 +28,6 @@
  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c
index 3bc3cbabbbc0..4df8942058de 100644
--- a/drivers/spi/spi-omap2-mcspi.c
+++ b/drivers/spi/spi-omap2-mcspi.c
@@ -14,11 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/spi/spi-pxa2xx-pxadma.c b/drivers/spi/spi-pxa2xx-pxadma.c
index e8a26f25d5c0..e51fcf9fd39b 100644
--- a/drivers/spi/spi-pxa2xx-pxadma.c
+++ b/drivers/spi/spi-pxa2xx-pxadma.c
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/delay.h>
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index 05c623cfb078..7a9a605d9bd2 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -11,10 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/init.h>
diff --git a/drivers/spi/spi-rspi.c b/drivers/spi/spi-rspi.c
index 2071f788c6fb..46ce47076e63 100644
--- a/drivers/spi/spi-rspi.c
+++ b/drivers/spi/spi-rspi.c
@@ -15,11 +15,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 
 #include <linux/module.h>
diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c
index 37b19836f5cb..9231c34b5a5c 100644
--- a/drivers/spi/spi-s3c64xx.c
+++ b/drivers/spi/spi-s3c64xx.c
@@ -11,10 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/init.h>
diff --git a/drivers/spi/spi-sc18is602.c b/drivers/spi/spi-sc18is602.c
index 237f2e7a7179..5a56acf8a43e 100644
--- a/drivers/spi/spi-sc18is602.c
+++ b/drivers/spi/spi-sc18is602.c
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/spi/spi-sh-hspi.c b/drivers/spi/spi-sh-hspi.c
index fc29233d0650..20e800e70442 100644
--- a/drivers/spi/spi-sh-hspi.c
+++ b/drivers/spi/spi-sh-hspi.c
@@ -16,11 +16,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 
 #include <linux/clk.h>
diff --git a/drivers/spi/spi-sh.c b/drivers/spi/spi-sh.c
index 1cfc906dd174..502501187c9e 100644
--- a/drivers/spi/spi-sh.c
+++ b/drivers/spi/spi-sh.c
@@ -14,11 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 
 #include <linux/module.h>
diff --git a/drivers/spi/spi-topcliff-pch.c b/drivers/spi/spi-topcliff-pch.c
index be692ad50442..93dfcee0f987 100644
--- a/drivers/spi/spi-topcliff-pch.c
+++ b/drivers/spi/spi-topcliff-pch.c
@@ -11,10 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307, USA.
  */
 
 #include <linux/delay.h>
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 66a70e9bc743..c76cc7568639 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -13,10 +13,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/kernel.h>
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index 6941e04afb8c..6f97e5af282b 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -14,10 +14,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #include <linux/init.h>
diff --git a/include/linux/spi/at86rf230.h b/include/linux/spi/at86rf230.h
index b2b1afbb3202..cd519a11c2c6 100644
--- a/include/linux/spi/at86rf230.h
+++ b/include/linux/spi/at86rf230.h
@@ -12,10 +12,6 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
  * Written by:
  * Dmitry Eremin-Solenikov <dmitry.baryshkov@siemens.com>
  */
diff --git a/include/linux/spi/l4f00242t03.h b/include/linux/spi/l4f00242t03.h
index bc8677c8eba9..e69e9b51b21a 100644
--- a/include/linux/spi/l4f00242t03.h
+++ b/include/linux/spi/l4f00242t03.h
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 
 #ifndef _INCLUDE_LINUX_SPI_L4F00242T03_H_
diff --git a/include/linux/spi/lms283gf05.h b/include/linux/spi/lms283gf05.h
index 555d254e6606..fdd1d1d51da5 100644
--- a/include/linux/spi/lms283gf05.h
+++ b/include/linux/spi/lms283gf05.h
@@ -11,10 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 
 #ifndef _INCLUDE_LINUX_SPI_LMS283GF05_H_
diff --git a/include/linux/spi/mxs-spi.h b/include/linux/spi/mxs-spi.h
index 4835486f58e5..381d368b91b4 100644
--- a/include/linux/spi/mxs-spi.h
+++ b/include/linux/spi/mxs-spi.h
@@ -15,10 +15,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
 #ifndef __LINUX_SPI_MXS_SPI_H__
diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
index d5a316550177..5eb56e35cf0d 100644
--- a/include/linux/spi/pxa2xx_spi.h
+++ b/include/linux/spi/pxa2xx_spi.h
@@ -10,10 +10,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #ifndef __linux_pxa2xx_spi_h
 #define __linux_pxa2xx_spi_h
diff --git a/include/linux/spi/rspi.h b/include/linux/spi/rspi.h
index e546b2ceb623..a693188cc08b 100644
--- a/include/linux/spi/rspi.h
+++ b/include/linux/spi/rspi.h
@@ -11,11 +11,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
  */
 
 #ifndef __LINUX_SPI_RENESAS_SPI_H__
diff --git a/include/linux/spi/sh_hspi.h b/include/linux/spi/sh_hspi.h
index a1121f872ac1..aa0d440ab4f0 100644
--- a/include/linux/spi/sh_hspi.h
+++ b/include/linux/spi/sh_hspi.h
@@ -9,10 +9,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #ifndef SH_HSPI_H
 #define SH_HSPI_H
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index a6ef2a8e6de4..6e2664a17114 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -10,10 +10,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
 #ifndef __LINUX_SPI_H
diff --git a/include/linux/spi/tle62x0.h b/include/linux/spi/tle62x0.h
index 60b59187e590..414c6fddfcf0 100644
--- a/include/linux/spi/tle62x0.h
+++ b/include/linux/spi/tle62x0.h
@@ -12,10 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 
 struct tle62x0_pdata {
diff --git a/include/linux/spi/tsc2005.h b/include/linux/spi/tsc2005.h
index 8f721e465e05..563b3b1799a8 100644
--- a/include/linux/spi/tsc2005.h
+++ b/include/linux/spi/tsc2005.h
@@ -12,11 +12,6 @@
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
  */
 
 #ifndef _LINUX_SPI_TSC2005_H
-- 
cgit v1.2.3


From 556d2f055bf6d79ce81587dfe774d4dd10da473f Mon Sep 17 00:00:00 2001
From: Yalin Wang <Yalin.Wang@sonymobile.com>
Date: Mon, 3 Nov 2014 03:01:03 +0100
Subject: ARM: 8187/1: add CONFIG_HAVE_ARCH_BITREVERSE to support rbit
 instruction

this change add CONFIG_HAVE_ARCH_BITREVERSE config option,
so that we can use some architecture's bitrev hardware instruction
to do bitrev operation.

Introduce __constant_bitrev* macro for constant bitrev operation.

Change __bitrev16() __bitrev32() to be inline function,
don't need export symbol for these tiny functions.

Signed-off-by: Yalin Wang <yalin.wang@sonymobile.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/bitrev.h | 77 +++++++++++++++++++++++++++++++++++++++++++++++---
 lib/Kconfig            |  9 ++++++
 lib/bitrev.c           | 17 ++---------
 3 files changed, 84 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bitrev.h b/include/linux/bitrev.h
index 7ffe03f4693d..fb790b8449c1 100644
--- a/include/linux/bitrev.h
+++ b/include/linux/bitrev.h
@@ -3,14 +3,83 @@
 
 #include <linux/types.h>
 
-extern u8 const byte_rev_table[256];
+#ifdef CONFIG_HAVE_ARCH_BITREVERSE
+#include <asm/bitrev.h>
+
+#define __bitrev32 __arch_bitrev32
+#define __bitrev16 __arch_bitrev16
+#define __bitrev8 __arch_bitrev8
 
-static inline u8 bitrev8(u8 byte)
+#else
+extern u8 const byte_rev_table[256];
+static inline u8 __bitrev8(u8 byte)
 {
 	return byte_rev_table[byte];
 }
 
-extern u16 bitrev16(u16 in);
-extern u32 bitrev32(u32 in);
+static inline u16 __bitrev16(u16 x)
+{
+	return (__bitrev8(x & 0xff) << 8) | __bitrev8(x >> 8);
+}
+
+static inline u32 __bitrev32(u32 x)
+{
+	return (__bitrev16(x & 0xffff) << 16) | __bitrev16(x >> 16);
+}
+
+#endif /* CONFIG_HAVE_ARCH_BITREVERSE */
+
+#define __constant_bitrev32(x)	\
+({					\
+	u32 __x = x;			\
+	__x = (__x >> 16) | (__x << 16);	\
+	__x = ((__x & (u32)0xFF00FF00UL) >> 8) | ((__x & (u32)0x00FF00FFUL) << 8);	\
+	__x = ((__x & (u32)0xF0F0F0F0UL) >> 4) | ((__x & (u32)0x0F0F0F0FUL) << 4);	\
+	__x = ((__x & (u32)0xCCCCCCCCUL) >> 2) | ((__x & (u32)0x33333333UL) << 2);	\
+	__x = ((__x & (u32)0xAAAAAAAAUL) >> 1) | ((__x & (u32)0x55555555UL) << 1);	\
+	__x;								\
+})
+
+#define __constant_bitrev16(x)	\
+({					\
+	u16 __x = x;			\
+	__x = (__x >> 8) | (__x << 8);	\
+	__x = ((__x & (u16)0xF0F0U) >> 4) | ((__x & (u16)0x0F0FU) << 4);	\
+	__x = ((__x & (u16)0xCCCCU) >> 2) | ((__x & (u16)0x3333U) << 2);	\
+	__x = ((__x & (u16)0xAAAAU) >> 1) | ((__x & (u16)0x5555U) << 1);	\
+	__x;								\
+})
+
+#define __constant_bitrev8(x)	\
+({					\
+	u8 __x = x;			\
+	__x = (__x >> 4) | (__x << 4);	\
+	__x = ((__x & (u8)0xCCU) >> 2) | ((__x & (u8)0x33U) << 2);	\
+	__x = ((__x & (u8)0xAAU) >> 1) | ((__x & (u8)0x55U) << 1);	\
+	__x;								\
+})
+
+#define bitrev32(x) \
+({			\
+	u32 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev32(__x) :			\
+	__bitrev32(__x);				\
+})
+
+#define bitrev16(x) \
+({			\
+	u16 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev16(__x) :			\
+	__bitrev16(__x);				\
+ })
 
+#define bitrev8(x) \
+({			\
+	u8 __x = x;	\
+	__builtin_constant_p(__x) ?	\
+	__constant_bitrev8(__x) :			\
+	__bitrev8(__x)	;			\
+ })
 #endif /* _LINUX_BITREV_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 54cf309a92a5..cd177caf3876 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -13,6 +13,15 @@ config RAID6_PQ
 config BITREVERSE
 	tristate
 
+config HAVE_ARCH_BITREVERSE
+	boolean
+	default n
+	depends on BITREVERSE
+	help
+	  This option provides an config for the architecture which have instruction
+	  can do bitreverse operation, we use the hardware instruction if the architecture
+	  have this capability.
+
 config RATIONAL
 	boolean
 
diff --git a/lib/bitrev.c b/lib/bitrev.c
index 3956203456d4..40ffda94cc5d 100644
--- a/lib/bitrev.c
+++ b/lib/bitrev.c
@@ -1,3 +1,4 @@
+#ifndef CONFIG_HAVE_ARCH_BITREVERSE
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/bitrev.h>
@@ -42,18 +43,4 @@ const u8 byte_rev_table[256] = {
 };
 EXPORT_SYMBOL_GPL(byte_rev_table);
 
-u16 bitrev16(u16 x)
-{
-	return (bitrev8(x & 0xff) << 8) | bitrev8(x >> 8);
-}
-EXPORT_SYMBOL(bitrev16);
-
-/**
- * bitrev32 - reverse the order of bits in a u32 value
- * @x: value to be bit-reversed
- */
-u32 bitrev32(u32 x)
-{
-	return (bitrev16(x & 0xffff) << 16) | bitrev16(x >> 16);
-}
-EXPORT_SYMBOL(bitrev32);
+#endif /* CONFIG_HAVE_ARCH_BITREVERSE */
-- 
cgit v1.2.3


From c4827bb859cbe8afad9287c9dd4e7162119d3d59 Mon Sep 17 00:00:00 2001
From: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Date: Thu, 18 Dec 2014 15:04:21 +0200
Subject: spi: pxa2xx: Add definition for Intel Quark DDS_RATE register

Intel Quark DDS_RATE register is defined only in register access macro. Add
a definition for it to common SSP register definitions for preparing to
cleanup those macros.

Signed-off-by: Jarkko Nikula <jarkko.nikula@linux.intel.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/pxa2xx_ssp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index 77aed9ea1d26..dab545bb66b3 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -37,6 +37,7 @@
 #define SSDR		(0x10)  /* SSP Data Write/Data Read Register */
 
 #define SSTO		(0x28)  /* SSP Time Out Register */
+#define DDS_RATE	(0x28)  /* SSP DDS Clock Rate Register (Intel Quark) */
 #define SSPSP		(0x2C)  /* SSP Programmable Serial Protocol */
 #define SSTSA		(0x30)  /* SSP Tx Timeslot Active */
 #define SSRSA		(0x34)  /* SSP Rx Timeslot Active */
-- 
cgit v1.2.3


From 32d17597d3e299ffe8b07e3afc12f8074e7ae483 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Wed, 9 Apr 2014 08:13:18 -0300
Subject: [media] v4l: vsp1: Remove support for platform data

Now that all platforms instantiate the VSP1 through DT, platform data
support isn't needed anymore.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/media/platform/Kconfig         |  2 +-
 drivers/media/platform/vsp1/vsp1.h     | 14 +++++-
 drivers/media/platform/vsp1/vsp1_drv.c | 81 ++++++++++++----------------------
 drivers/media/platform/vsp1/vsp1_wpf.c |  2 +-
 include/linux/platform_data/vsp1.h     | 27 ------------
 5 files changed, 43 insertions(+), 83 deletions(-)
 delete mode 100644 include/linux/platform_data/vsp1.h

(limited to 'include/linux')

diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig
index 765bffb49a72..480a174832a6 100644
--- a/drivers/media/platform/Kconfig
+++ b/drivers/media/platform/Kconfig
@@ -223,7 +223,7 @@ config VIDEO_SH_VEU
 config VIDEO_RENESAS_VSP1
 	tristate "Renesas VSP1 Video Processing Engine"
 	depends on VIDEO_V4L2 && VIDEO_V4L2_SUBDEV_API && HAS_DMA
-	depends on ARCH_SHMOBILE || COMPILE_TEST
+	depends on (ARCH_SHMOBILE && OF) || COMPILE_TEST
 	select VIDEOBUF2_DMA_CONTIG
 	---help---
 	  This is a V4L2 driver for the Renesas VSP1 video processing engine.
diff --git a/drivers/media/platform/vsp1/vsp1.h b/drivers/media/platform/vsp1/vsp1.h
index 12467191dff4..989e96f7e360 100644
--- a/drivers/media/platform/vsp1/vsp1.h
+++ b/drivers/media/platform/vsp1/vsp1.h
@@ -16,7 +16,6 @@
 #include <linux/io.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/platform_data/vsp1.h>
 
 #include <media/media-device.h>
 #include <media/v4l2-device.h>
@@ -40,9 +39,20 @@ struct vsp1_uds;
 #define VSP1_MAX_UDS		3
 #define VSP1_MAX_WPF		4
 
+#define VSP1_HAS_LIF		(1 << 0)
+#define VSP1_HAS_LUT		(1 << 1)
+#define VSP1_HAS_SRU		(1 << 2)
+
+struct vsp1_platform_data {
+	unsigned int features;
+	unsigned int rpf_count;
+	unsigned int uds_count;
+	unsigned int wpf_count;
+};
+
 struct vsp1_device {
 	struct device *dev;
-	struct vsp1_platform_data *pdata;
+	struct vsp1_platform_data pdata;
 
 	void __iomem *mmio;
 	struct clk *clock;
diff --git a/drivers/media/platform/vsp1/vsp1_drv.c b/drivers/media/platform/vsp1/vsp1_drv.c
index 5eb16e87d53f..913485a90e97 100644
--- a/drivers/media/platform/vsp1/vsp1_drv.c
+++ b/drivers/media/platform/vsp1/vsp1_drv.c
@@ -40,7 +40,7 @@ static irqreturn_t vsp1_irq_handler(int irq, void *data)
 	irqreturn_t ret = IRQ_NONE;
 	unsigned int i;
 
-	for (i = 0; i < vsp1->pdata->wpf_count; ++i) {
+	for (i = 0; i < vsp1->pdata.wpf_count; ++i) {
 		struct vsp1_rwpf *wpf = vsp1->wpf[i];
 		struct vsp1_pipeline *pipe;
 		u32 status;
@@ -181,7 +181,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 
 	list_add_tail(&vsp1->hst->entity.list_dev, &vsp1->entities);
 
-	if (vsp1->pdata->features & VSP1_HAS_LIF) {
+	if (vsp1->pdata.features & VSP1_HAS_LIF) {
 		vsp1->lif = vsp1_lif_create(vsp1);
 		if (IS_ERR(vsp1->lif)) {
 			ret = PTR_ERR(vsp1->lif);
@@ -191,7 +191,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 		list_add_tail(&vsp1->lif->entity.list_dev, &vsp1->entities);
 	}
 
-	if (vsp1->pdata->features & VSP1_HAS_LUT) {
+	if (vsp1->pdata.features & VSP1_HAS_LUT) {
 		vsp1->lut = vsp1_lut_create(vsp1);
 		if (IS_ERR(vsp1->lut)) {
 			ret = PTR_ERR(vsp1->lut);
@@ -201,7 +201,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 		list_add_tail(&vsp1->lut->entity.list_dev, &vsp1->entities);
 	}
 
-	for (i = 0; i < vsp1->pdata->rpf_count; ++i) {
+	for (i = 0; i < vsp1->pdata.rpf_count; ++i) {
 		struct vsp1_rwpf *rpf;
 
 		rpf = vsp1_rpf_create(vsp1, i);
@@ -214,7 +214,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 		list_add_tail(&rpf->entity.list_dev, &vsp1->entities);
 	}
 
-	if (vsp1->pdata->features & VSP1_HAS_SRU) {
+	if (vsp1->pdata.features & VSP1_HAS_SRU) {
 		vsp1->sru = vsp1_sru_create(vsp1);
 		if (IS_ERR(vsp1->sru)) {
 			ret = PTR_ERR(vsp1->sru);
@@ -224,7 +224,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 		list_add_tail(&vsp1->sru->entity.list_dev, &vsp1->entities);
 	}
 
-	for (i = 0; i < vsp1->pdata->uds_count; ++i) {
+	for (i = 0; i < vsp1->pdata.uds_count; ++i) {
 		struct vsp1_uds *uds;
 
 		uds = vsp1_uds_create(vsp1, i);
@@ -237,7 +237,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 		list_add_tail(&uds->entity.list_dev, &vsp1->entities);
 	}
 
-	for (i = 0; i < vsp1->pdata->wpf_count; ++i) {
+	for (i = 0; i < vsp1->pdata.wpf_count; ++i) {
 		struct vsp1_rwpf *wpf;
 
 		wpf = vsp1_wpf_create(vsp1, i);
@@ -261,7 +261,7 @@ static int vsp1_create_entities(struct vsp1_device *vsp1)
 			goto done;
 	}
 
-	if (vsp1->pdata->features & VSP1_HAS_LIF) {
+	if (vsp1->pdata.features & VSP1_HAS_LIF) {
 		ret = media_entity_create_link(
 			&vsp1->wpf[0]->entity.subdev.entity, RWPF_PAD_SOURCE,
 			&vsp1->lif->entity.subdev.entity, LIF_PAD_SINK, 0);
@@ -294,7 +294,7 @@ static int vsp1_device_init(struct vsp1_device *vsp1)
 	/* Reset any channel that might be running. */
 	status = vsp1_read(vsp1, VI6_STATUS);
 
-	for (i = 0; i < vsp1->pdata->wpf_count; ++i) {
+	for (i = 0; i < vsp1->pdata.wpf_count; ++i) {
 		unsigned int timeout;
 
 		if (!(status & VI6_STATUS_SYS_ACT(i)))
@@ -318,10 +318,10 @@ static int vsp1_device_init(struct vsp1_device *vsp1)
 	vsp1_write(vsp1, VI6_CLK_DCSWT, (8 << VI6_CLK_DCSWT_CSTPW_SHIFT) |
 		   (8 << VI6_CLK_DCSWT_CSTRW_SHIFT));
 
-	for (i = 0; i < vsp1->pdata->rpf_count; ++i)
+	for (i = 0; i < vsp1->pdata.rpf_count; ++i)
 		vsp1_write(vsp1, VI6_DPR_RPF_ROUTE(i), VI6_DPR_NODE_UNUSED);
 
-	for (i = 0; i < vsp1->pdata->uds_count; ++i)
+	for (i = 0; i < vsp1->pdata.uds_count; ++i)
 		vsp1_write(vsp1, VI6_DPR_UDS_ROUTE(i), VI6_DPR_NODE_UNUSED);
 
 	vsp1_write(vsp1, VI6_DPR_SRU_ROUTE, VI6_DPR_NODE_UNUSED);
@@ -428,28 +428,36 @@ static const struct dev_pm_ops vsp1_pm_ops = {
  * Platform Driver
  */
 
-static int vsp1_validate_platform_data(struct platform_device *pdev,
-				       struct vsp1_platform_data *pdata)
+static int vsp1_parse_dt(struct vsp1_device *vsp1)
 {
-	if (pdata == NULL) {
-		dev_err(&pdev->dev, "missing platform data\n");
-		return -EINVAL;
-	}
+	struct device_node *np = vsp1->dev->of_node;
+	struct vsp1_platform_data *pdata = &vsp1->pdata;
+
+	if (of_property_read_bool(np, "renesas,has-lif"))
+		pdata->features |= VSP1_HAS_LIF;
+	if (of_property_read_bool(np, "renesas,has-lut"))
+		pdata->features |= VSP1_HAS_LUT;
+	if (of_property_read_bool(np, "renesas,has-sru"))
+		pdata->features |= VSP1_HAS_SRU;
+
+	of_property_read_u32(np, "renesas,#rpf", &pdata->rpf_count);
+	of_property_read_u32(np, "renesas,#uds", &pdata->uds_count);
+	of_property_read_u32(np, "renesas,#wpf", &pdata->wpf_count);
 
 	if (pdata->rpf_count <= 0 || pdata->rpf_count > VSP1_MAX_RPF) {
-		dev_err(&pdev->dev, "invalid number of RPF (%u)\n",
+		dev_err(vsp1->dev, "invalid number of RPF (%u)\n",
 			pdata->rpf_count);
 		return -EINVAL;
 	}
 
 	if (pdata->uds_count <= 0 || pdata->uds_count > VSP1_MAX_UDS) {
-		dev_err(&pdev->dev, "invalid number of UDS (%u)\n",
+		dev_err(vsp1->dev, "invalid number of UDS (%u)\n",
 			pdata->uds_count);
 		return -EINVAL;
 	}
 
 	if (pdata->wpf_count <= 0 || pdata->wpf_count > VSP1_MAX_WPF) {
-		dev_err(&pdev->dev, "invalid number of WPF (%u)\n",
+		dev_err(vsp1->dev, "invalid number of WPF (%u)\n",
 			pdata->wpf_count);
 		return -EINVAL;
 	}
@@ -457,33 +465,6 @@ static int vsp1_validate_platform_data(struct platform_device *pdev,
 	return 0;
 }
 
-static struct vsp1_platform_data *
-vsp1_get_platform_data(struct platform_device *pdev)
-{
-	struct device_node *np = pdev->dev.of_node;
-	struct vsp1_platform_data *pdata;
-
-	if (!IS_ENABLED(CONFIG_OF) || np == NULL)
-		return pdev->dev.platform_data;
-
-	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
-	if (pdata == NULL)
-		return NULL;
-
-	if (of_property_read_bool(np, "renesas,has-lif"))
-		pdata->features |= VSP1_HAS_LIF;
-	if (of_property_read_bool(np, "renesas,has-lut"))
-		pdata->features |= VSP1_HAS_LUT;
-	if (of_property_read_bool(np, "renesas,has-sru"))
-		pdata->features |= VSP1_HAS_SRU;
-
-	of_property_read_u32(np, "renesas,#rpf", &pdata->rpf_count);
-	of_property_read_u32(np, "renesas,#uds", &pdata->uds_count);
-	of_property_read_u32(np, "renesas,#wpf", &pdata->wpf_count);
-
-	return pdata;
-}
-
 static int vsp1_probe(struct platform_device *pdev)
 {
 	struct vsp1_device *vsp1;
@@ -499,11 +480,7 @@ static int vsp1_probe(struct platform_device *pdev)
 	mutex_init(&vsp1->lock);
 	INIT_LIST_HEAD(&vsp1->entities);
 
-	vsp1->pdata = vsp1_get_platform_data(pdev);
-	if (vsp1->pdata == NULL)
-		return -ENODEV;
-
-	ret = vsp1_validate_platform_data(pdev, vsp1->pdata);
+	ret = vsp1_parse_dt(vsp1);
 	if (ret < 0)
 		return ret;
 
diff --git a/drivers/media/platform/vsp1/vsp1_wpf.c b/drivers/media/platform/vsp1/vsp1_wpf.c
index 6e057762c933..b1089d05583a 100644
--- a/drivers/media/platform/vsp1/vsp1_wpf.c
+++ b/drivers/media/platform/vsp1/vsp1_wpf.c
@@ -280,7 +280,7 @@ struct vsp1_rwpf *vsp1_wpf_create(struct vsp1_device *vsp1, unsigned int index)
 	 * except for the WPF0 source link if a LIF is present.
 	 */
 	flags = MEDIA_LNK_FL_ENABLED;
-	if (!(vsp1->pdata->features & VSP1_HAS_LIF) || index != 0)
+	if (!(vsp1->pdata.features & VSP1_HAS_LIF) || index != 0)
 		flags |= MEDIA_LNK_FL_IMMUTABLE;
 
 	ret = media_entity_create_link(&wpf->entity.subdev.entity,
diff --git a/include/linux/platform_data/vsp1.h b/include/linux/platform_data/vsp1.h
deleted file mode 100644
index 63170e2614b3..000000000000
--- a/include/linux/platform_data/vsp1.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * vsp1.h  --  R-Car VSP1 Platform Data
- *
- * Copyright (C) 2013 Renesas Corporation
- *
- * Contact: Laurent Pinchart (laurent.pinchart@ideasonboard.com)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-#ifndef __PLATFORM_VSP1_H__
-#define __PLATFORM_VSP1_H__
-
-#define VSP1_HAS_LIF		(1 << 0)
-#define VSP1_HAS_LUT		(1 << 1)
-#define VSP1_HAS_SRU		(1 << 2)
-
-struct vsp1_platform_data {
-	unsigned int features;
-	unsigned int rpf_count;
-	unsigned int uds_count;
-	unsigned int wpf_count;
-};
-
-#endif /* __PLATFORM_VSP1_H__ */
-- 
cgit v1.2.3


From 3110628d89f80fbafa085fd62e75afcb39fb764c Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Date: Fri, 19 Dec 2014 17:15:53 +0900
Subject: spi: sh-msiof: Configure MSIOF sync signal timing in device tree

The MSIOF controller has DTDL and SYNCDL in SITMDR1 register. So,
this patch adds new properties like the following commit:
  d0fb47a5237d8b9576113568bacfd27892308b62
  (spi: fsl-espi: Configure FSL eSPI CSBEF and CSAFT)

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/spi/sh-msiof.txt | 16 ++++++++
 drivers/spi/spi-sh-msiof.c                         | 47 ++++++++++++++++++++++
 include/linux/spi/sh_msiof.h                       |  2 +
 3 files changed, 65 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/spi/sh-msiof.txt b/Documentation/devicetree/bindings/spi/sh-msiof.txt
index d11c3721e7cd..4c388bb2f0a2 100644
--- a/Documentation/devicetree/bindings/spi/sh-msiof.txt
+++ b/Documentation/devicetree/bindings/spi/sh-msiof.txt
@@ -30,6 +30,22 @@ Optional properties:
 			 specifiers, one for transmission, and one for
 			 reception.
 - dma-names            : Must contain a list of two DMA names, "tx" and "rx".
+- renesas,dtdl         : delay sync signal (setup) in transmit mode.
+			 Must contain one of the following values:
+			 0   (no bit delay)
+			 50  (0.5-clock-cycle delay)
+			 100 (1-clock-cycle delay)
+			 150 (1.5-clock-cycle delay)
+			 200 (2-clock-cycle delay)
+
+- renesas,syncdl       : delay sync signal (hold) in transmit mode.
+			 Must contain one of the following values:
+			 0   (no bit delay)
+			 50  (0.5-clock-cycle delay)
+			 100 (1-clock-cycle delay)
+			 150 (1.5-clock-cycle delay)
+			 200 (2-clock-cycle delay)
+			 300 (3-clock-cycle delay)
 
 Optional properties, deprecated for soctype-specific bindings:
 - renesas,tx-fifo-size : Overrides the default tx fifo size given in words
diff --git a/drivers/spi/spi-sh-msiof.c b/drivers/spi/spi-sh-msiof.c
index 239be7cbe5a8..2a87cb939f56 100644
--- a/drivers/spi/spi-sh-msiof.c
+++ b/drivers/spi/spi-sh-msiof.c
@@ -82,6 +82,8 @@ struct sh_msiof_spi_priv {
 #define MDR1_SYNCMD_LR	 0x30000000 /*   L/R mode */
 #define MDR1_SYNCAC_SHIFT	 25 /* Sync Polarity (1 = Active-low) */
 #define MDR1_BITLSB_SHIFT	 24 /* MSB/LSB First (1 = LSB first) */
+#define MDR1_DTDL_SHIFT		 20 /* Data Pin Bit Delay for MSIOF_SYNC */
+#define MDR1_SYNCDL_SHIFT	 16 /* Frame Sync Signal Timing Delay */
 #define MDR1_FLD_MASK	 0x000000c0 /* Frame Sync Signal Interval (0-3) */
 #define MDR1_FLD_SHIFT		  2
 #define MDR1_XXSTP	 0x00000001 /* Transmission/Reception Stop on FIFO */
@@ -279,6 +281,48 @@ static void sh_msiof_spi_set_clk_regs(struct sh_msiof_spi_priv *p,
 		sh_msiof_write(p, RSCR, sh_msiof_spi_clk_table[k].scr);
 }
 
+static u32 sh_msiof_get_delay_bit(u32 dtdl_or_syncdl)
+{
+	/*
+	 * DTDL/SYNCDL bit	: p->info->dtdl or p->info->syncdl
+	 * b'000		: 0
+	 * b'001		: 100
+	 * b'010		: 200
+	 * b'011 (SYNCDL only)	: 300
+	 * b'101		: 50
+	 * b'110		: 150
+	 */
+	if (dtdl_or_syncdl % 100)
+		return dtdl_or_syncdl / 100 + 5;
+	else
+		return dtdl_or_syncdl / 100;
+}
+
+static u32 sh_msiof_spi_get_dtdl_and_syncdl(struct sh_msiof_spi_priv *p)
+{
+	u32 val;
+
+	if (!p->info)
+		return 0;
+
+	/* check if DTDL and SYNCDL is allowed value */
+	if (p->info->dtdl > 200 || p->info->syncdl > 300) {
+		dev_warn(&p->pdev->dev, "DTDL or SYNCDL is too large\n");
+		return 0;
+	}
+
+	/* check if the sum of DTDL and SYNCDL becomes an integer value  */
+	if ((p->info->dtdl + p->info->syncdl) % 100) {
+		dev_warn(&p->pdev->dev, "the sum of DTDL/SYNCDL is not good\n");
+		return 0;
+	}
+
+	val = sh_msiof_get_delay_bit(p->info->dtdl) << MDR1_DTDL_SHIFT;
+	val |= sh_msiof_get_delay_bit(p->info->syncdl) << MDR1_SYNCDL_SHIFT;
+
+	return val;
+}
+
 static void sh_msiof_spi_set_pin_regs(struct sh_msiof_spi_priv *p,
 				      u32 cpol, u32 cpha,
 				      u32 tx_hi_z, u32 lsb_first, u32 cs_high)
@@ -296,6 +340,7 @@ static void sh_msiof_spi_set_pin_regs(struct sh_msiof_spi_priv *p,
 	tmp = MDR1_SYNCMD_SPI | 1 << MDR1_FLD_SHIFT | MDR1_XXSTP;
 	tmp |= !cs_high << MDR1_SYNCAC_SHIFT;
 	tmp |= lsb_first << MDR1_BITLSB_SHIFT;
+	tmp |= sh_msiof_spi_get_dtdl_and_syncdl(p);
 	sh_msiof_write(p, TMDR1, tmp | MDR1_TRMD | TMDR1_PCON);
 	if (p->chipdata->master_flags & SPI_MASTER_MUST_TX) {
 		/* These bits are reserved if RX needs TX */
@@ -952,6 +997,8 @@ static struct sh_msiof_spi_info *sh_msiof_spi_parse_dt(struct device *dev)
 					&info->tx_fifo_override);
 	of_property_read_u32(np, "renesas,rx-fifo-size",
 					&info->rx_fifo_override);
+	of_property_read_u32(np, "renesas,dtdl", &info->dtdl);
+	of_property_read_u32(np, "renesas,syncdl", &info->syncdl);
 
 	info->num_chipselect = num_cs;
 
diff --git a/include/linux/spi/sh_msiof.h b/include/linux/spi/sh_msiof.h
index 88a14d81c49e..b087a85f5f72 100644
--- a/include/linux/spi/sh_msiof.h
+++ b/include/linux/spi/sh_msiof.h
@@ -7,6 +7,8 @@ struct sh_msiof_spi_info {
 	u16 num_chipselect;
 	unsigned int dma_tx_id;
 	unsigned int dma_rx_id;
+	u32 dtdl;
+	u32 syncdl;
 };
 
 #endif /* __SPI_SH_MSIOF_H__ */
-- 
cgit v1.2.3


From 0425e2420c0ab1b5da24f6d9fce39241ad85fc46 Mon Sep 17 00:00:00 2001
From: Flora Fu <flora.fu@mediatek.com>
Date: Fri, 5 Dec 2014 12:07:54 +0800
Subject: regulator: mt6397: Add support for MT6397 regulator

Add MT6397 regulator driver.

Signed-off-by: Flora Fu <flora.fu@mediatek.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/Kconfig                  |   9 +
 drivers/regulator/Makefile                 |   1 +
 drivers/regulator/mt6397-regulator.c       | 332 +++++++++++++++++++++++++++++
 include/linux/regulator/mt6397-regulator.h |  49 +++++
 4 files changed, 391 insertions(+)
 create mode 100644 drivers/regulator/mt6397-regulator.c
 create mode 100644 include/linux/regulator/mt6397-regulator.h

(limited to 'include/linux')

diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index c3a60b57a865..f622d0613d27 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -433,6 +433,15 @@ config REGULATOR_MC13892
 	  Say y here to support the regulators found on the Freescale MC13892
 	  PMIC.
 
+config REGULATOR_MT6397
+	tristate "MediaTek MT6397 PMIC"
+	depends on MFD_MT6397
+	help
+	  Say y here to select this option to enable the power regulator of
+	  MediaTek MT6397 PMIC.
+	  This driver supports the control of different power rails of device
+	  through regulator interface.
+
 config REGULATOR_PALMAS
 	tristate "TI Palmas PMIC Regulators"
 	depends on MFD_PALMAS
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 1f28ebfc6f3a..1a0fbc397de1 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -58,6 +58,7 @@ obj-$(CONFIG_REGULATOR_MAX77802) += max77802.o
 obj-$(CONFIG_REGULATOR_MC13783) += mc13783-regulator.o
 obj-$(CONFIG_REGULATOR_MC13892) += mc13892-regulator.o
 obj-$(CONFIG_REGULATOR_MC13XXX_CORE) +=  mc13xxx-regulator-core.o
+obj-$(CONFIG_REGULATOR_MT6397)	+= mt6397-regulator.o
 obj-$(CONFIG_REGULATOR_QCOM_RPM) += qcom_rpm-regulator.o
 obj-$(CONFIG_REGULATOR_PALMAS) += palmas-regulator.o
 obj-$(CONFIG_REGULATOR_PFUZE100) += pfuze100-regulator.o
diff --git a/drivers/regulator/mt6397-regulator.c b/drivers/regulator/mt6397-regulator.c
new file mode 100644
index 000000000000..a5b2f4762677
--- /dev/null
+++ b/drivers/regulator/mt6397-regulator.c
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) 2014 MediaTek Inc.
+ * Author: Flora Fu <flora.fu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/mfd/mt6397/core.h>
+#include <linux/mfd/mt6397/registers.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/regulator/mt6397-regulator.h>
+#include <linux/regulator/of_regulator.h>
+
+/*
+ * MT6397 regulators' information
+ *
+ * @desc: standard fields of regulator description.
+ * @qi: Mask for query enable signal status of regulators
+ * @vselon_reg: Register sections for hardware control mode of bucks
+ * @vselctrl_reg: Register for controlling the buck control mode.
+ * @vselctrl_mask: Mask for query buck's voltage control mode.
+ */
+struct mt6397_regulator_info {
+	struct regulator_desc desc;
+	u32 qi;
+	u32 vselon_reg;
+	u32 vselctrl_reg;
+	u32 vselctrl_mask;
+};
+
+#define MT6397_BUCK(match, vreg, min, max, step, volt_ranges, enreg,	\
+		vosel, vosel_mask, voselon, vosel_ctrl)			\
+[MT6397_ID_##vreg] = {							\
+	.desc = {							\
+		.name = #vreg,						\
+		.of_match = of_match_ptr(match),			\
+		.ops = &mt6397_volt_range_ops,				\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6397_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = (max - min)/step + 1,			\
+		.linear_ranges = volt_ranges,				\
+		.n_linear_ranges = ARRAY_SIZE(volt_ranges),		\
+		.vsel_reg = vosel,					\
+		.vsel_mask = vosel_mask,				\
+		.enable_reg = enreg,					\
+		.enable_mask = BIT(0),					\
+	},								\
+	.qi = BIT(13),							\
+	.vselon_reg = voselon,						\
+	.vselctrl_reg = vosel_ctrl,					\
+	.vselctrl_mask = BIT(1),					\
+}
+
+#define MT6397_LDO(match, vreg, ldo_volt_table, enreg, enbit, vosel,	\
+		vosel_mask)						\
+[MT6397_ID_##vreg] = {							\
+	.desc = {							\
+		.name = #vreg,						\
+		.of_match = of_match_ptr(match),			\
+		.ops = &mt6397_volt_table_ops,				\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6397_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = ARRAY_SIZE(ldo_volt_table),		\
+		.volt_table = ldo_volt_table,				\
+		.vsel_reg = vosel,					\
+		.vsel_mask = vosel_mask,				\
+		.enable_reg = enreg,					\
+		.enable_mask = BIT(enbit),				\
+	},								\
+	.qi = BIT(15),							\
+}
+
+#define MT6397_REG_FIXED(match, vreg, enreg, enbit, volt)		\
+[MT6397_ID_##vreg] = {							\
+	.desc = {							\
+		.name = #vreg,						\
+		.of_match = of_match_ptr(match),			\
+		.ops = &mt6397_volt_fixed_ops,				\
+		.type = REGULATOR_VOLTAGE,				\
+		.id = MT6397_ID_##vreg,					\
+		.owner = THIS_MODULE,					\
+		.n_voltages = 1,					\
+		.enable_reg = enreg,					\
+		.enable_mask = BIT(enbit),				\
+		.min_uV = volt,						\
+	},								\
+	.qi = BIT(15),							\
+}
+
+static const struct regulator_linear_range buck_volt_range1[] = {
+	REGULATOR_LINEAR_RANGE(700000, 0, 0x7f, 6250),
+};
+
+static const struct regulator_linear_range buck_volt_range2[] = {
+	REGULATOR_LINEAR_RANGE(800000, 0, 0x7f, 6250),
+};
+
+static const struct regulator_linear_range buck_volt_range3[] = {
+	REGULATOR_LINEAR_RANGE(1500000, 0, 0x1f, 20000),
+};
+
+static const u32 ldo_volt_table1[] = {
+	1500000, 1800000, 2500000, 2800000,
+};
+
+static const u32 ldo_volt_table2[] = {
+	1800000, 3300000,
+};
+
+static const u32 ldo_volt_table3[] = {
+	3000000, 3300000,
+};
+
+static const u32 ldo_volt_table4[] = {
+	1220000, 1300000, 1500000, 1800000, 2500000, 2800000, 3000000, 3300000,
+};
+
+static const u32 ldo_volt_table5[] = {
+	1200000, 1300000, 1500000, 1800000, 2500000, 2800000, 3000000, 3300000,
+};
+
+static const u32 ldo_volt_table5_v2[] = {
+	1200000, 1000000, 1500000, 1800000, 2500000, 2800000, 3000000, 3300000,
+};
+
+static const u32 ldo_volt_table6[] = {
+	1200000, 1300000, 1500000, 1800000, 2500000, 2800000, 3000000, 2000000,
+};
+
+static const u32 ldo_volt_table7[] = {
+	1300000, 1500000, 1800000, 2000000, 2500000, 2800000, 3000000, 3300000,
+};
+
+static int mt6397_get_status(struct regulator_dev *rdev)
+{
+	int ret;
+	u32 regval;
+	struct mt6397_regulator_info *info = rdev_get_drvdata(rdev);
+
+	ret = regmap_read(rdev->regmap, info->desc.enable_reg, &regval);
+	if (ret != 0) {
+		dev_err(&rdev->dev, "Failed to get enable reg: %d\n", ret);
+		return ret;
+	}
+
+	return (regval & info->qi) ? REGULATOR_STATUS_ON : REGULATOR_STATUS_OFF;
+}
+
+static struct regulator_ops mt6397_volt_range_ops = {
+	.list_voltage = regulator_list_voltage_linear_range,
+	.map_voltage = regulator_map_voltage_linear_range,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.get_status = mt6397_get_status,
+};
+
+static struct regulator_ops mt6397_volt_table_ops = {
+	.list_voltage = regulator_list_voltage_table,
+	.map_voltage = regulator_map_voltage_iterate,
+	.set_voltage_sel = regulator_set_voltage_sel_regmap,
+	.get_voltage_sel = regulator_get_voltage_sel_regmap,
+	.set_voltage_time_sel = regulator_set_voltage_time_sel,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.get_status = mt6397_get_status,
+};
+
+static struct regulator_ops mt6397_volt_fixed_ops = {
+	.list_voltage = regulator_list_voltage_linear,
+	.enable = regulator_enable_regmap,
+	.disable = regulator_disable_regmap,
+	.is_enabled = regulator_is_enabled_regmap,
+	.get_status = mt6397_get_status,
+};
+
+/* The array is indexed by id(MT6397_ID_XXX) */
+static struct mt6397_regulator_info mt6397_regulators[] = {
+	MT6397_BUCK("buck_vpca15", VPCA15, 700000, 1493750, 6250,
+		buck_volt_range1, MT6397_VCA15_CON7, MT6397_VCA15_CON9, 0x7f,
+		MT6397_VCA15_CON10, MT6397_VCA15_CON5),
+	MT6397_BUCK("buck_vpca7", VPCA7, 700000, 1493750, 6250,
+		buck_volt_range1, MT6397_VPCA7_CON7, MT6397_VPCA7_CON9, 0x7f,
+		MT6397_VPCA7_CON10, MT6397_VPCA7_CON5),
+	MT6397_BUCK("buck_vsramca15", VSRAMCA15, 700000, 1493750, 6250,
+		buck_volt_range1, MT6397_VSRMCA15_CON7, MT6397_VSRMCA15_CON9,
+		0x7f, MT6397_VSRMCA15_CON10, MT6397_VSRMCA15_CON5),
+	MT6397_BUCK("buck_vsramca7", VSRAMCA7, 700000, 1493750, 6250,
+		buck_volt_range1, MT6397_VSRMCA7_CON7, MT6397_VSRMCA7_CON9,
+		0x7f, MT6397_VSRMCA7_CON10, MT6397_VSRMCA7_CON5),
+	MT6397_BUCK("buck_vcore", VCORE, 700000, 1493750, 6250,
+		buck_volt_range1, MT6397_VCORE_CON7, MT6397_VCORE_CON9, 0x7f,
+		MT6397_VCORE_CON10, MT6397_VCORE_CON5),
+	MT6397_BUCK("buck_vgpu", VGPU, 700000, 1493750, 6250, buck_volt_range1,
+		MT6397_VGPU_CON7, MT6397_VGPU_CON9, 0x7f,
+		MT6397_VGPU_CON10, MT6397_VGPU_CON5),
+	MT6397_BUCK("buck_vdrm", VDRM, 800000, 1593750, 6250, buck_volt_range2,
+		MT6397_VDRM_CON7, MT6397_VDRM_CON9, 0x7f,
+		MT6397_VDRM_CON10, MT6397_VDRM_CON5),
+	MT6397_BUCK("buck_vio18", VIO18, 1500000, 2120000, 20000,
+		buck_volt_range3, MT6397_VIO18_CON7, MT6397_VIO18_CON9, 0x1f,
+		MT6397_VIO18_CON10, MT6397_VIO18_CON5),
+	MT6397_REG_FIXED("ldo_vtcxo", VTCXO, MT6397_ANALDO_CON0, 10, 2800000),
+	MT6397_REG_FIXED("ldo_va28", VA28, MT6397_ANALDO_CON1, 14, 2800000),
+	MT6397_LDO("ldo_vcama", VCAMA, ldo_volt_table1,
+		MT6397_ANALDO_CON2, 15, MT6397_ANALDO_CON6, 0xC0),
+	MT6397_REG_FIXED("ldo_vio28", VIO28, MT6397_DIGLDO_CON0, 14, 2800000),
+	MT6397_REG_FIXED("ldo_vusb", VUSB, MT6397_DIGLDO_CON1, 14, 3300000),
+	MT6397_LDO("ldo_vmc", VMC, ldo_volt_table2,
+		MT6397_DIGLDO_CON2, 12, MT6397_DIGLDO_CON29, 0x10),
+	MT6397_LDO("ldo_vmch", VMCH, ldo_volt_table3,
+		MT6397_DIGLDO_CON3, 14, MT6397_DIGLDO_CON17, 0x80),
+	MT6397_LDO("ldo_vemc3v3", VEMC3V3, ldo_volt_table3,
+		MT6397_DIGLDO_CON4, 14, MT6397_DIGLDO_CON18, 0x10),
+	MT6397_LDO("ldo_vgp1", VGP1, ldo_volt_table4,
+		MT6397_DIGLDO_CON5, 15, MT6397_DIGLDO_CON19, 0xE0),
+	MT6397_LDO("ldo_vgp2", VGP2, ldo_volt_table5,
+		MT6397_DIGLDO_CON6, 15, MT6397_DIGLDO_CON20, 0xE0),
+	MT6397_LDO("ldo_vgp3", VGP3, ldo_volt_table5,
+		MT6397_DIGLDO_CON7, 15, MT6397_DIGLDO_CON21, 0xE0),
+	MT6397_LDO("ldo_vgp4", VGP4, ldo_volt_table5,
+		MT6397_DIGLDO_CON8, 15, MT6397_DIGLDO_CON22, 0xE0),
+	MT6397_LDO("ldo_vgp5", VGP5, ldo_volt_table6,
+		MT6397_DIGLDO_CON9, 15, MT6397_DIGLDO_CON23, 0xE0),
+	MT6397_LDO("ldo_vgp6", VGP6, ldo_volt_table5,
+		MT6397_DIGLDO_CON10, 15, MT6397_DIGLDO_CON33, 0xE0),
+	MT6397_LDO("ldo_vibr", VIBR, ldo_volt_table7,
+		MT6397_DIGLDO_CON24, 15, MT6397_DIGLDO_CON25, 0xE00),
+};
+
+static int mt6397_set_buck_vosel_reg(struct platform_device *pdev)
+{
+	struct mt6397_chip *mt6397 = dev_get_drvdata(pdev->dev.parent);
+	int i;
+	u32 regval;
+
+	for (i = 0; i < MT6397_MAX_REGULATOR; i++) {
+		if (mt6397_regulators[i].vselctrl_reg) {
+			if (regmap_read(mt6397->regmap,
+				mt6397_regulators[i].vselctrl_reg,
+				&regval) < 0) {
+				dev_err(&pdev->dev,
+					"Failed to read buck ctrl\n");
+				return -EIO;
+			}
+
+			if (regval & mt6397_regulators[i].vselctrl_mask) {
+				mt6397_regulators[i].desc.vsel_reg =
+				mt6397_regulators[i].vselon_reg;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int mt6397_regulator_probe(struct platform_device *pdev)
+{
+	struct mt6397_chip *mt6397 = dev_get_drvdata(pdev->dev.parent);
+	struct regulator_config config = {};
+	struct regulator_dev *rdev;
+	int i;
+	u32 reg_value, version;
+
+	/* Query buck controller to select activated voltage register part */
+	if (mt6397_set_buck_vosel_reg(pdev))
+		return -EIO;
+
+	/* Read PMIC chip revision to update constraints and voltage table */
+	if (regmap_read(mt6397->regmap, MT6397_CID, &reg_value) < 0) {
+		dev_err(&pdev->dev, "Failed to read Chip ID\n");
+		return -EIO;
+	}
+	dev_info(&pdev->dev, "Chip ID = 0x%x\n", reg_value);
+
+	version = (reg_value & 0xFF);
+	switch (version) {
+	case MT6397_REGULATOR_ID91:
+		mt6397_regulators[MT6397_ID_VGP2].desc.volt_table =
+		ldo_volt_table5_v2;
+		break;
+	default:
+		break;
+	}
+
+	for (i = 0; i < MT6397_MAX_REGULATOR; i++) {
+		config.dev = &pdev->dev;
+		config.driver_data = &mt6397_regulators[i];
+		config.regmap = mt6397->regmap;
+		rdev = devm_regulator_register(&pdev->dev,
+				&mt6397_regulators[i].desc, &config);
+		if (IS_ERR(rdev)) {
+			dev_err(&pdev->dev, "failed to register %s\n",
+				mt6397_regulators[i].desc.name);
+			return PTR_ERR(rdev);
+		}
+	}
+
+	return 0;
+}
+
+static struct platform_driver mt6397_regulator_driver = {
+	.driver = {
+		.name = "mt6397-regulator",
+	},
+	.probe = mt6397_regulator_probe,
+};
+
+module_platform_driver(mt6397_regulator_driver);
+
+MODULE_AUTHOR("Flora Fu <flora.fu@mediatek.com>");
+MODULE_DESCRIPTION("Regulator Driver for MediaTek MT6397 PMIC");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:mt6397-regulator");
diff --git a/include/linux/regulator/mt6397-regulator.h b/include/linux/regulator/mt6397-regulator.h
new file mode 100644
index 000000000000..30cc5963e265
--- /dev/null
+++ b/include/linux/regulator/mt6397-regulator.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2014 MediaTek Inc.
+ * Author: Flora Fu <flora.fu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_REGULATOR_MT6397_H
+#define __LINUX_REGULATOR_MT6397_H
+
+enum {
+	MT6397_ID_VPCA15 = 0,
+	MT6397_ID_VPCA7,
+	MT6397_ID_VSRAMCA15,
+	MT6397_ID_VSRAMCA7,
+	MT6397_ID_VCORE,
+	MT6397_ID_VGPU,
+	MT6397_ID_VDRM,
+	MT6397_ID_VIO18 = 7,
+	MT6397_ID_VTCXO,
+	MT6397_ID_VA28,
+	MT6397_ID_VCAMA,
+	MT6397_ID_VIO28,
+	MT6397_ID_VUSB,
+	MT6397_ID_VMC,
+	MT6397_ID_VMCH,
+	MT6397_ID_VEMC3V3,
+	MT6397_ID_VGP1,
+	MT6397_ID_VGP2,
+	MT6397_ID_VGP3,
+	MT6397_ID_VGP4,
+	MT6397_ID_VGP5,
+	MT6397_ID_VGP6,
+	MT6397_ID_VIBR,
+	MT6397_ID_RG_MAX,
+};
+
+#define MT6397_MAX_REGULATOR	MT6397_ID_RG_MAX
+#define MT6397_REGULATOR_ID97	0x97
+#define MT6397_REGULATOR_ID91	0x91
+
+#endif /* __LINUX_REGULATOR_MT6397_H */
-- 
cgit v1.2.3


From 0628ee7c81a8ced9b10f9ee300707f7f79fdecf1 Mon Sep 17 00:00:00 2001
From: Nicholas Krause <xerofoify@gmail.com>
Date: Sun, 21 Dec 2014 22:36:37 -0500
Subject: libata: s/ata_id_removeable()/ata_id_removable()/

Changes the spelling typos of removeable to removable where
ata_id_removeable is defined in ata.h and called in libata-scsi.c
respectively.

Signed-off-by: Nicholas Krause <xerofoify@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-scsi.c | 4 ++--
 include/linux/ata.h       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index e364e86e84d7..7659d6468303 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1995,8 +1995,8 @@ static unsigned int ata_scsiop_inq_std(struct ata_scsi_args *args, u8 *rbuf)
 
 	VPRINTK("ENTER\n");
 
-	/* set scsi removeable (RMB) bit per ata bit */
-	if (ata_id_removeable(args->id))
+	/* set scsi removable (RMB) bit per ata bit */
+	if (ata_id_removable(args->id))
 		hdr[1] |= (1 << 7);
 
 	if (args->dev->class == ATA_DEV_ZAC) {
diff --git a/include/linux/ata.h b/include/linux/ata.h
index f2f4d8da97c0..1648026e06b4 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -503,7 +503,7 @@ struct ata_bmdma_prd {
 #define ata_id_has_dma(id)	((id)[ATA_ID_CAPABILITY] & (1 << 8))
 #define ata_id_has_ncq(id)	((id)[ATA_ID_SATA_CAPABILITY] & (1 << 8))
 #define ata_id_queue_depth(id)	(((id)[ATA_ID_QUEUE_DEPTH] & 0x1f) + 1)
-#define ata_id_removeable(id)	((id)[ATA_ID_CONFIG] & (1 << 7))
+#define ata_id_removable(id)	((id)[ATA_ID_CONFIG] & (1 << 7))
 #define ata_id_has_atapi_AN(id)	\
 	((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \
 	  ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \
-- 
cgit v1.2.3


From 74d23cc704d19732e70ef1579a669f7d5f09dd9a Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sun, 21 Dec 2014 19:46:56 +0100
Subject: time: move the timecounter/cyclecounter code into its own file.

The timecounter code has almost nothing to do with the clocksource
code. Let it live in its own file. This will help isolate the
timecounter users from the clocksource users in the source tree.

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Acked-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/amd/xgbe/xgbe.h        |   2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h |   2 +-
 drivers/net/ethernet/freescale/fec.h        |   1 +
 drivers/net/ethernet/intel/e1000e/e1000.h   |   2 +-
 drivers/net/ethernet/intel/igb/igb.h        |   2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe.h    |   2 +-
 drivers/net/ethernet/ti/cpts.h              |   1 +
 include/clocksource/arm_arch_timer.h        |   2 +-
 include/linux/clocksource.h                 | 102 -----------------------
 include/linux/mlx4/device.h                 |   2 +-
 include/linux/timecounter.h                 | 122 ++++++++++++++++++++++++++++
 include/linux/types.h                       |   3 +
 kernel/time/Makefile                        |   2 +-
 kernel/time/clocksource.c                   |  76 -----------------
 kernel/time/timecounter.c                   |  95 ++++++++++++++++++++++
 sound/pci/hda/hda_priv.h                    |   2 +-
 16 files changed, 231 insertions(+), 187 deletions(-)
 create mode 100644 include/linux/timecounter.h
 create mode 100644 kernel/time/timecounter.c

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index f9ec762ac3f0..2af6affc35a7 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -124,7 +124,7 @@
 #include <linux/if_vlan.h>
 #include <linux/bitops.h>
 #include <linux/ptp_clock_kernel.h>
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <linux/net_tstamp.h>
 #include <net/dcbnl.h>
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index c3a6072134f5..792ba72fb5c8 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -22,7 +22,7 @@
 
 #include <linux/ptp_clock_kernel.h>
 #include <linux/net_tstamp.h>
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 
 /* compilation time flags */
 
diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index 469691ad4a1e..df8bbddaeb37 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -16,6 +16,7 @@
 #include <linux/clocksource.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
+#include <linux/timecounter.h>
 
 #if defined(CONFIG_M523x) || defined(CONFIG_M527x) || defined(CONFIG_M528x) || \
     defined(CONFIG_M520x) || defined(CONFIG_M532x) || \
diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h
index 7785240a0da1..9416e5a7e0c8 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -34,7 +34,7 @@
 #include <linux/pci-aspm.h>
 #include <linux/crc32.h>
 #include <linux/if_vlan.h>
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/ptp_classify.h>
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 82d891e183b1..ee22da391474 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -29,7 +29,7 @@
 #include "e1000_mac.h"
 #include "e1000_82575.h"
 
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/bitops.h>
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index b6137be43920..38fc64cf5dca 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -38,7 +38,7 @@
 #include <linux/if_vlan.h>
 #include <linux/jiffies.h>
 
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
 
diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h
index 1a581ef7eee8..69a46b92c7d6 100644
--- a/drivers/net/ethernet/ti/cpts.h
+++ b/drivers/net/ethernet/ti/cpts.h
@@ -27,6 +27,7 @@
 #include <linux/list.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/skbuff.h>
+#include <linux/timecounter.h>
 
 struct cpsw_cpts {
 	u32 idver;                /* Identification and version */
diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h
index 6d26b40cbf5d..9916d0e4eff5 100644
--- a/include/clocksource/arm_arch_timer.h
+++ b/include/clocksource/arm_arch_timer.h
@@ -16,7 +16,7 @@
 #ifndef __CLKSOURCE_ARM_ARCH_TIMER_H
 #define __CLKSOURCE_ARM_ARCH_TIMER_H
 
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <linux/types.h>
 
 #define ARCH_TIMER_CTRL_ENABLE		(1 << 0)
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index abcafaa20b86..9c78d15d33e4 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -18,8 +18,6 @@
 #include <asm/div64.h>
 #include <asm/io.h>
 
-/* clocksource cycle base type */
-typedef u64 cycle_t;
 struct clocksource;
 struct module;
 
@@ -27,106 +25,6 @@ struct module;
 #include <asm/clocksource.h>
 #endif
 
-/**
- * struct cyclecounter - hardware abstraction for a free running counter
- *	Provides completely state-free accessors to the underlying hardware.
- *	Depending on which hardware it reads, the cycle counter may wrap
- *	around quickly. Locking rules (if necessary) have to be defined
- *	by the implementor and user of specific instances of this API.
- *
- * @read:		returns the current cycle value
- * @mask:		bitmask for two's complement
- *			subtraction of non 64 bit counters,
- *			see CLOCKSOURCE_MASK() helper macro
- * @mult:		cycle to nanosecond multiplier
- * @shift:		cycle to nanosecond divisor (power of two)
- */
-struct cyclecounter {
-	cycle_t (*read)(const struct cyclecounter *cc);
-	cycle_t mask;
-	u32 mult;
-	u32 shift;
-};
-
-/**
- * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds
- *	Contains the state needed by timecounter_read() to detect
- *	cycle counter wrap around. Initialize with
- *	timecounter_init(). Also used to convert cycle counts into the
- *	corresponding nanosecond counts with timecounter_cyc2time(). Users
- *	of this code are responsible for initializing the underlying
- *	cycle counter hardware, locking issues and reading the time
- *	more often than the cycle counter wraps around. The nanosecond
- *	counter will only wrap around after ~585 years.
- *
- * @cc:			the cycle counter used by this instance
- * @cycle_last:		most recent cycle counter value seen by
- *			timecounter_read()
- * @nsec:		continuously increasing count
- */
-struct timecounter {
-	const struct cyclecounter *cc;
-	cycle_t cycle_last;
-	u64 nsec;
-};
-
-/**
- * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds
- * @cc:		Pointer to cycle counter.
- * @cycles:	Cycles
- *
- * XXX - This could use some mult_lxl_ll() asm optimization. Same code
- * as in cyc2ns, but with unsigned result.
- */
-static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc,
-				      cycle_t cycles)
-{
-	u64 ret = (u64)cycles;
-	ret = (ret * cc->mult) >> cc->shift;
-	return ret;
-}
-
-/**
- * timecounter_init - initialize a time counter
- * @tc:			Pointer to time counter which is to be initialized/reset
- * @cc:			A cycle counter, ready to be used.
- * @start_tstamp:	Arbitrary initial time stamp.
- *
- * After this call the current cycle register (roughly) corresponds to
- * the initial time stamp. Every call to timecounter_read() increments
- * the time stamp counter by the number of elapsed nanoseconds.
- */
-extern void timecounter_init(struct timecounter *tc,
-			     const struct cyclecounter *cc,
-			     u64 start_tstamp);
-
-/**
- * timecounter_read - return nanoseconds elapsed since timecounter_init()
- *                    plus the initial time stamp
- * @tc:          Pointer to time counter.
- *
- * In other words, keeps track of time since the same epoch as
- * the function which generated the initial time stamp.
- */
-extern u64 timecounter_read(struct timecounter *tc);
-
-/**
- * timecounter_cyc2time - convert a cycle counter to same
- *                        time base as values returned by
- *                        timecounter_read()
- * @tc:		Pointer to time counter.
- * @cycle_tstamp:	a value returned by tc->cc->read()
- *
- * Cycle counts that are converted correctly as long as they
- * fall into the interval [-1/2 max cycle count, +1/2 max cycle count],
- * with "max cycle count" == cs->mask+1.
- *
- * This allows conversion of cycle counter values which were generated
- * in the past.
- */
-extern u64 timecounter_cyc2time(struct timecounter *tc,
-				cycle_t cycle_tstamp);
-
 /**
  * struct clocksource - hardware abstraction for a free running counter
  *	Provides mostly state-free accessors to the underlying hardware.
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 25c791e295fd..f1e41b33462f 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -42,7 +42,7 @@
 
 #include <linux/atomic.h>
 
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 
 #define MAX_MSIX_P_PORT		17
 #define MAX_MSIX		64
diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h
new file mode 100644
index 000000000000..146f07a6651b
--- /dev/null
+++ b/include/linux/timecounter.h
@@ -0,0 +1,122 @@
+/*
+ * linux/include/linux/timecounter.h
+ *
+ * based on code that migrated away from
+ * linux/include/linux/clocksource.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef _LINUX_TIMECOUNTER_H
+#define _LINUX_TIMECOUNTER_H
+
+#include <linux/types.h>
+
+/**
+ * struct cyclecounter - hardware abstraction for a free running counter
+ *	Provides completely state-free accessors to the underlying hardware.
+ *	Depending on which hardware it reads, the cycle counter may wrap
+ *	around quickly. Locking rules (if necessary) have to be defined
+ *	by the implementor and user of specific instances of this API.
+ *
+ * @read:		returns the current cycle value
+ * @mask:		bitmask for two's complement
+ *			subtraction of non 64 bit counters,
+ *			see CLOCKSOURCE_MASK() helper macro
+ * @mult:		cycle to nanosecond multiplier
+ * @shift:		cycle to nanosecond divisor (power of two)
+ */
+struct cyclecounter {
+	cycle_t (*read)(const struct cyclecounter *cc);
+	cycle_t mask;
+	u32 mult;
+	u32 shift;
+};
+
+/**
+ * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds
+ *	Contains the state needed by timecounter_read() to detect
+ *	cycle counter wrap around. Initialize with
+ *	timecounter_init(). Also used to convert cycle counts into the
+ *	corresponding nanosecond counts with timecounter_cyc2time(). Users
+ *	of this code are responsible for initializing the underlying
+ *	cycle counter hardware, locking issues and reading the time
+ *	more often than the cycle counter wraps around. The nanosecond
+ *	counter will only wrap around after ~585 years.
+ *
+ * @cc:			the cycle counter used by this instance
+ * @cycle_last:		most recent cycle counter value seen by
+ *			timecounter_read()
+ * @nsec:		continuously increasing count
+ */
+struct timecounter {
+	const struct cyclecounter *cc;
+	cycle_t cycle_last;
+	u64 nsec;
+};
+
+/**
+ * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds
+ * @cc:		Pointer to cycle counter.
+ * @cycles:	Cycles
+ *
+ * XXX - This could use some mult_lxl_ll() asm optimization. Same code
+ * as in cyc2ns, but with unsigned result.
+ */
+static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc,
+				      cycle_t cycles)
+{
+	u64 ret = (u64)cycles;
+	ret = (ret * cc->mult) >> cc->shift;
+	return ret;
+}
+
+/**
+ * timecounter_init - initialize a time counter
+ * @tc:			Pointer to time counter which is to be initialized/reset
+ * @cc:			A cycle counter, ready to be used.
+ * @start_tstamp:	Arbitrary initial time stamp.
+ *
+ * After this call the current cycle register (roughly) corresponds to
+ * the initial time stamp. Every call to timecounter_read() increments
+ * the time stamp counter by the number of elapsed nanoseconds.
+ */
+extern void timecounter_init(struct timecounter *tc,
+			     const struct cyclecounter *cc,
+			     u64 start_tstamp);
+
+/**
+ * timecounter_read - return nanoseconds elapsed since timecounter_init()
+ *                    plus the initial time stamp
+ * @tc:          Pointer to time counter.
+ *
+ * In other words, keeps track of time since the same epoch as
+ * the function which generated the initial time stamp.
+ */
+extern u64 timecounter_read(struct timecounter *tc);
+
+/**
+ * timecounter_cyc2time - convert a cycle counter to same
+ *                        time base as values returned by
+ *                        timecounter_read()
+ * @tc:		Pointer to time counter.
+ * @cycle_tstamp:	a value returned by tc->cc->read()
+ *
+ * Cycle counts that are converted correctly as long as they
+ * fall into the interval [-1/2 max cycle count, +1/2 max cycle count],
+ * with "max cycle count" == cs->mask+1.
+ *
+ * This allows conversion of cycle counter values which were generated
+ * in the past.
+ */
+extern u64 timecounter_cyc2time(struct timecounter *tc,
+				cycle_t cycle_tstamp);
+
+#endif
diff --git a/include/linux/types.h b/include/linux/types.h
index a0bb7048687f..62323825cff9 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -213,5 +213,8 @@ struct callback_head {
 };
 #define rcu_head callback_head
 
+/* clocksource cycle base type */
+typedef u64 cycle_t;
+
 #endif /*  __ASSEMBLY__ */
 #endif /* _LINUX_TYPES_H */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f622cf28628a..c09c07817d7a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,6 +1,6 @@
 obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o
 obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
-obj-y += timeconv.o posix-clock.o alarmtimer.o
+obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index b79f39bda7e1..4892352f0e49 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -34,82 +34,6 @@
 #include "tick-internal.h"
 #include "timekeeping_internal.h"
 
-void timecounter_init(struct timecounter *tc,
-		      const struct cyclecounter *cc,
-		      u64 start_tstamp)
-{
-	tc->cc = cc;
-	tc->cycle_last = cc->read(cc);
-	tc->nsec = start_tstamp;
-}
-EXPORT_SYMBOL_GPL(timecounter_init);
-
-/**
- * timecounter_read_delta - get nanoseconds since last call of this function
- * @tc:         Pointer to time counter
- *
- * When the underlying cycle counter runs over, this will be handled
- * correctly as long as it does not run over more than once between
- * calls.
- *
- * The first call to this function for a new time counter initializes
- * the time tracking and returns an undefined result.
- */
-static u64 timecounter_read_delta(struct timecounter *tc)
-{
-	cycle_t cycle_now, cycle_delta;
-	u64 ns_offset;
-
-	/* read cycle counter: */
-	cycle_now = tc->cc->read(tc->cc);
-
-	/* calculate the delta since the last timecounter_read_delta(): */
-	cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
-
-	/* convert to nanoseconds: */
-	ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
-
-	/* update time stamp of timecounter_read_delta() call: */
-	tc->cycle_last = cycle_now;
-
-	return ns_offset;
-}
-
-u64 timecounter_read(struct timecounter *tc)
-{
-	u64 nsec;
-
-	/* increment time by nanoseconds since last call */
-	nsec = timecounter_read_delta(tc);
-	nsec += tc->nsec;
-	tc->nsec = nsec;
-
-	return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_read);
-
-u64 timecounter_cyc2time(struct timecounter *tc,
-			 cycle_t cycle_tstamp)
-{
-	u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
-	u64 nsec;
-
-	/*
-	 * Instead of always treating cycle_tstamp as more recent
-	 * than tc->cycle_last, detect when it is too far in the
-	 * future and treat it as old time stamp instead.
-	 */
-	if (cycle_delta > tc->cc->mask / 2) {
-		cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
-		nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
-	} else {
-		nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
-	}
-
-	return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_cyc2time);
-
 /**
  * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
  * @mult:	pointer to mult variable
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
new file mode 100644
index 000000000000..59a1ec3a57cb
--- /dev/null
+++ b/kernel/time/timecounter.c
@@ -0,0 +1,95 @@
+/*
+ * linux/kernel/time/timecounter.c
+ *
+ * based on code that migrated away from
+ * linux/kernel/time/clocksource.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/export.h>
+#include <linux/timecounter.h>
+
+void timecounter_init(struct timecounter *tc,
+		      const struct cyclecounter *cc,
+		      u64 start_tstamp)
+{
+	tc->cc = cc;
+	tc->cycle_last = cc->read(cc);
+	tc->nsec = start_tstamp;
+}
+EXPORT_SYMBOL_GPL(timecounter_init);
+
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc:         Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+	cycle_t cycle_now, cycle_delta;
+	u64 ns_offset;
+
+	/* read cycle counter: */
+	cycle_now = tc->cc->read(tc->cc);
+
+	/* calculate the delta since the last timecounter_read_delta(): */
+	cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+
+	/* convert to nanoseconds: */
+	ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
+
+	/* update time stamp of timecounter_read_delta() call: */
+	tc->cycle_last = cycle_now;
+
+	return ns_offset;
+}
+
+u64 timecounter_read(struct timecounter *tc)
+{
+	u64 nsec;
+
+	/* increment time by nanoseconds since last call */
+	nsec = timecounter_read_delta(tc);
+	nsec += tc->nsec;
+	tc->nsec = nsec;
+
+	return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_read);
+
+u64 timecounter_cyc2time(struct timecounter *tc,
+			 cycle_t cycle_tstamp)
+{
+	u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+	u64 nsec;
+
+	/*
+	 * Instead of always treating cycle_tstamp as more recent
+	 * than tc->cycle_last, detect when it is too far in the
+	 * future and treat it as old time stamp instead.
+	 */
+	if (cycle_delta > tc->cc->mask / 2) {
+		cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+		nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
+	} else {
+		nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
+	}
+
+	return nsec;
+}
+EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/sound/pci/hda/hda_priv.h b/sound/pci/hda/hda_priv.h
index 166e3e84b963..daf458299753 100644
--- a/sound/pci/hda/hda_priv.h
+++ b/sound/pci/hda/hda_priv.h
@@ -15,7 +15,7 @@
 #ifndef __SOUND_HDA_PRIV_H
 #define __SOUND_HDA_PRIV_H
 
-#include <linux/clocksource.h>
+#include <linux/timecounter.h>
 #include <sound/core.h>
 #include <sound/pcm.h>
 
-- 
cgit v1.2.3


From 796c1efd6fa0ed696d550b68f4410ab1a1749d01 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sun, 21 Dec 2014 19:46:57 +0100
Subject: timecounter: provide a helper function to shift the time.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some PTP Hardware Clock drivers use a struct timecounter to represent
their clock. To adjust the time by a given offset, these drivers all
perform a two step read/write of their timecounter. However, it is
better and simpler just to adjust the offset in one step. This patch
introduces a little routine to help drivers implement the adjtime
method.

Suggested-by: Janusz Użycki <j.uzycki@elproma.com.pl>
Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/timecounter.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h
index 146f07a6651b..af3dfa4e90f0 100644
--- a/include/linux/timecounter.h
+++ b/include/linux/timecounter.h
@@ -78,6 +78,15 @@ static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc,
 	return ret;
 }
 
+/**
+ * timecounter_adjtime - Shifts the time of the clock.
+ * @delta:	Desired change in nanoseconds.
+ */
+static inline void timecounter_adjtime(struct timecounter *tc, s64 delta)
+{
+	tc->nsec += delta;
+}
+
 /**
  * timecounter_init - initialize a time counter
  * @tc:			Pointer to time counter which is to be initialized/reset
-- 
cgit v1.2.3


From 2eebdde6528a722fbf8e2cffcf7aa52cbb4c2de0 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Sun, 21 Dec 2014 19:47:06 +0100
Subject: timecounter: keep track of accumulated fractional nanoseconds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current timecounter implementation will drop a variable amount
of resolution, depending on the magnitude of the time delta. In
other words, reading the clock too often or too close to a time
stamp conversion will introduce errors into the time values. This
patch fixes the issue by introducing a fractional nanosecond field
that accumulates the low order bits.

Reported-by: Janusz Użycki <j.uzycki@elproma.com.pl>
Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_clock.c |  4 ++--
 include/linux/timecounter.h                   | 19 ++++++++++------
 kernel/time/timecounter.c                     | 31 +++++++++++++++++++++------
 virt/kvm/arm/arch_timer.c                     |  3 ++-
 4 files changed, 40 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
index df35d0e1b899..e9cce4f72b24 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_clock.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
@@ -240,7 +240,7 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
 {
 	struct mlx4_dev *dev = mdev->dev;
 	unsigned long flags;
-	u64 ns;
+	u64 ns, zero = 0;
 
 	rwlock_init(&mdev->clock_lock);
 
@@ -265,7 +265,7 @@ void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
 	/* Calculate period in seconds to call the overflow watchdog - to make
 	 * sure counter is checked at least once every wrap around.
 	 */
-	ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask);
+	ns = cyclecounter_cyc2ns(&mdev->cycles, mdev->cycles.mask, zero, &zero);
 	do_div(ns, NSEC_PER_SEC / 2 / HZ);
 	mdev->overflow_period = ns;
 
diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h
index af3dfa4e90f0..74f45496e6d1 100644
--- a/include/linux/timecounter.h
+++ b/include/linux/timecounter.h
@@ -55,27 +55,32 @@ struct cyclecounter {
  * @cycle_last:		most recent cycle counter value seen by
  *			timecounter_read()
  * @nsec:		continuously increasing count
+ * @mask:		bit mask for maintaining the 'frac' field
+ * @frac:		accumulated fractional nanoseconds
  */
 struct timecounter {
 	const struct cyclecounter *cc;
 	cycle_t cycle_last;
 	u64 nsec;
+	u64 mask;
+	u64 frac;
 };
 
 /**
  * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds
  * @cc:		Pointer to cycle counter.
  * @cycles:	Cycles
- *
- * XXX - This could use some mult_lxl_ll() asm optimization. Same code
- * as in cyc2ns, but with unsigned result.
+ * @mask:	bit mask for maintaining the 'frac' field
+ * @frac:	pointer to storage for the fractional nanoseconds.
  */
 static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc,
-				      cycle_t cycles)
+				      cycle_t cycles, u64 mask, u64 *frac)
 {
-	u64 ret = (u64)cycles;
-	ret = (ret * cc->mult) >> cc->shift;
-	return ret;
+	u64 ns = (u64) cycles;
+
+	ns = (ns * cc->mult) + *frac;
+	*frac = ns & mask;
+	return ns >> cc->shift;
 }
 
 /**
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index 59a1ec3a57cb..4687b3104bae 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -25,6 +25,8 @@ void timecounter_init(struct timecounter *tc,
 	tc->cc = cc;
 	tc->cycle_last = cc->read(cc);
 	tc->nsec = start_tstamp;
+	tc->mask = (1ULL << cc->shift) - 1;
+	tc->frac = 0;
 }
 EXPORT_SYMBOL_GPL(timecounter_init);
 
@@ -51,7 +53,8 @@ static u64 timecounter_read_delta(struct timecounter *tc)
 	cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
 
 	/* convert to nanoseconds: */
-	ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
+	ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta,
+					tc->mask, &tc->frac);
 
 	/* update time stamp of timecounter_read_delta() call: */
 	tc->cycle_last = cycle_now;
@@ -72,22 +75,36 @@ u64 timecounter_read(struct timecounter *tc)
 }
 EXPORT_SYMBOL_GPL(timecounter_read);
 
+/*
+ * This is like cyclecounter_cyc2ns(), but it is used for computing a
+ * time previous to the time stored in the cycle counter.
+ */
+static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
+			       cycle_t cycles, u64 mask, u64 frac)
+{
+	u64 ns = (u64) cycles;
+
+	ns = ((ns * cc->mult) - frac) >> cc->shift;
+
+	return ns;
+}
+
 u64 timecounter_cyc2time(struct timecounter *tc,
 			 cycle_t cycle_tstamp)
 {
-	u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
-	u64 nsec;
+	u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+	u64 nsec = tc->nsec, frac = tc->frac;
 
 	/*
 	 * Instead of always treating cycle_tstamp as more recent
 	 * than tc->cycle_last, detect when it is too far in the
 	 * future and treat it as old time stamp instead.
 	 */
-	if (cycle_delta > tc->cc->mask / 2) {
-		cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
-		nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
+	if (delta > tc->cc->mask / 2) {
+		delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+		nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
 	} else {
-		nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
+		nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
 	}
 
 	return nsec;
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 1c0772b340d8..6e54f3542126 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -152,7 +152,8 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 		return;
 	}
 
-	ns = cyclecounter_cyc2ns(timecounter->cc, cval - now);
+	ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask,
+				 &timecounter->frac);
 	timer_arm(timer, ns);
 }
 
-- 
cgit v1.2.3


From dd450777990baae668c1143064f2f234dbab1b9b Mon Sep 17 00:00:00 2001
From: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Date: Wed, 24 Dec 2014 01:14:14 +0300
Subject: arm: sa1100: move irda header to linux/platform_data

In the end asm/mach/irda.h header is not used by anybody except sa1100.
Move the header to the platform data includes dir and rename it to
irda-sa11x0.h.

Signed-off-by: Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/include/asm/mach/irda.h          | 20 --------------------
 arch/arm/mach-sa1100/assabet.c            |  2 +-
 arch/arm/mach-sa1100/collie.c             |  2 +-
 arch/arm/mach-sa1100/h3100.c              |  2 +-
 arch/arm/mach-sa1100/h3600.c              |  2 +-
 drivers/net/irda/sa1100_ir.c              |  2 +-
 include/linux/platform_data/irda-sa11x0.h | 20 ++++++++++++++++++++
 7 files changed, 25 insertions(+), 25 deletions(-)
 delete mode 100644 arch/arm/include/asm/mach/irda.h
 create mode 100644 include/linux/platform_data/irda-sa11x0.h

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/mach/irda.h b/arch/arm/include/asm/mach/irda.h
deleted file mode 100644
index 38f77b5e56cf..000000000000
--- a/arch/arm/include/asm/mach/irda.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  arch/arm/include/asm/mach/irda.h
- *
- *  Copyright (C) 2004 Russell King.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __ASM_ARM_MACH_IRDA_H
-#define __ASM_ARM_MACH_IRDA_H
-
-struct irda_platform_data {
-	int (*startup)(struct device *);
-	void (*shutdown)(struct device *);
-	int (*set_power)(struct device *, unsigned int state);
-	void (*set_speed)(struct device *, unsigned int speed);
-};
-
-#endif
diff --git a/arch/arm/mach-sa1100/assabet.c b/arch/arm/mach-sa1100/assabet.c
index 7dd894ece9ae..d28ecb9ef172 100644
--- a/arch/arm/mach-sa1100/assabet.c
+++ b/arch/arm/mach-sa1100/assabet.c
@@ -37,7 +37,7 @@
 
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
-#include <asm/mach/irda.h>
+#include <linux/platform_data/irda-sa11x0.h>
 #include <asm/mach/map.h>
 #include <mach/assabet.h>
 #include <linux/platform_data/mfd-mcp-sa11x0.h>
diff --git a/arch/arm/mach-sa1100/collie.c b/arch/arm/mach-sa1100/collie.c
index b90c7d828391..7fcbe3d119c7 100644
--- a/arch/arm/mach-sa1100/collie.c
+++ b/arch/arm/mach-sa1100/collie.c
@@ -43,7 +43,7 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
 #include <asm/mach/map.h>
-#include <asm/mach/irda.h>
+#include <linux/platform_data/irda-sa11x0.h>
 
 #include <asm/hardware/scoop.h>
 #include <asm/mach/sharpsl_param.h>
diff --git a/arch/arm/mach-sa1100/h3100.c b/arch/arm/mach-sa1100/h3100.c
index 3c43219bc881..c6b412054a3c 100644
--- a/arch/arm/mach-sa1100/h3100.c
+++ b/arch/arm/mach-sa1100/h3100.c
@@ -18,7 +18,7 @@
 
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
-#include <asm/mach/irda.h>
+#include <linux/platform_data/irda-sa11x0.h>
 
 #include <mach/h3xxx.h>
 #include <mach/irqs.h>
diff --git a/arch/arm/mach-sa1100/h3600.c b/arch/arm/mach-sa1100/h3600.c
index 5be54c214c7c..118338efd790 100644
--- a/arch/arm/mach-sa1100/h3600.c
+++ b/arch/arm/mach-sa1100/h3600.c
@@ -18,7 +18,7 @@
 
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
-#include <asm/mach/irda.h>
+#include <linux/platform_data/irda-sa11x0.h>
 
 #include <mach/h3xxx.h>
 #include <mach/irqs.h>
diff --git a/drivers/net/irda/sa1100_ir.c b/drivers/net/irda/sa1100_ir.c
index 7b17fa2114e1..b6e44ff4e373 100644
--- a/drivers/net/irda/sa1100_ir.c
+++ b/drivers/net/irda/sa1100_ir.c
@@ -38,7 +38,7 @@
 #include <net/irda/irda_device.h>
 
 #include <mach/hardware.h>
-#include <asm/mach/irda.h>
+#include <linux/platform_data/irda-sa11x0.h>
 
 static int power_level = 3;
 static int tx_lpm;
diff --git a/include/linux/platform_data/irda-sa11x0.h b/include/linux/platform_data/irda-sa11x0.h
new file mode 100644
index 000000000000..38f77b5e56cf
--- /dev/null
+++ b/include/linux/platform_data/irda-sa11x0.h
@@ -0,0 +1,20 @@
+/*
+ *  arch/arm/include/asm/mach/irda.h
+ *
+ *  Copyright (C) 2004 Russell King.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __ASM_ARM_MACH_IRDA_H
+#define __ASM_ARM_MACH_IRDA_H
+
+struct irda_platform_data {
+	int (*startup)(struct device *);
+	void (*shutdown)(struct device *);
+	int (*set_power)(struct device *, unsigned int state);
+	void (*set_speed)(struct device *, unsigned int speed);
+};
+
+#endif
-- 
cgit v1.2.3


From de40ed31b3c577cefd7b54972365a272ecbe9dd6 Mon Sep 17 00:00:00 2001
From: Nimrod Andy <B38611@freescale.com>
Date: Wed, 24 Dec 2014 17:30:39 +0800
Subject: net: fec: add Wake-on-LAN support

Support for Wake-on-LAN using Magic Packet. ENET IP supports sleep mode
in low power status, when system enter suspend status, Magic packet can
wake up system even if all SOC clocks are gate. The patch doing below things:
- flagging the device as a wakeup source for the system, as well as
  its Wake-on-LAN interrupt
- prepare the hardware for entering WoL mode
- add standard ethtool WOL interface
- enable the ENET interrupt to wake us

Tested on i.MX6q/dl sabresd, sabreauto boards, i.MX6SX arm2 boards.

Signed-off-by: Fugang Duan <B38611@freescale.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/fsl-fec.txt |   2 +
 drivers/net/ethernet/freescale/fec.h              |   2 +
 drivers/net/ethernet/freescale/fec_main.c         | 104 +++++++++++++++++++---
 include/linux/fec.h                               |   1 +
 4 files changed, 99 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/fsl-fec.txt b/Documentation/devicetree/bindings/net/fsl-fec.txt
index 0c8775c45798..a9eb611bee68 100644
--- a/Documentation/devicetree/bindings/net/fsl-fec.txt
+++ b/Documentation/devicetree/bindings/net/fsl-fec.txt
@@ -22,6 +22,8 @@ Optional properties:
 - fsl,num-rx-queues : The property is valid for enet-avb IP, which supports
   hw multi queues. Should specify the rx queue number, otherwise set rx queue
   number to 1.
+- fsl,magic-packet : If present, indicates that the hardware supports waking
+  up via magic packet.
 
 Optional subnodes:
 - mdio : specifies the mdio bus in the FEC, used as a container for phy nodes
diff --git a/drivers/net/ethernet/freescale/fec.h b/drivers/net/ethernet/freescale/fec.h
index df8bbddaeb37..d77a96fdf1dd 100644
--- a/drivers/net/ethernet/freescale/fec.h
+++ b/drivers/net/ethernet/freescale/fec.h
@@ -357,6 +357,7 @@ struct bufdesc_ex {
 #define FEC_ENET_RXB    ((uint)0x01000000)      /* A buffer was received */
 #define FEC_ENET_MII    ((uint)0x00800000)      /* MII interrupt */
 #define FEC_ENET_EBERR  ((uint)0x00400000)      /* SDMA bus error */
+#define FEC_ENET_WAKEUP	((uint)0x00020000)	/* Wakeup request */
 #define FEC_ENET_TXF	(FEC_ENET_TXF_0 | FEC_ENET_TXF_1 | FEC_ENET_TXF_2)
 #define FEC_ENET_RXF	(FEC_ENET_RXF_0 | FEC_ENET_RXF_1 | FEC_ENET_RXF_2)
 #define FEC_ENET_TS_AVAIL       ((uint)0x00010000)
@@ -512,6 +513,7 @@ struct fec_enet_private {
 	int	irq[FEC_IRQ_NUM];
 	bool	bufdesc_ex;
 	int	pause_flag;
+	int	wol_flag;
 	u32	quirks;
 
 	struct	napi_struct napi;
diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 5ebdf8dc8a31..49cd358c30fa 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -187,6 +187,9 @@ MODULE_PARM_DESC(macaddr, "FEC Ethernet MAC address");
 #define FEC_MMFR_RA(v)		((v & 0x1f) << 18)
 #define FEC_MMFR_TA		(2 << 16)
 #define FEC_MMFR_DATA(v)	(v & 0xffff)
+/* FEC ECR bits definition */
+#define FEC_ECR_MAGICEN		(1 << 2)
+#define FEC_ECR_SLEEP		(1 << 3)
 
 #define FEC_MII_TIMEOUT		30000 /* us */
 
@@ -195,6 +198,9 @@ MODULE_PARM_DESC(macaddr, "FEC Ethernet MAC address");
 
 #define FEC_PAUSE_FLAG_AUTONEG	0x1
 #define FEC_PAUSE_FLAG_ENABLE	0x2
+#define FEC_WOL_HAS_MAGIC_PACKET	(0x1 << 0)
+#define FEC_WOL_FLAG_ENABLE		(0x1 << 1)
+#define FEC_WOL_FLAG_SLEEP_ON		(0x1 << 2)
 
 #define COPYBREAK_DEFAULT	256
 
@@ -1089,7 +1095,9 @@ static void
 fec_stop(struct net_device *ndev)
 {
 	struct fec_enet_private *fep = netdev_priv(ndev);
+	struct fec_platform_data *pdata = fep->pdev->dev.platform_data;
 	u32 rmii_mode = readl(fep->hwp + FEC_R_CNTRL) & (1 << 8);
+	u32 val;
 
 	/* We cannot expect a graceful transmit stop without link !!! */
 	if (fep->link) {
@@ -1103,17 +1111,28 @@ fec_stop(struct net_device *ndev)
 	 * For i.MX6SX SOC, enet use AXI bus, we use disable MAC
 	 * instead of reset MAC itself.
 	 */
-	if (fep->quirks & FEC_QUIRK_HAS_AVB) {
-		writel(0, fep->hwp + FEC_ECNTRL);
+	if (!(fep->wol_flag & FEC_WOL_FLAG_SLEEP_ON)) {
+		if (fep->quirks & FEC_QUIRK_HAS_AVB) {
+			writel(0, fep->hwp + FEC_ECNTRL);
+		} else {
+			writel(1, fep->hwp + FEC_ECNTRL);
+			udelay(10);
+		}
+		writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
 	} else {
-		writel(1, fep->hwp + FEC_ECNTRL);
-		udelay(10);
+		writel(FEC_DEFAULT_IMASK | FEC_ENET_WAKEUP, fep->hwp + FEC_IMASK);
+		val = readl(fep->hwp + FEC_ECNTRL);
+		val |= (FEC_ECR_MAGICEN | FEC_ECR_SLEEP);
+		writel(val, fep->hwp + FEC_ECNTRL);
+
+		if (pdata && pdata->sleep_mode_enable)
+			pdata->sleep_mode_enable(true);
 	}
 	writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED);
-	writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
 
 	/* We have to keep ENET enabled to have MII interrupt stay working */
-	if (fep->quirks & FEC_QUIRK_ENET_MAC) {
+	if (fep->quirks & FEC_QUIRK_ENET_MAC &&
+		!(fep->wol_flag & FEC_WOL_FLAG_SLEEP_ON)) {
 		writel(2, fep->hwp + FEC_ECNTRL);
 		writel(rmii_mode, fep->hwp + FEC_R_CNTRL);
 	}
@@ -2427,6 +2446,44 @@ static int fec_enet_set_tunable(struct net_device *netdev,
 	return ret;
 }
 
+static void
+fec_enet_get_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct fec_enet_private *fep = netdev_priv(ndev);
+
+	if (fep->wol_flag & FEC_WOL_HAS_MAGIC_PACKET) {
+		wol->supported = WAKE_MAGIC;
+		wol->wolopts = fep->wol_flag & FEC_WOL_FLAG_ENABLE ? WAKE_MAGIC : 0;
+	} else {
+		wol->supported = wol->wolopts = 0;
+	}
+}
+
+static int
+fec_enet_set_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct fec_enet_private *fep = netdev_priv(ndev);
+
+	if (!(fep->wol_flag & FEC_WOL_HAS_MAGIC_PACKET))
+		return -EINVAL;
+
+	if (wol->wolopts & ~WAKE_MAGIC)
+		return -EINVAL;
+
+	device_set_wakeup_enable(&ndev->dev, wol->wolopts & WAKE_MAGIC);
+	if (device_may_wakeup(&ndev->dev)) {
+		fep->wol_flag |= FEC_WOL_FLAG_ENABLE;
+		if (fep->irq[0] > 0)
+			enable_irq_wake(fep->irq[0]);
+	} else {
+		fep->wol_flag &= (~FEC_WOL_FLAG_ENABLE);
+		if (fep->irq[0] > 0)
+			disable_irq_wake(fep->irq[0]);
+	}
+
+	return 0;
+}
+
 static const struct ethtool_ops fec_enet_ethtool_ops = {
 	.get_settings		= fec_enet_get_settings,
 	.set_settings		= fec_enet_set_settings,
@@ -2445,6 +2502,8 @@ static const struct ethtool_ops fec_enet_ethtool_ops = {
 	.get_ts_info		= fec_enet_get_ts_info,
 	.get_tunable		= fec_enet_get_tunable,
 	.set_tunable		= fec_enet_set_tunable,
+	.get_wol		= fec_enet_get_wol,
+	.set_wol		= fec_enet_set_wol,
 };
 
 static int fec_enet_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd)
@@ -2705,6 +2764,9 @@ fec_enet_open(struct net_device *ndev)
 	phy_start(fep->phy_dev);
 	netif_tx_start_all_queues(ndev);
 
+	device_set_wakeup_enable(&ndev->dev, fep->wol_flag &
+				 FEC_WOL_FLAG_ENABLE);
+
 	return 0;
 
 err_enet_mii_probe:
@@ -3153,6 +3215,9 @@ fec_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, ndev);
 
+	if (of_get_property(np, "fsl,magic-packet", NULL))
+		fep->wol_flag |= FEC_WOL_HAS_MAGIC_PACKET;
+
 	phy_node = of_parse_phandle(np, "phy-handle", 0);
 	if (!phy_node && of_phy_is_fixed_link(np)) {
 		ret = of_phy_register_fixed_link(np);
@@ -3247,6 +3312,8 @@ fec_probe(struct platform_device *pdev)
 				       0, pdev->name, ndev);
 		if (ret)
 			goto failed_irq;
+
+		fep->irq[i] = irq;
 	}
 
 	init_completion(&fep->mdio_done);
@@ -3263,6 +3330,9 @@ fec_probe(struct platform_device *pdev)
 	if (ret)
 		goto failed_register;
 
+	device_init_wakeup(&ndev->dev, fep->wol_flag &
+			   FEC_WOL_HAS_MAGIC_PACKET);
+
 	if (fep->bufdesc_ex && fep->ptp_clock)
 		netdev_info(ndev, "registered PHC device %d\n", fep->dev_id);
 
@@ -3316,6 +3386,8 @@ static int __maybe_unused fec_suspend(struct device *dev)
 
 	rtnl_lock();
 	if (netif_running(ndev)) {
+		if (fep->wol_flag & FEC_WOL_FLAG_ENABLE)
+			fep->wol_flag |= FEC_WOL_FLAG_SLEEP_ON;
 		phy_stop(fep->phy_dev);
 		napi_disable(&fep->napi);
 		netif_tx_lock_bh(ndev);
@@ -3323,11 +3395,12 @@ static int __maybe_unused fec_suspend(struct device *dev)
 		netif_tx_unlock_bh(ndev);
 		fec_stop(ndev);
 		fec_enet_clk_enable(ndev, false);
-		pinctrl_pm_select_sleep_state(&fep->pdev->dev);
+		if (!(fep->wol_flag & FEC_WOL_FLAG_ENABLE))
+			pinctrl_pm_select_sleep_state(&fep->pdev->dev);
 	}
 	rtnl_unlock();
 
-	if (fep->reg_phy)
+	if (fep->reg_phy && !(fep->wol_flag & FEC_WOL_FLAG_ENABLE))
 		regulator_disable(fep->reg_phy);
 
 	/* SOC supply clock to phy, when clock is disabled, phy link down
@@ -3343,9 +3416,11 @@ static int __maybe_unused fec_resume(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct fec_enet_private *fep = netdev_priv(ndev);
+	struct fec_platform_data *pdata = fep->pdev->dev.platform_data;
 	int ret;
+	int val;
 
-	if (fep->reg_phy) {
+	if (fep->reg_phy && !(fep->wol_flag & FEC_WOL_FLAG_ENABLE)) {
 		ret = regulator_enable(fep->reg_phy);
 		if (ret)
 			return ret;
@@ -3353,12 +3428,21 @@ static int __maybe_unused fec_resume(struct device *dev)
 
 	rtnl_lock();
 	if (netif_running(ndev)) {
-		pinctrl_pm_select_default_state(&fep->pdev->dev);
 		ret = fec_enet_clk_enable(ndev, true);
 		if (ret) {
 			rtnl_unlock();
 			goto failed_clk;
 		}
+		if (fep->wol_flag & FEC_WOL_FLAG_ENABLE) {
+			if (pdata && pdata->sleep_mode_enable)
+				pdata->sleep_mode_enable(false);
+			val = readl(fep->hwp + FEC_ECNTRL);
+			val &= ~(FEC_ECR_MAGICEN | FEC_ECR_SLEEP);
+			writel(val, fep->hwp + FEC_ECNTRL);
+			fep->wol_flag &= ~FEC_WOL_FLAG_SLEEP_ON;
+		} else {
+			pinctrl_pm_select_default_state(&fep->pdev->dev);
+		}
 		fec_restart(ndev);
 		netif_tx_lock_bh(ndev);
 		netif_device_attach(ndev);
diff --git a/include/linux/fec.h b/include/linux/fec.h
index bcff455d1d53..1454a503622d 100644
--- a/include/linux/fec.h
+++ b/include/linux/fec.h
@@ -19,6 +19,7 @@
 struct fec_platform_data {
 	phy_interface_t phy;
 	unsigned char mac[ETH_ALEN];
+	void (*sleep_mode_enable)(int enabled);
 };
 
 #endif
-- 
cgit v1.2.3


From 9b174d88c257150562b0101fcc6cb6c3cb74275c Mon Sep 17 00:00:00 2001
From: Jesse Gross <jesse@nicira.com>
Date: Tue, 30 Dec 2014 19:10:15 -0800
Subject: net: Add Transparent Ethernet Bridging GRO support.

Currently the only tunnel protocol that supports GRO with encapsulated
Ethernet is VXLAN. This pulls out the Ethernet code into a proper layer
so that it can be used by other tunnel protocols such as GRE and Geneve.

Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c         | 53 +++-----------------------
 include/linux/etherdevice.h |  4 ++
 net/ethernet/eth.c          | 92 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 102 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 7fbd89fbe107..2ab0922af0b4 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -549,10 +549,7 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
 {
 	struct sk_buff *p, **pp = NULL;
 	struct vxlanhdr *vh, *vh2;
-	struct ethhdr *eh, *eh2;
-	unsigned int hlen, off_vx, off_eth;
-	const struct packet_offload *ptype;
-	__be16 type;
+	unsigned int hlen, off_vx;
 	int flush = 1;
 
 	off_vx = skb_gro_offset(skb);
@@ -563,17 +560,6 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
 		if (unlikely(!vh))
 			goto out;
 	}
-	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
-	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
-
-	off_eth = skb_gro_offset(skb);
-	hlen = off_eth + sizeof(*eh);
-	eh   = skb_gro_header_fast(skb, off_eth);
-	if (skb_gro_header_hard(skb, hlen)) {
-		eh = skb_gro_header_slow(skb, hlen, off_eth);
-		if (unlikely(!eh))
-			goto out;
-	}
 
 	flush = 0;
 
@@ -582,28 +568,16 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff
 			continue;
 
 		vh2 = (struct vxlanhdr *)(p->data + off_vx);
-		eh2 = (struct ethhdr   *)(p->data + off_eth);
-		if (vh->vx_vni != vh2->vx_vni || compare_ether_header(eh, eh2)) {
+		if (vh->vx_vni != vh2->vx_vni) {
 			NAPI_GRO_CB(p)->same_flow = 0;
 			continue;
 		}
 	}
 
-	type = eh->h_proto;
-
-	rcu_read_lock();
-	ptype = gro_find_receive_by_type(type);
-	if (ptype == NULL) {
-		flush = 1;
-		goto out_unlock;
-	}
-
-	skb_gro_pull(skb, sizeof(*eh)); /* pull inner eth header */
-	skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));
-	pp = ptype->callbacks.gro_receive(head, skb);
+	skb_gro_pull(skb, sizeof(struct vxlanhdr));
+	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
+	pp = eth_gro_receive(head, skb);
 
-out_unlock:
-	rcu_read_unlock();
 out:
 	NAPI_GRO_CB(skb)->flush |= flush;
 
@@ -612,24 +586,9 @@ out:
 
 static int vxlan_gro_complete(struct sk_buff *skb, int nhoff)
 {
-	struct ethhdr *eh;
-	struct packet_offload *ptype;
-	__be16 type;
-	int vxlan_len  = sizeof(struct vxlanhdr) + sizeof(struct ethhdr);
-	int err = -ENOSYS;
-
 	udp_tunnel_gro_complete(skb, nhoff);
 
-	eh = (struct ethhdr *)(skb->data + nhoff + sizeof(struct vxlanhdr));
-	type = eh->h_proto;
-
-	rcu_read_lock();
-	ptype = gro_find_complete_by_type(type);
-	if (ptype != NULL)
-		err = ptype->callbacks.gro_complete(skb, nhoff + vxlan_len);
-
-	rcu_read_unlock();
-	return err;
+	return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
 }
 
 /* Notify netdevs that UDP port started listening */
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 41c891d05f04..1d869d185a0d 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -52,6 +52,10 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
 #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
 #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)
 
+struct sk_buff **eth_gro_receive(struct sk_buff **head,
+				 struct sk_buff *skb);
+int eth_gro_complete(struct sk_buff *skb, int nhoff);
+
 /* Reserved Ethernet Addresses per IEEE 802.1Q */
 static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
 { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 33a140e15834..238f38d21641 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -424,3 +424,95 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 	return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
 }
 EXPORT_SYMBOL(sysfs_format_mac);
+
+struct sk_buff **eth_gro_receive(struct sk_buff **head,
+				 struct sk_buff *skb)
+{
+	struct sk_buff *p, **pp = NULL;
+	struct ethhdr *eh, *eh2;
+	unsigned int hlen, off_eth;
+	const struct packet_offload *ptype;
+	__be16 type;
+	int flush = 1;
+
+	off_eth = skb_gro_offset(skb);
+	hlen = off_eth + sizeof(*eh);
+	eh = skb_gro_header_fast(skb, off_eth);
+	if (skb_gro_header_hard(skb, hlen)) {
+		eh = skb_gro_header_slow(skb, hlen, off_eth);
+		if (unlikely(!eh))
+			goto out;
+	}
+
+	flush = 0;
+
+	for (p = *head; p; p = p->next) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		eh2 = (struct ethhdr *)(p->data + off_eth);
+		if (compare_ether_header(eh, eh2)) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+	}
+
+	type = eh->h_proto;
+
+	rcu_read_lock();
+	ptype = gro_find_receive_by_type(type);
+	if (ptype == NULL) {
+		flush = 1;
+		goto out_unlock;
+	}
+
+	skb_gro_pull(skb, sizeof(*eh));
+	skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));
+	pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+	rcu_read_unlock();
+out:
+	NAPI_GRO_CB(skb)->flush |= flush;
+
+	return pp;
+}
+EXPORT_SYMBOL(eth_gro_receive);
+
+int eth_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff);
+	__be16 type = eh->h_proto;
+	struct packet_offload *ptype;
+	int err = -ENOSYS;
+
+	if (skb->encapsulation)
+		skb_set_inner_mac_header(skb, nhoff);
+
+	rcu_read_lock();
+	ptype = gro_find_complete_by_type(type);
+	if (ptype != NULL)
+		err = ptype->callbacks.gro_complete(skb, nhoff +
+						    sizeof(struct ethhdr));
+
+	rcu_read_unlock();
+	return err;
+}
+EXPORT_SYMBOL(eth_gro_complete);
+
+static struct packet_offload eth_packet_offload __read_mostly = {
+	.type = cpu_to_be16(ETH_P_TEB),
+	.callbacks = {
+		.gro_receive = eth_gro_receive,
+		.gro_complete = eth_gro_complete,
+	},
+};
+
+static int __init eth_offload_init(void)
+{
+	dev_add_offload(&eth_packet_offload);
+
+	return 0;
+}
+
+fs_initcall(eth_offload_init);
-- 
cgit v1.2.3


From 1891172aa5c32f08ad9931b794edd71e91a4a527 Mon Sep 17 00:00:00 2001
From: Richard Cochran <richardcochran@gmail.com>
Date: Fri, 2 Jan 2015 20:22:03 +0100
Subject: timecounter: provide a macro to initialize the cyclecounter mask
 field.

There is no need for users of the timecounter/cyclecounter code to include
clocksource.h just for a single macro.

Signed-off-by: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/timecounter.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/timecounter.h b/include/linux/timecounter.h
index 74f45496e6d1..4382035a75bb 100644
--- a/include/linux/timecounter.h
+++ b/include/linux/timecounter.h
@@ -19,6 +19,9 @@
 
 #include <linux/types.h>
 
+/* simplify initialization of mask field */
+#define CYCLECOUNTER_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
+
 /**
  * struct cyclecounter - hardware abstraction for a free running counter
  *	Provides completely state-free accessors to the underlying hardware.
@@ -29,7 +32,7 @@
  * @read:		returns the current cycle value
  * @mask:		bitmask for two's complement
  *			subtraction of non 64 bit counters,
- *			see CLOCKSOURCE_MASK() helper macro
+ *			see CYCLECOUNTER_MASK() helper macro
  * @mult:		cycle to nanosecond multiplier
  * @shift:		cycle to nanosecond divisor (power of two)
  */
-- 
cgit v1.2.3


From 8d24c0b43125ec26cc80e04588477a9a2afc025c Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:14 +0100
Subject: rhashtable: Do hashing inside of rhashtable_lookup_compare()

Hash the key inside of rhashtable_lookup_compare() like
rhashtable_lookup() does. This allows to simplify the hashing
functions and keep them private.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: netfilter-devel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  5 +--
 lib/rhashtable.c           | 91 +++++++++++++++-------------------------------
 net/netfilter/nft_hash.c   | 46 ++++++++++++++---------
 net/netlink/af_netlink.c   |  5 +--
 4 files changed, 61 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index b93fd89b2e5e..1b51221c6bbd 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -96,9 +96,6 @@ static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
 
 int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params);
 
-u32 rhashtable_hashfn(const struct rhashtable *ht, const void *key, u32 len);
-u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr);
-
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node);
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node);
 void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
@@ -111,7 +108,7 @@ int rhashtable_expand(struct rhashtable *ht);
 int rhashtable_shrink(struct rhashtable *ht);
 
 void *rhashtable_lookup(const struct rhashtable *ht, const void *key);
-void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash,
+void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg);
 
 void rhashtable_destroy(const struct rhashtable *ht);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6c3c723e902b..1ee0eb636ca3 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -42,69 +42,39 @@ static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
 	return (void *) he - ht->p.head_offset;
 }
 
-static u32 __hashfn(const struct rhashtable *ht, const void *key,
-		      u32 len, u32 hsize)
+static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash)
 {
-	u32 h;
-
-	h = ht->p.hashfn(key, len, ht->p.hash_rnd);
-
-	return h & (hsize - 1);
-}
-
-/**
- * rhashtable_hashfn - compute hash for key of given length
- * @ht:		hash table to compute for
- * @key:	pointer to key
- * @len:	length of key
- *
- * Computes the hash value using the hash function provided in the 'hashfn'
- * of struct rhashtable_params. The returned value is guaranteed to be
- * smaller than the number of buckets in the hash table.
- */
-u32 rhashtable_hashfn(const struct rhashtable *ht, const void *key, u32 len)
-{
-	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
-
-	return __hashfn(ht, key, len, tbl->size);
+	return hash & (tbl->size - 1);
 }
-EXPORT_SYMBOL_GPL(rhashtable_hashfn);
 
-static u32 obj_hashfn(const struct rhashtable *ht, const void *ptr, u32 hsize)
+static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 {
-	if (unlikely(!ht->p.key_len)) {
-		u32 h;
-
-		h = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
+	u32 hash;
 
-		return h & (hsize - 1);
-	}
+	if (unlikely(!ht->p.key_len))
+		hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
+	else
+		hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
+				    ht->p.hash_rnd);
 
-	return __hashfn(ht, ptr + ht->p.key_offset, ht->p.key_len, hsize);
+	return hash;
 }
 
-/**
- * rhashtable_obj_hashfn - compute hash for hashed object
- * @ht:		hash table to compute for
- * @ptr:	pointer to hashed object
- *
- * Computes the hash value using the hash function `hashfn` respectively
- * 'obj_hashfn' depending on whether the hash table is set up to work with
- * a fixed length key. The returned value is guaranteed to be smaller than
- * the number of buckets in the hash table.
- */
-u32 rhashtable_obj_hashfn(const struct rhashtable *ht, void *ptr)
+static u32 key_hashfn(const struct rhashtable *ht, const void *key, u32 len)
 {
 	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	u32 hash;
+
+	hash = ht->p.hashfn(key, len, ht->p.hash_rnd);
 
-	return obj_hashfn(ht, ptr, tbl->size);
+	return rht_bucket_index(tbl, hash);
 }
-EXPORT_SYMBOL_GPL(rhashtable_obj_hashfn);
 
 static u32 head_hashfn(const struct rhashtable *ht,
-		       const struct rhash_head *he, u32 hsize)
+		       const struct bucket_table *tbl,
+		       const struct rhash_head *he)
 {
-	return obj_hashfn(ht, rht_obj(ht, he), hsize);
+	return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he)));
 }
 
 static struct bucket_table *bucket_table_alloc(size_t nbuckets)
@@ -170,9 +140,9 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 * reaches a node that doesn't hash to the same bucket as the
 	 * previous node p. Call the previous node p;
 	 */
-	h = head_hashfn(ht, p, new_tbl->size);
+	h = head_hashfn(ht, new_tbl, p);
 	rht_for_each(he, p->next, ht) {
-		if (head_hashfn(ht, he, new_tbl->size) != h)
+		if (head_hashfn(ht, new_tbl, he) != h)
 			break;
 		p = he;
 	}
@@ -184,7 +154,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	next = NULL;
 	if (he) {
 		rht_for_each(he, he->next, ht) {
-			if (head_hashfn(ht, he, new_tbl->size) == h) {
+			if (head_hashfn(ht, new_tbl, he) == h) {
 				next = he;
 				break;
 			}
@@ -237,9 +207,9 @@ int rhashtable_expand(struct rhashtable *ht)
 	 * single imprecise chain.
 	 */
 	for (i = 0; i < new_tbl->size; i++) {
-		h = i & (old_tbl->size - 1);
+		h = rht_bucket_index(old_tbl, i);
 		rht_for_each(he, old_tbl->buckets[h], ht) {
-			if (head_hashfn(ht, he, new_tbl->size) == i) {
+			if (head_hashfn(ht, new_tbl, he) == i) {
 				RCU_INIT_POINTER(new_tbl->buckets[i], he);
 				break;
 			}
@@ -353,7 +323,7 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	hash = head_hashfn(ht, obj, tbl->size);
+	hash = head_hashfn(ht, tbl, obj);
 	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
 	rcu_assign_pointer(tbl->buckets[hash], obj);
 	ht->nelems++;
@@ -413,7 +383,7 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	h = head_hashfn(ht, obj, tbl->size);
+	h = head_hashfn(ht, tbl, obj);
 
 	pprev = &tbl->buckets[h];
 	rht_for_each(he, tbl->buckets[h], ht) {
@@ -452,7 +422,7 @@ void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
 
 	BUG_ON(!ht->p.key_len);
 
-	h = __hashfn(ht, key, ht->p.key_len, tbl->size);
+	h = key_hashfn(ht, key, ht->p.key_len);
 	rht_for_each_rcu(he, tbl->buckets[h], ht) {
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
@@ -467,7 +437,7 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
 /**
  * rhashtable_lookup_compare - search hash table with compare function
  * @ht:		hash table
- * @hash:	hash value of desired entry
+ * @key:	the pointer to the key
  * @compare:	compare function, must return true on match
  * @arg:	argument passed on to compare function
  *
@@ -479,15 +449,14 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
  *
  * Returns the first entry on which the compare function returned true.
  */
-void *rhashtable_lookup_compare(const struct rhashtable *ht, u32 hash,
+void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg)
 {
 	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 	struct rhash_head *he;
+	u32 hash;
 
-	if (unlikely(hash >= tbl->size))
-		return NULL;
-
+	hash = key_hashfn(ht, key, ht->p.key_len);
 	rht_for_each_rcu(he, tbl->buckets[hash], ht) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 1e316ce4cb5d..614ee099ba36 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -94,28 +94,40 @@ static void nft_hash_remove(const struct nft_set *set,
 	kfree(he);
 }
 
+struct nft_compare_arg {
+	const struct nft_set *set;
+	struct nft_set_elem *elem;
+};
+
+static bool nft_hash_compare(void *ptr, void *arg)
+{
+	struct nft_hash_elem *he = ptr;
+	struct nft_compare_arg *x = arg;
+
+	if (!nft_data_cmp(&he->key, &x->elem->key, x->set->klen)) {
+		x->elem->cookie = &he->node;
+		x->elem->flags = 0;
+		if (x->set->flags & NFT_SET_MAP)
+			nft_data_copy(&x->elem->data, he->data);
+
+		return true;
+	}
+
+	return false;
+}
+
 static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
 {
 	const struct rhashtable *priv = nft_set_priv(set);
-	const struct bucket_table *tbl = rht_dereference_rcu(priv->tbl, priv);
-	struct rhash_head __rcu * const *pprev;
-	struct nft_hash_elem *he;
-	u32 h;
-
-	h = rhashtable_hashfn(priv, &elem->key, set->klen);
-	pprev = &tbl->buckets[h];
-	rht_for_each_entry_rcu(he, tbl->buckets[h], node) {
-		if (nft_data_cmp(&he->key, &elem->key, set->klen)) {
-			pprev = &he->node.next;
-			continue;
-		}
+	struct nft_compare_arg arg = {
+		.set = set,
+		.elem = elem,
+	};
 
-		elem->cookie = (void *)pprev;
-		elem->flags = 0;
-		if (set->flags & NFT_SET_MAP)
-			nft_data_copy(&elem->data, he->data);
+	if (rhashtable_lookup_compare(priv, &elem->key,
+				      &nft_hash_compare, &arg))
 		return 0;
-	}
+
 	return -ENOENT;
 }
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 84ea76ca3f1f..a5d7ed627563 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1002,11 +1002,8 @@ static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
 		.net = net,
 		.portid = portid,
 	};
-	u32 hash;
 
-	hash = rhashtable_hashfn(&table->hash, &portid, sizeof(portid));
-
-	return rhashtable_lookup_compare(&table->hash, hash,
+	return rhashtable_lookup_compare(&table->hash, &portid,
 					 &netlink_compare, &arg);
 }
 
-- 
cgit v1.2.3


From 88d6ed15acff1cb44b1d1f3c0a393b7f7744957a Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:16 +0100
Subject: rhashtable: Convert bucket iterators to take table and index

This patch is in preparation to introduce per bucket spinlocks. It
extends all iterator macros to take the bucket table and bucket
index. It also introduces a new rht_dereference_bucket() to
handle protected accesses to buckets.

It introduces a barrier() to the RCU iterators to the prevent
the compiler from caching the first element.

The lockdep verifier is introduced as stub which always succeeds
and properly implement in the next patch when the locks are
introduced.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 173 ++++++++++++++++++++++++++++++---------------
 lib/rhashtable.c           |  30 +++++---
 net/netfilter/nft_hash.c   |  12 ++--
 net/netlink/af_netlink.c   |  12 ++--
 net/netlink/diag.c         |   4 +-
 5 files changed, 152 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 1b51221c6bbd..b54e24a08806 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -87,11 +87,18 @@ struct rhashtable {
 
 #ifdef CONFIG_PROVE_LOCKING
 int lockdep_rht_mutex_is_held(const struct rhashtable *ht);
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
 #else
 static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
 {
 	return 1;
 }
+
+static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl,
+					     u32 hash)
+{
+	return 1;
+}
 #endif /* CONFIG_PROVE_LOCKING */
 
 int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params);
@@ -119,92 +126,144 @@ void rhashtable_destroy(const struct rhashtable *ht);
 #define rht_dereference_rcu(p, ht) \
 	rcu_dereference_check(p, lockdep_rht_mutex_is_held(ht))
 
-#define rht_entry(ptr, type, member) container_of(ptr, type, member)
-#define rht_entry_safe(ptr, type, member) \
-({ \
-	typeof(ptr) __ptr = (ptr); \
-	   __ptr ? rht_entry(__ptr, type, member) : NULL; \
-})
+#define rht_dereference_bucket(p, tbl, hash) \
+	rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash))
 
-#define rht_next_entry_safe(pos, ht, member) \
-({ \
-	pos ? rht_entry_safe(rht_dereference((pos)->member.next, ht), \
-			     typeof(*(pos)), member) : NULL; \
-})
+#define rht_dereference_bucket_rcu(p, tbl, hash) \
+	rcu_dereference_check(p, lockdep_rht_bucket_is_held(tbl, hash))
+
+#define rht_entry(tpos, pos, member) \
+	({ tpos = container_of(pos, typeof(*tpos), member); 1; })
 
 /**
- * rht_for_each - iterate over hash chain
- * @pos:	&struct rhash_head to use as a loop cursor.
- * @head:	head of the hash chain (struct rhash_head *)
- * @ht:		pointer to your struct rhashtable
+ * rht_for_each_continue - continue iterating over hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
  */
-#define rht_for_each(pos, head, ht) \
-	for (pos = rht_dereference(head, ht); \
+#define rht_for_each_continue(pos, head, tbl, hash) \
+	for (pos = rht_dereference_bucket(head, tbl, hash); \
 	     pos; \
-	     pos = rht_dereference((pos)->next, ht))
+	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
+
+/**
+ * rht_for_each - iterate over hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ */
+#define rht_for_each(pos, tbl, hash) \
+	rht_for_each_continue(pos, (tbl)->buckets[hash], tbl, hash)
+
+/**
+ * rht_for_each_entry_continue - continue iterating over hash chain
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ */
+#define rht_for_each_entry_continue(tpos, pos, head, tbl, hash, member)	\
+	for (pos = rht_dereference_bucket(head, tbl, hash);		\
+	     pos && rht_entry(tpos, pos, member);			\
+	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
 /**
  * rht_for_each_entry - iterate over hash chain of given type
- * @pos:	type * to use as a loop cursor.
- * @head:	head of the hash chain (struct rhash_head *)
- * @ht:		pointer to your struct rhashtable
- * @member:	name of the rhash_head within the hashable struct.
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
  */
-#define rht_for_each_entry(pos, head, ht, member) \
-	for (pos = rht_entry_safe(rht_dereference(head, ht), \
-				   typeof(*(pos)), member); \
-	     pos; \
-	     pos = rht_next_entry_safe(pos, ht, member))
+#define rht_for_each_entry(tpos, pos, tbl, hash, member)		\
+	rht_for_each_entry_continue(tpos, pos, (tbl)->buckets[hash],	\
+				    tbl, hash, member)
 
 /**
  * rht_for_each_entry_safe - safely iterate over hash chain of given type
- * @pos:	type * to use as a loop cursor.
- * @n:		type * to use for temporary next object storage
- * @head:	head of the hash chain (struct rhash_head *)
- * @ht:		pointer to your struct rhashtable
- * @member:	name of the rhash_head within the hashable struct.
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @next:	the &struct rhash_head to use as next in loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
  *
  * This hash chain list-traversal primitive allows for the looped code to
  * remove the loop cursor from the list.
  */
-#define rht_for_each_entry_safe(pos, n, head, ht, member)		\
-	for (pos = rht_entry_safe(rht_dereference(head, ht), \
-				  typeof(*(pos)), member), \
-	     n = rht_next_entry_safe(pos, ht, member); \
-	     pos; \
-	     pos = n, \
-	     n = rht_next_entry_safe(pos, ht, member))
+#define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	    \
+	for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \
+	     next = pos ? rht_dereference_bucket(pos->next, tbl, hash)      \
+			: NULL;						    \
+	     pos && rht_entry(tpos, pos, member);			    \
+	     pos = next)
+
+/**
+ * rht_for_each_rcu_continue - continue iterating over rcu hash chain
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_rcu_continue(pos, head, tbl, hash)			\
+	for (({barrier(); }),						\
+	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		\
+	     pos;							\
+	     pos = rcu_dereference_raw(pos->next))
 
 /**
  * rht_for_each_rcu - iterate over rcu hash chain
- * @pos:	&struct rhash_head to use as a loop cursor.
- * @head:	head of the hash chain (struct rhash_head *)
- * @ht:		pointer to your struct rhashtable
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
  *
  * This hash chain list-traversal primitive may safely run concurrently with
- * the _rcu fkht mutation primitives such as rht_insert() as long as the
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_rcu(pos, head, ht) \
-	for (pos = rht_dereference_rcu(head, ht); \
-	     pos; \
-	     pos = rht_dereference_rcu((pos)->next, ht))
+#define rht_for_each_rcu(pos, tbl, hash)				\
+	rht_for_each_rcu_continue(pos, (tbl)->buckets[hash], tbl, hash)
+
+/**
+ * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @head:	the previous &struct rhash_head to continue from
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
+ *
+ * This hash chain list-traversal primitive may safely run concurrently with
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
+ * traversal is guarded by rcu_read_lock().
+ */
+#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
+	for (({barrier(); }),						    \
+	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		    \
+	     pos && rht_entry(tpos, pos, member);			    \
+	     pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
 
 /**
  * rht_for_each_entry_rcu - iterate over rcu hash chain of given type
- * @pos:	type * to use as a loop cursor.
- * @head:	head of the hash chain (struct rhash_head *)
- * @member:	name of the rhash_head within the hashable struct.
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct rhash_head to use as a loop cursor.
+ * @tbl:	the &struct bucket_table
+ * @hash:	the hash value / bucket index
+ * @member:	name of the &struct rhash_head within the hashable struct.
  *
  * This hash chain list-traversal primitive may safely run concurrently with
- * the _rcu fkht mutation primitives such as rht_insert() as long as the
+ * the _rcu mutation primitives such as rhashtable_insert() as long as the
  * traversal is guarded by rcu_read_lock().
  */
-#define rht_for_each_entry_rcu(pos, head, member) \
-	for (pos = rht_entry_safe(rcu_dereference_raw(head), \
-				  typeof(*(pos)), member); \
-	     pos; \
-	     pos = rht_entry_safe(rcu_dereference_raw((pos)->member.next), \
-				  typeof(*(pos)), member))
+#define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member)		\
+	rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\
+					tbl, hash, member)
 
 #endif /* _LINUX_RHASHTABLE_H */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index b658245826a1..ce450d095fdf 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -35,6 +35,12 @@ int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
 	return ht->p.mutex_is_held(ht->p.parent);
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
+
+int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
+{
+	return 1;
+}
+EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 #endif
 
 static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he)
@@ -141,7 +147,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 * previous node p. Call the previous node p;
 	 */
 	h = head_hashfn(ht, new_tbl, p);
-	rht_for_each(he, p->next, ht) {
+	rht_for_each_continue(he, p->next, old_tbl, n) {
 		if (head_hashfn(ht, new_tbl, he) != h)
 			break;
 		p = he;
@@ -153,7 +159,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	 */
 	next = NULL;
 	if (he) {
-		rht_for_each(he, he->next, ht) {
+		rht_for_each_continue(he, he->next, old_tbl, n) {
 			if (head_hashfn(ht, new_tbl, he) == h) {
 				next = he;
 				break;
@@ -208,7 +214,7 @@ int rhashtable_expand(struct rhashtable *ht)
 	 */
 	for (i = 0; i < new_tbl->size; i++) {
 		h = rht_bucket_index(old_tbl, i);
-		rht_for_each(he, old_tbl->buckets[h], ht) {
+		rht_for_each(he, old_tbl, h) {
 			if (head_hashfn(ht, new_tbl, he) == i) {
 				RCU_INIT_POINTER(new_tbl->buckets[i], he);
 				break;
@@ -286,7 +292,7 @@ int rhashtable_shrink(struct rhashtable *ht)
 		 * to the new bucket.
 		 */
 		for (pprev = &ntbl->buckets[i]; *pprev != NULL;
-		     pprev = &rht_dereference(*pprev, ht)->next)
+		     pprev = &rht_dereference_bucket(*pprev, ntbl, i)->next)
 			;
 		RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]);
 	}
@@ -386,7 +392,7 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 	h = head_hashfn(ht, tbl, obj);
 
 	pprev = &tbl->buckets[h];
-	rht_for_each(he, tbl->buckets[h], ht) {
+	rht_for_each(he, tbl, h) {
 		if (he != obj) {
 			pprev = &he->next;
 			continue;
@@ -423,7 +429,7 @@ void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
 	BUG_ON(!ht->p.key_len);
 
 	h = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl->buckets[h], ht) {
+	rht_for_each_rcu(he, tbl, h) {
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
 			continue;
@@ -457,7 +463,7 @@ void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
 	u32 hash;
 
 	hash = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl->buckets[hash], ht) {
+	rht_for_each_rcu(he, tbl, hash) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
 		return rht_obj(ht, he);
@@ -625,6 +631,7 @@ static int __init test_rht_lookup(struct rhashtable *ht)
 static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 {
 	unsigned int cnt, rcu_cnt, i, total = 0;
+	struct rhash_head *pos;
 	struct test_obj *obj;
 	struct bucket_table *tbl;
 
@@ -635,14 +642,14 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 		if (!quiet)
 			pr_info(" [%#4x/%zu]", i, tbl->size);
 
-		rht_for_each_entry_rcu(obj, tbl->buckets[i], node) {
+		rht_for_each_entry_rcu(obj, pos, tbl, i, node) {
 			cnt++;
 			total++;
 			if (!quiet)
 				pr_cont(" [%p],", obj);
 		}
 
-		rht_for_each_entry_rcu(obj, tbl->buckets[i], node)
+		rht_for_each_entry_rcu(obj, pos, tbl, i, node)
 			rcu_cnt++;
 
 		if (rcu_cnt != cnt)
@@ -664,7 +671,8 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 static int __init test_rhashtable(struct rhashtable *ht)
 {
 	struct bucket_table *tbl;
-	struct test_obj *obj, *next;
+	struct test_obj *obj;
+	struct rhash_head *pos, *next;
 	int err;
 	unsigned int i;
 
@@ -733,7 +741,7 @@ static int __init test_rhashtable(struct rhashtable *ht)
 error:
 	tbl = rht_dereference_rcu(ht->tbl, ht);
 	for (i = 0; i < tbl->size; i++)
-		rht_for_each_entry_safe(obj, next, tbl->buckets[i], ht, node)
+		rht_for_each_entry_safe(obj, pos, next, tbl, i, node)
 			kfree(obj);
 
 	return err;
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 614ee099ba36..d93f1f4c22a9 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -142,7 +142,9 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
 
 	tbl = rht_dereference_rcu(priv->tbl, priv);
 	for (i = 0; i < tbl->size; i++) {
-		rht_for_each_entry_rcu(he, tbl->buckets[i], node) {
+		struct rhash_head *pos;
+
+		rht_for_each_entry_rcu(he, pos, tbl, i, node) {
 			if (iter->count < iter->skip)
 				goto cont;
 
@@ -197,15 +199,13 @@ static void nft_hash_destroy(const struct nft_set *set)
 {
 	const struct rhashtable *priv = nft_set_priv(set);
 	const struct bucket_table *tbl = priv->tbl;
-	struct nft_hash_elem *he, *next;
+	struct nft_hash_elem *he;
+	struct rhash_head *pos, *next;
 	unsigned int i;
 
 	for (i = 0; i < tbl->size; i++) {
-		for (he = rht_entry(tbl->buckets[i], struct nft_hash_elem, node);
-		     he != NULL; he = next) {
-			next = rht_entry(he->node.next, struct nft_hash_elem, node);
+		rht_for_each_entry_safe(he, pos, next, tbl, i, node)
 			nft_hash_elem_destroy(set, he);
-		}
 	}
 	rhashtable_destroy(priv);
 }
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a5d7ed627563..57449b6089c2 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2898,7 +2898,9 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
 		const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 
 		for (j = 0; j < tbl->size; j++) {
-			rht_for_each_entry_rcu(nlk, tbl->buckets[j], node) {
+			struct rhash_head *node;
+
+			rht_for_each_entry_rcu(nlk, node, tbl, j, node) {
 				s = (struct sock *)nlk;
 
 				if (sock_net(s) != seq_file_net(seq))
@@ -2926,6 +2928,8 @@ static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
 static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct rhashtable *ht;
+	const struct bucket_table *tbl;
+	struct rhash_head *node;
 	struct netlink_sock *nlk;
 	struct nl_seq_iter *iter;
 	struct net *net;
@@ -2942,17 +2946,17 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 	i = iter->link;
 	ht = &nl_table[i].hash;
-	rht_for_each_entry(nlk, nlk->node.next, ht, node)
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	rht_for_each_entry_rcu_continue(nlk, node, nlk->node.next, tbl, iter->hash_idx, node)
 		if (net_eq(sock_net((struct sock *)nlk), net))
 			return nlk;
 
 	j = iter->hash_idx + 1;
 
 	do {
-		const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 
 		for (; j < tbl->size; j++) {
-			rht_for_each_entry(nlk, tbl->buckets[j], ht, node) {
+			rht_for_each_entry_rcu(nlk, node, tbl, j, node) {
 				if (net_eq(sock_net((struct sock *)nlk), net)) {
 					iter->link = i;
 					iter->hash_idx = j;
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index de8c74a3c061..fcca36d81a62 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -113,7 +113,9 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
 	req = nlmsg_data(cb->nlh);
 
 	for (i = 0; i < htbl->size; i++) {
-		rht_for_each_entry(nlsk, htbl->buckets[i], ht, node) {
+		struct rhash_head *pos;
+
+		rht_for_each_entry(nlsk, pos, htbl, i, node) {
 			sk = (struct sock *)nlsk;
 
 			if (!net_eq(sock_net(sk), net))
-- 
cgit v1.2.3


From 897362e446436d245972e72c6bc5b33bd7a5c659 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:18 +0100
Subject: nft_hash: Remove rhashtable_remove_pprev()

The removal function of nft_hash currently stores a reference to the
previous element during lookup which is used to optimize removal later
on. This was possible because a lock is held throughout calling
rhashtable_lookup() and rhashtable_remove().

With the introdution of deferred table resizing in parallel to lookups
and insertions, the nftables lock will no longer synchronize all
table mutations and the stored pprev may become invalid.

Removing this optimization makes removal slightly more expensive on
average but allows taking the resize cost out of the insert and
remove path.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Cc: netfilter-devel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  2 --
 lib/rhashtable.c           | 34 +++++++---------------------------
 net/netfilter/nft_hash.c   | 11 +++--------
 3 files changed, 10 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index b54e24a08806..f624d4b5045f 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -105,8 +105,6 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params);
 
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *node);
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *node);
-void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-			     struct rhash_head __rcu **pprev);
 
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 0bd29c178910..e6b85c4a5828 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -344,32 +344,6 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert);
 
-/**
- * rhashtable_remove_pprev - remove object from hash table given previous element
- * @ht:		hash table
- * @obj:	pointer to hash head inside object
- * @pprev:	pointer to previous element
- *
- * Identical to rhashtable_remove() but caller is alreayd aware of the element
- * in front of the element to be deleted. This is in particular useful for
- * deletion when combined with walking or lookup.
- */
-void rhashtable_remove_pprev(struct rhashtable *ht, struct rhash_head *obj,
-			     struct rhash_head __rcu **pprev)
-{
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
-
-	ASSERT_RHT_MUTEX(ht);
-
-	RCU_INIT_POINTER(*pprev, obj->next);
-	ht->nelems--;
-
-	if (ht->p.shrink_decision &&
-	    ht->p.shrink_decision(ht, tbl->size))
-		rhashtable_shrink(ht);
-}
-EXPORT_SYMBOL_GPL(rhashtable_remove_pprev);
-
 /**
  * rhashtable_remove - remove object from hash table
  * @ht:		hash table
@@ -403,7 +377,13 @@ bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 			continue;
 		}
 
-		rhashtable_remove_pprev(ht, he, pprev);
+		RCU_INIT_POINTER(*pprev, he->next);
+		ht->nelems--;
+
+		if (ht->p.shrink_decision &&
+		    ht->p.shrink_decision(ht, tbl->size))
+			rhashtable_shrink(ht);
+
 		return true;
 	}
 
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index d93f1f4c22a9..7f903cf9a1b9 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -83,15 +83,10 @@ static void nft_hash_remove(const struct nft_set *set,
 			    const struct nft_set_elem *elem)
 {
 	struct rhashtable *priv = nft_set_priv(set);
-	struct rhash_head *he, __rcu **pprev;
-
-	pprev = elem->cookie;
-	he = rht_dereference((*pprev), priv);
-
-	rhashtable_remove_pprev(priv, he, pprev);
 
+	rhashtable_remove(priv, elem->cookie);
 	synchronize_rcu();
-	kfree(he);
+	kfree(elem->cookie);
 }
 
 struct nft_compare_arg {
@@ -105,7 +100,7 @@ static bool nft_hash_compare(void *ptr, void *arg)
 	struct nft_compare_arg *x = arg;
 
 	if (!nft_data_cmp(&he->key, &x->elem->key, x->set->klen)) {
-		x->elem->cookie = &he->node;
+		x->elem->cookie = he;
 		x->elem->flags = 0;
 		if (x->set->flags & NFT_SET_MAP)
 			nft_data_copy(&x->elem->data, he->data);
-- 
cgit v1.2.3


From 113948d841e8d78039e5dbbb5248f5b73e99eafa Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:19 +0100
Subject: spinlock: Add spin_lock_bh_nested()

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/spinlock.h         | 8 ++++++++
 include/linux/spinlock_api_smp.h | 2 ++
 include/linux/spinlock_api_up.h  | 1 +
 kernel/locking/spinlock.c        | 8 ++++++++
 4 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 262ba4ef9a8e..3e18379dfa6f 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -190,6 +190,8 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define raw_spin_lock_nested(lock, subclass) \
 	_raw_spin_lock_nested(lock, subclass)
+# define raw_spin_lock_bh_nested(lock, subclass) \
+	_raw_spin_lock_bh_nested(lock, subclass)
 
 # define raw_spin_lock_nest_lock(lock, nest_lock)			\
 	 do {								\
@@ -205,6 +207,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 # define raw_spin_lock_nested(lock, subclass)		\
 	_raw_spin_lock(((void)(subclass), (lock)))
 # define raw_spin_lock_nest_lock(lock, nest_lock)	_raw_spin_lock(lock)
+# define raw_spin_lock_bh_nested(lock, subclass)	_raw_spin_lock_bh(lock)
 #endif
 
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
@@ -324,6 +327,11 @@ do {								\
 	raw_spin_lock_nested(spinlock_check(lock), subclass);	\
 } while (0)
 
+#define spin_lock_bh_nested(lock, subclass)			\
+do {								\
+	raw_spin_lock_bh_nested(spinlock_check(lock), subclass);\
+} while (0)
+
 #define spin_lock_nest_lock(lock, nest_lock)				\
 do {									\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 42dfab89e740..5344268e6e62 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -22,6 +22,8 @@ int in_lock_functions(unsigned long addr);
 void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)		__acquires(lock);
 void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 								__acquires(lock);
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+								__acquires(lock);
 void __lockfunc
 _raw_spin_lock_nest_lock(raw_spinlock_t *lock, struct lockdep_map *map)
 								__acquires(lock);
diff --git a/include/linux/spinlock_api_up.h b/include/linux/spinlock_api_up.h
index d0d188861ad6..d3afef9d8dbe 100644
--- a/include/linux/spinlock_api_up.h
+++ b/include/linux/spinlock_api_up.h
@@ -57,6 +57,7 @@
 
 #define _raw_spin_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_nested(lock, subclass)	__LOCK(lock)
+#define _raw_spin_lock_bh_nested(lock, subclass) __LOCK(lock)
 #define _raw_read_lock(lock)			__LOCK(lock)
 #define _raw_write_lock(lock)			__LOCK(lock)
 #define _raw_spin_lock_bh(lock)			__LOCK_BH(lock)
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..db3ccb1dd614 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -363,6 +363,14 @@ void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
 }
 EXPORT_SYMBOL(_raw_spin_lock_nested);
 
+void __lockfunc _raw_spin_lock_bh_nested(raw_spinlock_t *lock, int subclass)
+{
+	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
+	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
+	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
+}
+EXPORT_SYMBOL(_raw_spin_lock_bh_nested);
+
 unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
 						   int subclass)
 {
-- 
cgit v1.2.3


From 97defe1ecf868b8127f8e62395499d6a06e4c4b1 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:20 +0100
Subject: rhashtable: Per bucket locks & deferred expansion/shrinking

Introduces an array of spinlocks to protect bucket mutations. The number
of spinlocks per CPU is configurable and selected based on the hash of
the bucket. This allows for parallel insertions and removals of entries
which do not share a lock.

The patch also defers expansion and shrinking to a worker queue which
allows insertion and removal from atomic context. Insertions and
deletions may occur in parallel to it and are only held up briefly
while the particular bucket is linked or unzipped.

Mutations of the bucket table pointer is protected by a new mutex, read
access is RCU protected.

In the event of an expansion or shrinking, the new bucket table allocated
is exposed as a so called future table as soon as the resize process
starts.  Lookups, deletions, and insertions will briefly use both tables.
The future table becomes the main table after an RCU grace period and
initial linking of the old to the new table was performed. Optimization
of the chains to make use of the new number of buckets follows only the
new table is in use.

The side effect of this is that during that RCU grace period, a bucket
traversal using any rht_for_each() variant on the main table will not see
any insertions performed during the RCU grace period which would at that
point land in the future table. The lookup will see them as it searches
both tables if needed.

Having multiple insertions and removals occur in parallel requires nelems
to become an atomic counter.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  37 ++--
 lib/rhashtable.c           | 458 ++++++++++++++++++++++++++++++++++-----------
 net/netfilter/nft_hash.c   |  27 ++-
 net/netlink/af_netlink.c   |  15 +-
 4 files changed, 384 insertions(+), 153 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index f624d4b5045f..a1688f0a6193 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -19,6 +19,7 @@
 #define _LINUX_RHASHTABLE_H
 
 #include <linux/rculist.h>
+#include <linux/workqueue.h>
 
 struct rhash_head {
 	struct rhash_head __rcu		*next;
@@ -26,8 +27,17 @@ struct rhash_head {
 
 #define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL)
 
+/**
+ * struct bucket_table - Table of hash buckets
+ * @size: Number of hash buckets
+ * @locks_mask: Mask to apply before accessing locks[]
+ * @locks: Array of spinlocks protecting individual buckets
+ * @buckets: size * hash buckets
+ */
 struct bucket_table {
 	size_t				size;
+	unsigned int			locks_mask;
+	spinlock_t			*locks;
 	struct rhash_head __rcu		*buckets[];
 };
 
@@ -45,11 +55,11 @@ struct rhashtable;
  * @hash_rnd: Seed to use while hashing
  * @max_shift: Maximum number of shifts while expanding
  * @min_shift: Minimum number of shifts while shrinking
+ * @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
  * @hashfn: Function to hash key
  * @obj_hashfn: Function to hash object
  * @grow_decision: If defined, may return true if table should expand
  * @shrink_decision: If defined, may return true if table should shrink
- * @mutex_is_held: Must return true if protecting mutex is held
  */
 struct rhashtable_params {
 	size_t			nelem_hint;
@@ -59,37 +69,42 @@ struct rhashtable_params {
 	u32			hash_rnd;
 	size_t			max_shift;
 	size_t			min_shift;
+	size_t			locks_mul;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
 	bool			(*grow_decision)(const struct rhashtable *ht,
 						 size_t new_size);
 	bool			(*shrink_decision)(const struct rhashtable *ht,
 						   size_t new_size);
-#ifdef CONFIG_PROVE_LOCKING
-	int			(*mutex_is_held)(void *parent);
-	void			*parent;
-#endif
 };
 
 /**
  * struct rhashtable - Hash table handle
  * @tbl: Bucket table
+ * @future_tbl: Table under construction during expansion/shrinking
  * @nelems: Number of elements in table
  * @shift: Current size (1 << shift)
  * @p: Configuration parameters
+ * @run_work: Deferred worker to expand/shrink asynchronously
+ * @mutex: Mutex to protect current/future table swapping
+ * @being_destroyed: True if table is set up for destruction
  */
 struct rhashtable {
 	struct bucket_table __rcu	*tbl;
-	size_t				nelems;
+	struct bucket_table __rcu       *future_tbl;
+	atomic_t			nelems;
 	size_t				shift;
 	struct rhashtable_params	p;
+	struct delayed_work             run_work;
+	struct mutex                    mutex;
+	bool                            being_destroyed;
 };
 
 #ifdef CONFIG_PROVE_LOCKING
-int lockdep_rht_mutex_is_held(const struct rhashtable *ht);
+int lockdep_rht_mutex_is_held(struct rhashtable *ht);
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
 #else
-static inline int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
+static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 {
 	return 1;
 }
@@ -112,11 +127,11 @@ bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size);
 int rhashtable_expand(struct rhashtable *ht);
 int rhashtable_shrink(struct rhashtable *ht);
 
-void *rhashtable_lookup(const struct rhashtable *ht, const void *key);
-void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
+void *rhashtable_lookup(struct rhashtable *ht, const void *key);
+void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg);
 
-void rhashtable_destroy(const struct rhashtable *ht);
+void rhashtable_destroy(struct rhashtable *ht);
 
 #define rht_dereference(p, ht) \
 	rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht))
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index e6b85c4a5828..312e3437c7bc 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -26,19 +26,42 @@
 
 #define HASH_DEFAULT_SIZE	64UL
 #define HASH_MIN_SIZE		4UL
+#define BUCKET_LOCKS_PER_CPU   128UL
+
+enum {
+	RHT_LOCK_NORMAL,
+	RHT_LOCK_NESTED,
+	RHT_LOCK_NESTED2,
+};
+
+/* The bucket lock is selected based on the hash and protects mutations
+ * on a group of hash buckets.
+ *
+ * IMPORTANT: When holding the bucket lock of both the old and new table
+ * during expansions and shrinking, the old bucket lock must always be
+ * acquired first.
+ */
+static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash)
+{
+	return &tbl->locks[hash & tbl->locks_mask];
+}
 
 #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
+#define ASSERT_BUCKET_LOCK(TBL, HASH) \
+	BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH))
 
 #ifdef CONFIG_PROVE_LOCKING
-int lockdep_rht_mutex_is_held(const struct rhashtable *ht)
+int lockdep_rht_mutex_is_held(struct rhashtable *ht)
 {
-	return ht->p.mutex_is_held(ht->p.parent);
+	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
 
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
 {
-	return 1;
+	spinlock_t *lock = bucket_lock(tbl, hash);
+
+	return (debug_locks) ? lockdep_is_held(lock) : 1;
 }
 EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
 #endif
@@ -66,7 +89,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 	return hash;
 }
 
-static u32 key_hashfn(const struct rhashtable *ht, const void *key, u32 len)
+static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
 {
 	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
 	u32 hash;
@@ -95,7 +118,49 @@ static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
 	return pprev;
 }
 
-static struct bucket_table *bucket_table_alloc(size_t nbuckets)
+static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl)
+{
+	unsigned int i, size;
+#if defined(CONFIG_PROVE_LOCKING)
+	unsigned int nr_pcpus = 2;
+#else
+	unsigned int nr_pcpus = num_possible_cpus();
+#endif
+
+	nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
+	size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul);
+
+	/* Never allocate more than one lock per bucket */
+	size = min_t(unsigned int, size, tbl->size);
+
+	if (sizeof(spinlock_t) != 0) {
+#ifdef CONFIG_NUMA
+		if (size * sizeof(spinlock_t) > PAGE_SIZE)
+			tbl->locks = vmalloc(size * sizeof(spinlock_t));
+		else
+#endif
+		tbl->locks = kmalloc_array(size, sizeof(spinlock_t),
+					   GFP_KERNEL);
+		if (!tbl->locks)
+			return -ENOMEM;
+		for (i = 0; i < size; i++)
+			spin_lock_init(&tbl->locks[i]);
+	}
+	tbl->locks_mask = size - 1;
+
+	return 0;
+}
+
+static void bucket_table_free(const struct bucket_table *tbl)
+{
+	if (tbl)
+		kvfree(tbl->locks);
+
+	kvfree(tbl);
+}
+
+static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
+					       size_t nbuckets)
 {
 	struct bucket_table *tbl;
 	size_t size;
@@ -110,12 +175,12 @@ static struct bucket_table *bucket_table_alloc(size_t nbuckets)
 
 	tbl->size = nbuckets;
 
-	return tbl;
-}
+	if (alloc_bucket_locks(ht, tbl) < 0) {
+		bucket_table_free(tbl);
+		return NULL;
+	}
 
-static void bucket_table_free(const struct bucket_table *tbl)
-{
-	kvfree(tbl);
+	return tbl;
 }
 
 /**
@@ -126,7 +191,7 @@ static void bucket_table_free(const struct bucket_table *tbl)
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
 {
 	/* Expand table when exceeding 75% load */
-	return ht->nelems > (new_size / 4 * 3);
+	return atomic_read(&ht->nelems) > (new_size / 4 * 3);
 }
 EXPORT_SYMBOL_GPL(rht_grow_above_75);
 
@@ -138,41 +203,59 @@ EXPORT_SYMBOL_GPL(rht_grow_above_75);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
 {
 	/* Shrink table beneath 30% load */
-	return ht->nelems < (new_size * 3 / 10);
+	return atomic_read(&ht->nelems) < (new_size * 3 / 10);
 }
 EXPORT_SYMBOL_GPL(rht_shrink_below_30);
 
 static void hashtable_chain_unzip(const struct rhashtable *ht,
 				  const struct bucket_table *new_tbl,
-				  struct bucket_table *old_tbl, size_t n)
+				  struct bucket_table *old_tbl,
+				  size_t old_hash)
 {
 	struct rhash_head *he, *p, *next;
-	unsigned int h;
+	spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL;
+	unsigned int new_hash, new_hash2;
+
+	ASSERT_BUCKET_LOCK(old_tbl, old_hash);
 
 	/* Old bucket empty, no work needed. */
-	p = rht_dereference(old_tbl->buckets[n], ht);
+	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
+				   old_hash);
 	if (!p)
 		return;
 
+	new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
 	/* Advance the old bucket pointer one or more times until it
 	 * reaches a node that doesn't hash to the same bucket as the
 	 * previous node p. Call the previous node p;
 	 */
-	h = head_hashfn(ht, new_tbl, p);
-	rht_for_each_continue(he, p->next, old_tbl, n) {
-		if (head_hashfn(ht, new_tbl, he) != h)
+	rht_for_each_continue(he, p->next, old_tbl, old_hash) {
+		new_hash2 = head_hashfn(ht, new_tbl, he);
+		if (new_hash != new_hash2)
 			break;
 		p = he;
 	}
-	RCU_INIT_POINTER(old_tbl->buckets[n], p->next);
+	rcu_assign_pointer(old_tbl->buckets[old_hash], p->next);
+
+	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+
+	/* If we have encountered an entry that maps to a different bucket in
+	 * the new table, lock down that bucket as well as we might cut off
+	 * the end of the chain.
+	 */
+	new_bucket_lock2 = bucket_lock(new_tbl, new_hash);
+	if (new_bucket_lock != new_bucket_lock2)
+		spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2);
 
 	/* Find the subsequent node which does hash to the same
 	 * bucket as node P, or NULL if no such node exists.
 	 */
 	next = NULL;
 	if (he) {
-		rht_for_each_continue(he, he->next, old_tbl, n) {
-			if (head_hashfn(ht, new_tbl, he) == h) {
+		rht_for_each_continue(he, he->next, old_tbl, old_hash) {
+			if (head_hashfn(ht, new_tbl, he) == new_hash) {
 				next = he;
 				break;
 			}
@@ -182,7 +265,23 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	/* Set p's next pointer to that subsequent node pointer,
 	 * bypassing the nodes which do not hash to p's bucket
 	 */
-	RCU_INIT_POINTER(p->next, next);
+	rcu_assign_pointer(p->next, next);
+
+	if (new_bucket_lock != new_bucket_lock2)
+		spin_unlock_bh(new_bucket_lock2);
+	spin_unlock_bh(new_bucket_lock);
+}
+
+static void link_old_to_new(struct bucket_table *new_tbl,
+			    unsigned int new_hash, struct rhash_head *entry)
+{
+	spinlock_t *new_bucket_lock;
+
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
+	spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+	rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry);
+	spin_unlock_bh(new_bucket_lock);
 }
 
 /**
@@ -195,43 +294,59 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
  * This function may only be called in a context where it is safe to call
  * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
  *
- * The caller must ensure that no concurrent table mutations take place.
- * It is however valid to have concurrent lookups if they are RCU protected.
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
  */
 int rhashtable_expand(struct rhashtable *ht)
 {
 	struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht);
 	struct rhash_head *he;
-	unsigned int i, h;
-	bool complete;
+	spinlock_t *old_bucket_lock;
+	unsigned int new_hash, old_hash;
+	bool complete = false;
 
 	ASSERT_RHT_MUTEX(ht);
 
 	if (ht->p.max_shift && ht->shift >= ht->p.max_shift)
 		return 0;
 
-	new_tbl = bucket_table_alloc(old_tbl->size * 2);
+	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
 	ht->shift++;
 
-	/* For each new bucket, search the corresponding old bucket
-	 * for the first entry that hashes to the new bucket, and
-	 * link the new bucket to that entry. Since all the entries
-	 * which will end up in the new bucket appear in the same
-	 * old bucket, this constructs an entirely valid new hash
-	 * table, but with multiple buckets "zipped" together into a
-	 * single imprecise chain.
+	/* Make insertions go into the new, empty table right away. Deletions
+	 * and lookups will be attempted in both tables until we synchronize.
+	 * The synchronize_rcu() guarantees for the new table to be picked up
+	 * so no new additions go into the old table while we relink.
+	 */
+	rcu_assign_pointer(ht->future_tbl, new_tbl);
+	synchronize_rcu();
+
+	/* For each new bucket, search the corresponding old bucket for the
+	 * first entry that hashes to the new bucket, and link the end of
+	 * newly formed bucket chain (containing entries added to future
+	 * table) to that entry. Since all the entries which will end up in
+	 * the new bucket appear in the same old bucket, this constructs an
+	 * entirely valid new hash table, but with multiple buckets
+	 * "zipped" together into a single imprecise chain.
 	 */
-	for (i = 0; i < new_tbl->size; i++) {
-		h = rht_bucket_index(old_tbl, i);
-		rht_for_each(he, old_tbl, h) {
-			if (head_hashfn(ht, new_tbl, he) == i) {
-				RCU_INIT_POINTER(new_tbl->buckets[i], he);
+	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
+		old_hash = rht_bucket_index(old_tbl, new_hash);
+		old_bucket_lock = bucket_lock(old_tbl, old_hash);
+
+		spin_lock_bh(old_bucket_lock);
+		rht_for_each(he, old_tbl, old_hash) {
+			if (head_hashfn(ht, new_tbl, he) == new_hash) {
+				link_old_to_new(new_tbl, new_hash, he);
 				break;
 			}
 		}
+		spin_unlock_bh(old_bucket_lock);
 	}
 
 	/* Publish the new table pointer. Lookups may now traverse
@@ -241,7 +356,7 @@ int rhashtable_expand(struct rhashtable *ht)
 	rcu_assign_pointer(ht->tbl, new_tbl);
 
 	/* Unzip interleaved hash chains */
-	do {
+	while (!complete && !ht->being_destroyed) {
 		/* Wait for readers. All new readers will see the new
 		 * table, and thus no references to the old table will
 		 * remain.
@@ -253,12 +368,17 @@ int rhashtable_expand(struct rhashtable *ht)
 		 * table): ...
 		 */
 		complete = true;
-		for (i = 0; i < old_tbl->size; i++) {
-			hashtable_chain_unzip(ht, new_tbl, old_tbl, i);
-			if (old_tbl->buckets[i] != NULL)
+		for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+			old_bucket_lock = bucket_lock(old_tbl, old_hash);
+			spin_lock_bh(old_bucket_lock);
+
+			hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
+			if (old_tbl->buckets[old_hash] != NULL)
 				complete = false;
+
+			spin_unlock_bh(old_bucket_lock);
 		}
-	} while (!complete);
+	}
 
 	bucket_table_free(old_tbl);
 	return 0;
@@ -272,38 +392,65 @@ EXPORT_SYMBOL_GPL(rhashtable_expand);
  * This function may only be called in a context where it is safe to call
  * synchronize_rcu(), e.g. not within a rcu_read_lock() section.
  *
+ * The caller must ensure that no concurrent resizing occurs by holding
+ * ht->mutex.
+ *
  * The caller must ensure that no concurrent table mutations take place.
  * It is however valid to have concurrent lookups if they are RCU protected.
+ *
+ * It is valid to have concurrent insertions and deletions protected by per
+ * bucket locks or concurrent RCU protected lookups and traversals.
  */
 int rhashtable_shrink(struct rhashtable *ht)
 {
-	struct bucket_table *ntbl, *tbl = rht_dereference(ht->tbl, ht);
-	unsigned int i;
+	struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht);
+	spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2;
+	unsigned int new_hash;
 
 	ASSERT_RHT_MUTEX(ht);
 
 	if (ht->shift <= ht->p.min_shift)
 		return 0;
 
-	ntbl = bucket_table_alloc(tbl->size / 2);
-	if (ntbl == NULL)
+	new_tbl = bucket_table_alloc(ht, tbl->size / 2);
+	if (new_tbl == NULL)
 		return -ENOMEM;
 
-	ht->shift--;
+	rcu_assign_pointer(ht->future_tbl, new_tbl);
+	synchronize_rcu();
 
-	/* Link each bucket in the new table to the first bucket
-	 * in the old table that contains entries which will hash
-	 * to the new bucket.
+	/* Link the first entry in the old bucket to the end of the
+	 * bucket in the new table. As entries are concurrently being
+	 * added to the new table, lock down the new bucket. As we
+	 * always divide the size in half when shrinking, each bucket
+	 * in the new table maps to exactly two buckets in the old
+	 * table.
+	 *
+	 * As removals can occur concurrently on the old table, we need
+	 * to lock down both matching buckets in the old table.
 	 */
-	for (i = 0; i < ntbl->size; i++) {
-		ntbl->buckets[i] = tbl->buckets[i];
-		RCU_INIT_POINTER(*bucket_tail(ntbl, i),
-				 tbl->buckets[i + ntbl->size]);
-
+	for (new_hash = 0; new_hash < new_tbl->size; new_hash++) {
+		old_bucket_lock1 = bucket_lock(tbl, new_hash);
+		old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size);
+		new_bucket_lock = bucket_lock(new_tbl, new_hash);
+
+		spin_lock_bh(old_bucket_lock1);
+		spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED);
+		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2);
+
+		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
+				   tbl->buckets[new_hash]);
+		rcu_assign_pointer(*bucket_tail(new_tbl, new_hash),
+				   tbl->buckets[new_hash + new_tbl->size]);
+
+		spin_unlock_bh(new_bucket_lock);
+		spin_unlock_bh(old_bucket_lock2);
+		spin_unlock_bh(old_bucket_lock1);
 	}
 
 	/* Publish the new, valid hash table */
-	rcu_assign_pointer(ht->tbl, ntbl);
+	rcu_assign_pointer(ht->tbl, new_tbl);
+	ht->shift--;
 
 	/* Wait for readers. No new readers will have references to the
 	 * old hash table.
@@ -316,31 +463,63 @@ int rhashtable_shrink(struct rhashtable *ht)
 }
 EXPORT_SYMBOL_GPL(rhashtable_shrink);
 
+static void rht_deferred_worker(struct work_struct *work)
+{
+	struct rhashtable *ht;
+	struct bucket_table *tbl;
+
+	ht = container_of(work, struct rhashtable, run_work.work);
+	mutex_lock(&ht->mutex);
+	tbl = rht_dereference(ht->tbl, ht);
+
+	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
+		rhashtable_expand(ht);
+	else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size))
+		rhashtable_shrink(ht);
+
+	mutex_unlock(&ht->mutex);
+}
+
 /**
  * rhashtable_insert - insert object into hash hash table
  * @ht:		hash table
  * @obj:	pointer to hash head inside object
  *
- * Will automatically grow the table via rhashtable_expand() if the the
- * grow_decision function specified at rhashtable_init() returns true.
+ * Will take a per bucket spinlock to protect against mutual mutations
+ * on the same bucket. Multiple insertions may occur in parallel unless
+ * they map to the same bucket lock.
  *
- * The caller must ensure that no concurrent table mutations occur. It is
- * however valid to have concurrent lookups if they are RCU protected.
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
  */
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
-	u32 hash;
+	struct bucket_table *tbl;
+	spinlock_t *lock;
+	unsigned hash;
 
-	ASSERT_RHT_MUTEX(ht);
+	rcu_read_lock();
 
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	hash = head_hashfn(ht, tbl, obj);
+	lock = bucket_lock(tbl, hash);
+
+	spin_lock_bh(lock);
 	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
 	rcu_assign_pointer(tbl->buckets[hash], obj);
-	ht->nelems++;
+	spin_unlock_bh(lock);
 
-	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
-		rhashtable_expand(ht);
+	atomic_inc(&ht->nelems);
+
+	/* Only grow the table if no resizing is currently in progress. */
+	if (ht->tbl != ht->future_tbl &&
+	    ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
+		schedule_delayed_work(&ht->run_work, 0);
+
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert);
 
@@ -361,32 +540,56 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  */
 bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj)
 {
-	struct bucket_table *tbl = rht_dereference(ht->tbl, ht);
+	struct bucket_table *tbl;
 	struct rhash_head __rcu **pprev;
 	struct rhash_head *he;
-	u32 h;
+	spinlock_t *lock;
+	unsigned int hash;
 
-	ASSERT_RHT_MUTEX(ht);
+	rcu_read_lock();
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+	hash = head_hashfn(ht, tbl, obj);
 
-	h = head_hashfn(ht, tbl, obj);
+	lock = bucket_lock(tbl, hash);
+	spin_lock_bh(lock);
 
-	pprev = &tbl->buckets[h];
-	rht_for_each(he, tbl, h) {
+restart:
+	pprev = &tbl->buckets[hash];
+	rht_for_each(he, tbl, hash) {
 		if (he != obj) {
 			pprev = &he->next;
 			continue;
 		}
 
-		RCU_INIT_POINTER(*pprev, he->next);
-		ht->nelems--;
+		rcu_assign_pointer(*pprev, obj->next);
+		atomic_dec(&ht->nelems);
 
-		if (ht->p.shrink_decision &&
+		spin_unlock_bh(lock);
+
+		if (ht->tbl != ht->future_tbl &&
+		    ht->p.shrink_decision &&
 		    ht->p.shrink_decision(ht, tbl->size))
-			rhashtable_shrink(ht);
+			schedule_delayed_work(&ht->run_work, 0);
+
+		rcu_read_unlock();
 
 		return true;
 	}
 
+	if (tbl != rht_dereference_rcu(ht->tbl, ht)) {
+		spin_unlock_bh(lock);
+
+		tbl = rht_dereference_rcu(ht->tbl, ht);
+		hash = head_hashfn(ht, tbl, obj);
+
+		lock = bucket_lock(tbl, hash);
+		spin_lock_bh(lock);
+		goto restart;
+	}
+
+	spin_unlock_bh(lock);
+	rcu_read_unlock();
+
 	return false;
 }
 EXPORT_SYMBOL_GPL(rhashtable_remove);
@@ -402,25 +605,35 @@ EXPORT_SYMBOL_GPL(rhashtable_remove);
  * This lookup function may only be used for fixed key hash table (key_len
  * paramter set). It will BUG() if used inappropriately.
  *
- * Lookups may occur in parallel with hash mutations as long as the lookup is
- * guarded by rcu_read_lock(). The caller must take care of this.
+ * Lookups may occur in parallel with hashtable mutations and resizing.
  */
-void *rhashtable_lookup(const struct rhashtable *ht, const void *key)
+void *rhashtable_lookup(struct rhashtable *ht, const void *key)
 {
-	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	const struct bucket_table *tbl, *old_tbl;
 	struct rhash_head *he;
-	u32 h;
+	u32 hash;
 
 	BUG_ON(!ht->p.key_len);
 
-	h = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl, h) {
+	rcu_read_lock();
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	hash = key_hashfn(ht, key, ht->p.key_len);
+restart:
+	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
 		if (memcmp(rht_obj(ht, he) + ht->p.key_offset, key,
 			   ht->p.key_len))
 			continue;
+		rcu_read_unlock();
 		return rht_obj(ht, he);
 	}
 
+	if (unlikely(tbl != old_tbl)) {
+		tbl = old_tbl;
+		goto restart;
+	}
+
+	rcu_read_unlock();
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup);
@@ -435,25 +648,36 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup);
  * Traverses the bucket chain behind the provided hash value and calls the
  * specified compare function for each entry.
  *
- * Lookups may occur in parallel with hash mutations as long as the lookup is
- * guarded by rcu_read_lock(). The caller must take care of this.
+ * Lookups may occur in parallel with hashtable mutations and resizing.
  *
  * Returns the first entry on which the compare function returned true.
  */
-void *rhashtable_lookup_compare(const struct rhashtable *ht, const void *key,
+void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg)
 {
-	const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+	const struct bucket_table *tbl, *old_tbl;
 	struct rhash_head *he;
 	u32 hash;
 
+	rcu_read_lock();
+
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	tbl = rht_dereference_rcu(ht->future_tbl, ht);
 	hash = key_hashfn(ht, key, ht->p.key_len);
-	rht_for_each_rcu(he, tbl, hash) {
+restart:
+	rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) {
 		if (!compare(rht_obj(ht, he), arg))
 			continue;
+		rcu_read_unlock();
 		return rht_obj(ht, he);
 	}
 
+	if (unlikely(tbl != old_tbl)) {
+		tbl = old_tbl;
+		goto restart;
+	}
+	rcu_read_unlock();
+
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
@@ -485,9 +709,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = jhash,
- * #ifdef CONFIG_PROVE_LOCKING
- *	.mutex_is_held = &my_mutex_is_held,
- * #endif
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -507,9 +728,6 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.head_offset = offsetof(struct test_obj, node),
  *	.hashfn = jhash,
  *	.obj_hashfn = my_hash_fn,
- * #ifdef CONFIG_PROVE_LOCKING
- *	.mutex_is_held = &my_mutex_is_held,
- * #endif
  * };
  */
 int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
@@ -529,18 +747,29 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (params->nelem_hint)
 		size = rounded_hashtable_size(params);
 
-	tbl = bucket_table_alloc(size);
+	memset(ht, 0, sizeof(*ht));
+	mutex_init(&ht->mutex);
+	memcpy(&ht->p, params, sizeof(*params));
+
+	if (params->locks_mul)
+		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
+	else
+		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
+
+	tbl = bucket_table_alloc(ht, size);
 	if (tbl == NULL)
 		return -ENOMEM;
 
-	memset(ht, 0, sizeof(*ht));
 	ht->shift = ilog2(tbl->size);
-	memcpy(&ht->p, params, sizeof(*params));
 	RCU_INIT_POINTER(ht->tbl, tbl);
+	RCU_INIT_POINTER(ht->future_tbl, tbl);
 
 	if (!ht->p.hash_rnd)
 		get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
 
+	if (ht->p.grow_decision || ht->p.shrink_decision)
+		INIT_DEFERRABLE_WORK(&ht->run_work, rht_deferred_worker);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rhashtable_init);
@@ -553,9 +782,16 @@ EXPORT_SYMBOL_GPL(rhashtable_init);
  * has to make sure that no resizing may happen by unpublishing the hashtable
  * and waiting for the quiescent cycle before releasing the bucket array.
  */
-void rhashtable_destroy(const struct rhashtable *ht)
+void rhashtable_destroy(struct rhashtable *ht)
 {
-	bucket_table_free(ht->tbl);
+	ht->being_destroyed = true;
+
+	mutex_lock(&ht->mutex);
+
+	cancel_delayed_work(&ht->run_work);
+	bucket_table_free(rht_dereference(ht->tbl, ht));
+
+	mutex_unlock(&ht->mutex);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
 
@@ -570,13 +806,6 @@ EXPORT_SYMBOL_GPL(rhashtable_destroy);
 #define TEST_PTR	((void *) 0xdeadbeef)
 #define TEST_NEXPANDS	4
 
-#ifdef CONFIG_PROVE_LOCKING
-static int test_mutex_is_held(void *parent)
-{
-	return 1;
-}
-#endif
-
 struct test_obj {
 	void			*ptr;
 	int			value;
@@ -646,10 +875,10 @@ static void test_bucket_stats(struct rhashtable *ht, bool quiet)
 				i, tbl->buckets[i], cnt);
 	}
 
-	pr_info("  Traversal complete: counted=%u, nelems=%zu, entries=%d\n",
-		total, ht->nelems, TEST_ENTRIES);
+	pr_info("  Traversal complete: counted=%u, nelems=%u, entries=%d\n",
+		total, atomic_read(&ht->nelems), TEST_ENTRIES);
 
-	if (total != ht->nelems || total != TEST_ENTRIES)
+	if (total != atomic_read(&ht->nelems) || total != TEST_ENTRIES)
 		pr_warn("Test failed: Total count mismatch ^^^");
 }
 
@@ -688,7 +917,9 @@ static int __init test_rhashtable(struct rhashtable *ht)
 
 	for (i = 0; i < TEST_NEXPANDS; i++) {
 		pr_info("  Table expansion iteration %u...\n", i);
+		mutex_lock(&ht->mutex);
 		rhashtable_expand(ht);
+		mutex_unlock(&ht->mutex);
 
 		rcu_read_lock();
 		pr_info("  Verifying lookups...\n");
@@ -698,7 +929,9 @@ static int __init test_rhashtable(struct rhashtable *ht)
 
 	for (i = 0; i < TEST_NEXPANDS; i++) {
 		pr_info("  Table shrinkage iteration %u...\n", i);
+		mutex_lock(&ht->mutex);
 		rhashtable_shrink(ht);
+		mutex_unlock(&ht->mutex);
 
 		rcu_read_lock();
 		pr_info("  Verifying lookups...\n");
@@ -741,9 +974,6 @@ static int __init test_rht_init(void)
 		.key_offset = offsetof(struct test_obj, value),
 		.key_len = sizeof(int),
 		.hashfn = jhash,
-#ifdef CONFIG_PROVE_LOCKING
-		.mutex_is_held = &test_mutex_is_held,
-#endif
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
 	};
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 7f903cf9a1b9..75887d7d2c6a 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -33,7 +33,7 @@ static bool nft_hash_lookup(const struct nft_set *set,
 			    const struct nft_data *key,
 			    struct nft_data *data)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
+	struct rhashtable *priv = nft_set_priv(set);
 	const struct nft_hash_elem *he;
 
 	he = rhashtable_lookup(priv, key);
@@ -113,7 +113,7 @@ static bool nft_hash_compare(void *ptr, void *arg)
 
 static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
+	struct rhashtable *priv = nft_set_priv(set);
 	struct nft_compare_arg arg = {
 		.set = set,
 		.elem = elem,
@@ -129,7 +129,7 @@ static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
 static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
 			  struct nft_set_iter *iter)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
+	struct rhashtable *priv = nft_set_priv(set);
 	const struct bucket_table *tbl;
 	const struct nft_hash_elem *he;
 	struct nft_set_elem elem;
@@ -162,13 +162,6 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
 	return sizeof(struct rhashtable);
 }
 
-#ifdef CONFIG_PROVE_LOCKING
-static int lockdep_nfnl_lock_is_held(void *parent)
-{
-	return lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES);
-}
-#endif
-
 static int nft_hash_init(const struct nft_set *set,
 			 const struct nft_set_desc *desc,
 			 const struct nlattr * const tb[])
@@ -182,9 +175,6 @@ static int nft_hash_init(const struct nft_set *set,
 		.hashfn = jhash,
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
-#ifdef CONFIG_PROVE_LOCKING
-		.mutex_is_held = lockdep_nfnl_lock_is_held,
-#endif
 	};
 
 	return rhashtable_init(priv, &params);
@@ -192,16 +182,23 @@ static int nft_hash_init(const struct nft_set *set,
 
 static void nft_hash_destroy(const struct nft_set *set)
 {
-	const struct rhashtable *priv = nft_set_priv(set);
-	const struct bucket_table *tbl = priv->tbl;
+	struct rhashtable *priv = nft_set_priv(set);
+	const struct bucket_table *tbl;
 	struct nft_hash_elem *he;
 	struct rhash_head *pos, *next;
 	unsigned int i;
 
+	/* Stop an eventual async resizing */
+	priv->being_destroyed = true;
+	mutex_lock(&priv->mutex);
+
+	tbl = rht_dereference(priv->tbl, priv);
 	for (i = 0; i < tbl->size; i++) {
 		rht_for_each_entry_safe(he, pos, next, tbl, i, node)
 			nft_hash_elem_destroy(set, he);
 	}
+	mutex_unlock(&priv->mutex);
+
 	rhashtable_destroy(priv);
 }
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 57449b6089c2..738c3bfaa564 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -114,15 +114,6 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
 DEFINE_MUTEX(nl_sk_hash_lock);
 EXPORT_SYMBOL_GPL(nl_sk_hash_lock);
 
-#ifdef CONFIG_PROVE_LOCKING
-static int lockdep_nl_sk_hash_is_held(void *parent)
-{
-	if (debug_locks)
-		return lockdep_is_held(&nl_sk_hash_lock) || lockdep_is_held(&nl_table_lock);
-	return 1;
-}
-#endif
-
 static ATOMIC_NOTIFIER_HEAD(netlink_chain);
 
 static DEFINE_SPINLOCK(netlink_tap_lock);
@@ -1063,7 +1054,8 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
 		goto err;
 
 	err = -ENOMEM;
-	if (BITS_PER_LONG > 32 && unlikely(table->hash.nelems >= UINT_MAX))
+	if (BITS_PER_LONG > 32 &&
+	    unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
 		goto err;
 
 	nlk_sk(sk)->portid = portid;
@@ -3122,9 +3114,6 @@ static int __init netlink_proto_init(void)
 		.max_shift = 16, /* 64K */
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
-#ifdef CONFIG_PROVE_LOCKING
-		.mutex_is_held = lockdep_nl_sk_hash_is_held,
-#endif
 	};
 
 	if (err != 0)
-- 
cgit v1.2.3


From f89bd6f87a53ce5a7d60662429591ebac2745c10 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 2 Jan 2015 23:00:21 +0100
Subject: rhashtable: Supports for nulls marker

In order to allow for wider usage of rhashtable, use a special nulls
marker to terminate each chain. The reason for not using the existing
nulls_list is that the prev pointer usage would not be valid as entries
can be linked in two different buckets at the same time.

The 4 nulls base bits can be set through the rhashtable_params structure
like this:

struct rhashtable_params params = {
        [...]
        .nulls_base = (1U << RHT_BASE_SHIFT),
};

This reduces the hash length from 32 bits to 27 bits.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list_nulls.h |  3 ++-
 include/linux/rhashtable.h | 57 ++++++++++++++++++++++++++++++++++++++--------
 lib/rhashtable.c           | 37 ++++++++++++++++++++++++------
 3 files changed, 79 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index 5d10ae364b5e..e8c300e06438 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -21,8 +21,9 @@ struct hlist_nulls_head {
 struct hlist_nulls_node {
 	struct hlist_nulls_node *next, **pprev;
 };
+#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
 #define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
-	((ptr)->first = (struct hlist_nulls_node *) (1UL | (((long)nulls) << 1)))
+	((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
 
 #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)
 /**
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index a1688f0a6193..de7cac753b09 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -18,15 +18,32 @@
 #ifndef _LINUX_RHASHTABLE_H
 #define _LINUX_RHASHTABLE_H
 
-#include <linux/rculist.h>
+#include <linux/list_nulls.h>
 #include <linux/workqueue.h>
 
+/*
+ * The end of the chain is marked with a special nulls marks which has
+ * the following format:
+ *
+ * +-------+-----------------------------------------------------+-+
+ * | Base  |                      Hash                           |1|
+ * +-------+-----------------------------------------------------+-+
+ *
+ * Base (4 bits) : Reserved to distinguish between multiple tables.
+ *                 Specified via &struct rhashtable_params.nulls_base.
+ * Hash (27 bits): Full hash (unmasked) of first element added to bucket
+ * 1 (1 bit)     : Nulls marker (always set)
+ *
+ * The remaining bits of the next pointer remain unused for now.
+ */
+#define RHT_BASE_BITS		4
+#define RHT_HASH_BITS		27
+#define RHT_BASE_SHIFT		RHT_HASH_BITS
+
 struct rhash_head {
 	struct rhash_head __rcu		*next;
 };
 
-#define INIT_HASH_HEAD(ptr) ((ptr)->next = NULL)
-
 /**
  * struct bucket_table - Table of hash buckets
  * @size: Number of hash buckets
@@ -55,6 +72,7 @@ struct rhashtable;
  * @hash_rnd: Seed to use while hashing
  * @max_shift: Maximum number of shifts while expanding
  * @min_shift: Minimum number of shifts while shrinking
+ * @nulls_base: Base value to generate nulls marker
  * @locks_mul: Number of bucket locks to allocate per cpu (default: 128)
  * @hashfn: Function to hash key
  * @obj_hashfn: Function to hash object
@@ -69,6 +87,7 @@ struct rhashtable_params {
 	u32			hash_rnd;
 	size_t			max_shift;
 	size_t			min_shift;
+	u32			nulls_base;
 	size_t			locks_mul;
 	rht_hashfn_t		hashfn;
 	rht_obj_hashfn_t	obj_hashfn;
@@ -100,6 +119,24 @@ struct rhashtable {
 	bool                            being_destroyed;
 };
 
+static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
+{
+	return NULLS_MARKER(ht->p.nulls_base + hash);
+}
+
+#define INIT_RHT_NULLS_HEAD(ptr, ht, hash) \
+	((ptr) = (typeof(ptr)) rht_marker(ht, hash))
+
+static inline bool rht_is_a_nulls(const struct rhash_head *ptr)
+{
+	return ((unsigned long) ptr & 1);
+}
+
+static inline unsigned long rht_get_nulls_value(const struct rhash_head *ptr)
+{
+	return ((unsigned long) ptr) >> 1;
+}
+
 #ifdef CONFIG_PROVE_LOCKING
 int lockdep_rht_mutex_is_held(struct rhashtable *ht);
 int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash);
@@ -157,7 +194,7 @@ void rhashtable_destroy(struct rhashtable *ht);
  */
 #define rht_for_each_continue(pos, head, tbl, hash) \
 	for (pos = rht_dereference_bucket(head, tbl, hash); \
-	     pos; \
+	     !rht_is_a_nulls(pos); \
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
 /**
@@ -180,7 +217,7 @@ void rhashtable_destroy(struct rhashtable *ht);
  */
 #define rht_for_each_entry_continue(tpos, pos, head, tbl, hash, member)	\
 	for (pos = rht_dereference_bucket(head, tbl, hash);		\
-	     pos && rht_entry(tpos, pos, member);			\
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	\
 	     pos = rht_dereference_bucket((pos)->next, tbl, hash))
 
 /**
@@ -209,9 +246,9 @@ void rhashtable_destroy(struct rhashtable *ht);
  */
 #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member)	    \
 	for (pos = rht_dereference_bucket((tbl)->buckets[hash], tbl, hash), \
-	     next = pos ? rht_dereference_bucket(pos->next, tbl, hash)      \
-			: NULL;						    \
-	     pos && rht_entry(tpos, pos, member);			    \
+	     next = !rht_is_a_nulls(pos) ?				    \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
 	     pos = next)
 
 /**
@@ -228,7 +265,7 @@ void rhashtable_destroy(struct rhashtable *ht);
 #define rht_for_each_rcu_continue(pos, head, tbl, hash)			\
 	for (({barrier(); }),						\
 	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		\
-	     pos;							\
+	     !rht_is_a_nulls(pos);					\
 	     pos = rcu_dereference_raw(pos->next))
 
 /**
@@ -260,7 +297,7 @@ void rhashtable_destroy(struct rhashtable *ht);
 #define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \
 	for (({barrier(); }),						    \
 	     pos = rht_dereference_bucket_rcu(head, tbl, hash);		    \
-	     pos && rht_entry(tpos, pos, member);			    \
+	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
 	     pos = rht_dereference_bucket_rcu(pos->next, tbl, hash))
 
 /**
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 312e3437c7bc..cbad192d3b3d 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -28,6 +28,9 @@
 #define HASH_MIN_SIZE		4UL
 #define BUCKET_LOCKS_PER_CPU   128UL
 
+/* Base bits plus 1 bit for nulls marker */
+#define HASH_RESERVED_SPACE	(RHT_BASE_BITS + 1)
+
 enum {
 	RHT_LOCK_NORMAL,
 	RHT_LOCK_NESTED,
@@ -86,7 +89,7 @@ static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
 		hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
 				    ht->p.hash_rnd);
 
-	return hash;
+	return hash >> HASH_RESERVED_SPACE;
 }
 
 static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
@@ -95,6 +98,7 @@ static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
 	u32 hash;
 
 	hash = ht->p.hashfn(key, len, ht->p.hash_rnd);
+	hash >>= HASH_RESERVED_SPACE;
 
 	return rht_bucket_index(tbl, hash);
 }
@@ -111,7 +115,7 @@ static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n)
 	struct rhash_head __rcu **pprev;
 
 	for (pprev = &tbl->buckets[n];
-	     rht_dereference_bucket(*pprev, tbl, n);
+	     !rht_is_a_nulls(rht_dereference_bucket(*pprev, tbl, n));
 	     pprev = &rht_dereference_bucket(*pprev, tbl, n)->next)
 		;
 
@@ -164,6 +168,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 {
 	struct bucket_table *tbl;
 	size_t size;
+	int i;
 
 	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
 	tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
@@ -180,6 +185,9 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 		return NULL;
 	}
 
+	for (i = 0; i < nbuckets; i++)
+		INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i);
+
 	return tbl;
 }
 
@@ -221,7 +229,7 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	/* Old bucket empty, no work needed. */
 	p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl,
 				   old_hash);
-	if (!p)
+	if (rht_is_a_nulls(p))
 		return;
 
 	new_hash = new_hash2 = head_hashfn(ht, new_tbl, p);
@@ -252,8 +260,8 @@ static void hashtable_chain_unzip(const struct rhashtable *ht,
 	/* Find the subsequent node which does hash to the same
 	 * bucket as node P, or NULL if no such node exists.
 	 */
-	next = NULL;
-	if (he) {
+	INIT_RHT_NULLS_HEAD(next, ht, old_hash);
+	if (!rht_is_a_nulls(he)) {
 		rht_for_each_continue(he, he->next, old_tbl, old_hash) {
 			if (head_hashfn(ht, new_tbl, he) == new_hash) {
 				next = he;
@@ -369,11 +377,15 @@ int rhashtable_expand(struct rhashtable *ht)
 		 */
 		complete = true;
 		for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
+			struct rhash_head *head;
+
 			old_bucket_lock = bucket_lock(old_tbl, old_hash);
 			spin_lock_bh(old_bucket_lock);
 
 			hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash);
-			if (old_tbl->buckets[old_hash] != NULL)
+			head = rht_dereference_bucket(old_tbl->buckets[old_hash],
+						      old_tbl, old_hash);
+			if (!rht_is_a_nulls(head))
 				complete = false;
 
 			spin_unlock_bh(old_bucket_lock);
@@ -498,6 +510,7 @@ static void rht_deferred_worker(struct work_struct *work)
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
 	struct bucket_table *tbl;
+	struct rhash_head *head;
 	spinlock_t *lock;
 	unsigned hash;
 
@@ -508,7 +521,12 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 	lock = bucket_lock(tbl, hash);
 
 	spin_lock_bh(lock);
-	RCU_INIT_POINTER(obj->next, tbl->buckets[hash]);
+	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
+	if (rht_is_a_nulls(head))
+		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
+	else
+		RCU_INIT_POINTER(obj->next, head);
+
 	rcu_assign_pointer(tbl->buckets[hash], obj);
 	spin_unlock_bh(lock);
 
@@ -709,6 +727,7 @@ static size_t rounded_hashtable_size(struct rhashtable_params *params)
  *	.key_offset = offsetof(struct test_obj, key),
  *	.key_len = sizeof(int),
  *	.hashfn = jhash,
+ *	.nulls_base = (1U << RHT_BASE_SHIFT),
  * };
  *
  * Configuration Example 2: Variable length keys
@@ -741,6 +760,9 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	    (!params->key_len && !params->obj_hashfn))
 		return -EINVAL;
 
+	if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT))
+		return -EINVAL;
+
 	params->min_shift = max_t(size_t, params->min_shift,
 				  ilog2(HASH_MIN_SIZE));
 
@@ -974,6 +996,7 @@ static int __init test_rht_init(void)
 		.key_offset = offsetof(struct test_obj, value),
 		.key_len = sizeof(int),
 		.hashfn = jhash,
+		.nulls_base = (3U << RHT_BASE_SHIFT),
 		.grow_decision = rht_grow_above_75,
 		.shrink_decision = rht_shrink_below_30,
 	};
-- 
cgit v1.2.3


From 86b35b64ed7b6b38305dee67a0f2ddff2ca5455d Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Sun, 4 Jan 2015 15:25:09 +0800
Subject: rhashtable: fix missing header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixup below build error:

include/linux/rhashtable.h: At top level:
include/linux/rhashtable.h:118:34: error: field ‘mutex’ has incomplete type

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index de7cac753b09..de1459c74c4d 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -20,6 +20,7 @@
 
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
+#include <linux/mutex.h>
 
 /*
  * The end of the chain is marked with a special nulls marks which has
-- 
cgit v1.2.3


From a3449ded128d037e6b2bd7a59b0741de56506066 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Sun, 4 Jan 2015 15:24:35 +0800
Subject: list_nulls: fix missing header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixup below build error:

include/linux/list_nulls.h: In function ‘hlist_nulls_del’:
include/linux/list_nulls.h:84:13: error: ‘LIST_POISON2’ undeclared (first use in this function)

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/list_nulls.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index e8c300e06438..f266661d2666 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -1,6 +1,9 @@
 #ifndef _LINUX_LIST_NULLS_H
 #define _LINUX_LIST_NULLS_H
 
+#include <linux/poison.h>
+#include <linux/const.h>
+
 /*
  * Special version of lists, where end of list is not a NULL pointer,
  * but a 'nulls' marker, which can have many different values.
-- 
cgit v1.2.3


From 224d019c4fbba242041e9b25a926ba873b7da1e2 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 5 Jan 2015 13:56:14 -0800
Subject: ip: Move checksum convert defines to inet

Move convert_csum from udp_sock to inet_sock. This allows the
possibility that we can use convert checksum for different types
of sockets and also allows convert checksum to be enabled from
inet layer (what we'll want to do when enabling IP_CHECKSUM cmsg).

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h     | 16 +---------------
 include/net/inet_sock.h | 17 +++++++++++++++++
 net/ipv4/fou.c          |  2 +-
 net/ipv4/udp.c          |  2 +-
 net/ipv4/udp_tunnel.c   |  2 +-
 net/ipv6/udp.c          |  2 +-
 6 files changed, 22 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index ee3277593222..247cfdcc4b08 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -49,11 +49,7 @@ struct udp_sock {
 	unsigned int	 corkflag;	/* Cork is required */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 	unsigned char	 no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
-			 no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
-			 convert_csum:1;/* On receive, convert checksum
-					 * unnecessary to checksum complete
-					 * if possible.
-					 */
+			 no_check6_rx:1;/* Allow zero UDP6 checksums on RX? */
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
@@ -102,16 +98,6 @@ static inline bool udp_get_no_check6_rx(struct sock *sk)
 	return udp_sk(sk)->no_check6_rx;
 }
 
-static inline void udp_set_convert_csum(struct sock *sk, bool val)
-{
-	udp_sk(sk)->convert_csum = val;
-}
-
-static inline bool udp_get_convert_csum(struct sock *sk)
-{
-	return udp_sk(sk)->convert_csum;
-}
-
 #define udp_portaddr_for_each_entry(__sk, node, list) \
 	hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node)
 
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index a829b77523cf..360b110b3e36 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -184,6 +184,7 @@ struct inet_sock {
 				mc_all:1,
 				nodefrag:1;
 	__u8			rcv_tos;
+	__u8			convert_csum;
 	int			uc_index;
 	int			mc_index;
 	__be32			mc_addr;
@@ -250,4 +251,20 @@ static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
 	return flags;
 }
 
+static inline void inet_inc_convert_csum(struct sock *sk)
+{
+	inet_sk(sk)->convert_csum++;
+}
+
+static inline void inet_dec_convert_csum(struct sock *sk)
+{
+	if (inet_sk(sk)->convert_csum > 0)
+		inet_sk(sk)->convert_csum--;
+}
+
+static inline bool inet_get_convert_csum(struct sock *sk)
+{
+	return !!inet_sk(sk)->convert_csum;
+}
+
 #endif	/* _INET_SOCK_H */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index b986298a7ba3..2197c36f722f 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -490,7 +490,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 	sk->sk_user_data = fou;
 	fou->sock = sock;
 
-	udp_set_convert_csum(sk, true);
+	inet_inc_convert_csum(sk);
 
 	sk->sk_allocation = GFP_ATOMIC;
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 13b4dcf86ef6..53358d88f110 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1806,7 +1806,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (sk != NULL) {
 		int ret;
 
-		if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk))
+		if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
 			skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
 						 inet_compute_pseudo);
 
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 1671263e5fa0..9996e63ed304 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -63,7 +63,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
 	inet_sk(sk)->mc_loop = 0;
 
 	/* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
-	udp_set_convert_csum(sk, true);
+	inet_inc_convert_csum(sk);
 
 	rcu_assign_sk_user_data(sk, cfg->sk_user_data);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 189dc4ae3eca..e41f017cd479 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -909,7 +909,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 			goto csum_error;
 		}
 
-		if (udp_sk(sk)->convert_csum && uh->check && !IS_UDPLITE(sk))
+		if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
 			skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
 						 ip6_compute_pseudo);
 
-- 
cgit v1.2.3


From 6810e4a394f9d781050107529b8d1465c00b7b13 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Jan 2015 10:26:10 -0500
Subject: percpu_ref: remove unnecessary ACCESS_ONCE() in
 percpu_ref_tryget_live()

__ref_is_percpu() needs the implied ACCESS_ONCE() in
lockless_dereference() on @ref->percpu_count_ptr because the value is
tested for !__PERCPU_REF_ATOMIC, which may be set asynchronously, and
then used as a pointer.  If the compiler generates a separate fetch
when using it as a pointer, __PERCPU_REF_ATOMIC may be set in between
contaminating the pointer value.

percpu_ref_tryget_live() also uses ACCESS_ONCE() to test
__PERCPU_REF_DEAD; however, there's no reason for this.  I just copied
ACCESS_ONCE() usage blindly from __ref_is_percpu().  All it does is
confusing people trying to understand what's going on.

This patch removes the unnecessary ACCESS_ONCE() usage from
percpu_ref_tryget_live() and adds a comment explaining why
__ref_is_percpu() needs it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kent Overstreet <kmo@daterainc.com>
---
 include/linux/percpu-refcount.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index b4337646388b..6a7a670366ab 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -128,8 +128,22 @@ static inline void percpu_ref_kill(struct percpu_ref *ref)
 static inline bool __ref_is_percpu(struct percpu_ref *ref,
 					  unsigned long __percpu **percpu_countp)
 {
-	/* paired with smp_store_release() in percpu_ref_reinit() */
-	unsigned long percpu_ptr = lockless_dereference(ref->percpu_count_ptr);
+	unsigned long percpu_ptr;
+
+	/*
+	 * The value of @ref->percpu_count_ptr is tested for
+	 * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then
+	 * used as a pointer.  If the compiler generates a separate fetch
+	 * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in
+	 * between contaminating the pointer value, meaning that
+	 * ACCESS_ONCE() is required when fetching it.
+	 *
+	 * Also, we need a data dependency barrier to be paired with
+	 * smp_store_release() in __percpu_ref_switch_to_percpu().
+	 *
+	 * Use lockless deref which contains both.
+	 */
+	percpu_ptr = lockless_dereference(ref->percpu_count_ptr);
 
 	/*
 	 * Theoretically, the following could test just ATOMIC; however,
@@ -233,7 +247,7 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
 	if (__ref_is_percpu(ref, &percpu_count)) {
 		this_cpu_inc(*percpu_count);
 		ret = true;
-	} else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) {
+	} else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
 		ret = atomic_long_inc_not_zero(&ref->count);
 	}
 
-- 
cgit v1.2.3


From 4c907baf36d8339f393bb576d0bab29194d0e6ad Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Jan 2015 10:26:10 -0500
Subject: percpu_ref: implement percpu_ref_is_dying()

Implement percpu_ref_is_dying() which tests whether the ref is dying
or dead.  This is useful to determine the current state when a
percpu_ref is used as a cyclic on/off switch via kill and reinit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kent Overstreet <kmo@daterainc.com>
---
 include/linux/percpu-refcount.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 6a7a670366ab..12c9b485beb7 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -294,6 +294,20 @@ static inline void percpu_ref_put(struct percpu_ref *ref)
 	percpu_ref_put_many(ref, 1);
 }
 
+/**
+ * percpu_ref_is_dying - test whether a percpu refcount is dying or dead
+ * @ref: percpu_ref to test
+ *
+ * Returns %true if @ref is dying or dead.
+ *
+ * This function is safe to call as long as @ref is between init and exit
+ * and the caller is responsible for synchronizing against state changes.
+ */
+static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
+{
+	return ref->percpu_count_ptr & __PERCPU_REF_DEAD;
+}
+
 /**
  * percpu_ref_is_zero - test whether a percpu refcount reached zero
  * @ref: percpu_ref to test
-- 
cgit v1.2.3


From 24dab7a7b3534ef40ecec20cfd7fb3ad99d9ff33 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Jan 2015 12:02:46 -0500
Subject: cgroup: reorder SUBSYS(blkio) in cgroup_subsys.h

The scheduled cgroup writeback support requires blkio to be
initialized before memcg as memcg needs to provide certain blkcg
related functionalities.  Relocate blkio so that it's right above
memory.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup_subsys.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 98c4f9b12b03..e4a96fb14403 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -15,6 +15,10 @@ SUBSYS(cpu)
 SUBSYS(cpuacct)
 #endif
 
+#if IS_ENABLED(CONFIG_BLK_CGROUP)
+SUBSYS(blkio)
+#endif
+
 #if IS_ENABLED(CONFIG_MEMCG)
 SUBSYS(memory)
 #endif
@@ -31,10 +35,6 @@ SUBSYS(freezer)
 SUBSYS(net_cls)
 #endif
 
-#if IS_ENABLED(CONFIG_BLK_CGROUP)
-SUBSYS(blkio)
-#endif
-
 #if IS_ENABLED(CONFIG_CGROUP_PERF)
 SUBSYS(perf_event)
 #endif
-- 
cgit v1.2.3


From f3ba53802eff25e3eedb60d7afe5262710e20bd5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 6 Jan 2015 12:02:46 -0500
Subject: cgroup: add dummy css_put() for !CONFIG_CGROUPS

This will later be depended upon by the scheduled cgroup writeback
support.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index da0dae0600e6..b9cb94c3102a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -943,6 +943,8 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 
 #else /* !CONFIG_CGROUPS */
 
+struct cgroup_subsys_state;
+
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_fork(struct task_struct *p) {}
@@ -955,6 +957,8 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 	return -EINVAL;
 }
 
+static inline void css_put(struct cgroup_subsys_state *css) {}
+
 /* No cgroups - nothing to do */
 static inline int cgroup_attach_task_all(struct task_struct *from,
 					 struct task_struct *t)
-- 
cgit v1.2.3


From 536fa402221f09633e7c5801b327055ab716a363 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 5 Sep 2014 11:14:48 -0700
Subject: compiler: Allow 1- and 2-byte smp_load_acquire() and
 smp_store_release()

CPUs without single-byte and double-byte loads and stores place some
"interesting" requirements on concurrent code.  For example (adapted
from Peter Hurley's test code), suppose we have the following structure:

	struct foo {
		spinlock_t lock1;
		spinlock_t lock2;
		char a; /* Protected by lock1. */
		char b; /* Protected by lock2. */
	};
	struct foo *foop;

Of course, it is common (and good) practice to place data protected
by different locks in separate cache lines.  However, if the locks are
rarely acquired (for example, only in rare error cases), and there are
a great many instances of the data structure, then memory footprint can
trump false-sharing concerns, so that it can be better to place them in
the same cache cache line as above.

But if the CPU does not support single-byte loads and stores, a store
to foop->a will do a non-atomic read-modify-write operation on foop->b,
which will come as a nasty surprise to someone holding foop->lock2.  So we
now require CPUs to support single-byte and double-byte loads and stores.
Therefore, this commit adjusts the definition of __native_word() to allow
these sizes to be used by smp_load_acquire() and smp_store_release().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/compiler.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index a1c81f80978e..49811cdddaa5 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -385,7 +385,7 @@ static __always_inline void __assign_once_size(volatile void *p, void *res, int
 
 /* Is this type a native word size -- useful for atomic operations */
 #ifndef __native_word
-# define __native_word(t) (sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
+# define __native_word(t) (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
 #endif
 
 /* Compile time object size, -1 for unknown */
-- 
cgit v1.2.3


From ac59853c06993a442e8060bc19040b2ca3025aec Mon Sep 17 00:00:00 2001
From: Pranith Kumar <bobby.prani@gmail.com>
Date: Thu, 13 Nov 2014 14:24:14 -0500
Subject: rcupdate: Replace smp_read_barrier_depends() with
 lockless_dereference()

Recently lockless_dereference() was added which can be used in place of
hard-coding smp_read_barrier_depends(). The following PATCH makes the change.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ed4f5939a452..386ba288084a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -582,11 +582,11 @@ static inline void rcu_preempt_sleep_check(void)
 })
 #define __rcu_dereference_check(p, c, space) \
 ({ \
-	typeof(*p) *_________p1 = (typeof(*p) *__force)ACCESS_ONCE(p); \
+	/* Dependency order vs. p above. */ \
+	typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \
 	rcu_lockdep_assert(c, "suspicious rcu_dereference_check() usage"); \
 	rcu_dereference_sparse(p, space); \
-	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
-	((typeof(*p) __force __kernel *)(_________p1)); \
+	((typeof(*p) __force __kernel *)(________p1)); \
 })
 #define __rcu_dereference_protected(p, c, space) \
 ({ \
@@ -603,10 +603,10 @@ static inline void rcu_preempt_sleep_check(void)
 })
 #define __rcu_dereference_index_check(p, c) \
 ({ \
-	typeof(p) _________p1 = ACCESS_ONCE(p); \
+	/* Dependency order vs. p above. */ \
+	typeof(p) _________p1 = lockless_dereference(p); \
 	rcu_lockdep_assert(c, \
 			   "suspicious rcu_dereference_index_check() usage"); \
-	smp_read_barrier_depends(); /* Dependency order vs. p above. */ \
 	(_________p1); \
 })
 
-- 
cgit v1.2.3


From f520c98e3e5212d8c282a86d9b7697dd70326192 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Fri, 12 Dec 2014 09:36:14 +0800
Subject: rculist: Fix sparse warning

This fixes the following sparse warnings:

make C=1 CF=-D__CHECK_ENDIAN__ net/ipv6/addrconf.o
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)
net/ipv6/addrconf.c:3495:9: error: incompatible types in comparison expression (different address spaces)

To silence these spare complaints, an RCU annotation should be added to
"next" pointer of hlist_node structure through hlist_next_rcu() macro
when iterating over a hlist with hlist_for_each_entry_continue_rcu_bh().

By the way, this commit also resolves the same error appearing in
hlist_for_each_entry_continue_rcu().

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rculist.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 529bc946f450..a18b16f1dc0e 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -524,11 +524,11 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
  * @member:	the name of the hlist_node within the struct.
  */
 #define hlist_for_each_entry_continue_rcu(pos, member)			\
-	for (pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\
-			typeof(*(pos)), member);			\
+	for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
+			&(pos)->member)), typeof(*(pos)), member);	\
 	     pos;							\
-	     pos = hlist_entry_safe(rcu_dereference((pos)->member.next),\
-			typeof(*(pos)), member))
+	     pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(	\
+			&(pos)->member)), typeof(*(pos)), member))
 
 /**
  * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
@@ -536,11 +536,11 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n,
  * @member:	the name of the hlist_node within the struct.
  */
 #define hlist_for_each_entry_continue_rcu_bh(pos, member)		\
-	for (pos = hlist_entry_safe(rcu_dereference_bh((pos)->member.next),\
-			typeof(*(pos)), member);			\
+	for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(  \
+			&(pos)->member)), typeof(*(pos)), member);	\
 	     pos;							\
-	     pos = hlist_entry_safe(rcu_dereference_bh((pos)->member.next),\
-			typeof(*(pos)), member))
+	     pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(	\
+			&(pos)->member)), typeof(*(pos)), member))
 
 /**
  * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
-- 
cgit v1.2.3


From a5c198f4f7da6cc48116ca239c59c9f44b753364 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 23 Nov 2014 20:30:06 -0800
Subject: rcu: Expand SRCU ->completed to 64 bits

When rcutorture used only the low-order 32 bits of the grace-period
number, it was not a problem for SRCU to use a 32-bit completed field.
However, rcutorture now uses the full 64 bits on 64-bit systems, so
this commit converts SRCU's ->completed field to unsigned long so as to
provide 64 bits on 64-bit systems.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h | 4 ++--
 kernel/rcu/srcu.c    | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index a2783cb5d275..ef923dd96249 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -45,7 +45,7 @@ struct rcu_batch {
 #define RCU_BATCH_INIT(name) { NULL, &(name.head) }
 
 struct srcu_struct {
-	unsigned completed;
+	unsigned long completed;
 	struct srcu_struct_array __percpu *per_cpu_ref;
 	spinlock_t queue_lock; /* protect ->batch_queue, ->running */
 	bool running;
@@ -135,7 +135,7 @@ int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp);
 void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp);
 void synchronize_srcu(struct srcu_struct *sp);
 void synchronize_srcu_expedited(struct srcu_struct *sp);
-long srcu_batches_completed(struct srcu_struct *sp);
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
 void srcu_barrier(struct srcu_struct *sp);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index e037f3eb2f7b..445bf8ffe3fb 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -546,7 +546,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
  * Report the number of batches, correlated with, but not necessarily
  * precisely the same as, the number of grace periods that have elapsed.
  */
-long srcu_batches_completed(struct srcu_struct *sp)
+unsigned long srcu_batches_completed(struct srcu_struct *sp)
 {
 	return sp->completed;
 }
-- 
cgit v1.2.3


From 9735af5c78599703be633c057af3faee26482028 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 26 Nov 2014 10:42:50 -0800
Subject: rcu: Combine DEFINE_SRCU() and DEFINE_STATIC_SRCU()

The DEFINE_SRCU() and DEFINE_STATIC_SRCU() definitions are quite
similar, so this commit combines them, saving a bit of code and removing
redundancy.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index ef923dd96249..9cfd9623fb03 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -102,13 +102,11 @@ void process_srcu(struct work_struct *work);
  * define and init a srcu struct at build time.
  * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
  */
-#define DEFINE_SRCU(name)						\
+#define __DEFINE_SRCU(name, is_static)					\
 	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	struct srcu_struct name = __SRCU_STRUCT_INIT(name);
-
-#define DEFINE_STATIC_SRCU(name)					\
-	static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
-	static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+	is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name)
+#define DEFINE_SRCU(name)		__DEFINE_SRCU(name, /* not static */)
+#define DEFINE_STATIC_SRCU(name)	__DEFINE_SRCU(name, static)
 
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
-- 
cgit v1.2.3


From 8b3a38daff6f50027039d6979b9eb026907508eb Mon Sep 17 00:00:00 2001
From: Arend van Spriel <arend@broadcom.com>
Date: Tue, 23 Dec 2014 19:04:23 +0100
Subject: brcmfmac: Add support for bcm43340/1 wireless chipsets

This patch adds support for the bcm43340 and bcm43341 wireless
chipsets. These two chipsets are identical from wireless parts
perspective. As such they use the same firmware image.

Cc: Samuel Ortiz <sameo@linux.intel.com>
Cc: Rob Herring <rob.herring@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
[arend@broadcom.com: squash to single commit, remove 43341 chipid]
Reviewed-by: Pieter-Paul Giesberts <pieterpg@broadcom.com>
Reviewed-by: Hante Meuleman <meuleman@broadcom.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c     | 2 ++
 drivers/net/wireless/brcm80211/brcmfmac/chip.c       | 1 +
 drivers/net/wireless/brcm80211/brcmfmac/sdio.c       | 5 +++++
 drivers/net/wireless/brcm80211/include/brcm_hw_ids.h | 3 +++
 include/linux/mmc/sdio_ids.h                         | 6 ++++--
 5 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c
index 9880dae2a569..dffd9e44f5b6 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c
@@ -1005,6 +1005,8 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_4329_DEVICE_ID),
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_4330_DEVICE_ID),
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_4334_DEVICE_ID),
+	BRCMF_SDIO_DEVICE(BRCM_SDIO_43340_DEVICE_ID),
+	BRCMF_SDIO_DEVICE(BRCM_SDIO_43341_DEVICE_ID),
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_43362_DEVICE_ID),
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_4335_4339_DEVICE_ID),
 	BRCMF_SDIO_DEVICE(BRCM_SDIO_4354_DEVICE_ID),
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/chip.c b/drivers/net/wireless/brcm80211/brcmfmac/chip.c
index 519b79ebaabd..04d2ca0d87d6 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/chip.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/chip.c
@@ -481,6 +481,7 @@ static void brcmf_chip_get_raminfo(struct brcmf_chip_priv *ci)
 		ci->pub.ramsize = 0x48000;
 		break;
 	case BRCM_CC_4334_CHIP_ID:
+	case BRCM_CC_43340_CHIP_ID:
 		ci->pub.ramsize = 0x80000;
 		break;
 	case BRCM_CC_4335_CHIP_ID:
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c
index 0b0d51a61060..551da356a5bd 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/sdio.c
@@ -608,6 +608,8 @@ static const struct sdiod_drive_str sdiod_drvstr_tab2_3v3[] = {
 #define BCM4330_NVRAM_NAME		"brcm/brcmfmac4330-sdio.txt"
 #define BCM4334_FIRMWARE_NAME		"brcm/brcmfmac4334-sdio.bin"
 #define BCM4334_NVRAM_NAME		"brcm/brcmfmac4334-sdio.txt"
+#define BCM43340_FIRMWARE_NAME		"brcm/brcmfmac43340-sdio.bin"
+#define BCM43340_NVRAM_NAME		"brcm/brcmfmac43340-sdio.txt"
 #define BCM4335_FIRMWARE_NAME		"brcm/brcmfmac4335-sdio.bin"
 #define BCM4335_NVRAM_NAME		"brcm/brcmfmac4335-sdio.txt"
 #define BCM43362_FIRMWARE_NAME		"brcm/brcmfmac43362-sdio.bin"
@@ -629,6 +631,8 @@ MODULE_FIRMWARE(BCM4330_FIRMWARE_NAME);
 MODULE_FIRMWARE(BCM4330_NVRAM_NAME);
 MODULE_FIRMWARE(BCM4334_FIRMWARE_NAME);
 MODULE_FIRMWARE(BCM4334_NVRAM_NAME);
+MODULE_FIRMWARE(BCM43340_FIRMWARE_NAME);
+MODULE_FIRMWARE(BCM43340_NVRAM_NAME);
 MODULE_FIRMWARE(BCM4335_FIRMWARE_NAME);
 MODULE_FIRMWARE(BCM4335_NVRAM_NAME);
 MODULE_FIRMWARE(BCM43362_FIRMWARE_NAME);
@@ -660,6 +664,7 @@ static const struct brcmf_firmware_names brcmf_fwname_data[] = {
 	{ BRCM_CC_4329_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM4329) },
 	{ BRCM_CC_4330_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM4330) },
 	{ BRCM_CC_4334_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM4334) },
+	{ BRCM_CC_43340_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM43340) },
 	{ BRCM_CC_4335_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM4335) },
 	{ BRCM_CC_43362_CHIP_ID, 0xFFFFFFFE, BRCMF_FIRMWARE_NVRAM(BCM43362) },
 	{ BRCM_CC_4339_CHIP_ID, 0xFFFFFFFF, BRCMF_FIRMWARE_NVRAM(BCM4339) },
diff --git a/drivers/net/wireless/brcm80211/include/brcm_hw_ids.h b/drivers/net/wireless/brcm80211/include/brcm_hw_ids.h
index 6996fcc144cf..00215efbc13b 100644
--- a/drivers/net/wireless/brcm80211/include/brcm_hw_ids.h
+++ b/drivers/net/wireless/brcm80211/include/brcm_hw_ids.h
@@ -34,6 +34,7 @@
 #define BRCM_CC_4329_CHIP_ID		0x4329
 #define BRCM_CC_4330_CHIP_ID		0x4330
 #define BRCM_CC_4334_CHIP_ID		0x4334
+#define BRCM_CC_43340_CHIP_ID		43340
 #define BRCM_CC_43362_CHIP_ID		43362
 #define BRCM_CC_4335_CHIP_ID		0x4335
 #define BRCM_CC_4339_CHIP_ID		0x4339
@@ -51,6 +52,8 @@
 #define BRCM_SDIO_4329_DEVICE_ID	BRCM_CC_4329_CHIP_ID
 #define BRCM_SDIO_4330_DEVICE_ID	BRCM_CC_4330_CHIP_ID
 #define BRCM_SDIO_4334_DEVICE_ID	BRCM_CC_4334_CHIP_ID
+#define BRCM_SDIO_43340_DEVICE_ID	BRCM_CC_43340_CHIP_ID
+#define BRCM_SDIO_43341_DEVICE_ID	43341
 #define BRCM_SDIO_43362_DEVICE_ID	BRCM_CC_43362_CHIP_ID
 #define BRCM_SDIO_4335_4339_DEVICE_ID	BRCM_CC_4335_CHIP_ID
 #define BRCM_SDIO_4354_DEVICE_ID	BRCM_CC_4354_CHIP_ID
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 0f01fe065424..996807963716 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -24,13 +24,15 @@
  * Vendors and devices.  Sort key: vendor first, device next.
  */
 #define SDIO_VENDOR_ID_BROADCOM			0x02d0
-#define SDIO_DEVICE_ID_BROADCOM_43143		43143
+#define SDIO_DEVICE_ID_BROADCOM_43143		0xa887
 #define SDIO_DEVICE_ID_BROADCOM_43241		0x4324
 #define SDIO_DEVICE_ID_BROADCOM_4329		0x4329
 #define SDIO_DEVICE_ID_BROADCOM_4330		0x4330
 #define SDIO_DEVICE_ID_BROADCOM_4334		0x4334
+#define SDIO_DEVICE_ID_BROADCOM_43340		0xa94c
+#define SDIO_DEVICE_ID_BROADCOM_43341		0xa94d
 #define SDIO_DEVICE_ID_BROADCOM_4335_4339	0x4335
-#define SDIO_DEVICE_ID_BROADCOM_43362		43362
+#define SDIO_DEVICE_ID_BROADCOM_43362		0xa962
 #define SDIO_DEVICE_ID_BROADCOM_4354		0x4354
 
 #define SDIO_VENDOR_ID_INTEL			0x0089
-- 
cgit v1.2.3


From 2f4383667d57d1c719070db86b14277277752841 Mon Sep 17 00:00:00 2001
From: Ed Swierk <eswierk@skyportsystems.com>
Date: Fri, 2 Jan 2015 17:27:56 -0800
Subject: ethtool: Extend ethtool plugin module eeprom API to phylib

This patch extends the ethtool plugin module eeprom API to support cards
whose phy support is delegated to a separate driver.

The handlers for ETHTOOL_GMODULEINFO and ETHTOOL_GMODULEEEPROM call the
module_info and module_eeprom functions if the phy driver provides them;
otherwise the handlers call the equivalent ethtool_ops functions provided
by network drivers with built-in phy support.

Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h |  9 +++++++++
 net/core/ethtool.c  | 45 ++++++++++++++++++++++++++++++++++-----------
 2 files changed, 43 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 22af8f8f5802..9c189a1fa3a2 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -565,6 +565,15 @@ struct phy_driver {
 	void (*write_mmd_indirect)(struct phy_device *dev, int ptrad,
 				   int devnum, int regnum, u32 val);
 
+	/* Get the size and type of the eeprom contained within a plug-in
+	 * module */
+	int (*module_info)(struct phy_device *dev,
+			   struct ethtool_modinfo *modinfo);
+
+	/* Get the eeprom information from the plug-in module */
+	int (*module_eeprom)(struct phy_device *dev,
+			     struct ethtool_eeprom *ee, u8 *data);
+
 	struct device_driver driver;
 };
 #define to_phy_driver(d) container_of(d, struct phy_driver, driver)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 550892cd6b3f..91f74f3eb204 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1597,20 +1597,31 @@ static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)
 	return err;
 }
 
+static int __ethtool_get_module_info(struct net_device *dev,
+				     struct ethtool_modinfo *modinfo)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct phy_device *phydev = dev->phydev;
+
+	if (phydev && phydev->drv && phydev->drv->module_info)
+		return phydev->drv->module_info(phydev, modinfo);
+
+	if (ops->get_module_info)
+		return ops->get_module_info(dev, modinfo);
+
+	return -EOPNOTSUPP;
+}
+
 static int ethtool_get_module_info(struct net_device *dev,
 				   void __user *useraddr)
 {
 	int ret;
 	struct ethtool_modinfo modinfo;
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-
-	if (!ops->get_module_info)
-		return -EOPNOTSUPP;
 
 	if (copy_from_user(&modinfo, useraddr, sizeof(modinfo)))
 		return -EFAULT;
 
-	ret = ops->get_module_info(dev, &modinfo);
+	ret = __ethtool_get_module_info(dev, &modinfo);
 	if (ret)
 		return ret;
 
@@ -1620,21 +1631,33 @@ static int ethtool_get_module_info(struct net_device *dev,
 	return 0;
 }
 
+static int __ethtool_get_module_eeprom(struct net_device *dev,
+				       struct ethtool_eeprom *ee, u8 *data)
+{
+	const struct ethtool_ops *ops = dev->ethtool_ops;
+	struct phy_device *phydev = dev->phydev;
+
+	if (phydev && phydev->drv && phydev->drv->module_eeprom)
+		return phydev->drv->module_eeprom(phydev, ee, data);
+
+	if (ops->get_module_eeprom)
+		return ops->get_module_eeprom(dev, ee, data);
+
+	return -EOPNOTSUPP;
+}
+
 static int ethtool_get_module_eeprom(struct net_device *dev,
 				     void __user *useraddr)
 {
 	int ret;
 	struct ethtool_modinfo modinfo;
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-
-	if (!ops->get_module_info || !ops->get_module_eeprom)
-		return -EOPNOTSUPP;
 
-	ret = ops->get_module_info(dev, &modinfo);
+	ret = __ethtool_get_module_info(dev, &modinfo);
 	if (ret)
 		return ret;
 
-	return ethtool_get_any_eeprom(dev, useraddr, ops->get_module_eeprom,
+	return ethtool_get_any_eeprom(dev, useraddr,
+				      __ethtool_get_module_eeprom,
 				      modinfo.eeprom_len);
 }
 
-- 
cgit v1.2.3


From 9da7dae94fb8adab5cc5f395640e30736a66e910 Mon Sep 17 00:00:00 2001
From: Valentin Rothberg <valentinrothberg@gmail.com>
Date: Tue, 6 Jan 2015 17:29:29 +0100
Subject: workqueue.h: remove loops of single statement macros

checkpatch.pl complained about two single statement macros in
do while (0) loops.  The loops and the trailing semicolons are
now removed, which makes checkpatch happy and the two macros
consistent with the rest of the file.

Signed-off-by: Valentin Rothberg <valentinrothberg@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index b996e6cde6bb..74db135f9957 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -220,14 +220,10 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 #endif
 
 #define INIT_WORK(_work, _func)						\
-	do {								\
-		__INIT_WORK((_work), (_func), 0);			\
-	} while (0)
+	__INIT_WORK((_work), (_func), 0)
 
 #define INIT_WORK_ONSTACK(_work, _func)					\
-	do {								\
-		__INIT_WORK((_work), (_func), 1);			\
-	} while (0)
+	__INIT_WORK((_work), (_func), 1)
 
 #define __INIT_DELAYED_WORK(_work, _func, _tflags)			\
 	do {								\
-- 
cgit v1.2.3


From 6ada1fc0e1c4775de0e043e1bd3ae9d065491aa5 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Wed, 3 Dec 2014 19:22:48 -0500
Subject: time: settimeofday: Validate the values of tv from user

An unvalidated user input is multiplied by a constant, which can result in
an undefined behaviour for large values. While this is validated later,
we should avoid triggering undefined behaviour.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable <stable@vger.kernel.org>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
[jstultz: include trivial milisecond->microsecond correction noticed
by Andy]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h | 13 +++++++++++++
 kernel/time/time.c   |  4 ++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 8c42cf8d2444..5989b0ead1ec 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -99,6 +99,19 @@ static inline bool timespec_valid_strict(const struct timespec *ts)
 	return true;
 }
 
+static inline bool timeval_valid(const struct timeval *tv)
+{
+	/* Dates before 1970 are bogus */
+	if (tv->tv_sec < 0)
+		return false;
+
+	/* Can't have more microseconds then a second */
+	if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC)
+		return false;
+
+	return true;
+}
+
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 
 #define CURRENT_TIME		(current_kernel_time())
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a9ae20fb0b11..22d5d3b73970 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv,
 	if (tv) {
 		if (copy_from_user(&user_tv, tv, sizeof(*tv)))
 			return -EFAULT;
+
+		if (!timeval_valid(&user_tv))
+			return -EINVAL;
+
 		new_ts.tv_sec = user_tv.tv_sec;
 		new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
 	}
-- 
cgit v1.2.3


From 26e022727f5e88c6e5054e14d954425deacbe56a Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 18 Dec 2014 16:02:17 +0100
Subject: efi: Rename efi_guid_unparse to efi_guid_to_str

Call it what it does - "unparse" is plain-misleading.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
---
 block/partitions/efi.c         | 2 +-
 drivers/firmware/efi/efi.c     | 4 ++--
 drivers/firmware/efi/efivars.c | 6 +++---
 fs/efivarfs/super.c            | 2 +-
 include/linux/efi.h            | 2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index 56d08fd75b1a..26cb624ace05 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -715,7 +715,7 @@ int efi_partition(struct parsed_partitions *state)
 			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
 
 		info = &state->parts[i + 1].info;
-		efi_guid_unparse(&ptes[i].unique_partition_guid, info->uuid);
+		efi_guid_to_str(&ptes[i].unique_partition_guid, info->uuid);
 
 		/* Naively convert UTF16-LE to 7 bits. */
 		label_max = min(ARRAY_SIZE(info->volname) - 1,
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 8590099ac148..ff0bbe383b31 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -272,10 +272,10 @@ static __init int match_config_table(efi_guid_t *guid,
 	int i;
 
 	if (table_types) {
-		efi_guid_unparse(guid, str);
+		efi_guid_to_str(guid, str);
 
 		for (i = 0; efi_guidcmp(table_types[i].guid, NULL_GUID); i++) {
-			efi_guid_unparse(&table_types[i].guid, str);
+			efi_guid_to_str(&table_types[i].guid, str);
 
 			if (!efi_guidcmp(*guid, table_types[i].guid)) {
 				*(table_types[i].ptr) = table;
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
index f256ecd8a176..7b2e0496e0c0 100644
--- a/drivers/firmware/efi/efivars.c
+++ b/drivers/firmware/efi/efivars.c
@@ -39,7 +39,7 @@
  *   fix locking per Peter Chubb's findings
  *
  *  25 Mar 2002 - Matt Domsch <Matt_Domsch@dell.com>
- *   move uuid_unparse() to include/asm-ia64/efi.h:efi_guid_unparse()
+ *   move uuid_unparse() to include/asm-ia64/efi.h:efi_guid_to_str()
  *
  *  12 Feb 2002 - Matt Domsch <Matt_Domsch@dell.com>
  *   use list_for_each_safe when deleting vars.
@@ -128,7 +128,7 @@ efivar_guid_read(struct efivar_entry *entry, char *buf)
 	if (!entry || !buf)
 		return 0;
 
-	efi_guid_unparse(&var->VendorGuid, str);
+	efi_guid_to_str(&var->VendorGuid, str);
 	str += strlen(str);
 	str += sprintf(str, "\n");
 
@@ -569,7 +569,7 @@ efivar_create_sysfs_entry(struct efivar_entry *new_var)
 	   private variables from another's.         */
 
 	*(short_name + strlen(short_name)) = '-';
-	efi_guid_unparse(&new_var->var.VendorGuid,
+	efi_guid_to_str(&new_var->var.VendorGuid,
 			 short_name + strlen(short_name));
 
 	new_var->kobj.kset = efivars_kset;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6dad1176ec52..ddbce42548c9 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -140,7 +140,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 
 	name[len] = '-';
 
-	efi_guid_unparse(&entry->var.VendorGuid, name + len + 1);
+	efi_guid_to_str(&entry->var.VendorGuid, name + len + 1);
 
 	name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
 
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 0949f9c7e872..d762c81e62a8 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -844,7 +844,7 @@ efi_guidcmp (efi_guid_t left, efi_guid_t right)
 }
 
 static inline char *
-efi_guid_unparse(efi_guid_t *guid, char *out)
+efi_guid_to_str(efi_guid_t *guid, char *out)
 {
 	sprintf(out, "%pUl", guid->b);
         return out;
-- 
cgit v1.2.3


From e61f7d1c3c07a7e51036b0796749edb00deff845 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Thu, 8 Jan 2015 10:34:27 -0500
Subject: libata: Whitelist SSDs that are known to properly return zeroes after
 TRIM

As defined, the DRAT (Deterministic Read After Trim) and RZAT (Return
Zero After Trim) flags in the ATA Command Set are unreliable in the
sense that they only define what happens if the device successfully
executed the DSM TRIM command. TRIM is only advisory, however, and the
device is free to silently ignore all or parts of the request.

In practice this renders the DRAT and RZAT flags completely useless and
because the results are unpredictable we decided to disable discard in
MD for 3.18 to avoid the risk of data corruption.

Hardware vendors in the real world obviously need better guarantees than
what the standards bodies provide. Unfortuntely those guarantees are
encoded in product requirements documents rather than somewhere we can
key off of them programatically. So we are compelled to disabling
discard_zeroes_data for all devices unless we explicitly have data to
support whitelisting them.

This patch whitelists SSDs from a few of the main vendors. None of the
whitelists are based on written guarantees. They are purely based on
empirical evidence collected from internal and external users that have
tested or qualified these drives in RAID deployments.

The whitelist is only meant as a starting point and is by no means
comprehensive:

   - All intel SSD models except for 510
   - Micron M5?0/M600
   - Samsung SSDs
   - Seagate SSDs

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 31 +++++++++++++++++++++++++++----
 drivers/ata/libata-scsi.c | 10 ++++++----
 include/linux/libata.h    |  1 +
 3 files changed, 34 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 5c84fb5c3372..23c2ae03a7ab 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4233,10 +4233,33 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "PIONEER DVD-RW  DVR-216D",	NULL,	ATA_HORKAGE_NOSETXFER },
 
 	/* devices that don't properly handle queued TRIM commands */
-	{ "Micron_M500*",		NULL,	ATA_HORKAGE_NO_NCQ_TRIM, },
-	{ "Crucial_CT???M500SSD*",	NULL,	ATA_HORKAGE_NO_NCQ_TRIM, },
-	{ "Micron_M550*",		NULL,	ATA_HORKAGE_NO_NCQ_TRIM, },
-	{ "Crucial_CT*M550SSD*",	NULL,	ATA_HORKAGE_NO_NCQ_TRIM, },
+	{ "Micron_M[56]*",		NULL,	ATA_HORKAGE_NO_NCQ_TRIM |
+						ATA_HORKAGE_ZERO_AFTER_TRIM, },
+	{ "Crucial_CT*SSD*",		NULL,	ATA_HORKAGE_NO_NCQ_TRIM, },
+
+	/*
+	 * As defined, the DRAT (Deterministic Read After Trim) and RZAT
+	 * (Return Zero After Trim) flags in the ATA Command Set are
+	 * unreliable in the sense that they only define what happens if
+	 * the device successfully executed the DSM TRIM command. TRIM
+	 * is only advisory, however, and the device is free to silently
+	 * ignore all or parts of the request.
+	 *
+	 * Whitelist drives that are known to reliably return zeroes
+	 * after TRIM.
+	 */
+
+	/*
+	 * The intel 510 drive has buggy DRAT/RZAT. Explicitly exclude
+	 * that model before whitelisting all other intel SSDs.
+	 */
+	{ "INTEL*SSDSC2MH*",		NULL,	0, },
+
+	{ "INTEL*SSD*", 		NULL,	ATA_HORKAGE_ZERO_AFTER_TRIM, },
+	{ "SSD*INTEL*",			NULL,	ATA_HORKAGE_ZERO_AFTER_TRIM, },
+	{ "Samsung*SSD*",		NULL,	ATA_HORKAGE_ZERO_AFTER_TRIM, },
+	{ "SAMSUNG*SSD*",		NULL,	ATA_HORKAGE_ZERO_AFTER_TRIM, },
+	{ "ST[1248][0248]0[FH]*",	NULL,	ATA_HORKAGE_ZERO_AFTER_TRIM, },
 
 	/*
 	 * Some WD SATA-I drives spin up and down erratically when the link
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index e364e86e84d7..6abd17a85b13 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -2532,13 +2532,15 @@ static unsigned int ata_scsiop_read_cap(struct ata_scsi_args *args, u8 *rbuf)
 		rbuf[15] = lowest_aligned;
 
 		if (ata_id_has_trim(args->id)) {
-			rbuf[14] |= 0x80; /* TPE */
+			rbuf[14] |= 0x80; /* LBPME */
 
-			if (ata_id_has_zero_after_trim(args->id))
-				rbuf[14] |= 0x40; /* TPRZ */
+			if (ata_id_has_zero_after_trim(args->id) &&
+			    dev->horkage & ATA_HORKAGE_ZERO_AFTER_TRIM) {
+				ata_dev_info(dev, "Enabling discard_zeroes_data\n");
+				rbuf[14] |= 0x40; /* LBPRZ */
+			}
 		}
 	}
-
 	return 0;
 }
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2d182413b1db..f2b440e44fd7 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -422,6 +422,7 @@ enum {
 	ATA_HORKAGE_NO_NCQ_TRIM	= (1 << 19),	/* don't use queued TRIM */
 	ATA_HORKAGE_NOLPM	= (1 << 20),	/* don't use LPM */
 	ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21),	/* some WDs have broken LPM */
+	ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.2.3


From ad26aa6c60974acf3228ed0ade97ba5793093dbe Mon Sep 17 00:00:00 2001
From: Jonghwa Lee <jonghwa3.lee@samsung.com>
Date: Thu, 8 Jan 2015 11:04:07 +0900
Subject: regulator: s2mps11: Fix wrong calculation of register offset

This patch adds missing registers('BUCK7_SW' & 'LDO29_CTRL'). Since BUCK7 has
1 more register (BUCK7_SW) than others, register offset should
be added one more for which has bigger address than BUCK7 registers.

Fixes: 76b9840b24ae04(regulator: s2mps11: Add support S2MPS13 regulator device)
Signed-off-by: Jonghwa Lee <jonghwa3.lee@samsung.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Cc: <stable@vger.kernel.org>
---
 drivers/regulator/s2mps11.c         | 42 +++++++++++++++++++++++++++++++++----
 include/linux/mfd/samsung/s2mps13.h |  2 ++
 2 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/s2mps11.c b/drivers/regulator/s2mps11.c
index c1444c3d84c2..13ca20ed33a6 100644
--- a/drivers/regulator/s2mps11.c
+++ b/drivers/regulator/s2mps11.c
@@ -405,6 +405,40 @@ static struct regulator_ops s2mps14_reg_ops;
 	.enable_mask	= S2MPS14_ENABLE_MASK			\
 }
 
+#define regulator_desc_s2mps13_buck7(num, min, step, min_sel) {	\
+	.name		= "BUCK"#num,				\
+	.id		= S2MPS13_BUCK##num,			\
+	.ops		= &s2mps14_reg_ops,			\
+	.type		= REGULATOR_VOLTAGE,			\
+	.owner		= THIS_MODULE,				\
+	.min_uV		= min,					\
+	.uV_step	= step,					\
+	.linear_min_sel	= min_sel,				\
+	.n_voltages	= S2MPS14_BUCK_N_VOLTAGES,		\
+	.ramp_delay	= S2MPS13_BUCK_RAMP_DELAY,		\
+	.vsel_reg	= S2MPS13_REG_B1OUT + (num) * 2 - 1,	\
+	.vsel_mask	= S2MPS14_BUCK_VSEL_MASK,		\
+	.enable_reg	= S2MPS13_REG_B1CTRL + (num - 1) * 2,	\
+	.enable_mask	= S2MPS14_ENABLE_MASK			\
+}
+
+#define regulator_desc_s2mps13_buck8_10(num, min, step, min_sel) {	\
+	.name		= "BUCK"#num,				\
+	.id		= S2MPS13_BUCK##num,			\
+	.ops		= &s2mps14_reg_ops,			\
+	.type		= REGULATOR_VOLTAGE,			\
+	.owner		= THIS_MODULE,				\
+	.min_uV		= min,					\
+	.uV_step	= step,					\
+	.linear_min_sel	= min_sel,				\
+	.n_voltages	= S2MPS14_BUCK_N_VOLTAGES,		\
+	.ramp_delay	= S2MPS13_BUCK_RAMP_DELAY,		\
+	.vsel_reg	= S2MPS13_REG_B1OUT + (num) * 2 - 1,	\
+	.vsel_mask	= S2MPS14_BUCK_VSEL_MASK,		\
+	.enable_reg	= S2MPS13_REG_B1CTRL + (num) * 2 - 1,	\
+	.enable_mask	= S2MPS14_ENABLE_MASK			\
+}
+
 static const struct regulator_desc s2mps13_regulators[] = {
 	regulator_desc_s2mps13_ldo(1,  MIN_800_MV,  STEP_12_5_MV, 0x00),
 	regulator_desc_s2mps13_ldo(2,  MIN_1400_MV, STEP_50_MV,   0x0C),
@@ -452,10 +486,10 @@ static const struct regulator_desc s2mps13_regulators[] = {
 	regulator_desc_s2mps13_buck(4,  MIN_500_MV,  STEP_6_25_MV, 0x10),
 	regulator_desc_s2mps13_buck(5,  MIN_500_MV,  STEP_6_25_MV, 0x10),
 	regulator_desc_s2mps13_buck(6,  MIN_500_MV,  STEP_6_25_MV, 0x10),
-	regulator_desc_s2mps13_buck(7,  MIN_500_MV,  STEP_6_25_MV, 0x10),
-	regulator_desc_s2mps13_buck(8,  MIN_1000_MV, STEP_12_5_MV, 0x20),
-	regulator_desc_s2mps13_buck(9,  MIN_1000_MV, STEP_12_5_MV, 0x20),
-	regulator_desc_s2mps13_buck(10, MIN_500_MV,  STEP_6_25_MV, 0x10),
+	regulator_desc_s2mps13_buck7(7,  MIN_500_MV,  STEP_6_25_MV, 0x10),
+	regulator_desc_s2mps13_buck8_10(8,  MIN_1000_MV, STEP_12_5_MV, 0x20),
+	regulator_desc_s2mps13_buck8_10(9,  MIN_1000_MV, STEP_12_5_MV, 0x20),
+	regulator_desc_s2mps13_buck8_10(10, MIN_500_MV,  STEP_6_25_MV, 0x10),
 };
 
 static int s2mps14_regulator_enable(struct regulator_dev *rdev)
diff --git a/include/linux/mfd/samsung/s2mps13.h b/include/linux/mfd/samsung/s2mps13.h
index ce5dda8958fe..b1fd675fa36f 100644
--- a/include/linux/mfd/samsung/s2mps13.h
+++ b/include/linux/mfd/samsung/s2mps13.h
@@ -59,6 +59,7 @@ enum s2mps13_reg {
 	S2MPS13_REG_B6CTRL,
 	S2MPS13_REG_B6OUT,
 	S2MPS13_REG_B7CTRL,
+	S2MPS13_REG_B7SW,
 	S2MPS13_REG_B7OUT,
 	S2MPS13_REG_B8CTRL,
 	S2MPS13_REG_B8OUT,
@@ -102,6 +103,7 @@ enum s2mps13_reg {
 	S2MPS13_REG_L26CTRL,
 	S2MPS13_REG_L27CTRL,
 	S2MPS13_REG_L28CTRL,
+	S2MPS13_REG_L29CTRL,
 	S2MPS13_REG_L30CTRL,
 	S2MPS13_REG_L31CTRL,
 	S2MPS13_REG_L32CTRL,
-- 
cgit v1.2.3


From bfa21a0dfe6915dc85953b5d40ea9dae5fdf205f Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Mon, 5 Jan 2015 12:48:42 +0100
Subject: regulator: Allow parsing custom properties when using simplified DT
 parsing

When drivers use simplified DT parsing method (they provide
'regulator_desc.of_match') they still may want to parse custom
properties for some of the regulators. For example some of the
regulators support GPIO enable control.

Add a driver-supplied callback for such case. This way the regulator
core parses common bindings offloading a lot of code from drivers and
still custom properties may be used.

The callback, called for each parsed regulator, may modify the
'regulator_config' initially passed to regulator_register().

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/core.c         |  2 +-
 drivers/regulator/internal.h     |  2 ++
 drivers/regulator/of_regulator.c | 11 +++++++++++
 include/linux/regulator/driver.h | 13 +++++++++++++
 4 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index c13b557a560e..5fae8cabd254 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -3635,7 +3635,7 @@ regulator_register(const struct regulator_desc *regulator_desc,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	init_data = regulator_of_get_init_data(dev, regulator_desc,
+	init_data = regulator_of_get_init_data(dev, regulator_desc, config,
 					       &rdev->dev.of_node);
 	if (!init_data) {
 		init_data = config->init_data;
diff --git a/drivers/regulator/internal.h b/drivers/regulator/internal.h
index 80ba2a35a04b..c74ac8734023 100644
--- a/drivers/regulator/internal.h
+++ b/drivers/regulator/internal.h
@@ -38,11 +38,13 @@ struct regulator {
 #ifdef CONFIG_OF
 struct regulator_init_data *regulator_of_get_init_data(struct device *dev,
 			         const struct regulator_desc *desc,
+				 struct regulator_config *config,
 				 struct device_node **node);
 #else
 static inline struct regulator_init_data *
 regulator_of_get_init_data(struct device *dev,
 			   const struct regulator_desc *desc,
+			   struct regulator_config *config,
 			   struct device_node **node)
 {
 	return NULL;
diff --git a/drivers/regulator/of_regulator.c b/drivers/regulator/of_regulator.c
index 91eaaf010524..24e812c48d93 100644
--- a/drivers/regulator/of_regulator.c
+++ b/drivers/regulator/of_regulator.c
@@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(of_regulator_match);
 
 struct regulator_init_data *regulator_of_get_init_data(struct device *dev,
 					    const struct regulator_desc *desc,
+					    struct regulator_config *config,
 					    struct device_node **node)
 {
 	struct device_node *search, *child;
@@ -307,6 +308,16 @@ struct regulator_init_data *regulator_of_get_init_data(struct device *dev,
 			break;
 		}
 
+		if (desc->of_parse_cb) {
+			if (desc->of_parse_cb(child, desc, config)) {
+				dev_err(dev,
+					"driver callback failed to parse DT for regulator %s\n",
+					child->name);
+				init_data = NULL;
+				break;
+			}
+		}
+
 		of_node_get(child);
 		*node = child;
 		break;
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 5f1e9ca47417..d4ad5b5a02bb 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -21,6 +21,7 @@
 
 struct regmap;
 struct regulator_dev;
+struct regulator_config;
 struct regulator_init_data;
 struct regulator_enable_gpio;
 
@@ -205,6 +206,15 @@ enum regulator_type {
  * @supply_name: Identifying the regulator supply
  * @of_match: Name used to identify regulator in DT.
  * @regulators_node: Name of node containing regulator definitions in DT.
+ * @of_parse_cb: Optional callback called only if of_match is present.
+ *               Will be called for each regulator parsed from DT, during
+ *               init_data parsing.
+ *               The regulator_config passed as argument to the callback will
+ *               be a copy of config passed to regulator_register, valid only
+ *               for this particular call. Callback may freely change the
+ *               config but it cannot store it for later usage.
+ *               Callback should return 0 on success or negative ERRNO
+ *               indicating failure.
  * @id: Numerical identifier for the regulator.
  * @ops: Regulator operations table.
  * @irq: Interrupt number for the regulator.
@@ -251,6 +261,9 @@ struct regulator_desc {
 	const char *supply_name;
 	const char *of_match;
 	const char *regulators_node;
+	int (*of_parse_cb)(struct device_node *,
+			    const struct regulator_desc *,
+			    struct regulator_config *);
 	int id;
 	bool continuous_voltage_range;
 	unsigned n_voltages;
-- 
cgit v1.2.3


From db30485408326a6f466a843b291b23535f63eda0 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 7 Jan 2015 13:41:54 +0800
Subject: rhashtable: involve rhashtable_lookup_insert routine

Involve a new function called rhashtable_lookup_insert() which makes
lookup and insertion atomic under bucket lock protection, helping us
avoid to introduce an extra lock when we search and insert an object
into hash table.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  1 +
 lib/rhashtable.c           | 97 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 83 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index de1459c74c4d..73c913f31574 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -168,6 +168,7 @@ int rhashtable_shrink(struct rhashtable *ht);
 void *rhashtable_lookup(struct rhashtable *ht, const void *key);
 void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg);
+bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj);
 
 void rhashtable_destroy(struct rhashtable *ht);
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 20006854fce0..4430233c4e11 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -505,8 +505,26 @@ static void rhashtable_wakeup_worker(struct rhashtable *ht)
 		schedule_delayed_work(&ht->run_work, 0);
 }
 
+static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
+				struct bucket_table *tbl, u32 hash)
+{
+	struct rhash_head *head = rht_dereference_bucket(tbl->buckets[hash],
+							 tbl, hash);
+
+	if (rht_is_a_nulls(head))
+		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
+	else
+		RCU_INIT_POINTER(obj->next, head);
+
+	rcu_assign_pointer(tbl->buckets[hash], obj);
+
+	atomic_inc(&ht->nelems);
+
+	rhashtable_wakeup_worker(ht);
+}
+
 /**
- * rhashtable_insert - insert object into hash hash table
+ * rhashtable_insert - insert object into hash table
  * @ht:		hash table
  * @obj:	pointer to hash head inside object
  *
@@ -523,7 +541,6 @@ static void rhashtable_wakeup_worker(struct rhashtable *ht)
 void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 {
 	struct bucket_table *tbl;
-	struct rhash_head *head;
 	spinlock_t *lock;
 	unsigned hash;
 
@@ -534,19 +551,9 @@ void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj)
 	lock = bucket_lock(tbl, hash);
 
 	spin_lock_bh(lock);
-	head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash);
-	if (rht_is_a_nulls(head))
-		INIT_RHT_NULLS_HEAD(obj->next, ht, hash);
-	else
-		RCU_INIT_POINTER(obj->next, head);
-
-	rcu_assign_pointer(tbl->buckets[hash], obj);
+	__rhashtable_insert(ht, obj, tbl, hash);
 	spin_unlock_bh(lock);
 
-	atomic_inc(&ht->nelems);
-
-	rhashtable_wakeup_worker(ht);
-
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(rhashtable_insert);
@@ -560,7 +567,7 @@ EXPORT_SYMBOL_GPL(rhashtable_insert);
  * walk the bucket chain upon removal. The removal operation is thus
  * considerable slow if the hash table is not correctly sized.
  *
- * Will automatically shrink the table via rhashtable_expand() if the the
+ * Will automatically shrink the table via rhashtable_expand() if the
  * shrink_decision function specified at rhashtable_init() returns true.
  *
  * The caller must ensure that no concurrent table mutations occur. It is
@@ -641,7 +648,7 @@ static bool rhashtable_compare(void *ptr, void *arg)
  * for a entry with an identical key. The first matching entry is returned.
  *
  * This lookup function may only be used for fixed key hash table (key_len
- * paramter set). It will BUG() if used inappropriately.
+ * parameter set). It will BUG() if used inappropriately.
  *
  * Lookups may occur in parallel with hashtable mutations and resizing.
  */
@@ -702,6 +709,66 @@ restart:
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
 
+/**
+ * rhashtable_lookup_insert - lookup and insert object into hash table
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ *
+ * Locks down the bucket chain in both the old and new table if a resize
+ * is in progress to ensure that writers can't remove from the old table
+ * and can't insert to the new table during the atomic operation of search
+ * and insertion. Searches for duplicates in both the old and new table if
+ * a resize is in progress.
+ *
+ * This lookup function may only be used for fixed key hash table (key_len
+ * parameter set). It will BUG() if used inappropriately.
+ *
+ * It is safe to call this function from atomic context.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ */
+bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
+{
+	struct bucket_table *new_tbl, *old_tbl;
+	spinlock_t *new_bucket_lock, *old_bucket_lock;
+	u32 new_hash, old_hash;
+	bool success = true;
+
+	BUG_ON(!ht->p.key_len);
+
+	rcu_read_lock();
+
+	old_tbl = rht_dereference_rcu(ht->tbl, ht);
+	old_hash = head_hashfn(ht, old_tbl, obj);
+	old_bucket_lock = bucket_lock(old_tbl, old_hash);
+	spin_lock_bh(old_bucket_lock);
+
+	new_tbl = rht_dereference_rcu(ht->future_tbl, ht);
+	new_hash = head_hashfn(ht, new_tbl, obj);
+	new_bucket_lock = bucket_lock(new_tbl, new_hash);
+	if (unlikely(old_tbl != new_tbl))
+		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
+
+	if (rhashtable_lookup(ht, rht_obj(ht, obj) + ht->p.key_offset)) {
+		success = false;
+		goto exit;
+	}
+
+	__rhashtable_insert(ht, obj, new_tbl, new_hash);
+
+exit:
+	if (unlikely(old_tbl != new_tbl))
+		spin_unlock_bh(new_bucket_lock);
+	spin_unlock_bh(old_bucket_lock);
+
+	rcu_read_unlock();
+
+	return success;
+}
+EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
+
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
 	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
-- 
cgit v1.2.3


From c0c09bfdc4150b3918526660768585cd477adf35 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 7 Jan 2015 13:41:56 +0800
Subject: rhashtable: avoid unnecessary wakeup for worker queue

Move condition statements of verifying whether hash table size exceeds
its maximum threshold or reaches its minimum threshold from resizing
functions to resizing decision functions, avoiding unnecessary wakeup
for worker queue thread.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  2 +-
 lib/rhashtable.c           | 18 +++++++-----------
 2 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 73c913f31574..326acd8c2e9f 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -113,7 +113,7 @@ struct rhashtable {
 	struct bucket_table __rcu	*tbl;
 	struct bucket_table __rcu       *future_tbl;
 	atomic_t			nelems;
-	size_t				shift;
+	atomic_t			shift;
 	struct rhashtable_params	p;
 	struct delayed_work             run_work;
 	struct mutex                    mutex;
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 1aef942976fe..7fb474b18f1b 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -199,7 +199,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size)
 {
 	/* Expand table when exceeding 75% load */
-	return atomic_read(&ht->nelems) > (new_size / 4 * 3);
+	return atomic_read(&ht->nelems) > (new_size / 4 * 3) &&
+	       (ht->p.max_shift && atomic_read(&ht->shift) < ht->p.max_shift);
 }
 EXPORT_SYMBOL_GPL(rht_grow_above_75);
 
@@ -211,7 +212,8 @@ EXPORT_SYMBOL_GPL(rht_grow_above_75);
 bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size)
 {
 	/* Shrink table beneath 30% load */
-	return atomic_read(&ht->nelems) < (new_size * 3 / 10);
+	return atomic_read(&ht->nelems) < (new_size * 3 / 10) &&
+	       (atomic_read(&ht->shift) > ht->p.min_shift);
 }
 EXPORT_SYMBOL_GPL(rht_shrink_below_30);
 
@@ -318,14 +320,11 @@ int rhashtable_expand(struct rhashtable *ht)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	if (ht->p.max_shift && ht->shift >= ht->p.max_shift)
-		return 0;
-
 	new_tbl = bucket_table_alloc(ht, old_tbl->size * 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
 
-	ht->shift++;
+	atomic_inc(&ht->shift);
 
 	/* Make insertions go into the new, empty table right away. Deletions
 	 * and lookups will be attempted in both tables until we synchronize.
@@ -421,9 +420,6 @@ int rhashtable_shrink(struct rhashtable *ht)
 
 	ASSERT_RHT_MUTEX(ht);
 
-	if (ht->shift <= ht->p.min_shift)
-		return 0;
-
 	new_tbl = bucket_table_alloc(ht, tbl->size / 2);
 	if (new_tbl == NULL)
 		return -ENOMEM;
@@ -462,7 +458,7 @@ int rhashtable_shrink(struct rhashtable *ht)
 
 	/* Publish the new, valid hash table */
 	rcu_assign_pointer(ht->tbl, new_tbl);
-	ht->shift--;
+	atomic_dec(&ht->shift);
 
 	/* Wait for readers. No new readers will have references to the
 	 * old hash table.
@@ -851,7 +847,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	if (tbl == NULL)
 		return -ENOMEM;
 
-	ht->shift = ilog2(tbl->size);
+	atomic_set(&ht->shift, ilog2(tbl->size));
 	RCU_INIT_POINTER(ht->tbl, tbl);
 	RCU_INIT_POINTER(ht->future_tbl, tbl);
 
-- 
cgit v1.2.3


From e5a7a72cd51a585b8f1a1e299bf88fff44b94440 Mon Sep 17 00:00:00 2001
From: Robin Gong <b38343@freescale.com>
Date: Fri, 9 Jan 2015 09:57:33 +0800
Subject: regulator: pfuze100-regulator: add pfuze3000 support

Add pfuze3000 chip support.

Signed-off-by: Robin Gong <b38343@freescale.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 .../devicetree/bindings/regulator/pfuze100.txt     |  94 ++++++++++++++-
 drivers/regulator/pfuze100-regulator.c             | 134 +++++++++++++++++++--
 include/linux/regulator/pfuze100.h                 |  14 +++
 3 files changed, 232 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/pfuze100.txt b/Documentation/devicetree/bindings/regulator/pfuze100.txt
index 34ef5d16d0f1..9b40db88f637 100644
--- a/Documentation/devicetree/bindings/regulator/pfuze100.txt
+++ b/Documentation/devicetree/bindings/regulator/pfuze100.txt
@@ -1,7 +1,7 @@
 PFUZE100 family of regulators
 
 Required properties:
-- compatible: "fsl,pfuze100" or "fsl,pfuze200"
+- compatible: "fsl,pfuze100", "fsl,pfuze200", "fsl,pfuze3000"
 - reg: I2C slave address
 
 Required child node:
@@ -14,6 +14,8 @@ Required child node:
   sw1ab,sw1c,sw2,sw3a,sw3b,sw4,swbst,vsnvs,vrefddr,vgen1~vgen6
   --PFUZE200
   sw1ab,sw2,sw3a,sw3b,swbst,vsnvs,vrefddr,vgen1~vgen6
+  --PFUZE3000
+  sw1a,sw1b,sw2,sw3,swbst,vsnvs,vrefddr,vldo1,vldo2,vccsd,v33,vldo3,vldo4
 
 Each regulator is defined using the standard binding for regulators.
 
@@ -205,3 +207,93 @@ Example 2: PFUZE200
 			};
 		};
 	};
+
+Example 3: PFUZE3000
+
+	pmic: pfuze3000@08 {
+		compatible = "fsl,pfuze3000";
+		reg = <0x08>;
+
+		regulators {
+			sw1a_reg: sw1a {
+				regulator-min-microvolt = <700000>;
+				regulator-max-microvolt = <1475000>;
+				regulator-boot-on;
+				regulator-always-on;
+				regulator-ramp-delay = <6250>;
+			};
+			/* use sw1c_reg to align with pfuze100/pfuze200 */
+			sw1c_reg: sw1b {
+				regulator-min-microvolt = <700000>;
+				regulator-max-microvolt = <1475000>;
+				regulator-boot-on;
+				regulator-always-on;
+				regulator-ramp-delay = <6250>;
+			};
+
+			sw2_reg: sw2 {
+				regulator-min-microvolt = <2500000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-boot-on;
+				regulator-always-on;
+			};
+
+			sw3a_reg: sw3 {
+				regulator-min-microvolt = <900000>;
+				regulator-max-microvolt = <1650000>;
+				regulator-boot-on;
+				regulator-always-on;
+			};
+
+			swbst_reg: swbst {
+				regulator-min-microvolt = <5000000>;
+				regulator-max-microvolt = <5150000>;
+			};
+
+			snvs_reg: vsnvs {
+				regulator-min-microvolt = <1000000>;
+				regulator-max-microvolt = <3000000>;
+				regulator-boot-on;
+				regulator-always-on;
+			};
+
+			vref_reg: vrefddr {
+				regulator-boot-on;
+				regulator-always-on;
+			};
+
+			vgen1_reg: vldo1 {
+				regulator-min-microvolt = <1800000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+			};
+
+			vgen2_reg: vldo2 {
+				regulator-min-microvolt = <800000>;
+				regulator-max-microvolt = <1550000>;
+			};
+
+			vgen3_reg: vccsd {
+				regulator-min-microvolt = <2850000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+			};
+
+			vgen4_reg: v33 {
+				regulator-min-microvolt = <2850000>;
+				regulator-max-microvolt = <3300000>;
+			};
+
+			vgen5_reg: vldo3 {
+				regulator-min-microvolt = <1800000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+			};
+
+			vgen6_reg: vldo4 {
+				regulator-min-microvolt = <1800000>;
+				regulator-max-microvolt = <3300000>;
+				regulator-always-on;
+			};
+		};
+	};
diff --git a/drivers/regulator/pfuze100-regulator.c b/drivers/regulator/pfuze100-regulator.c
index c879dff597ee..8cc8d1877c44 100644
--- a/drivers/regulator/pfuze100-regulator.c
+++ b/drivers/regulator/pfuze100-regulator.c
@@ -56,7 +56,7 @@
 #define PFUZE100_VGEN5VOL	0x70
 #define PFUZE100_VGEN6VOL	0x71
 
-enum chips { PFUZE100, PFUZE200 };
+enum chips { PFUZE100, PFUZE200, PFUZE3000 = 3 };
 
 struct pfuze_regulator {
 	struct regulator_desc desc;
@@ -80,9 +80,18 @@ static const int pfuze100_vsnvs[] = {
 	1000000, 1100000, 1200000, 1300000, 1500000, 1800000, 3000000,
 };
 
+static const int pfuze3000_sw2lo[] = {
+	1500000, 1550000, 1600000, 1650000, 1700000, 1750000, 1800000, 1850000,
+};
+
+static const int pfuze3000_sw2hi[] = {
+	2500000, 2800000, 2850000, 3000000, 3100000, 3150000, 3200000, 3300000,
+};
+
 static const struct i2c_device_id pfuze_device_id[] = {
 	{.name = "pfuze100", .driver_data = PFUZE100},
 	{.name = "pfuze200", .driver_data = PFUZE200},
+	{.name = "pfuze3000", .driver_data = PFUZE3000},
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, pfuze_device_id);
@@ -90,6 +99,7 @@ MODULE_DEVICE_TABLE(i2c, pfuze_device_id);
 static const struct of_device_id pfuze_dt_ids[] = {
 	{ .compatible = "fsl,pfuze100", .data = (void *)PFUZE100},
 	{ .compatible = "fsl,pfuze200", .data = (void *)PFUZE200},
+	{ .compatible = "fsl,pfuze3000", .data = (void *)PFUZE3000},
 	{ }
 };
 MODULE_DEVICE_TABLE(of, pfuze_dt_ids);
@@ -219,6 +229,60 @@ static struct regulator_ops pfuze100_swb_regulator_ops = {
 		.stby_mask = 0x20,	\
 	}
 
+#define PFUZE3000_VCC_REG(_chip, _name, base, min, max, step)	{	\
+	.desc = {	\
+		.name = #_name,	\
+		.n_voltages = ((max) - (min)) / (step) + 1,	\
+		.ops = &pfuze100_ldo_regulator_ops,	\
+		.type = REGULATOR_VOLTAGE,	\
+		.id = _chip ## _ ## _name,	\
+		.owner = THIS_MODULE,	\
+		.min_uV = (min),	\
+		.uV_step = (step),	\
+		.vsel_reg = (base),	\
+		.vsel_mask = 0x3,	\
+		.enable_reg = (base),	\
+		.enable_mask = 0x10,	\
+	},	\
+	.stby_reg = (base),	\
+	.stby_mask = 0x20,	\
+}
+
+
+#define PFUZE3000_SW2_REG(_chip, _name, base, min, max, step)	{	\
+	.desc = {	\
+		.name = #_name,\
+		.n_voltages = ((max) - (min)) / (step) + 1,	\
+		.ops = &pfuze100_sw_regulator_ops,	\
+		.type = REGULATOR_VOLTAGE,	\
+		.id = _chip ## _ ## _name,	\
+		.owner = THIS_MODULE,	\
+		.min_uV = (min),	\
+		.uV_step = (step),	\
+		.vsel_reg = (base) + PFUZE100_VOL_OFFSET,	\
+		.vsel_mask = 0x7,	\
+	},	\
+	.stby_reg = (base) + PFUZE100_STANDBY_OFFSET,	\
+	.stby_mask = 0x7,	\
+}
+
+#define PFUZE3000_SW3_REG(_chip, _name, base, min, max, step)	{	\
+	.desc = {	\
+		.name = #_name,\
+		.n_voltages = ((max) - (min)) / (step) + 1,	\
+		.ops = &pfuze100_sw_regulator_ops,	\
+		.type = REGULATOR_VOLTAGE,	\
+		.id = _chip ## _ ## _name,	\
+		.owner = THIS_MODULE,	\
+		.min_uV = (min),	\
+		.uV_step = (step),	\
+		.vsel_reg = (base) + PFUZE100_VOL_OFFSET,	\
+		.vsel_mask = 0xf,	\
+	},	\
+	.stby_reg = (base) + PFUZE100_STANDBY_OFFSET,	\
+	.stby_mask = 0xf,	\
+}
+
 /* PFUZE100 */
 static struct pfuze_regulator pfuze100_regulators[] = {
 	PFUZE100_SW_REG(PFUZE100, SW1AB, PFUZE100_SW1ABVOL, 300000, 1875000, 25000),
@@ -254,6 +318,22 @@ static struct pfuze_regulator pfuze200_regulators[] = {
 	PFUZE100_VGEN_REG(PFUZE200, VGEN6, PFUZE100_VGEN6VOL, 1800000, 3300000, 100000),
 };
 
+static struct pfuze_regulator pfuze3000_regulators[] = {
+	PFUZE100_SW_REG(PFUZE3000, SW1A, PFUZE100_SW1ABVOL, 700000, 1475000, 25000),
+	PFUZE100_SW_REG(PFUZE3000, SW1B, PFUZE100_SW1CVOL, 700000, 1475000, 25000),
+	PFUZE100_SWB_REG(PFUZE3000, SW2, PFUZE100_SW2VOL, 0x7, pfuze3000_sw2lo),
+	PFUZE3000_SW3_REG(PFUZE3000, SW3, PFUZE100_SW3AVOL, 900000, 1650000, 50000),
+	PFUZE100_SWB_REG(PFUZE3000, SWBST, PFUZE100_SWBSTCON1, 0x3, pfuze100_swbst),
+	PFUZE100_SWB_REG(PFUZE3000, VSNVS, PFUZE100_VSNVSVOL, 0x7, pfuze100_vsnvs),
+	PFUZE100_FIXED_REG(PFUZE3000, VREFDDR, PFUZE100_VREFDDRCON, 750000),
+	PFUZE100_VGEN_REG(PFUZE3000, VLDO1, PFUZE100_VGEN1VOL, 1800000, 3300000, 100000),
+	PFUZE100_VGEN_REG(PFUZE3000, VLDO2, PFUZE100_VGEN2VOL, 800000, 1550000, 50000),
+	PFUZE3000_VCC_REG(PFUZE3000, VCCSD, PFUZE100_VGEN3VOL, 2850000, 3300000, 150000),
+	PFUZE3000_VCC_REG(PFUZE3000, V33, PFUZE100_VGEN4VOL, 2850000, 3300000, 150000),
+	PFUZE100_VGEN_REG(PFUZE3000, VLDO3, PFUZE100_VGEN5VOL, 1800000, 3300000, 100000),
+	PFUZE100_VGEN_REG(PFUZE3000, VLDO4, PFUZE100_VGEN6VOL, 1800000, 3300000, 100000),
+};
+
 static struct pfuze_regulator *pfuze_regulators;
 
 #ifdef CONFIG_OF
@@ -294,6 +374,24 @@ static struct of_regulator_match pfuze200_matches[] = {
 	{ .name = "vgen6",	},
 };
 
+/* PFUZE3000 */
+static struct of_regulator_match pfuze3000_matches[] = {
+
+	{ .name = "sw1a",	},
+	{ .name = "sw1b",	},
+	{ .name = "sw2",	},
+	{ .name = "sw3",	},
+	{ .name = "swbst",	},
+	{ .name = "vsnvs",	},
+	{ .name = "vrefddr",	},
+	{ .name = "vldo1",	},
+	{ .name = "vldo2",	},
+	{ .name = "vccsd",	},
+	{ .name = "v33",	},
+	{ .name = "vldo3",	},
+	{ .name = "vldo4",	},
+};
+
 static struct of_regulator_match *pfuze_matches;
 
 static int pfuze_parse_regulators_dt(struct pfuze_chip *chip)
@@ -313,6 +411,11 @@ static int pfuze_parse_regulators_dt(struct pfuze_chip *chip)
 	}
 
 	switch (chip->chip_id) {
+	case PFUZE3000:
+		pfuze_matches = pfuze3000_matches;
+		ret = of_regulator_match(dev, parent, pfuze3000_matches,
+					 ARRAY_SIZE(pfuze3000_matches));
+		break;
 	case PFUZE200:
 		pfuze_matches = pfuze200_matches;
 		ret = of_regulator_match(dev, parent, pfuze200_matches,
@@ -378,7 +481,8 @@ static int pfuze_identify(struct pfuze_chip *pfuze_chip)
 		 * as ID=8 in PFUZE100
 		 */
 		dev_info(pfuze_chip->dev, "Assuming misprogrammed ID=0x8");
-	} else if ((value & 0x0f) != pfuze_chip->chip_id) {
+	} else if ((value & 0x0f) != pfuze_chip->chip_id &&
+		   (value & 0xf0) >> 4 != pfuze_chip->chip_id) {
 		/* device id NOT match with your setting */
 		dev_warn(pfuze_chip->dev, "Illegal ID: %x\n", value);
 		return -ENODEV;
@@ -417,7 +521,7 @@ static int pfuze100_regulator_probe(struct i2c_client *client,
 	int i, ret;
 	const struct of_device_id *match;
 	u32 regulator_num;
-	u32 sw_check_start, sw_check_end;
+	u32 sw_check_start, sw_check_end, sw_hi = 0x40;
 
 	pfuze_chip = devm_kzalloc(&client->dev, sizeof(*pfuze_chip),
 			GFP_KERNEL);
@@ -458,13 +562,19 @@ static int pfuze100_regulator_probe(struct i2c_client *client,
 
 	/* use the right regulators after identify the right device */
 	switch (pfuze_chip->chip_id) {
+	case PFUZE3000:
+		pfuze_regulators = pfuze3000_regulators;
+		regulator_num = ARRAY_SIZE(pfuze3000_regulators);
+		sw_check_start = PFUZE3000_SW2;
+		sw_check_end = PFUZE3000_SW2;
+		sw_hi = 1 << 3;
+		break;
 	case PFUZE200:
 		pfuze_regulators = pfuze200_regulators;
 		regulator_num = ARRAY_SIZE(pfuze200_regulators);
 		sw_check_start = PFUZE200_SW2;
 		sw_check_end = PFUZE200_SW3B;
 		break;
-
 	case PFUZE100:
 	default:
 		pfuze_regulators = pfuze100_regulators;
@@ -474,7 +584,8 @@ static int pfuze100_regulator_probe(struct i2c_client *client,
 		break;
 	}
 	dev_info(&client->dev, "pfuze%s found.\n",
-		(pfuze_chip->chip_id == PFUZE100) ? "100" : "200");
+		(pfuze_chip->chip_id == PFUZE100) ? "100" :
+		((pfuze_chip->chip_id == PFUZE200) ? "200" : "3000"));
 
 	memcpy(pfuze_chip->regulator_descs, pfuze_regulators,
 		sizeof(pfuze_chip->regulator_descs));
@@ -498,10 +609,15 @@ static int pfuze100_regulator_probe(struct i2c_client *client,
 		/* SW2~SW4 high bit check and modify the voltage value table */
 		if (i >= sw_check_start && i <= sw_check_end) {
 			regmap_read(pfuze_chip->regmap, desc->vsel_reg, &val);
-			if (val & 0x40) {
-				desc->min_uV = 800000;
-				desc->uV_step = 50000;
-				desc->n_voltages = 51;
+			if (val & sw_hi) {
+				if (pfuze_chip->chip_id == PFUZE3000) {
+					desc->volt_table = pfuze3000_sw2hi;
+					desc->n_voltages = ARRAY_SIZE(pfuze3000_sw2hi);
+				} else {
+					desc->min_uV = 800000;
+					desc->uV_step = 50000;
+					desc->n_voltages = 51;
+				}
 			}
 		}
 
diff --git a/include/linux/regulator/pfuze100.h b/include/linux/regulator/pfuze100.h
index 364f7a7c43db..70c6c66c5bcf 100644
--- a/include/linux/regulator/pfuze100.h
+++ b/include/linux/regulator/pfuze100.h
@@ -49,6 +49,20 @@
 #define PFUZE200_VGEN5		11
 #define PFUZE200_VGEN6		12
 
+#define PFUZE3000_SW1A		0
+#define PFUZE3000_SW1B		1
+#define PFUZE3000_SW2		2
+#define PFUZE3000_SW3		3
+#define PFUZE3000_SWBST		4
+#define PFUZE3000_VSNVS		5
+#define PFUZE3000_VREFDDR	6
+#define PFUZE3000_VLDO1		7
+#define PFUZE3000_VLDO2		8
+#define PFUZE3000_VCCSD		9
+#define PFUZE3000_V33		10
+#define PFUZE3000_VLDO3		11
+#define PFUZE3000_VLDO4		12
+
 struct regulator_init_data;
 
 struct pfuze_regulator_platform_data {
-- 
cgit v1.2.3


From ed09dcc8bd7fe0991af7737e675996cbd022f38f Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 30 Dec 2014 14:46:14 -0800
Subject: ses: close potential registration race

The slot and address fields have a small window of instability when
userspace can read them before initialization. Separate
enclosure_component
allocation from registration.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Jens Axboe <axboe@fb.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/misc/enclosure.c  | 37 ++++++++++++++++++++++++++-----------
 drivers/scsi/ses.c        | 21 ++++++++++++++-------
 include/linux/enclosure.h |  5 +++--
 3 files changed, 43 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index 180a5442fd4b..e18278aa6bbb 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -273,27 +273,26 @@ enclosure_component_find_by_name(struct enclosure_device *edev,
 static const struct attribute_group *enclosure_component_groups[];
 
 /**
- * enclosure_component_register - add a particular component to an enclosure
+ * enclosure_component_alloc - prepare a new enclosure component
  * @edev:	the enclosure to add the component
  * @num:	the device number
  * @type:	the type of component being added
  * @name:	an optional name to appear in sysfs (leave NULL if none)
  *
- * Registers the component.  The name is optional for enclosures that
- * give their components a unique name.  If not, leave the field NULL
- * and a name will be assigned.
+ * The name is optional for enclosures that give their components a unique
+ * name.  If not, leave the field NULL and a name will be assigned.
  *
  * Returns a pointer to the enclosure component or an error.
  */
 struct enclosure_component *
-enclosure_component_register(struct enclosure_device *edev,
-			     unsigned int number,
-			     enum enclosure_component_type type,
-			     const char *name)
+enclosure_component_alloc(struct enclosure_device *edev,
+			  unsigned int number,
+			  enum enclosure_component_type type,
+			  const char *name)
 {
 	struct enclosure_component *ecomp;
 	struct device *cdev;
-	int err, i;
+	int i;
 	char newname[COMPONENT_NAME_SIZE];
 
 	if (number >= edev->components)
@@ -327,14 +326,30 @@ enclosure_component_register(struct enclosure_device *edev,
 	cdev->release = enclosure_component_release;
 	cdev->groups = enclosure_component_groups;
 
+	return ecomp;
+}
+EXPORT_SYMBOL_GPL(enclosure_component_alloc);
+
+/**
+ * enclosure_component_register - publishes an initialized enclosure component
+ * @ecomp:	component to add
+ *
+ * Returns 0 on successful registration, releases the component otherwise
+ */
+int enclosure_component_register(struct enclosure_component *ecomp)
+{
+	struct device *cdev;
+	int err;
+
+	cdev = &ecomp->cdev;
 	err = device_register(cdev);
 	if (err) {
 		ecomp->number = -1;
 		put_device(cdev);
-		return ERR_PTR(err);
+		return err;
 	}
 
-	return ecomp;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(enclosure_component_register);
 
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index b7e79e7646ad..7dd9cf558380 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -423,16 +423,23 @@ static void ses_enclosure_data_process(struct enclosure_device *edev,
 			    type_ptr[0] == ENCLOSURE_COMPONENT_ARRAY_DEVICE) {
 
 				if (create)
-					ecomp =	enclosure_component_register(edev,
-									     components++,
-									     type_ptr[0],
-									     name);
+					ecomp =	enclosure_component_alloc(
+						edev,
+						components++,
+						type_ptr[0],
+						name);
 				else
 					ecomp = &edev->component[components++];
 
-				if (!IS_ERR(ecomp) && addl_desc_ptr)
-					ses_process_descriptor(ecomp,
-							       addl_desc_ptr);
+				if (!IS_ERR(ecomp)) {
+					if (addl_desc_ptr)
+						ses_process_descriptor(
+							ecomp,
+							addl_desc_ptr);
+					if (create)
+						enclosure_component_register(
+							ecomp);
+				}
 			}
 			if (desc_ptr)
 				desc_ptr += len;
diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h
index 9a33c5f7e126..a835d335b924 100644
--- a/include/linux/enclosure.h
+++ b/include/linux/enclosure.h
@@ -120,8 +120,9 @@ enclosure_register(struct device *, const char *, int,
 		   struct enclosure_component_callbacks *);
 void enclosure_unregister(struct enclosure_device *);
 struct enclosure_component *
-enclosure_component_register(struct enclosure_device *, unsigned int,
-				 enum enclosure_component_type, const char *);
+enclosure_component_alloc(struct enclosure_device *, unsigned int,
+			  enum enclosure_component_type, const char *);
+int enclosure_component_register(struct enclosure_component *);
 int enclosure_add_device(struct enclosure_device *enclosure, int component,
 			 struct device *dev);
 int enclosure_remove_device(struct enclosure_device *, struct device *);
-- 
cgit v1.2.3


From 967f7bab0eaaa74d7d01a56d45aa309f78fb87dd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 30 Dec 2014 14:46:16 -0800
Subject: ses: add enclosure logical id

Export the NAA logical id for the enclosure.  This is optionally
available from the sas_transport_class, but it is really a property of
the enclosure.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Jens Axboe <axboe@fb.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/misc/enclosure.c  | 13 +++++++++++++
 drivers/scsi/ses.c        |  9 +++++++++
 include/linux/enclosure.h |  1 +
 3 files changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index e18278aa6bbb..958ee988a1e1 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -432,8 +432,21 @@ static ssize_t components_show(struct device *cdev,
 }
 static DEVICE_ATTR_RO(components);
 
+static ssize_t id_show(struct device *cdev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct enclosure_device *edev = to_enclosure_device(cdev);
+
+	if (edev->cb->show_id)
+		return edev->cb->show_id(edev, buf);
+	return -EINVAL;
+}
+static DEVICE_ATTR_RO(id);
+
 static struct attribute *enclosure_class_attrs[] = {
 	&dev_attr_components.attr,
+	&dev_attr_id.attr,
 	NULL,
 };
 ATTRIBUTE_GROUPS(enclosure_class);
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index 6662b0ca6cfd..1041556cdbf3 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -258,6 +258,14 @@ static int ses_set_active(struct enclosure_device *edev,
 	return ses_set_page2_descriptor(edev, ecomp, desc);
 }
 
+static int ses_show_id(struct enclosure_device *edev, char *buf)
+{
+	struct ses_device *ses_dev = edev->scratch;
+	unsigned long long id = get_unaligned_be64(ses_dev->page1+8+4);
+
+	return sprintf(buf, "%#llx\n", id);
+}
+
 static struct enclosure_component_callbacks ses_enclosure_callbacks = {
 	.get_fault		= ses_get_fault,
 	.set_fault		= ses_set_fault,
@@ -265,6 +273,7 @@ static struct enclosure_component_callbacks ses_enclosure_callbacks = {
 	.get_locate		= ses_get_locate,
 	.set_locate		= ses_set_locate,
 	.set_active		= ses_set_active,
+	.show_id		= ses_show_id,
 };
 
 struct ses_host_edev {
diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h
index a835d335b924..807622b252a4 100644
--- a/include/linux/enclosure.h
+++ b/include/linux/enclosure.h
@@ -79,6 +79,7 @@ struct enclosure_component_callbacks {
 	int (*set_locate)(struct enclosure_device *,
 			  struct enclosure_component *,
 			  enum enclosure_component_setting);
+	int (*show_id)(struct enclosure_device *, char *buf);
 };
 
 
-- 
cgit v1.2.3


From 921ce7f5786052749a22a75780f5ce1a456bcdc6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 30 Dec 2014 14:46:17 -0800
Subject: ses: add reliable slot attribute

The name provided by firmware is in a vendor specific format, publish
the slot number to have a reliable mechanism for identifying slots
across firmware implementations.  If the enclosure does not provide a
slot number fallback to the component number which is guaranteed unique,
and usually mirrors the slot number.

Cleaned up the unused ses_component.desc in the process.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/misc/enclosure.c  | 20 +++++++++++++++++++-
 drivers/scsi/ses.c        | 17 ++++++++++++-----
 include/linux/enclosure.h |  1 +
 3 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index 958ee988a1e1..b62314d627ae 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -145,8 +145,10 @@ enclosure_register(struct device *dev, const char *name, int components,
 	if (err)
 		goto err;
 
-	for (i = 0; i < components; i++)
+	for (i = 0; i < components; i++) {
 		edev->component[i].number = -1;
+		edev->component[i].slot = -1;
+	}
 
 	mutex_lock(&container_list_lock);
 	list_add_tail(&edev->node, &container_list);
@@ -589,6 +591,20 @@ static ssize_t get_component_type(struct device *cdev,
 	return snprintf(buf, 40, "%s\n", enclosure_type[ecomp->type]);
 }
 
+static ssize_t get_component_slot(struct device *cdev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct enclosure_component *ecomp = to_enclosure_component(cdev);
+	int slot;
+
+	/* if the enclosure does not override then use 'number' as a stand-in */
+	if (ecomp->slot >= 0)
+		slot = ecomp->slot;
+	else
+		slot = ecomp->number;
+
+	return snprintf(buf, 40, "%d\n", slot);
+}
 
 static DEVICE_ATTR(fault, S_IRUGO | S_IWUSR, get_component_fault,
 		    set_component_fault);
@@ -599,6 +615,7 @@ static DEVICE_ATTR(active, S_IRUGO | S_IWUSR, get_component_active,
 static DEVICE_ATTR(locate, S_IRUGO | S_IWUSR, get_component_locate,
 		   set_component_locate);
 static DEVICE_ATTR(type, S_IRUGO, get_component_type, NULL);
+static DEVICE_ATTR(slot, S_IRUGO, get_component_slot, NULL);
 
 static struct attribute *enclosure_component_attrs[] = {
 	&dev_attr_fault.attr,
@@ -606,6 +623,7 @@ static struct attribute *enclosure_component_attrs[] = {
 	&dev_attr_active.attr,
 	&dev_attr_locate.attr,
 	&dev_attr_type.attr,
+	&dev_attr_slot.attr,
 	NULL
 };
 ATTRIBUTE_GROUPS(enclosure_component);
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index 1041556cdbf3..433de8e6f538 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -47,7 +47,6 @@ struct ses_device {
 
 struct ses_component {
 	u64 addr;
-	unsigned char *desc;
 };
 
 static int ses_probe(struct device *dev)
@@ -307,19 +306,26 @@ static void ses_process_descriptor(struct enclosure_component *ecomp,
 	int invalid = desc[0] & 0x80;
 	enum scsi_protocol proto = desc[0] & 0x0f;
 	u64 addr = 0;
+	int slot = -1;
 	struct ses_component *scomp = ecomp->scratch;
 	unsigned char *d;
 
-	scomp->desc = desc;
-
 	if (invalid)
 		return;
 
 	switch (proto) {
+	case SCSI_PROTOCOL_FCP:
+		if (eip) {
+			d = desc + 4;
+			slot = d[3];
+		}
+		break;
 	case SCSI_PROTOCOL_SAS:
-		if (eip)
+		if (eip) {
+			d = desc + 4;
+			slot = d[3];
 			d = desc + 8;
-		else
+		} else
 			d = desc + 4;
 		/* only take the phy0 addr */
 		addr = (u64)d[12] << 56 |
@@ -335,6 +341,7 @@ static void ses_process_descriptor(struct enclosure_component *ecomp,
 		/* FIXME: Need to add more protocols than just SAS */
 		break;
 	}
+	ecomp->slot = slot;
 	scomp->addr = addr;
 }
 
diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h
index 807622b252a4..0f826c14a337 100644
--- a/include/linux/enclosure.h
+++ b/include/linux/enclosure.h
@@ -92,6 +92,7 @@ struct enclosure_component {
 	int fault;
 	int active;
 	int locate;
+	int slot;
 	enum enclosure_status status;
 };
 
-- 
cgit v1.2.3


From 08024885a2a3ed432716e9d50046a620a5b2df05 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Tue, 30 Dec 2014 14:46:18 -0800
Subject: ses: Add power_status to SES device slot

Add power_status to SES device slot, so we can power on/off the
HDDs behind the enclosure.

Check firmware status in ses_set_* before sending control pages to
firmware.

Signed-off-by: Song Liu <songliubraving@fb.com>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jens Axboe <axboe@fb.com>
Cc: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/misc/enclosure.c  | 38 ++++++++++++++++++
 drivers/scsi/ses.c        | 98 ++++++++++++++++++++++++++++++++++++++++++-----
 include/linux/enclosure.h |  6 +++
 3 files changed, 133 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/enclosure.c b/drivers/misc/enclosure.c
index b62314d627ae..38552a31304a 100644
--- a/drivers/misc/enclosure.c
+++ b/drivers/misc/enclosure.c
@@ -148,6 +148,7 @@ enclosure_register(struct device *dev, const char *name, int components,
 	for (i = 0; i < components; i++) {
 		edev->component[i].number = -1;
 		edev->component[i].slot = -1;
+		edev->component[i].power_status = 1;
 	}
 
 	mutex_lock(&container_list_lock);
@@ -583,6 +584,40 @@ static ssize_t set_component_locate(struct device *cdev,
 	return count;
 }
 
+static ssize_t get_component_power_status(struct device *cdev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
+	struct enclosure_component *ecomp = to_enclosure_component(cdev);
+
+	if (edev->cb->get_power_status)
+		edev->cb->get_power_status(edev, ecomp);
+	return snprintf(buf, 40, "%s\n", ecomp->power_status ? "on" : "off");
+}
+
+static ssize_t set_component_power_status(struct device *cdev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct enclosure_device *edev = to_enclosure_device(cdev->parent);
+	struct enclosure_component *ecomp = to_enclosure_component(cdev);
+	int val;
+
+	if (strncmp(buf, "on", 2) == 0 &&
+	    (buf[2] == '\n' || buf[2] == '\0'))
+		val = 1;
+	else if (strncmp(buf, "off", 3) == 0 &&
+	    (buf[3] == '\n' || buf[3] == '\0'))
+		val = 0;
+	else
+		return -EINVAL;
+
+	if (edev->cb->set_power_status)
+		edev->cb->set_power_status(edev, ecomp, val);
+	return count;
+}
+
 static ssize_t get_component_type(struct device *cdev,
 				  struct device_attribute *attr, char *buf)
 {
@@ -614,6 +649,8 @@ static DEVICE_ATTR(active, S_IRUGO | S_IWUSR, get_component_active,
 		   set_component_active);
 static DEVICE_ATTR(locate, S_IRUGO | S_IWUSR, get_component_locate,
 		   set_component_locate);
+static DEVICE_ATTR(power_status, S_IRUGO | S_IWUSR, get_component_power_status,
+		   set_component_power_status);
 static DEVICE_ATTR(type, S_IRUGO, get_component_type, NULL);
 static DEVICE_ATTR(slot, S_IRUGO, get_component_slot, NULL);
 
@@ -622,6 +659,7 @@ static struct attribute *enclosure_component_attrs[] = {
 	&dev_attr_status.attr,
 	&dev_attr_active.attr,
 	&dev_attr_locate.attr,
+	&dev_attr_power_status.attr,
 	&dev_attr_type.attr,
 	&dev_attr_slot.attr,
 	NULL
diff --git a/drivers/scsi/ses.c b/drivers/scsi/ses.c
index 433de8e6f538..dcb0d76d7312 100644
--- a/drivers/scsi/ses.c
+++ b/drivers/scsi/ses.c
@@ -67,6 +67,20 @@ static int ses_probe(struct device *dev)
 #define SES_TIMEOUT (30 * HZ)
 #define SES_RETRIES 3
 
+static void init_device_slot_control(unsigned char *dest_desc,
+				     struct enclosure_component *ecomp,
+				     unsigned char *status)
+{
+	memcpy(dest_desc, status, 4);
+	dest_desc[0] = 0;
+	/* only clear byte 1 for ENCLOSURE_COMPONENT_DEVICE */
+	if (ecomp->type == ENCLOSURE_COMPONENT_DEVICE)
+		dest_desc[1] = 0;
+	dest_desc[2] &= 0xde;
+	dest_desc[3] &= 0x3c;
+}
+
+
 static int ses_recv_diag(struct scsi_device *sdev, int page_code,
 			 void *buf, int bufflen)
 {
@@ -178,14 +192,22 @@ static int ses_set_fault(struct enclosure_device *edev,
 			  struct enclosure_component *ecomp,
 			 enum enclosure_component_setting val)
 {
-	unsigned char desc[4] = {0 };
+	unsigned char desc[4];
+	unsigned char *desc_ptr;
+
+	desc_ptr = ses_get_page2_descriptor(edev, ecomp);
+
+	if (!desc_ptr)
+		return -EIO;
+
+	init_device_slot_control(desc, ecomp, desc_ptr);
 
 	switch (val) {
 	case ENCLOSURE_SETTING_DISABLED:
-		/* zero is disabled */
+		desc[3] &= 0xdf;
 		break;
 	case ENCLOSURE_SETTING_ENABLED:
-		desc[3] = 0x20;
+		desc[3] |= 0x20;
 		break;
 	default:
 		/* SES doesn't do the SGPIO blink settings */
@@ -219,14 +241,22 @@ static int ses_set_locate(struct enclosure_device *edev,
 			  struct enclosure_component *ecomp,
 			  enum enclosure_component_setting val)
 {
-	unsigned char desc[4] = {0 };
+	unsigned char desc[4];
+	unsigned char *desc_ptr;
+
+	desc_ptr = ses_get_page2_descriptor(edev, ecomp);
+
+	if (!desc_ptr)
+		return -EIO;
+
+	init_device_slot_control(desc, ecomp, desc_ptr);
 
 	switch (val) {
 	case ENCLOSURE_SETTING_DISABLED:
-		/* zero is disabled */
+		desc[2] &= 0xfd;
 		break;
 	case ENCLOSURE_SETTING_ENABLED:
-		desc[2] = 0x02;
+		desc[2] |= 0x02;
 		break;
 	default:
 		/* SES doesn't do the SGPIO blink settings */
@@ -239,15 +269,23 @@ static int ses_set_active(struct enclosure_device *edev,
 			  struct enclosure_component *ecomp,
 			  enum enclosure_component_setting val)
 {
-	unsigned char desc[4] = {0 };
+	unsigned char desc[4];
+	unsigned char *desc_ptr;
+
+	desc_ptr = ses_get_page2_descriptor(edev, ecomp);
+
+	if (!desc_ptr)
+		return -EIO;
+
+	init_device_slot_control(desc, ecomp, desc_ptr);
 
 	switch (val) {
 	case ENCLOSURE_SETTING_DISABLED:
-		/* zero is disabled */
+		desc[2] &= 0x7f;
 		ecomp->active = 0;
 		break;
 	case ENCLOSURE_SETTING_ENABLED:
-		desc[2] = 0x80;
+		desc[2] |= 0x80;
 		ecomp->active = 1;
 		break;
 	default:
@@ -265,12 +303,53 @@ static int ses_show_id(struct enclosure_device *edev, char *buf)
 	return sprintf(buf, "%#llx\n", id);
 }
 
+static void ses_get_power_status(struct enclosure_device *edev,
+				 struct enclosure_component *ecomp)
+{
+	unsigned char *desc;
+
+	desc = ses_get_page2_descriptor(edev, ecomp);
+	if (desc)
+		ecomp->power_status = (desc[3] & 0x10) ? 0 : 1;
+}
+
+static int ses_set_power_status(struct enclosure_device *edev,
+				struct enclosure_component *ecomp,
+				int val)
+{
+	unsigned char desc[4];
+	unsigned char *desc_ptr;
+
+	desc_ptr = ses_get_page2_descriptor(edev, ecomp);
+
+	if (!desc_ptr)
+		return -EIO;
+
+	init_device_slot_control(desc, ecomp, desc_ptr);
+
+	switch (val) {
+	/* power = 1 is device_off = 0 and vice versa */
+	case 0:
+		desc[3] |= 0x10;
+		break;
+	case 1:
+		desc[3] &= 0xef;
+		break;
+	default:
+		return -EINVAL;
+	}
+	ecomp->power_status = val;
+	return ses_set_page2_descriptor(edev, ecomp, desc);
+}
+
 static struct enclosure_component_callbacks ses_enclosure_callbacks = {
 	.get_fault		= ses_get_fault,
 	.set_fault		= ses_set_fault,
 	.get_status		= ses_get_status,
 	.get_locate		= ses_get_locate,
 	.set_locate		= ses_set_locate,
+	.get_power_status	= ses_get_power_status,
+	.set_power_status	= ses_set_power_status,
 	.set_active		= ses_set_active,
 	.show_id		= ses_show_id,
 };
@@ -449,6 +528,7 @@ static void ses_enclosure_data_process(struct enclosure_device *edev,
 					ecomp = &edev->component[components++];
 
 				if (!IS_ERR(ecomp)) {
+					ses_get_power_status(edev, ecomp);
 					if (addl_desc_ptr)
 						ses_process_descriptor(
 							ecomp,
diff --git a/include/linux/enclosure.h b/include/linux/enclosure.h
index 0f826c14a337..7be22da321f3 100644
--- a/include/linux/enclosure.h
+++ b/include/linux/enclosure.h
@@ -79,6 +79,11 @@ struct enclosure_component_callbacks {
 	int (*set_locate)(struct enclosure_device *,
 			  struct enclosure_component *,
 			  enum enclosure_component_setting);
+	void (*get_power_status)(struct enclosure_device *,
+				 struct enclosure_component *);
+	int (*set_power_status)(struct enclosure_device *,
+				struct enclosure_component *,
+				int);
 	int (*show_id)(struct enclosure_device *, char *buf);
 };
 
@@ -94,6 +99,7 @@ struct enclosure_component {
 	int locate;
 	int slot;
 	enum enclosure_status status;
+	int power_status;
 };
 
 struct enclosure_device {
-- 
cgit v1.2.3


From 9733e4f0a973a354034f5dd603b4142a3095c85f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 12:49:13 -0800
Subject: rcu: Make _batches_completed() functions return unsigned long

Long ago, the various ->completed fields were of type long, but now are
unsigned long due to signed-integer-overflow concerns.  However, the
various _batches_completed() functions remained of type long, even though
their only purpose in life is to return the corresponding ->completed
field.  This patch cleans this up by changing these functions' return
types to unsigned long.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h  | 4 ++--
 include/linux/rcutree.h  | 6 +++---
 kernel/rcu/tree.c        | 4 ++--
 kernel/rcu/tree.h        | 2 +-
 kernel/rcu/tree_plugin.h | 6 +++---
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 0e5366200154..91f7e4c37800 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -94,7 +94,7 @@ static inline void rcu_virt_note_context_switch(int cpu)
 /*
  * Return the number of grace periods.
  */
-static inline long rcu_batches_completed(void)
+static inline unsigned long rcu_batches_completed(void)
 {
 	return 0;
 }
@@ -102,7 +102,7 @@ static inline long rcu_batches_completed(void)
 /*
  * Return the number of bottom-half grace periods.
  */
-static inline long rcu_batches_completed_bh(void)
+static inline unsigned long rcu_batches_completed_bh(void)
 {
 	return 0;
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 52953790dcca..9885bfb6b123 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -81,9 +81,9 @@ void cond_synchronize_rcu(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
-long rcu_batches_completed(void);
-long rcu_batches_completed_bh(void);
-long rcu_batches_completed_sched(void);
+unsigned long rcu_batches_completed(void);
+unsigned long rcu_batches_completed_bh(void);
+unsigned long rcu_batches_completed_sched(void);
 void show_rcu_gp_kthreads(void);
 
 void rcu_force_quiescent_state(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4c106fcc0d54..e26d78712e16 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -317,7 +317,7 @@ static int rcu_pending(void);
 /*
  * Return the number of RCU-sched batches processed thus far for debug & stats.
  */
-long rcu_batches_completed_sched(void)
+unsigned long rcu_batches_completed_sched(void)
 {
 	return rcu_sched_state.completed;
 }
@@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 /*
  * Return the number of RCU BH batches processed thus far for debug & stats.
  */
-long rcu_batches_completed_bh(void)
+unsigned long rcu_batches_completed_bh(void)
 {
 	return rcu_bh_state.completed;
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e7b1843896e..1a07d7379ac6 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -546,7 +546,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
 
 /* Forward declarations for rcutree_plugin.h */
 static void rcu_bootup_announce(void);
-long rcu_batches_completed(void);
+unsigned long rcu_batches_completed(void);
 static void rcu_preempt_note_context_switch(void);
 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3ec85cb5d544..f69300d4a51f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -117,7 +117,7 @@ static void __init rcu_bootup_announce(void)
  * Return the number of RCU-preempt batches processed thus far
  * for debug and statistics.
  */
-static long rcu_batches_completed_preempt(void)
+static unsigned long rcu_batches_completed_preempt(void)
 {
 	return rcu_preempt_state.completed;
 }
@@ -126,7 +126,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
 /*
  * Return the number of RCU batches processed thus far for debug & stats.
  */
-long rcu_batches_completed(void)
+unsigned long rcu_batches_completed(void)
 {
 	return rcu_batches_completed_preempt();
 }
@@ -935,7 +935,7 @@ static void __init rcu_bootup_announce(void)
 /*
  * Return the number of RCU batches processed thus far for debug & stats.
  */
-long rcu_batches_completed(void)
+unsigned long rcu_batches_completed(void)
 {
 	return rcu_batches_completed_sched();
 }
-- 
cgit v1.2.3


From c1fe9cde4ae904fffb5b4d975d0a37e99136ff50 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 15:45:27 -0800
Subject: rcu: Provide rcu_batches_completed_sched() for TINY_RCU

A bug in rcutorture has caused it to ignore completed batches.
In preparation for fixing that bug, this commit provides TINY_RCU with
the required rcu_batches_completed_sched().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 91f7e4c37800..1ce2d6b8f0c3 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -107,6 +107,14 @@ static inline unsigned long rcu_batches_completed_bh(void)
 	return 0;
 }
 
+/*
+ * Return the number of sched grace periods.
+ */
+static inline unsigned long rcu_batches_completed_sched(void)
+{
+	return 0;
+}
+
 static inline void rcu_force_quiescent_state(void)
 {
 }
-- 
cgit v1.2.3


From 917963d0b30f9c4153c372c165178501d97b6b55 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 21 Nov 2014 17:10:16 -0800
Subject: rcutorture: Check from beginning to end of grace period

Currently, rcutorture's Reader Batch checks measure from the end of
the previous grace period to the end of the current one.  This commit
tightens up these checks by measuring from the start and end of the same
grace period.  This involves adding rcu_batches_started() and friends
corresponding to the existing rcu_batches_completed() and friends.

We leave SRCU alone for the moment, as it does not yet have a way of
tracking both ends of its grace periods.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcutiny.h  | 30 +++++++++++++++++++++++++++---
 include/linux/rcutree.h  |  3 +++
 kernel/rcu/rcutorture.c  | 37 +++++++++++++++++++++++++++----------
 kernel/rcu/tree.c        | 40 ++++++++++++++++++++++++++++++++++++++--
 kernel/rcu/tree_plugin.h | 28 ----------------------------
 5 files changed, 95 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 1ce2d6b8f0c3..984192160e9b 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -92,7 +92,31 @@ static inline void rcu_virt_note_context_switch(int cpu)
 }
 
 /*
- * Return the number of grace periods.
+ * Return the number of grace periods started.
+ */
+static inline unsigned long rcu_batches_started(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of bottom-half grace periods started.
+ */
+static inline unsigned long rcu_batches_started_bh(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of sched grace periods started.
+ */
+static inline unsigned long rcu_batches_started_sched(void)
+{
+	return 0;
+}
+
+/*
+ * Return the number of grace periods completed.
  */
 static inline unsigned long rcu_batches_completed(void)
 {
@@ -100,7 +124,7 @@ static inline unsigned long rcu_batches_completed(void)
 }
 
 /*
- * Return the number of bottom-half grace periods.
+ * Return the number of bottom-half grace periods completed.
  */
 static inline unsigned long rcu_batches_completed_bh(void)
 {
@@ -108,7 +132,7 @@ static inline unsigned long rcu_batches_completed_bh(void)
 }
 
 /*
- * Return the number of sched grace periods.
+ * Return the number of sched grace periods completed.
  */
 static inline unsigned long rcu_batches_completed_sched(void)
 {
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 9885bfb6b123..c0dd124e69ec 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -81,6 +81,9 @@ void cond_synchronize_rcu(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
+unsigned long rcu_batches_started(void);
+unsigned long rcu_batches_started_bh(void);
+unsigned long rcu_batches_started_sched(void);
 unsigned long rcu_batches_completed(void);
 unsigned long rcu_batches_completed_bh(void);
 unsigned long rcu_batches_completed_sched(void);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index aadbc072ccf4..24142c200901 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -244,6 +244,7 @@ struct rcu_torture_ops {
 	int (*readlock)(void);
 	void (*read_delay)(struct torture_random_state *rrsp);
 	void (*readunlock)(int idx);
+	unsigned long (*started)(void);
 	unsigned long (*completed)(void);
 	void (*deferred_free)(struct rcu_torture *p);
 	void (*sync)(void);
@@ -372,6 +373,7 @@ static struct rcu_torture_ops rcu_ops = {
 	.readlock	= rcu_torture_read_lock,
 	.read_delay	= rcu_read_delay,
 	.readunlock	= rcu_torture_read_unlock,
+	.started	= rcu_batches_started,
 	.completed	= rcu_batches_completed,
 	.deferred_free	= rcu_torture_deferred_free,
 	.sync		= synchronize_rcu,
@@ -413,6 +415,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.readlock	= rcu_bh_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= rcu_bh_torture_read_unlock,
+	.started	= rcu_batches_started_bh,
 	.completed	= rcu_batches_completed_bh,
 	.deferred_free	= rcu_bh_torture_deferred_free,
 	.sync		= synchronize_rcu_bh,
@@ -456,6 +459,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
 	.readlock	= rcu_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= rcu_torture_read_unlock,
+	.started	= rcu_no_completed,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_busted_torture_deferred_free,
 	.sync		= synchronize_rcu_busted,
@@ -554,6 +558,7 @@ static struct rcu_torture_ops srcu_ops = {
 	.readlock	= srcu_torture_read_lock,
 	.read_delay	= srcu_read_delay,
 	.readunlock	= srcu_torture_read_unlock,
+	.started	= NULL,
 	.completed	= srcu_torture_completed,
 	.deferred_free	= srcu_torture_deferred_free,
 	.sync		= srcu_torture_synchronize,
@@ -590,6 +595,7 @@ static struct rcu_torture_ops sched_ops = {
 	.readlock	= sched_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= sched_torture_read_unlock,
+	.started	= rcu_batches_started_sched,
 	.completed	= rcu_batches_completed_sched,
 	.deferred_free	= rcu_sched_torture_deferred_free,
 	.sync		= synchronize_sched,
@@ -628,6 +634,7 @@ static struct rcu_torture_ops tasks_ops = {
 	.readlock	= tasks_torture_read_lock,
 	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
 	.readunlock	= tasks_torture_read_unlock,
+	.started	= rcu_no_completed,
 	.completed	= rcu_no_completed,
 	.deferred_free	= rcu_tasks_torture_deferred_free,
 	.sync		= synchronize_rcu_tasks,
@@ -1005,8 +1012,8 @@ static void rcutorture_trace_dump(void)
 static void rcu_torture_timer(unsigned long unused)
 {
 	int idx;
+	unsigned long started;
 	unsigned long completed;
-	unsigned long completed_end;
 	static DEFINE_TORTURE_RANDOM(rand);
 	static DEFINE_SPINLOCK(rand_lock);
 	struct rcu_torture *p;
@@ -1014,7 +1021,10 @@ static void rcu_torture_timer(unsigned long unused)
 	unsigned long long ts;
 
 	idx = cur_ops->readlock();
-	completed = cur_ops->completed();
+	if (cur_ops->started)
+		started = cur_ops->started();
+	else
+		started = cur_ops->completed();
 	ts = rcu_trace_clock_local();
 	p = rcu_dereference_check(rcu_torture_current,
 				  rcu_read_lock_bh_held() ||
@@ -1037,14 +1047,16 @@ static void rcu_torture_timer(unsigned long unused)
 		/* Should not happen, but... */
 		pipe_count = RCU_TORTURE_PIPE_LEN;
 	}
-	completed_end = cur_ops->completed();
+	completed = cur_ops->completed();
 	if (pipe_count > 1) {
 		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
-					  completed, completed_end);
+					  started, completed);
 		rcutorture_trace_dump();
 	}
 	__this_cpu_inc(rcu_torture_count[pipe_count]);
-	completed = completed_end - completed;
+	completed = completed - started;
+	if (cur_ops->started)
+		completed++;
 	if (completed > RCU_TORTURE_PIPE_LEN) {
 		/* Should not happen, but... */
 		completed = RCU_TORTURE_PIPE_LEN;
@@ -1063,8 +1075,8 @@ static void rcu_torture_timer(unsigned long unused)
 static int
 rcu_torture_reader(void *arg)
 {
+	unsigned long started;
 	unsigned long completed;
-	unsigned long completed_end;
 	int idx;
 	DEFINE_TORTURE_RANDOM(rand);
 	struct rcu_torture *p;
@@ -1083,7 +1095,10 @@ rcu_torture_reader(void *arg)
 				mod_timer(&t, jiffies + 1);
 		}
 		idx = cur_ops->readlock();
-		completed = cur_ops->completed();
+		if (cur_ops->started)
+			started = cur_ops->started();
+		else
+			started = cur_ops->completed();
 		ts = rcu_trace_clock_local();
 		p = rcu_dereference_check(rcu_torture_current,
 					  rcu_read_lock_bh_held() ||
@@ -1104,14 +1119,16 @@ rcu_torture_reader(void *arg)
 			/* Should not happen, but... */
 			pipe_count = RCU_TORTURE_PIPE_LEN;
 		}
-		completed_end = cur_ops->completed();
+		completed = cur_ops->completed();
 		if (pipe_count > 1) {
 			do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
-						  ts, completed, completed_end);
+						  ts, started, completed);
 			rcutorture_trace_dump();
 		}
 		__this_cpu_inc(rcu_torture_count[pipe_count]);
-		completed = completed_end - completed;
+		completed = completed - started;
+		if (cur_ops->started)
+			completed++;
 		if (completed > RCU_TORTURE_PIPE_LEN) {
 			/* Should not happen, but... */
 			completed = RCU_TORTURE_PIPE_LEN;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e26d78712e16..c0faad51ae87 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -315,7 +315,43 @@ static void force_quiescent_state(struct rcu_state *rsp);
 static int rcu_pending(void);
 
 /*
- * Return the number of RCU-sched batches processed thus far for debug & stats.
+ * Return the number of RCU batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started(void)
+{
+	return rcu_state_p->gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started);
+
+/*
+ * Return the number of RCU-sched batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_sched(void)
+{
+	return rcu_sched_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_sched);
+
+/*
+ * Return the number of RCU BH batches started thus far for debug & stats.
+ */
+unsigned long rcu_batches_started_bh(void)
+{
+	return rcu_bh_state.gpnum;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_started_bh);
+
+/*
+ * Return the number of RCU batches completed thus far for debug & stats.
+ */
+unsigned long rcu_batches_completed(void)
+{
+	return rcu_state_p->completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU-sched batches completed thus far for debug & stats.
  */
 unsigned long rcu_batches_completed_sched(void)
 {
@@ -324,7 +360,7 @@ unsigned long rcu_batches_completed_sched(void)
 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
 
 /*
- * Return the number of RCU BH batches processed thus far for debug & stats.
+ * Return the number of RCU BH batches completed thus far for debug & stats.
  */
 unsigned long rcu_batches_completed_bh(void)
 {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index f69300d4a51f..07e61a04de1d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -113,25 +113,6 @@ static void __init rcu_bootup_announce(void)
 	rcu_bootup_announce_oddness();
 }
 
-/*
- * Return the number of RCU-preempt batches processed thus far
- * for debug and statistics.
- */
-static unsigned long rcu_batches_completed_preempt(void)
-{
-	return rcu_preempt_state.completed;
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
-
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-unsigned long rcu_batches_completed(void)
-{
-	return rcu_batches_completed_preempt();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
 /*
  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
  * that this just means that the task currently running on the CPU is
@@ -932,15 +913,6 @@ static void __init rcu_bootup_announce(void)
 	rcu_bootup_announce_oddness();
 }
 
-/*
- * Return the number of RCU batches processed thus far for debug & stats.
- */
-unsigned long rcu_batches_completed(void)
-{
-	return rcu_batches_completed_sched();
-}
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-
 /*
  * Because preemptible RCU does not exist, we never have to check for
  * CPUs being in quiescent states.
-- 
cgit v1.2.3


From 31c89c959667194350f496947b576c149503ce98 Mon Sep 17 00:00:00 2001
From: Soren Brinkmann <soren.brinkmann@xilinx.com>
Date: Fri, 9 Jan 2015 07:43:45 -0800
Subject: pinctrl: pinconf-generic: Infer map type from DT property
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the new 'groups' property, the DT parser can infer the map type
from the fact whether 'pins' or 'groups' is used to specify the pin
group to work on.

To maintain backwards compatibitliy with current usage of the DT
binding, this is only done when PIN_MAP_TYPE_INVALID is passed to the
parsing function as type.

Also, a new helper 'pinconf_generic_dt_node_to_map_all()' is introduced,
which can be used by drivers as generic callback for dt_node_to_map() to
leverage the new feature.

Changes since v2:
 - rename dt_pin_specifier to subnode_target_type
 - add additional comment in header file explaining passing an invalid
   map type
 - mention map_all() helper in commit message
Changes since RFC v2:
 - none

Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Tested-by: Andreas FÃ¤rber <afaerber@suse.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c       | 17 ++++++++++++++---
 include/linux/pinctrl/pinconf-generic.h | 11 +++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index f78b416d7984..21b3d90ebb2d 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -264,6 +264,7 @@ int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 	unsigned reserve;
 	struct property *prop;
 	const char *group;
+	const char *subnode_target_type = "pins";
 
 	ret = of_property_read_string(np, "function", &function);
 	if (ret < 0) {
@@ -284,10 +285,20 @@ int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 		reserve++;
 	if (num_configs)
 		reserve++;
+
 	ret = of_property_count_strings(np, "pins");
 	if (ret < 0) {
-		dev_err(dev, "could not parse property pins\n");
-		goto exit;
+		ret = of_property_count_strings(np, "groups");
+		if (ret < 0) {
+			dev_err(dev, "could not parse property pins/groups\n");
+			goto exit;
+		}
+		if (type == PIN_MAP_TYPE_INVALID)
+			type = PIN_MAP_TYPE_CONFIGS_GROUP;
+		subnode_target_type = "groups";
+	} else {
+		if (type == PIN_MAP_TYPE_INVALID)
+			type = PIN_MAP_TYPE_CONFIGS_PIN;
 	}
 	reserve *= ret;
 
@@ -296,7 +307,7 @@ int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 	if (ret < 0)
 		goto exit;
 
-	of_property_for_each_string(np, "pins", prop, group) {
+	of_property_for_each_string(np, subnode_target_type, prop, group) {
 		if (function) {
 			ret = pinctrl_utils_add_map_mux(pctldev, map,
 					reserved_maps, num_maps, group,
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index d578a60eff23..83c89f5ab705 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -174,6 +174,17 @@ static inline int pinconf_generic_dt_node_to_map_pin(
 			PIN_MAP_TYPE_CONFIGS_PIN);
 }
 
+static inline int pinconf_generic_dt_node_to_map_all(
+		struct pinctrl_dev *pctldev, struct device_node *np_config,
+		struct pinctrl_map **map, unsigned *num_maps)
+{
+	/*
+	 * passing the type as PIN_MAP_TYPE_INVALID causes the underlying parser
+	 * to infer the map type from the DT properties used.
+	 */
+	return pinconf_generic_dt_node_to_map(pctldev, np_config, map, num_maps,
+			PIN_MAP_TYPE_INVALID);
+}
 #endif
 
 #endif /* CONFIG_GENERIC_PINCONF */
-- 
cgit v1.2.3


From dd4d01f7bad886c22687224bc7070b87de8deb51 Mon Sep 17 00:00:00 2001
From: Soren Brinkmann <soren.brinkmann@xilinx.com>
Date: Fri, 9 Jan 2015 07:43:46 -0800
Subject: pinctrl: pinconf-generic: Allow driver to specify DT params
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Additionally to the generic DT parameters, allow drivers to provide
driver-specific DT parameters to be used with the generic parser
infrastructure.

To achieve this 'struct pinctrl_desc' is extended to pass custom pinconf
option to the core. In order to pass this kind of information, the
related data structures - 'struct pinconf_generic_dt_params',
'pin_config_item' - are moved from pinconf internals to the
pinconf-generic header.

Additionally pinconfg-generic is refactored to not only iterate over the
generic pinconf parameters but also take the parameters into account
that are provided through the driver's 'struct pinctrl_desc'.
In particular 'pinconf_generic_parse_dt_config()' and
'pinconf_generic_dump' helpers are split into two parts each. In order
to have a more generic helper that can be used to process the generic
parameters as well as the driver-specific ones.

v2:
 - fix typo
 - add missing documentation for @conf_items member in struct
 - rebase to pinctrl/devel: conflict in abx500
 - rename _pinconf_generic_dump() to pinconf_generic_dump_one()
 - removed '_' from _parse_dt_cfg()
 - removed BUG_ONs, error condition is handled in if statements
 - removed pinconf_generic_dump_group() & pinconf_generic_dump_pin
   helpers
   - fixed up corresponding call sites
   - renamed pinconf_generic_dump() to pinconf_generic_dump_pins()
   - added kernel-doc to pinconf_generic_dump_pins()
 - add kernel-doc
 - more verbose commit message

Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Tested-by: Andreas FÃ¤rber <afaerber@suse.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/nomadik/pinctrl-abx500.c |   2 +-
 drivers/pinctrl/pinconf-generic.c        | 182 +++++++++++++++++--------------
 drivers/pinctrl/pinconf.c                |   4 +-
 drivers/pinctrl/pinconf.h                |  22 ++--
 drivers/pinctrl/pinctrl-rockchip.c       |   2 +-
 drivers/pinctrl/pinctrl-tz1090-pdc.c     |   2 +-
 drivers/pinctrl/pinctrl-tz1090.c         |   2 +-
 drivers/pinctrl/sh-pfc/pinctrl.c         |   2 +-
 include/linux/pinctrl/pinconf-generic.h  |  18 +++
 include/linux/pinctrl/pinctrl.h          |   9 ++
 10 files changed, 141 insertions(+), 104 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/nomadik/pinctrl-abx500.c b/drivers/pinctrl/nomadik/pinctrl-abx500.c
index 3d6d97228523..1806b24faa14 100644
--- a/drivers/pinctrl/nomadik/pinctrl-abx500.c
+++ b/drivers/pinctrl/nomadik/pinctrl-abx500.c
@@ -914,7 +914,7 @@ static int abx500_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 		}
 	}
 
-	ret = pinconf_generic_parse_dt_config(np, &configs, &nconfigs);
+	ret = pinconf_generic_parse_dt_config(np, pctldev, &configs, &nconfigs);
 	if (nconfigs) {
 		const char *gpio_name;
 		const char *pin;
diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index 21b3d90ebb2d..e0886665b70a 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -27,17 +27,6 @@
 #include "pinctrl-utils.h"
 
 #ifdef CONFIG_DEBUG_FS
-
-struct pin_config_item {
-	const enum pin_config_param param;
-	const char * const display;
-	const char * const format;
-	bool has_arg;
-};
-
-#define PCONFDUMP(a, b, c, d) { .param = a, .display = b, .format = c, \
-				.has_arg = d }
-
 static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_BIAS_DISABLE, "input bias disabled", NULL, false),
 	PCONFDUMP(PIN_CONFIG_BIAS_HIGH_IMPEDANCE, "input bias high impedance", NULL, false),
@@ -60,22 +49,25 @@ static const struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_OUTPUT, "pin output", "level", true),
 };
 
-void pinconf_generic_dump_pin(struct pinctrl_dev *pctldev,
-			      struct seq_file *s, unsigned pin)
+static void pinconf_generic_dump_one(struct pinctrl_dev *pctldev,
+				     struct seq_file *s, const char *gname,
+				     unsigned pin,
+				     const struct pin_config_item *items,
+				     int nitems)
 {
-	const struct pinconf_ops *ops = pctldev->desc->confops;
 	int i;
 
-	if (!ops->is_generic)
-		return;
-
-	for (i = 0; i < ARRAY_SIZE(conf_items); i++) {
+	for (i = 0; i < nitems; i++) {
 		unsigned long config;
 		int ret;
 
 		/* We want to check out this parameter */
-		config = pinconf_to_config_packed(conf_items[i].param, 0);
-		ret = pin_config_get_for_pin(pctldev, pin, &config);
+		config = pinconf_to_config_packed(items[i].param, 0);
+		if (gname)
+			ret = pin_config_group_get(dev_name(pctldev->dev),
+						   gname, &config);
+		else
+			ret = pin_config_get_for_pin(pctldev, pin, &config);
 		/* These are legal errors */
 		if (ret == -EINVAL || ret == -ENOTSUPP)
 			continue;
@@ -85,56 +77,46 @@ void pinconf_generic_dump_pin(struct pinctrl_dev *pctldev,
 		}
 		/* Space between multiple configs */
 		seq_puts(s, " ");
-		seq_puts(s, conf_items[i].display);
+		seq_puts(s, items[i].display);
 		/* Print unit if available */
-		if (conf_items[i].has_arg) {
+		if (items[i].has_arg) {
 			seq_printf(s, " (%u",
 				   pinconf_to_config_argument(config));
-			if (conf_items[i].format)
-				seq_printf(s, " %s)", conf_items[i].format);
+			if (items[i].format)
+				seq_printf(s, " %s)", items[i].format);
 			else
 				seq_puts(s, ")");
 		}
 	}
 }
 
-void pinconf_generic_dump_group(struct pinctrl_dev *pctldev,
-			      struct seq_file *s, const char *gname)
+/**
+ * pinconf_generic_dump_pins - Print information about pin or group of pins
+ * @pctldev:	Pincontrol device
+ * @s:		File to print to
+ * @gname:	Group name specifying pins
+ * @pin:	Pin number specyfying pin
+ *
+ * Print the pinconf configuration for the requested pin(s) to @s. Pins can be
+ * specified either by pin using @pin or by group using @gname. Only one needs
+ * to be specified the other can be NULL/0.
+ */
+void pinconf_generic_dump_pins(struct pinctrl_dev *pctldev, struct seq_file *s,
+			       const char *gname, unsigned pin)
 {
 	const struct pinconf_ops *ops = pctldev->desc->confops;
-	int i;
 
 	if (!ops->is_generic)
 		return;
 
-	for (i = 0; i < ARRAY_SIZE(conf_items); i++) {
-		unsigned long config;
-		int ret;
-
-		/* We want to check out this parameter */
-		config = pinconf_to_config_packed(conf_items[i].param, 0);
-		ret = pin_config_group_get(dev_name(pctldev->dev), gname,
-					   &config);
-		/* These are legal errors */
-		if (ret == -EINVAL || ret == -ENOTSUPP)
-			continue;
-		if (ret) {
-			seq_printf(s, "ERROR READING CONFIG SETTING %d ", i);
-			continue;
-		}
-		/* Space between multiple configs */
-		seq_puts(s, " ");
-		seq_puts(s, conf_items[i].display);
-		/* Print unit if available */
-		if (conf_items[i].has_arg) {
-			seq_printf(s, " (%u",
-				   pinconf_to_config_argument(config));
-			if (conf_items[i].format)
-				seq_printf(s, " %s)", conf_items[i].format);
-			else
-				seq_puts(s, ")");
-		}
-	}
+	/* generic parameters */
+	pinconf_generic_dump_one(pctldev, s, gname, pin, conf_items,
+				 ARRAY_SIZE(conf_items));
+	/* driver-specific parameters */
+	if (pctldev->desc->num_dt_params && pctldev->desc->conf_items)
+		pinconf_generic_dump_one(pctldev, s, gname, pin,
+					 pctldev->desc->conf_items,
+					 pctldev->desc->num_dt_params);
 }
 
 void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
@@ -148,17 +130,21 @@ void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
 		seq_printf(s, "%s: 0x%x", conf_items[i].display,
 			   pinconf_to_config_argument(config));
 	}
+
+	if (!pctldev->desc->num_dt_params || !pctldev->desc->conf_items)
+		return;
+
+	for (i = 0; i < pctldev->desc->num_dt_params; i++) {
+		if (pinconf_to_config_param(config) != pctldev->desc->conf_items[i].param)
+			continue;
+		seq_printf(s, "%s: 0x%x", pctldev->desc->conf_items[i].display,
+			   pinconf_to_config_argument(config));
+	}
 }
 EXPORT_SYMBOL_GPL(pinconf_generic_dump_config);
 #endif
 
 #ifdef CONFIG_OF
-struct pinconf_generic_dt_params {
-	const char * const property;
-	enum pin_config_param param;
-	u32 default_value;
-};
-
 static const struct pinconf_generic_dt_params dt_params[] = {
 	{ "bias-disable", PIN_CONFIG_BIAS_DISABLE, 0 },
 	{ "bias-high-impedance", PIN_CONFIG_BIAS_HIGH_IMPEDANCE, 0 },
@@ -183,6 +169,47 @@ static const struct pinconf_generic_dt_params dt_params[] = {
 	{ "slew-rate", PIN_CONFIG_SLEW_RATE, 0},
 };
 
+/**
+ * parse_dt_cfg - Parse DT pinconf parameters
+ * @np:	DT node
+ * @params:	Array of describing DT parameters
+ * @count:	Number of entries in @params
+ * @cfg:	Array of parsed config options
+ * @ncfg:	Number of entries in @cfg
+ *
+ * Parse the config options described in @params from @np and puts the result
+ * in @cfg. @cfg does not need to be empty, entries are added beggining at
+ * @ncfg. @ncfg is updated to reflect the number of entries after parsing. @cfg
+ * needs to have enough memory allocated to hold all possible entries.
+ */
+static void parse_dt_cfg(struct device_node *np,
+			 const struct pinconf_generic_dt_params *params,
+			 unsigned int count, unsigned long *cfg,
+			 unsigned int *ncfg)
+{
+	int i;
+
+	for (i = 0; i < count; i++) {
+		u32 val;
+		int ret;
+		const struct pinconf_generic_dt_params *par = &params[i];
+
+		ret = of_property_read_u32(np, par->property, &val);
+
+		/* property not found */
+		if (ret == -EINVAL)
+			continue;
+
+		/* use default value, when no value is specified */
+		if (ret)
+			val = par->default_value;
+
+		pr_debug("found %s with value %u\n", par->property, val);
+		cfg[*ncfg] = pinconf_to_config_packed(par->param, val);
+		(*ncfg)++;
+	}
+}
+
 /**
  * pinconf_generic_parse_dt_config()
  * parse the config properties into generic pinconfig values.
@@ -191,39 +218,29 @@ static const struct pinconf_generic_dt_params dt_params[] = {
  * @nconfigs: umber of configurations
  */
 int pinconf_generic_parse_dt_config(struct device_node *np,
+				    struct pinctrl_dev *pctldev,
 				    unsigned long **configs,
 				    unsigned int *nconfigs)
 {
 	unsigned long *cfg;
-	unsigned int ncfg = 0;
+	unsigned int max_cfg, ncfg = 0;
 	int ret;
-	int i;
-	u32 val;
 
 	if (!np)
 		return -EINVAL;
 
 	/* allocate a temporary array big enough to hold one of each option */
-	cfg = kzalloc(sizeof(*cfg) * ARRAY_SIZE(dt_params), GFP_KERNEL);
+	max_cfg = ARRAY_SIZE(dt_params);
+	if (pctldev)
+		max_cfg += pctldev->desc->num_dt_params;
+	cfg = kcalloc(max_cfg, sizeof(*cfg), GFP_KERNEL);
 	if (!cfg)
 		return -ENOMEM;
 
-	for (i = 0; i < ARRAY_SIZE(dt_params); i++) {
-		const struct pinconf_generic_dt_params *par = &dt_params[i];
-		ret = of_property_read_u32(np, par->property, &val);
-
-		/* property not found */
-		if (ret == -EINVAL)
-			continue;
-
-		/* use default value, when no value is specified */
-		if (ret)
-			val = par->default_value;
-
-		pr_debug("found %s with value %u\n", par->property, val);
-		cfg[ncfg] = pinconf_to_config_packed(par->param, val);
-		ncfg++;
-	}
+	parse_dt_cfg(np, dt_params, ARRAY_SIZE(dt_params), cfg, &ncfg);
+	if (pctldev && pctldev->desc->num_dt_params && pctldev->desc->params)
+		parse_dt_cfg(np, pctldev->desc->params,
+			      pctldev->desc->num_dt_params, cfg, &ncfg);
 
 	ret = 0;
 
@@ -274,7 +291,8 @@ int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 		function = NULL;
 	}
 
-	ret = pinconf_generic_parse_dt_config(np, &configs, &num_configs);
+	ret = pinconf_generic_parse_dt_config(np, pctldev, &configs,
+					      &num_configs);
 	if (ret < 0) {
 		dev_err(dev, "could not parse node property\n");
 		return ret;
diff --git a/drivers/pinctrl/pinconf.c b/drivers/pinctrl/pinconf.c
index 8bfa0643e5dc..1fc09dc20199 100644
--- a/drivers/pinctrl/pinconf.c
+++ b/drivers/pinctrl/pinconf.c
@@ -288,7 +288,7 @@ static void pinconf_dump_pin(struct pinctrl_dev *pctldev,
 	const struct pinconf_ops *ops = pctldev->desc->confops;
 
 	/* no-op when not using generic pin config */
-	pinconf_generic_dump_pin(pctldev, s, pin);
+	pinconf_generic_dump_pins(pctldev, s, NULL, pin);
 	if (ops && ops->pin_config_dbg_show)
 		ops->pin_config_dbg_show(pctldev, s, pin);
 }
@@ -333,7 +333,7 @@ static void pinconf_dump_group(struct pinctrl_dev *pctldev,
 	const struct pinconf_ops *ops = pctldev->desc->confops;
 
 	/* no-op when not using generic pin config */
-	pinconf_generic_dump_group(pctldev, s, gname);
+	pinconf_generic_dump_pins(pctldev, s, gname, 0);
 	if (ops && ops->pin_config_group_dbg_show)
 		ops->pin_config_group_dbg_show(pctldev, s, selector);
 }
diff --git a/drivers/pinctrl/pinconf.h b/drivers/pinctrl/pinconf.h
index a4a5417e1413..55c75780b3b2 100644
--- a/drivers/pinctrl/pinconf.h
+++ b/drivers/pinctrl/pinconf.h
@@ -92,26 +92,17 @@ static inline void pinconf_init_device_debugfs(struct dentry *devroot,
 
 #if defined(CONFIG_GENERIC_PINCONF) && defined(CONFIG_DEBUG_FS)
 
-void pinconf_generic_dump_pin(struct pinctrl_dev *pctldev,
-			      struct seq_file *s, unsigned pin);
-
-void pinconf_generic_dump_group(struct pinctrl_dev *pctldev,
-			      struct seq_file *s, const char *gname);
+void pinconf_generic_dump_pins(struct pinctrl_dev *pctldev,
+			       struct seq_file *s, const char *gname,
+			       unsigned pin);
 
 void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
 				 struct seq_file *s, unsigned long config);
 #else
 
-static inline void pinconf_generic_dump_pin(struct pinctrl_dev *pctldev,
-					    struct seq_file *s,
-					    unsigned pin)
-{
-	return;
-}
-
-static inline void pinconf_generic_dump_group(struct pinctrl_dev *pctldev,
-					      struct seq_file *s,
-					      const char *gname)
+static inline void pinconf_generic_dump_pins(struct pinctrl_dev *pctldev,
+					     struct seq_file *s,
+					     const char *gname, unsigned pin)
 {
 	return;
 }
@@ -126,6 +117,7 @@ static inline void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
 
 #if defined(CONFIG_GENERIC_PINCONF) && defined(CONFIG_OF)
 int pinconf_generic_parse_dt_config(struct device_node *np,
+				    struct pinctrl_dev *pctldev,
 				    unsigned long **configs,
 				    unsigned int *nconfigs);
 #endif
diff --git a/drivers/pinctrl/pinctrl-rockchip.c b/drivers/pinctrl/pinctrl-rockchip.c
index ba74f0aa60c7..7625f333ab07 100644
--- a/drivers/pinctrl/pinctrl-rockchip.c
+++ b/drivers/pinctrl/pinctrl-rockchip.c
@@ -1140,7 +1140,7 @@ static int rockchip_pinctrl_parse_groups(struct device_node *np,
 			return -EINVAL;
 
 		np_config = of_find_node_by_phandle(be32_to_cpup(phandle));
-		ret = pinconf_generic_parse_dt_config(np_config,
+		ret = pinconf_generic_parse_dt_config(np_config, NULL,
 				&grp->data[j].configs, &grp->data[j].nconfigs);
 		if (ret)
 			return ret;
diff --git a/drivers/pinctrl/pinctrl-tz1090-pdc.c b/drivers/pinctrl/pinctrl-tz1090-pdc.c
index 146e48a9b839..fab6aafa6a9f 100644
--- a/drivers/pinctrl/pinctrl-tz1090-pdc.c
+++ b/drivers/pinctrl/pinctrl-tz1090-pdc.c
@@ -415,7 +415,7 @@ static int tz1090_pdc_pinctrl_dt_subnode_to_map(struct device *dev,
 		function = NULL;
 	}
 
-	ret = pinconf_generic_parse_dt_config(np, &configs, &num_configs);
+	ret = pinconf_generic_parse_dt_config(np, NULL, &configs, &num_configs);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/pinctrl-tz1090.c b/drivers/pinctrl/pinctrl-tz1090.c
index df8cb1e5b7b4..8bd73075f9dd 100644
--- a/drivers/pinctrl/pinctrl-tz1090.c
+++ b/drivers/pinctrl/pinctrl-tz1090.c
@@ -1131,7 +1131,7 @@ static int tz1090_pinctrl_dt_subnode_to_map(struct device *dev,
 		function = NULL;
 	}
 
-	ret = pinconf_generic_parse_dt_config(np, &configs, &num_configs);
+	ret = pinconf_generic_parse_dt_config(np, NULL, &configs, &num_configs);
 	if (ret)
 		return ret;
 
diff --git a/drivers/pinctrl/sh-pfc/pinctrl.c b/drivers/pinctrl/sh-pfc/pinctrl.c
index 910deaefa0ac..072e7c62cab7 100644
--- a/drivers/pinctrl/sh-pfc/pinctrl.c
+++ b/drivers/pinctrl/sh-pfc/pinctrl.c
@@ -122,7 +122,7 @@ static int sh_pfc_dt_subnode_to_map(struct device *dev, struct device_node *np,
 		return ret;
 	}
 
-	ret = pinconf_generic_parse_dt_config(np, &configs, &num_configs);
+	ret = pinconf_generic_parse_dt_config(np, NULL, &configs, &num_configs);
 	if (ret < 0)
 		return ret;
 
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 83c89f5ab705..342409f7f3ec 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -115,6 +115,18 @@ enum pin_config_param {
 	PIN_CONFIG_END = 0x7FFF,
 };
 
+#ifdef CONFIG_DEBUG_FS
+#define PCONFDUMP(a, b, c, d) { .param = a, .display = b, .format = c, \
+				.has_arg = d }
+
+struct pin_config_item {
+	const enum pin_config_param param;
+	const char * const display;
+	const char * const format;
+	bool has_arg;
+};
+#endif /* CONFIG_DEBUG_FS */
+
 /*
  * Helpful configuration macro to be used in tables etc.
  */
@@ -150,6 +162,12 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param
 struct pinctrl_dev;
 struct pinctrl_map;
 
+struct pinconf_generic_dt_params {
+	const char * const property;
+	enum pin_config_param param;
+	u32 default_value;
+};
+
 int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev,
 		struct device_node *np, struct pinctrl_map **map,
 		unsigned *reserved_maps, unsigned *num_maps,
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index cc8e1aff0e28..c58b3e11ba8e 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -24,6 +24,7 @@ struct pinctrl_dev;
 struct pinctrl_map;
 struct pinmux_ops;
 struct pinconf_ops;
+struct pin_config_item;
 struct gpio_chip;
 struct device_node;
 
@@ -117,6 +118,9 @@ struct pinctrl_ops {
  * @confops: pin config operations vtable, if you support pin configuration in
  *	your driver
  * @owner: module providing the pin controller, used for refcounting
+ * @num_dt_params: Number of driver-specific DT parameters
+ * @params: List of DT parameters
+ * @conf_items: Information how to print @params in debugfs
  */
 struct pinctrl_desc {
 	const char *name;
@@ -126,6 +130,11 @@ struct pinctrl_desc {
 	const struct pinmux_ops *pmxops;
 	const struct pinconf_ops *confops;
 	struct module *owner;
+#if defined(CONFIG_GENERIC_PINCONF) && defined(CONFIG_OF)
+	unsigned int num_dt_params;
+	const struct pinconf_generic_dt_params *params;
+	const struct pin_config_item *conf_items;
+#endif
 };
 
 /* External interface to pin controller */
-- 
cgit v1.2.3


From 7bb68410ef22067b08fd52887875b8f337f89dcc Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Sat, 18 Oct 2014 15:04:15 +0200
Subject: efi: split off remapping code from efi_config_init()

Split of the remapping code from efi_config_init() so that the caller
can perform its own remapping. This is necessary to correctly handle
virtually remapped UEFI memory regions under kexec, as efi.systab will
have been updated to a virtual address.

Acked-by: Matt Fleming <matt.fleming@intel.com>
Tested-by: Leif Lindholm <leif.lindholm@linaro.org>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
---
 drivers/firmware/efi/efi.c | 56 ++++++++++++++++++++++++++--------------------
 include/linux/efi.h        |  2 ++
 2 files changed, 34 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index 9035c1b74d58..b7ba9d8ec4b0 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c
@@ -293,29 +293,15 @@ static __init int match_config_table(efi_guid_t *guid,
 	return 0;
 }
 
-int __init efi_config_init(efi_config_table_type_t *arch_tables)
+int __init efi_config_parse_tables(void *config_tables, int count, int sz,
+				   efi_config_table_type_t *arch_tables)
 {
-	void *config_tables, *tablep;
-	int i, sz;
-
-	if (efi_enabled(EFI_64BIT))
-		sz = sizeof(efi_config_table_64_t);
-	else
-		sz = sizeof(efi_config_table_32_t);
-
-	/*
-	 * Let's see what config tables the firmware passed to us.
-	 */
-	config_tables = early_memremap(efi.systab->tables,
-				       efi.systab->nr_tables * sz);
-	if (config_tables == NULL) {
-		pr_err("Could not map Configuration table!\n");
-		return -ENOMEM;
-	}
+	void *tablep;
+	int i;
 
 	tablep = config_tables;
 	pr_info("");
-	for (i = 0; i < efi.systab->nr_tables; i++) {
+	for (i = 0; i < count; i++) {
 		efi_guid_t guid;
 		unsigned long table;
 
@@ -328,8 +314,6 @@ int __init efi_config_init(efi_config_table_type_t *arch_tables)
 			if (table64 >> 32) {
 				pr_cont("\n");
 				pr_err("Table located above 4GB, disabling EFI.\n");
-				early_memunmap(config_tables,
-					       efi.systab->nr_tables * sz);
 				return -EINVAL;
 			}
 #endif
@@ -344,13 +328,37 @@ int __init efi_config_init(efi_config_table_type_t *arch_tables)
 		tablep += sz;
 	}
 	pr_cont("\n");
-	early_memunmap(config_tables, efi.systab->nr_tables * sz);
-
 	set_bit(EFI_CONFIG_TABLES, &efi.flags);
-
 	return 0;
 }
 
+int __init efi_config_init(efi_config_table_type_t *arch_tables)
+{
+	void *config_tables;
+	int sz, ret;
+
+	if (efi_enabled(EFI_64BIT))
+		sz = sizeof(efi_config_table_64_t);
+	else
+		sz = sizeof(efi_config_table_32_t);
+
+	/*
+	 * Let's see what config tables the firmware passed to us.
+	 */
+	config_tables = early_memremap(efi.systab->tables,
+				       efi.systab->nr_tables * sz);
+	if (config_tables == NULL) {
+		pr_err("Could not map Configuration table!\n");
+		return -ENOMEM;
+	}
+
+	ret = efi_config_parse_tables(config_tables, efi.systab->nr_tables, sz,
+				      arch_tables);
+
+	early_memunmap(config_tables, efi.systab->nr_tables * sz);
+	return ret;
+}
+
 #ifdef CONFIG_EFI_VARS_MODULE
 static int __init efi_load_efivars(void)
 {
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 0238d612750e..5ffe5115951f 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -875,6 +875,8 @@ static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned lon
 #endif
 extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
 extern int efi_config_init(efi_config_table_type_t *arch_tables);
+extern int efi_config_parse_tables(void *config_tables, int count, int sz,
+				   efi_config_table_type_t *arch_tables);
 extern u64 efi_get_iobase (void);
 extern u32 efi_mem_type (unsigned long phys_addr);
 extern u64 efi_mem_attributes (unsigned long phys_addr);
-- 
cgit v1.2.3


From cbf6ab52add20b845f903decc973afbd5463c527 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Mon, 5 Jan 2015 19:29:32 +0800
Subject: kprobes: Pass the original kprobe for preparing optimized kprobe

Pass the original kprobe for preparing an optimized kprobe arch-dep
part, since for some architecture (e.g. ARM32) requires the information
in original kprobe.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Jon Medhurst <tixy@linaro.org>
---
 arch/x86/kernel/kprobes/opt.c | 3 ++-
 include/linux/kprobes.h       | 3 ++-
 kernel/kprobes.c              | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7c523bbf3dc8..0dd8d089c315 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -322,7 +322,8 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
  * Target instructions MUST be relocatable (checked inside)
  * This is called when new aggr(opt)probe is allocated or reused.
  */
-int arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
+int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+				  struct kprobe *__unused)
 {
 	u8 *buf;
 	int ret;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 5297f9fa0ef2..1ab54754a86d 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -308,7 +308,8 @@ struct optimized_kprobe {
 /* Architecture dependent functions for direct jump optimization */
 extern int arch_prepared_optinsn(struct arch_optimized_insn *optinsn);
 extern int arch_check_optimized_kprobe(struct optimized_kprobe *op);
-extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op);
+extern int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
+					 struct kprobe *orig);
 extern void arch_remove_optimized_kprobe(struct optimized_kprobe *op);
 extern void arch_optimize_kprobes(struct list_head *oplist);
 extern void arch_unoptimize_kprobes(struct list_head *oplist,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 06f58309fed2..bad4e959f2f7 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -717,7 +717,7 @@ static void prepare_optimized_kprobe(struct kprobe *p)
 	struct optimized_kprobe *op;
 
 	op = container_of(p, struct optimized_kprobe, kp);
-	arch_prepare_optimized_kprobe(op);
+	arch_prepare_optimized_kprobe(op, p);
 }
 
 /* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -731,7 +731,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
 
 	INIT_LIST_HEAD(&op->list);
 	op->kp.addr = p->addr;
-	arch_prepare_optimized_kprobe(op);
+	arch_prepare_optimized_kprobe(op, p);
 
 	return &op->kp;
 }
-- 
cgit v1.2.3


From 7a868d1e9ab3c534c5ad44e3e5dc46753a1e5636 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Mon, 12 Jan 2015 14:52:22 +0800
Subject: rhashtable: involve rhashtable_lookup_compare_insert routine

Introduce a new function called rhashtable_lookup_compare_insert()
which is very similar to rhashtable_lookup_insert(). But the former
makes use of users' given compare function to look for an object,
and then inserts it into hash table if found. As the entire process
of search and insertion is under protection of per bucket lock, this
can help users to avoid the involvement of extra lock.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  5 +++++
 lib/rhashtable.c           | 42 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 326acd8c2e9f..7b9bd77ed684 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -168,7 +168,12 @@ int rhashtable_shrink(struct rhashtable *ht);
 void *rhashtable_lookup(struct rhashtable *ht, const void *key);
 void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key,
 				bool (*compare)(void *, void *), void *arg);
+
 bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj);
+bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
+				      struct rhash_head *obj,
+				      bool (*compare)(void *, void *),
+				      void *arg);
 
 void rhashtable_destroy(struct rhashtable *ht);
 
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 8023b554905c..ed6ae1ad304c 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -726,6 +726,43 @@ EXPORT_SYMBOL_GPL(rhashtable_lookup_compare);
  * to rhashtable_init().
  */
 bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
+{
+	struct rhashtable_compare_arg arg = {
+		.ht = ht,
+		.key = rht_obj(ht, obj) + ht->p.key_offset,
+	};
+
+	BUG_ON(!ht->p.key_len);
+
+	return rhashtable_lookup_compare_insert(ht, obj, &rhashtable_compare,
+						&arg);
+}
+EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
+
+/**
+ * rhashtable_lookup_compare_insert - search and insert object to hash table
+ *                                    with compare function
+ * @ht:		hash table
+ * @obj:	pointer to hash head inside object
+ * @compare:	compare function, must return true on match
+ * @arg:	argument passed on to compare function
+ *
+ * Locks down the bucket chain in both the old and new table if a resize
+ * is in progress to ensure that writers can't remove from the old table
+ * and can't insert to the new table during the atomic operation of search
+ * and insertion. Searches for duplicates in both the old and new table if
+ * a resize is in progress.
+ *
+ * Lookups may occur in parallel with hashtable mutations and resizing.
+ *
+ * Will trigger an automatic deferred table resizing if the size grows
+ * beyond the watermark indicated by grow_decision() which can be passed
+ * to rhashtable_init().
+ */
+bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
+				      struct rhash_head *obj,
+				      bool (*compare)(void *, void *),
+				      void *arg)
 {
 	struct bucket_table *new_tbl, *old_tbl;
 	spinlock_t *new_bucket_lock, *old_bucket_lock;
@@ -747,7 +784,8 @@ bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj)
 	if (unlikely(old_tbl != new_tbl))
 		spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED);
 
-	if (rhashtable_lookup(ht, rht_obj(ht, obj) + ht->p.key_offset)) {
+	if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset,
+				      compare, arg)) {
 		success = false;
 		goto exit;
 	}
@@ -763,7 +801,7 @@ exit:
 
 	return success;
 }
-EXPORT_SYMBOL_GPL(rhashtable_lookup_insert);
+EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert);
 
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
-- 
cgit v1.2.3


From 6f73d3b13dc5e16ae06025cd1b12a36b2857caa2 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Mon, 12 Jan 2015 14:52:24 +0800
Subject: rhashtable: add a note for grow and shrink decision functions

As commit c0c09bfdc415 ("rhashtable: avoid unnecessary wakeup for
worker queue") moves condition statements of verifying whether hash
table size exceeds its maximum threshold or reaches its minimum
threshold from resizing functions to resizing decision functions,
we should add a note in rhashtable.h to indicate the implementation
of what the grow and shrink decision function must enforce min/max
shift, otherwise, it's failed to take min/max shift's set watermarks
into effect.

Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 7b9bd77ed684..9570832ab07c 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -79,6 +79,10 @@ struct rhashtable;
  * @obj_hashfn: Function to hash object
  * @grow_decision: If defined, may return true if table should expand
  * @shrink_decision: If defined, may return true if table should shrink
+ *
+ * Note: when implementing the grow and shrink decision function, min/max
+ * shift must be enforced, otherwise, resizing watermarks they set may be
+ * useless.
  */
 struct rhashtable_params {
 	size_t			nelem_hint;
-- 
cgit v1.2.3


From df8a39defad46b83694ea6dd868d332976d62cc0 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 13 Jan 2015 17:13:44 +0100
Subject: net: rename vlan_tx_* helpers since "tx" is misleading there

The same macros are used for rx as well. So rename it.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/filter.txt                  |  4 ++--
 drivers/infiniband/hw/nes/nes_nic.c                  | 13 +++++++------
 drivers/net/ethernet/3com/typhoon.c                  |  4 ++--
 drivers/net/ethernet/alteon/acenic.c                 |  8 ++++----
 drivers/net/ethernet/amd/amd8111e.c                  |  4 ++--
 drivers/net/ethernet/amd/xgbe/xgbe-drv.c             |  8 ++++----
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c      |  4 ++--
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c      |  4 ++--
 drivers/net/ethernet/atheros/atlx/atl1.c             |  4 ++--
 drivers/net/ethernet/atheros/atlx/atl2.c             |  4 ++--
 drivers/net/ethernet/broadcom/bnx2.c                 |  4 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c      |  4 ++--
 drivers/net/ethernet/broadcom/tg3.c                  |  4 ++--
 drivers/net/ethernet/brocade/bna/bnad.c              |  4 ++--
 drivers/net/ethernet/chelsio/cxgb/sge.c              |  4 ++--
 drivers/net/ethernet/chelsio/cxgb3/sge.c             |  6 +++---
 drivers/net/ethernet/chelsio/cxgb4/sge.c             |  4 ++--
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c           |  4 ++--
 drivers/net/ethernet/cisco/enic/enic_main.c          |  4 ++--
 drivers/net/ethernet/emulex/benet/be_main.c          | 12 ++++++------
 drivers/net/ethernet/freescale/gianfar.c             |  4 ++--
 drivers/net/ethernet/ibm/ehea/ehea_main.c            |  4 ++--
 drivers/net/ethernet/intel/e1000/e1000_main.c        |  5 +++--
 drivers/net/ethernet/intel/e1000e/netdev.c           |  9 +++++----
 drivers/net/ethernet/intel/fm10k/fm10k_main.c        |  4 ++--
 drivers/net/ethernet/intel/fm10k/fm10k_netdev.c      |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c          |  4 ++--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c        |  4 ++--
 drivers/net/ethernet/intel/igb/igb_main.c            |  4 ++--
 drivers/net/ethernet/intel/igbvf/netdev.c            |  5 +++--
 drivers/net/ethernet/intel/ixgb/ixgb_main.c          |  4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c        |  4 ++--
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c    |  4 ++--
 drivers/net/ethernet/jme.c                           |  4 ++--
 drivers/net/ethernet/marvell/sky2.c                  |  6 +++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c           | 12 ++++++------
 drivers/net/ethernet/natsemi/ns83820.c               |  4 ++--
 drivers/net/ethernet/neterion/s2io.c                 |  4 ++--
 drivers/net/ethernet/neterion/vxge/vxge-main.c       |  4 ++--
 drivers/net/ethernet/nvidia/forcedeth.c              |  4 ++--
 drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c |  4 ++--
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c       |  8 ++++----
 drivers/net/ethernet/qlogic/qlge/qlge_main.c         |  6 +++---
 drivers/net/ethernet/realtek/8139cp.c                |  4 ++--
 drivers/net/ethernet/realtek/r8169.c                 |  4 ++--
 drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c      |  2 +-
 drivers/net/ethernet/tehuti/tehuti.c                 |  4 ++--
 drivers/net/ethernet/via/via-rhine.c                 |  6 +++---
 drivers/net/ethernet/via/via-velocity.c              |  4 ++--
 drivers/net/macvtap.c                                |  6 +++---
 drivers/net/tun.c                                    |  4 ++--
 drivers/net/usb/r8152.c                              |  4 ++--
 drivers/net/vmxnet3/vmxnet3_drv.c                    |  4 ++--
 drivers/net/vxlan.c                                  |  4 ++--
 drivers/s390/net/qeth_l3_main.c                      |  8 ++++----
 drivers/vhost/net.c                                  |  2 +-
 include/linux/if_vlan.h                              | 16 ++++++++--------
 include/net/pkt_sched.h                              |  2 +-
 include/trace/events/net.h                           |  8 ++++----
 net/8021q/vlan_core.c                                |  2 +-
 net/bridge/br_netfilter.c                            | 12 ++++++------
 net/bridge/br_private.h                              |  4 ++--
 net/bridge/br_vlan.c                                 |  4 ++--
 net/bridge/netfilter/ebt_vlan.c                      |  4 ++--
 net/bridge/netfilter/ebtables.c                      |  2 +-
 net/core/dev.c                                       | 10 +++++-----
 net/core/netpoll.c                                   |  2 +-
 net/core/skbuff.c                                    |  8 ++++----
 net/ipv4/geneve.c                                    |  2 +-
 net/openvswitch/actions.c                            |  4 ++--
 net/openvswitch/datapath.c                           |  2 +-
 net/openvswitch/flow.c                               |  4 ++--
 net/openvswitch/vport-gre.c                          |  2 +-
 net/openvswitch/vport.c                              |  3 ++-
 net/packet/af_packet.c                               | 12 ++++++------
 net/sched/em_meta.c                                  |  2 +-
 net/wireless/util.c                                  |  4 ++--
 77 files changed, 195 insertions(+), 190 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt
index 58d08f8d8d80..9930ecfbb465 100644
--- a/Documentation/networking/filter.txt
+++ b/Documentation/networking/filter.txt
@@ -279,8 +279,8 @@ Possible BPF extensions are shown in the following table:
   hatype                                skb->dev->type
   rxhash                                skb->hash
   cpu                                   raw_smp_processor_id()
-  vlan_tci                              vlan_tx_tag_get(skb)
-  vlan_pr                               vlan_tx_tag_present(skb)
+  vlan_tci                              skb_vlan_tag_get(skb)
+  vlan_pr                               skb_vlan_tag_present(skb)
   rand                                  prandom_u32()
 
 These extensions can also be prefixed with '#'.
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index 49eb5111d2cd..70acda91eb2a 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -373,11 +373,11 @@ static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
 	wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
 
 	/* setup the VLAN tag if present */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
-				netdev->name, vlan_tx_tag_get(skb));
+				netdev->name, skb_vlan_tag_get(skb));
 		wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
-		wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb);
+		wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
 	} else
 		wqe_misc = 0;
 
@@ -576,11 +576,12 @@ tso_sq_no_longer_full:
 				wqe_fragment_length =
 						(__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
 				/* setup the VLAN tag if present */
-				if (vlan_tx_tag_present(skb)) {
+				if (skb_vlan_tag_present(skb)) {
 					nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
-							netdev->name, vlan_tx_tag_get(skb) );
+							netdev->name,
+						  skb_vlan_tag_get(skb));
 					wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
-					wqe_fragment_length[0] = (__force __le16) vlan_tx_tag_get(skb);
+					wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
 				} else
 					wqe_misc = 0;
 
diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c
index dede43f4ce09..8f8418d2ac4a 100644
--- a/drivers/net/ethernet/3com/typhoon.c
+++ b/drivers/net/ethernet/3com/typhoon.c
@@ -769,11 +769,11 @@ typhoon_start_tx(struct sk_buff *skb, struct net_device *dev)
 		first_txd->processFlags |= TYPHOON_TX_PF_IP_CHKSUM;
 	}
 
-	if(vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		first_txd->processFlags |=
 		    TYPHOON_TX_PF_INSERT_VLAN | TYPHOON_TX_PF_VLAN_PRIORITY;
 		first_txd->processFlags |=
-		    cpu_to_le32(htons(vlan_tx_tag_get(skb)) <<
+		    cpu_to_le32(htons(skb_vlan_tag_get(skb)) <<
 				TYPHOON_TX_PF_VLAN_TAG_SHIFT);
 	}
 
diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c
index b68074803de3..b90a26b13fdf 100644
--- a/drivers/net/ethernet/alteon/acenic.c
+++ b/drivers/net/ethernet/alteon/acenic.c
@@ -2429,9 +2429,9 @@ restart:
 		flagsize = (skb->len << 16) | (BD_FLG_END);
 		if (skb->ip_summed == CHECKSUM_PARTIAL)
 			flagsize |= BD_FLG_TCP_UDP_SUM;
-		if (vlan_tx_tag_present(skb)) {
+		if (skb_vlan_tag_present(skb)) {
 			flagsize |= BD_FLG_VLAN_TAG;
-			vlan_tag = vlan_tx_tag_get(skb);
+			vlan_tag = skb_vlan_tag_get(skb);
 		}
 		desc = ap->tx_ring + idx;
 		idx = (idx + 1) % ACE_TX_RING_ENTRIES(ap);
@@ -2450,9 +2450,9 @@ restart:
 		flagsize = (skb_headlen(skb) << 16);
 		if (skb->ip_summed == CHECKSUM_PARTIAL)
 			flagsize |= BD_FLG_TCP_UDP_SUM;
-		if (vlan_tx_tag_present(skb)) {
+		if (skb_vlan_tag_present(skb)) {
 			flagsize |= BD_FLG_VLAN_TAG;
-			vlan_tag = vlan_tx_tag_get(skb);
+			vlan_tag = skb_vlan_tag_get(skb);
 		}
 
 		ace_load_tx_bd(ap, ap->tx_ring + idx, mapping, flagsize, vlan_tag);
diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c
index 841e6558db68..4c2ae2221780 100644
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@@ -1299,11 +1299,11 @@ static netdev_tx_t amd8111e_start_xmit(struct sk_buff *skb,
 	lp->tx_ring[tx_index].tx_flags = 0;
 
 #if AMD8111E_VLAN_TAG_USED
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		lp->tx_ring[tx_index].tag_ctrl_cmd |=
 				cpu_to_le16(TCC_VLAN_INSERT);
 		lp->tx_ring[tx_index].tag_ctrl_info =
-				cpu_to_le16(vlan_tx_tag_get(skb));
+				cpu_to_le16(skb_vlan_tag_get(skb));
 
 	}
 #endif
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
index 7bb5f07dbeef..2ba1dd22ad64 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
@@ -1165,8 +1165,8 @@ static void xgbe_prep_tx_tstamp(struct xgbe_prv_data *pdata,
 
 static void xgbe_prep_vlan(struct sk_buff *skb, struct xgbe_packet_data *packet)
 {
-	if (vlan_tx_tag_present(skb))
-		packet->vlan_ctag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb))
+		packet->vlan_ctag = skb_vlan_tag_get(skb);
 }
 
 static int xgbe_prep_tso(struct sk_buff *skb, struct xgbe_packet_data *packet)
@@ -1247,9 +1247,9 @@ static void xgbe_packet_info(struct xgbe_prv_data *pdata,
 		XGMAC_SET_BITS(packet->attributes, TX_PACKET_ATTRIBUTES,
 			       CSUM_ENABLE, 1);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/* VLAN requires an extra descriptor if tag is different */
-		if (vlan_tx_tag_get(skb) != ring->tx.cur_vlan_ctag)
+		if (skb_vlan_tag_get(skb) != ring->tx.cur_vlan_ctag)
 			/* We can share with the TSO context descriptor */
 			if (!context_desc) {
 				context_desc = 1;
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index c9946c6c119e..587f63e87588 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -2235,8 +2235,8 @@ static netdev_tx_t atl1c_xmit_frame(struct sk_buff *skb,
 		return NETDEV_TX_OK;
 	}
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		u16 vlan = vlan_tx_tag_get(skb);
+	if (unlikely(skb_vlan_tag_present(skb))) {
+		u16 vlan = skb_vlan_tag_get(skb);
 		__le16 tag;
 
 		vlan = cpu_to_le16(vlan);
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index c88abf5b6415..59a03a193e83 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -1892,8 +1892,8 @@ static netdev_tx_t atl1e_xmit_frame(struct sk_buff *skb,
 
 	tpd = atl1e_get_tpd(adapter);
 
-	if (vlan_tx_tag_present(skb)) {
-		u16 vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		u16 vlan_tag = skb_vlan_tag_get(skb);
 		u16 atl1e_vlan_tag;
 
 		tpd->word3 |= 1 << TPD_INS_VL_TAG_SHIFT;
diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c
index 2c8f398aeda9..eca1d113fee1 100644
--- a/drivers/net/ethernet/atheros/atlx/atl1.c
+++ b/drivers/net/ethernet/atheros/atlx/atl1.c
@@ -2415,8 +2415,8 @@ static netdev_tx_t atl1_xmit_frame(struct sk_buff *skb,
 		(u16) atomic_read(&tpd_ring->next_to_use));
 	memset(ptpd, 0, sizeof(struct tx_packet_desc));
 
-	if (vlan_tx_tag_present(skb)) {
-		vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		vlan_tag = skb_vlan_tag_get(skb);
 		vlan_tag = (vlan_tag << 4) | (vlan_tag >> 13) |
 			((vlan_tag >> 9) & 0x8);
 		ptpd->word3 |= 1 << TPD_INS_VL_TAG_SHIFT;
diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c
index 482a7cabb0a1..46a535318c7a 100644
--- a/drivers/net/ethernet/atheros/atlx/atl2.c
+++ b/drivers/net/ethernet/atheros/atlx/atl2.c
@@ -887,8 +887,8 @@ static netdev_tx_t atl2_xmit_frame(struct sk_buff *skb,
 		offset = ((u32)(skb->len-copy_len + 3) & ~3);
 	}
 #ifdef NETIF_F_HW_VLAN_CTAG_TX
-	if (vlan_tx_tag_present(skb)) {
-		u16 vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		u16 vlan_tag = skb_vlan_tag_get(skb);
 		vlan_tag = (vlan_tag << 4) |
 			(vlan_tag >> 13) |
 			((vlan_tag >> 9) & 0x8);
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 823d01c5684c..02bf0b86995b 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -6597,9 +6597,9 @@ bnx2_start_xmit(struct sk_buff *skb, struct net_device *dev)
 		vlan_tag_flags |= TX_BD_FLAGS_TCP_UDP_CKSUM;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		vlan_tag_flags |=
-			(TX_BD_FLAGS_VLAN_TAG | (vlan_tx_tag_get(skb) << 16));
+			(TX_BD_FLAGS_VLAN_TAG | (skb_vlan_tag_get(skb) << 16));
 	}
 
 	if ((mss = skb_shinfo(skb)->gso_size)) {
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 1d1147c93d59..b51a18a09d4d 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -3865,9 +3865,9 @@ netdev_tx_t bnx2x_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	   "sending pkt %u @%p  next_idx %u  bd %u @%p\n",
 	   pkt_prod, tx_buf, txdata->tx_pkt_prod, bd_prod, tx_start_bd);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_start_bd->vlan_or_ethertype =
-		    cpu_to_le16(vlan_tx_tag_get(skb));
+		    cpu_to_le16(skb_vlan_tag_get(skb));
 		tx_start_bd->bd_flags.as_bitfield |=
 		    (X_ETH_OUTBAND_VLAN << ETH_TX_BD_FLAGS_VLAN_MODE_SHIFT);
 	} else {
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 356bd5b022a5..4cf43bfbc955 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -8002,9 +8002,9 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	    !mss && skb->len > VLAN_ETH_FRAME_LEN)
 		base_flags |= TXD_FLAG_JMB_PKT;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		base_flags |= TXD_FLAG_VLAN;
-		vlan = vlan_tx_tag_get(skb);
+		vlan = skb_vlan_tag_get(skb);
 	}
 
 	if ((unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) &&
diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index 323721838cf9..7714d7790089 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c
@@ -2824,8 +2824,8 @@ bnad_txq_wi_prepare(struct bnad *bnad, struct bna_tcb *tcb,
 	u32 gso_size;
 	u16 vlan_tag = 0;
 
-	if (vlan_tx_tag_present(skb)) {
-		vlan_tag = (u16)vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		vlan_tag = (u16)skb_vlan_tag_get(skb);
 		flags |= (BNA_TXQ_WI_CF_INS_PRIO | BNA_TXQ_WI_CF_INS_VLAN);
 	}
 	if (test_bit(BNAD_RF_CEE_RUNNING, &bnad->run_flags)) {
diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
index babe2a915b00..526ea74e82d9 100644
--- a/drivers/net/ethernet/chelsio/cxgb/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
@@ -1860,9 +1860,9 @@ netdev_tx_t t1_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 	cpl->iff = dev->if_port;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		cpl->vlan_valid = 1;
-		cpl->vlan = htons(vlan_tx_tag_get(skb));
+		cpl->vlan = htons(skb_vlan_tag_get(skb));
 		st->vlan_insert++;
 	} else
 		cpl->vlan_valid = 0;
diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
index 3dfcf600fcc6..d6aa602f168d 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -1148,8 +1148,8 @@ static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
 	cpl->len = htonl(skb->len);
 	cntrl = V_TXPKT_INTF(pi->port_id);
 
-	if (vlan_tx_tag_present(skb))
-		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(vlan_tx_tag_get(skb));
+	if (skb_vlan_tag_present(skb))
+		cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(skb_vlan_tag_get(skb));
 
 	tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size);
 	if (tso_info) {
@@ -1282,7 +1282,7 @@ netdev_tx_t t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 		qs->port_stats[SGE_PSTAT_TX_CSUM]++;
 	if (skb_shinfo(skb)->gso_size)
 		qs->port_stats[SGE_PSTAT_TSO]++;
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		qs->port_stats[SGE_PSTAT_VLANINS]++;
 
 	/*
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index ca42e2e9dec9..619156112b21 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -1154,9 +1154,9 @@ out_free:	dev_kfree_skb_any(skb);
 			cntrl = TXPKT_L4CSUM_DIS | TXPKT_IPCSUM_DIS;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		q->vlan_ins++;
-		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(vlan_tx_tag_get(skb));
+		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(skb_vlan_tag_get(skb));
 	}
 
 	cpl->ctrl0 = htonl(TXPKT_OPCODE(CPL_TX_PKT_XT) |
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index 4424277a7e4d..0545f0de1c52 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -1326,9 +1326,9 @@ int t4vf_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * If there's a VLAN tag present, add that to the list of things to
 	 * do in this Work Request.
 	 */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		txq->vlan_ins++;
-		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(vlan_tx_tag_get(skb));
+		cntrl |= TXPKT_VLAN_VLD | TXPKT_VLAN(skb_vlan_tag_get(skb));
 	}
 
 	/*
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 9a952df6606e..0535f6fbdc71 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -520,10 +520,10 @@ static inline void enic_queue_wq_skb(struct enic *enic,
 	int loopback = 0;
 	int err;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/* VLAN tag from trunking driver */
 		vlan_tag_insert = 1;
-		vlan_tag = vlan_tx_tag_get(skb);
+		vlan_tag = skb_vlan_tag_get(skb);
 	} else if (enic->loop_enable) {
 		vlan_tag = enic->loop_tag;
 		loopback = 1;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 37a26b0b7e33..ed46610e5453 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -694,7 +694,7 @@ static inline u16 be_get_tx_vlan_tag(struct be_adapter *adapter,
 	u8 vlan_prio;
 	u16 vlan_tag;
 
-	vlan_tag = vlan_tx_tag_get(skb);
+	vlan_tag = skb_vlan_tag_get(skb);
 	vlan_prio = (vlan_tag & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
 	/* If vlan priority provided by OS is NOT in available bmap */
 	if (!(adapter->vlan_prio_bmap & (1 << vlan_prio)))
@@ -745,7 +745,7 @@ static void wrb_fill_hdr(struct be_adapter *adapter, struct be_eth_hdr_wrb *hdr,
 			SET_TX_WRB_HDR_BITS(udpcs, hdr, 1);
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		SET_TX_WRB_HDR_BITS(vlan, hdr, 1);
 		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
 		SET_TX_WRB_HDR_BITS(vlan_tag, hdr, vlan_tag);
@@ -864,7 +864,7 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
 	if (unlikely(!skb))
 		return skb;
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
 
 	if (qnq_async_evt_rcvd(adapter) && adapter->pvid) {
@@ -923,7 +923,7 @@ static bool be_ipv6_exthdr_check(struct sk_buff *skb)
 
 static int be_vlan_tag_tx_chk(struct be_adapter *adapter, struct sk_buff *skb)
 {
-	return vlan_tx_tag_present(skb) || adapter->pvid || adapter->qnq_vid;
+	return skb_vlan_tag_present(skb) || adapter->pvid || adapter->qnq_vid;
 }
 
 static int be_ipv6_tx_stall_chk(struct be_adapter *adapter, struct sk_buff *skb)
@@ -946,7 +946,7 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
 	eth_hdr_len = ntohs(skb->protocol) == ETH_P_8021Q ?
 						VLAN_ETH_HLEN : ETH_HLEN;
 	if (skb->len <= 60 &&
-	    (lancer_chip(adapter) || vlan_tx_tag_present(skb)) &&
+	    (lancer_chip(adapter) || skb_vlan_tag_present(skb)) &&
 	    is_ipv4_pkt(skb)) {
 		ip = (struct iphdr *)ip_hdr(skb);
 		pskb_trim(skb, eth_hdr_len + ntohs(ip->tot_len));
@@ -964,7 +964,7 @@ static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
 	 * Manually insert VLAN in pkt.
 	 */
 	if (skb->ip_summed != CHECKSUM_PARTIAL &&
-	    vlan_tx_tag_present(skb)) {
+	    skb_vlan_tag_present(skb)) {
 		skb = be_insert_vlan_in_pkt(adapter, skb, skip_hw_vlan);
 		if (unlikely(!skb))
 			goto err;
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index e54b1e39f9b4..93ff846e96f1 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -2170,7 +2170,7 @@ static inline void gfar_tx_checksum(struct sk_buff *skb, struct txfcb *fcb,
 void inline gfar_tx_vlan(struct sk_buff *skb, struct txfcb *fcb)
 {
 	fcb->flags |= TXFCB_VLN;
-	fcb->vlctl = vlan_tx_tag_get(skb);
+	fcb->vlctl = skb_vlan_tag_get(skb);
 }
 
 static inline struct txbd8 *skip_txbd(struct txbd8 *bdp, int stride,
@@ -2230,7 +2230,7 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	regs = tx_queue->grp->regs;
 
 	do_csum = (CHECKSUM_PARTIAL == skb->ip_summed);
-	do_vlan = vlan_tx_tag_present(skb);
+	do_vlan = skb_vlan_tag_present(skb);
 	do_tstamp = (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
 		    priv->hwts_tx_en;
 
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 566b17db135a..e8a1adb7a962 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -2064,9 +2064,9 @@ static int ehea_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	memset(swqe, 0, SWQE_HEADER_SIZE);
 	atomic_dec(&pr->swqe_avail);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		swqe->tx_control |= EHEA_SWQE_VLAN_INSERT;
-		swqe->vlan_tag = vlan_tx_tag_get(skb);
+		swqe->vlan_tag = skb_vlan_tag_get(skb);
 	}
 
 	pr->tx_packets++;
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 83140cbb5f01..9242982db3e0 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -3226,9 +3226,10 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 		return NETDEV_TX_BUSY;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= E1000_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << E1000_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) <<
+			     E1000_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	first = tx_ring->next_to_use;
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 332a298e95b5..38cb586b1bf4 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -5463,8 +5463,8 @@ static int e1000_transfer_dhcp_info(struct e1000_adapter *adapter,
 	struct e1000_hw *hw = &adapter->hw;
 	u16 length, offset;
 
-	if (vlan_tx_tag_present(skb) &&
-	    !((vlan_tx_tag_get(skb) == adapter->hw.mng_cookie.vlan_id) &&
+	if (skb_vlan_tag_present(skb) &&
+	    !((skb_vlan_tag_get(skb) == adapter->hw.mng_cookie.vlan_id) &&
 	      (adapter->hw.mng_cookie.status &
 	       E1000_MNG_DHCP_COOKIE_STATUS_VLAN)))
 		return 0;
@@ -5603,9 +5603,10 @@ static netdev_tx_t e1000_xmit_frame(struct sk_buff *skb,
 	if (e1000_maybe_stop_tx(tx_ring, count + 2))
 		return NETDEV_TX_BUSY;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= E1000_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << E1000_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) <<
+			     E1000_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	first = tx_ring->next_to_use;
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
index eb088b129bc7..caa43f7c2931 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -965,8 +965,8 @@ static void fm10k_tx_map(struct fm10k_ring *tx_ring,
 	tx_desc = FM10K_TX_DESC(tx_ring, i);
 
 	/* add HW VLAN tag */
-	if (vlan_tx_tag_present(skb))
-		tx_desc->vlan = cpu_to_le16(vlan_tx_tag_get(skb));
+	if (skb_vlan_tag_present(skb))
+		tx_desc->vlan = cpu_to_le16(skb_vlan_tag_get(skb));
 	else
 		tx_desc->vlan = 0;
 
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
index 8811364b91cb..945b35d31c71 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_netdev.c
@@ -609,7 +609,7 @@ static netdev_tx_t fm10k_xmit_frame(struct sk_buff *skb, struct net_device *dev)
 	int err;
 
 	if ((skb->protocol ==  htons(ETH_P_8021Q)) &&
-	    !vlan_tx_tag_present(skb)) {
+	    !skb_vlan_tag_present(skb)) {
 		/* FM10K only supports hardware tagging, any tags in frame
 		 * are considered 2nd level or "outer" tags
 		 */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 04b441460bbd..9f536dd8e1ec 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1772,8 +1772,8 @@ static int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
 	u32  tx_flags = 0;
 
 	/* if we have a HW VLAN tag being added, default to the HW one */
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= I40E_TX_FLAGS_HW_VLAN;
 	/* else if it is a SW VLAN, check the next protocol and store the tag */
 	} else if (protocol == htons(ETH_P_8021Q)) {
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 04c7c1557a0c..82c3798fdd36 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1122,8 +1122,8 @@ static int i40e_tx_prepare_vlan_flags(struct sk_buff *skb,
 	u32  tx_flags = 0;
 
 	/* if we have a HW VLAN tag being added, default to the HW one */
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb) << I40E_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= I40E_TX_FLAGS_HW_VLAN;
 	/* else if it is a SW VLAN, check the next protocol and store the tag */
 	} else if (protocol == htons(ETH_P_8021Q)) {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index ff59897a9463..6c25ec314183 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -5035,9 +5035,9 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb,
 
 	skb_tx_timestamp(skb);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= IGB_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << IGB_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) << IGB_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	/* record initial flags and protocol */
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index 63c807c9b21c..ad2b4897b392 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -2234,9 +2234,10 @@ static netdev_tx_t igbvf_xmit_frame_ring_adv(struct sk_buff *skb,
 		return NETDEV_TX_BUSY;
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= IGBVF_TX_FLAGS_VLAN;
-		tx_flags |= (vlan_tx_tag_get(skb) << IGBVF_TX_FLAGS_VLAN_SHIFT);
+		tx_flags |= (skb_vlan_tag_get(skb) <<
+			     IGBVF_TX_FLAGS_VLAN_SHIFT);
 	}
 
 	if (skb->protocol == htons(ETH_P_IP))
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index aa87605b144a..11a1bdbe3fd9 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -1532,9 +1532,9 @@ ixgb_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
                      DESC_NEEDED)))
 		return NETDEV_TX_BUSY;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		tx_flags |= IXGB_TX_FLAGS_VLAN;
-		vlan_id = vlan_tx_tag_get(skb);
+		vlan_id = skb_vlan_tag_get(skb);
 	}
 
 	first = adapter->tx_ring.next_to_use;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 2ed2c7de2304..7bb421bfd84e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7217,8 +7217,8 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
 	first->gso_segs = 1;
 
 	/* if we have a HW VLAN tag being added default to the HW one */
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb) << IXGBE_TX_FLAGS_VLAN_SHIFT;
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb) << IXGBE_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= IXGBE_TX_FLAGS_HW_VLAN;
 	/* else if it is a SW VLAN check the next protocol and store the tag */
 	} else if (protocol == htons(ETH_P_8021Q)) {
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 62a0d8e0f17d..c9b49bfb51bb 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3452,8 +3452,8 @@ static int ixgbevf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 	first->bytecount = skb->len;
 	first->gso_segs = 1;
 
-	if (vlan_tx_tag_present(skb)) {
-		tx_flags |= vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		tx_flags |= skb_vlan_tag_get(skb);
 		tx_flags <<= IXGBE_TX_FLAGS_VLAN_SHIFT;
 		tx_flags |= IXGBE_TX_FLAGS_VLAN;
 	}
diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index 44ce7d88f554..6e9a792097d3 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -2154,9 +2154,9 @@ jme_tx_csum(struct jme_adapter *jme, struct sk_buff *skb, u8 *flags)
 static inline void
 jme_tx_vlan(struct sk_buff *skb, __le16 *vlan, u8 *flags)
 {
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		*flags |= TXFLAG_TAGON;
-		*vlan = cpu_to_le16(vlan_tx_tag_get(skb));
+		*vlan = cpu_to_le16(skb_vlan_tag_get(skb));
 	}
 }
 
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 867a6a3ef81f..d9f4498832a1 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -1895,14 +1895,14 @@ static netdev_tx_t sky2_xmit_frame(struct sk_buff *skb,
 	ctrl = 0;
 
 	/* Add VLAN tag, can piggyback on LRGLEN or ADDR64 */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		if (!le) {
 			le = get_tx_le(sky2, &slot);
 			le->addr = 0;
 			le->opcode = OP_VLAN|HW_OWNER;
 		} else
 			le->opcode |= OP_VLAN;
-		le->length = cpu_to_be16(vlan_tx_tag_get(skb));
+		le->length = cpu_to_be16(skb_vlan_tag_get(skb));
 		ctrl |= INS_VLAN;
 	}
 
@@ -2594,7 +2594,7 @@ static struct sk_buff *sky2_receive(struct net_device *dev,
 	sky2->rx_next = (sky2->rx_next + 1) % sky2->rx_pending;
 	prefetch(sky2->rx_ring + sky2->rx_next);
 
-	if (vlan_tx_tag_present(re->skb))
+	if (skb_vlan_tag_present(re->skb))
 		count -= VLAN_HLEN;	/* Account for vlan tag */
 
 	/* This chip has hardware problems that generates bogus status.
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index e3357bf523df..359bb1286eb5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -682,8 +682,8 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb,
 	if (dev->num_tc)
 		return skb_tx_hash(dev, skb);
 
-	if (vlan_tx_tag_present(skb))
-		up = vlan_tx_tag_get(skb) >> VLAN_PRIO_SHIFT;
+	if (skb_vlan_tag_present(skb))
+		up = skb_vlan_tag_get(skb) >> VLAN_PRIO_SHIFT;
 
 	return fallback(dev, skb) % rings_p_up + up * rings_p_up;
 }
@@ -742,8 +742,8 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto tx_drop;
 	}
 
-	if (vlan_tx_tag_present(skb))
-		vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb))
+		vlan_tag = skb_vlan_tag_get(skb);
 
 
 	netdev_txq_bql_enqueue_prefetchw(ring->tx_queue);
@@ -930,7 +930,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	real_size = (real_size / 16) & 0x3f;
 
 	if (ring->bf_enabled && desc_size <= MAX_BF && !bounce &&
-	    !vlan_tx_tag_present(skb) && send_doorbell) {
+	    !skb_vlan_tag_present(skb) && send_doorbell) {
 		tx_desc->ctrl.bf_qpn = ring->doorbell_qpn |
 				       cpu_to_be32(real_size);
 
@@ -952,7 +952,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	} else {
 		tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
 		tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
-			!!vlan_tx_tag_present(skb);
+			!!skb_vlan_tag_present(skb);
 		tx_desc->ctrl.fence_size = real_size;
 
 		/* Ensure new descriptor hits memory
diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c
index 2552e550a78c..eb807b0dc72a 100644
--- a/drivers/net/ethernet/natsemi/ns83820.c
+++ b/drivers/net/ethernet/natsemi/ns83820.c
@@ -1122,12 +1122,12 @@ again:
 	}
 
 #ifdef NS83820_VLAN_ACCEL_SUPPORT
-	if(vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/* fetch the vlan tag info out of the
 		 * ancillary data if the vlan code
 		 * is using hw vlan acceleration
 		 */
-		short tag = vlan_tx_tag_get(skb);
+		short tag = skb_vlan_tag_get(skb);
 		extsts |= (EXTSTS_VPKT | htons(tag));
 	}
 #endif
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index f5e4b820128b..0529cad75b10 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -4045,8 +4045,8 @@ static netdev_tx_t s2io_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	queue = 0;
-	if (vlan_tx_tag_present(skb))
-		vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb))
+		vlan_tag = skb_vlan_tag_get(skb);
 	if (sp->config.tx_steering_type == TX_DEFAULT_STEERING) {
 		if (skb->protocol == htons(ETH_P_IP)) {
 			struct iphdr *ip;
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index cc0485e3c621..50d5604833ed 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -890,8 +890,8 @@ vxge_xmit(struct sk_buff *skb, struct net_device *dev)
 		dev->name, __func__, __LINE__,
 		fifo_hw, dtr, dtr_priv);
 
-	if (vlan_tx_tag_present(skb)) {
-		u16 vlan_tag = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		u16 vlan_tag = skb_vlan_tag_get(skb);
 		vxge_hw_fifo_txdl_vlan_set(dtr, vlan_tag);
 	}
 
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index f39cae620f61..a41bb5e6b954 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -2462,9 +2462,9 @@ static netdev_tx_t nv_start_xmit_optimized(struct sk_buff *skb,
 			 NV_TX2_CHECKSUM_L3 | NV_TX2_CHECKSUM_L4 : 0;
 
 	/* vlan tag */
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		start_tx->txvlan = cpu_to_le32(NV_TX3_VLAN_TAG_PRESENT |
-					vlan_tx_tag_get(skb));
+					skb_vlan_tag_get(skb));
 	else
 		start_tx->txvlan = 0;
 
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 613037584d08..a47fe67fdf58 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -1893,9 +1893,9 @@ netxen_tso_check(struct net_device *netdev,
 		protocol = vh->h_vlan_encapsulated_proto;
 		flags = FLAGS_VLAN_TAGGED;
 
-	} else if (vlan_tx_tag_present(skb)) {
+	} else if (skb_vlan_tag_present(skb)) {
 		flags = FLAGS_VLAN_OOB;
-		vid = vlan_tx_tag_get(skb);
+		vid = skb_vlan_tag_get(skb);
 		netxen_set_tx_vlan_tci(first_desc, vid);
 		vlan_oob = 1;
 	}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
index d166e534925d..4d2496f28b85 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
@@ -321,8 +321,8 @@ static void qlcnic_send_filter(struct qlcnic_adapter *adapter,
 		if (protocol == ETH_P_8021Q) {
 			vh = (struct vlan_ethhdr *)skb->data;
 			vlan_id = ntohs(vh->h_vlan_TCI);
-		} else if (vlan_tx_tag_present(skb)) {
-			vlan_id = vlan_tx_tag_get(skb);
+		} else if (skb_vlan_tag_present(skb)) {
+			vlan_id = skb_vlan_tag_get(skb);
 		}
 	}
 
@@ -473,9 +473,9 @@ static int qlcnic_tx_pkt(struct qlcnic_adapter *adapter,
 		flags = QLCNIC_FLAGS_VLAN_TAGGED;
 		vlan_tci = ntohs(vh->h_vlan_TCI);
 		protocol = ntohs(vh->h_vlan_encapsulated_proto);
-	} else if (vlan_tx_tag_present(skb)) {
+	} else if (skb_vlan_tag_present(skb)) {
 		flags = QLCNIC_FLAGS_VLAN_OOB;
-		vlan_tci = vlan_tx_tag_get(skb);
+		vlan_tci = skb_vlan_tag_get(skb);
 	}
 	if (unlikely(adapter->tx_pvid)) {
 		if (vlan_tci && !(adapter->flags & QLCNIC_TAGGING_ENABLED))
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 6c904a6cad2a..dc0058f90370 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -2660,11 +2660,11 @@ static netdev_tx_t qlge_send(struct sk_buff *skb, struct net_device *ndev)
 
 	mac_iocb_ptr->frame_len = cpu_to_le16((u16) skb->len);
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		netif_printk(qdev, tx_queued, KERN_DEBUG, qdev->ndev,
-			     "Adding a vlan tag %d.\n", vlan_tx_tag_get(skb));
+			     "Adding a vlan tag %d.\n", skb_vlan_tag_get(skb));
 		mac_iocb_ptr->flags3 |= OB_MAC_IOCB_V;
-		mac_iocb_ptr->vlan_tci = cpu_to_le16(vlan_tx_tag_get(skb));
+		mac_iocb_ptr->vlan_tci = cpu_to_le16(skb_vlan_tag_get(skb));
 	}
 	tso = ql_tso(skb, (struct ob_mac_tso_iocb_req *)mac_iocb_ptr);
 	if (tso < 0) {
diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index 9c31e46d1eee..d79e33b3c191 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -708,8 +708,8 @@ static void cp_tx (struct cp_private *cp)
 
 static inline u32 cp_tx_vlan_tag(struct sk_buff *skb)
 {
-	return vlan_tx_tag_present(skb) ?
-		TxVlanTag | swab16(vlan_tx_tag_get(skb)) : 0x00;
+	return skb_vlan_tag_present(skb) ?
+		TxVlanTag | swab16(skb_vlan_tag_get(skb)) : 0x00;
 }
 
 static void unwind_tx_frag_mapping(struct cp_private *cp, struct sk_buff *skb,
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 3a280598a15a..cd286b0356ab 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -2073,8 +2073,8 @@ static int rtl8169_set_features(struct net_device *dev,
 
 static inline u32 rtl8169_tx_vlan_tag(struct sk_buff *skb)
 {
-	return (vlan_tx_tag_present(skb)) ?
-		TxVlanTag | swab16(vlan_tx_tag_get(skb)) : 0x00;
+	return (skb_vlan_tag_present(skb)) ?
+		TxVlanTag | swab16(skb_vlan_tag_get(skb)) : 0x00;
 }
 
 static void rtl8169_rx_vlan_tag(struct RxDesc *desc, struct sk_buff *skb)
diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
index b6612d6090ac..23545e1e605a 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c
@@ -1272,7 +1272,7 @@ static netdev_tx_t sxgbe_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (unlikely(skb_is_gso(skb) && tqueue->prev_mss != cur_mss))
 		ctxt_desc_req = 1;
 
-	if (unlikely(vlan_tx_tag_present(skb) ||
+	if (unlikely(skb_vlan_tag_present(skb) ||
 		     ((skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) &&
 		      tqueue->hwts_tx_en)))
 		ctxt_desc_req = 1;
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index 6ab36d9ff2ab..a9cac8413e49 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -1650,9 +1650,9 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb,
 		    txd_mss);
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		/*Cut VLAN ID to 12 bits */
-		txd_vlan_id = vlan_tx_tag_get(skb) & BITS_MASK(12);
+		txd_vlan_id = skb_vlan_tag_get(skb) & BITS_MASK(12);
 		txd_vtag = 1;
 	}
 
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index a191afc23b56..0ac76102b33d 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -1781,8 +1781,8 @@ static netdev_tx_t rhine_start_tx(struct sk_buff *skb,
 	rp->tx_ring[entry].desc_length =
 		cpu_to_le32(TXDESC | (skb->len >= ETH_ZLEN ? skb->len : ETH_ZLEN));
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		u16 vid_pcp = vlan_tx_tag_get(skb);
+	if (unlikely(skb_vlan_tag_present(skb))) {
+		u16 vid_pcp = skb_vlan_tag_get(skb);
 
 		/* drop CFI/DEI bit, register needs VID and PCP */
 		vid_pcp = (vid_pcp & VLAN_VID_MASK) |
@@ -1803,7 +1803,7 @@ static netdev_tx_t rhine_start_tx(struct sk_buff *skb,
 
 	/* Non-x86 Todo: explicitly flush cache lines here. */
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		/* Tx queues are bits 7-0 (first Tx queue: bit 7) */
 		BYTE_REG_BITS_ON(1 << 7, ioaddr + TQWake);
 
diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c
index 282f83a63b67..c20206f83cc1 100644
--- a/drivers/net/ethernet/via/via-velocity.c
+++ b/drivers/net/ethernet/via/via-velocity.c
@@ -2611,8 +2611,8 @@ static netdev_tx_t velocity_xmit(struct sk_buff *skb,
 
 	td_ptr->tdesc1.cmd = TCPLS_NORMAL + (tdinfo->nskb_dma + 1) * 16;
 
-	if (vlan_tx_tag_present(skb)) {
-		td_ptr->tdesc1.vlan = cpu_to_le16(vlan_tx_tag_get(skb));
+	if (skb_vlan_tag_present(skb)) {
+		td_ptr->tdesc1.vlan = cpu_to_le16(skb_vlan_tag_get(skb));
 		td_ptr->tdesc1.TCR |= TCR0_VETAG;
 	}
 
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 7df221788cd4..d0ed5694dd7d 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -645,7 +645,7 @@ static void macvtap_skb_to_vnet_hdr(struct macvtap_queue *q,
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
 		vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
-		if (vlan_tx_tag_present(skb))
+		if (skb_vlan_tag_present(skb))
 			vnet_hdr->csum_start = cpu_to_macvtap16(q,
 				skb_checksum_start_offset(skb) + VLAN_HLEN);
 		else
@@ -821,13 +821,13 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 	total = vnet_hdr_len;
 	total += skb->len;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
 		} veth;
 		veth.h_vlan_proto = skb->vlan_proto;
-		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
+		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 		total += VLAN_HLEN;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 74fdf1158448..be196e89ab6c 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1260,7 +1260,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	int vlan_hlen = 0;
 	int vnet_hdr_sz = 0;
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		vlan_hlen = VLAN_HLEN;
 
 	if (tun->flags & IFF_VNET_HDR)
@@ -1337,7 +1337,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 		} veth;
 
 		veth.h_vlan_proto = skb->vlan_proto;
-		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
+		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 
diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index b23426e4952c..e519e6a269b9 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -1421,10 +1421,10 @@ static int msdn_giant_send_check(struct sk_buff *skb)
 
 static inline void rtl_tx_vlan_tag(struct tx_desc *desc, struct sk_buff *skb)
 {
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		u32 opts2;
 
-		opts2 = TX_VLAN_TAG | swab16(vlan_tx_tag_get(skb));
+		opts2 = TX_VLAN_TAG | swab16(skb_vlan_tag_get(skb));
 		desc->opts2 |= cpu_to_le32(opts2);
 	}
 }
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 31439818c27e..294214c15292 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1038,9 +1038,9 @@ vmxnet3_tq_xmit(struct sk_buff *skb, struct vmxnet3_tx_queue *tq,
 		le32_add_cpu(&tq->shared->txNumDeferred, 1);
 	}
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		gdesc->txd.ti = 1;
-		gdesc->txd.tci = vlan_tx_tag_get(skb);
+		gdesc->txd.tci = skb_vlan_tag_get(skb);
 	}
 
 	/* finally flips the GEN bit of the SOP desc. */
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 3a18d8ed89ca..985359dd6033 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1561,7 +1561,7 @@ static int vxlan6_xmit_skb(struct vxlan_sock *vs,
 
 	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
 			+ VXLAN_HLEN + sizeof(struct ipv6hdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	/* Need space for new headers (invalidates iph ptr) */
 	err = skb_cow_head(skb, min_headroom);
@@ -1607,7 +1607,7 @@ int vxlan_xmit_skb(struct vxlan_sock *vs,
 
 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 			+ VXLAN_HLEN + sizeof(struct iphdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	/* Need space for new headers (invalidates iph ptr) */
 	err = skb_cow_head(skb, min_headroom);
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 625227ad16ee..dd4ab8d73d34 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2800,12 +2800,12 @@ static void qeth_l3_fill_header(struct qeth_card *card, struct qeth_hdr *hdr,
 	 * before we're going to overwrite this location with next hop ip.
 	 * v6 uses passthrough, v4 sets the tag in the QDIO header.
 	 */
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		if ((ipv == 4) || (card->info.type == QETH_CARD_TYPE_IQD))
 			hdr->hdr.l3.ext_flags = QETH_HDR_EXT_VLAN_FRAME;
 		else
 			hdr->hdr.l3.ext_flags = QETH_HDR_EXT_INCLUDE_VLAN_TAG;
-		hdr->hdr.l3.vlan_id = vlan_tx_tag_get(skb);
+		hdr->hdr.l3.vlan_id = skb_vlan_tag_get(skb);
 	}
 
 	hdr->hdr.l3.length = skb->len - sizeof(struct qeth_hdr);
@@ -2986,7 +2986,7 @@ static int qeth_l3_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			skb_pull(new_skb, ETH_HLEN);
 		}
 
-		if (ipv != 4 && vlan_tx_tag_present(new_skb)) {
+		if (ipv != 4 && skb_vlan_tag_present(new_skb)) {
 			skb_push(new_skb, VLAN_HLEN);
 			skb_copy_to_linear_data(new_skb, new_skb->data + 4, 4);
 			skb_copy_to_linear_data_offset(new_skb, 4,
@@ -2995,7 +2995,7 @@ static int qeth_l3_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
 				new_skb->data + 12, 4);
 			tag = (u16 *)(new_skb->data + 12);
 			*tag = __constant_htons(ETH_P_8021Q);
-			*(tag + 1) = htons(vlan_tx_tag_get(new_skb));
+			*(tag + 1) = htons(skb_vlan_tag_get(new_skb));
 		}
 	}
 
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 14419a8ccbb6..bcaf4cabb858 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -469,7 +469,7 @@ static int peek_head_len(struct sock *sk)
 	head = skb_peek(&sk->sk_receive_queue);
 	if (likely(head)) {
 		len = head->len;
-		if (vlan_tx_tag_present(head))
+		if (skb_vlan_tag_present(head))
 			len += VLAN_HLEN;
 	}
 
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 515a35e2a48a..bea465f24ebb 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -78,9 +78,9 @@ static inline bool is_vlan_dev(struct net_device *dev)
         return dev->priv_flags & IFF_802_1Q_VLAN;
 }
 
-#define vlan_tx_tag_present(__skb)	((__skb)->vlan_tci & VLAN_TAG_PRESENT)
-#define vlan_tx_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
-#define vlan_tx_tag_get_id(__skb)	((__skb)->vlan_tci & VLAN_VID_MASK)
+#define skb_vlan_tag_present(__skb)	((__skb)->vlan_tci & VLAN_TAG_PRESENT)
+#define skb_vlan_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
+#define skb_vlan_tag_get_id(__skb)	((__skb)->vlan_tci & VLAN_VID_MASK)
 
 /**
  *	struct vlan_pcpu_stats - VLAN percpu rx/tx stats
@@ -376,7 +376,7 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
 static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
 {
 	skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-					vlan_tx_tag_get(skb));
+					skb_vlan_tag_get(skb));
 	if (likely(skb))
 		skb->vlan_tci = 0;
 	return skb;
@@ -393,7 +393,7 @@ static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
  */
 static inline struct sk_buff *vlan_hwaccel_push_inside(struct sk_buff *skb)
 {
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		skb = __vlan_hwaccel_push_inside(skb);
 	return skb;
 }
@@ -442,8 +442,8 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
 					 u16 *vlan_tci)
 {
-	if (vlan_tx_tag_present(skb)) {
-		*vlan_tci = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		*vlan_tci = skb_vlan_tag_get(skb);
 		return 0;
 	} else {
 		*vlan_tci = 0;
@@ -480,7 +480,7 @@ static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
 {
 	__be16 protocol = 0;
 
-	if (vlan_tx_tag_present(skb) ||
+	if (skb_vlan_tag_present(skb) ||
 	     skb->protocol != cpu_to_be16(ETH_P_8021Q))
 		protocol = skb->protocol;
 	else {
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index fe6e7aac3c56..2342bf12cb78 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -121,7 +121,7 @@ static inline __be16 tc_skb_protocol(const struct sk_buff *skb)
 	 * vlan accelerated path. In that case, use skb->vlan_proto
 	 * as the original vlan header was already stripped.
 	 */
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		return skb->vlan_proto;
 	return skb->protocol;
 }
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 1de256b35807..49cc7c3de252 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -40,9 +40,9 @@ TRACE_EVENT(net_dev_start_xmit,
 		__assign_str(name, dev->name);
 		__entry->queue_mapping = skb->queue_mapping;
 		__entry->skbaddr = skb;
-		__entry->vlan_tagged = vlan_tx_tag_present(skb);
+		__entry->vlan_tagged = skb_vlan_tag_present(skb);
 		__entry->vlan_proto = ntohs(skb->vlan_proto);
-		__entry->vlan_tci = vlan_tx_tag_get(skb);
+		__entry->vlan_tci = skb_vlan_tag_get(skb);
 		__entry->protocol = ntohs(skb->protocol);
 		__entry->ip_summed = skb->ip_summed;
 		__entry->len = skb->len;
@@ -174,9 +174,9 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
 #endif
 		__entry->queue_mapping = skb->queue_mapping;
 		__entry->skbaddr = skb;
-		__entry->vlan_tagged = vlan_tx_tag_present(skb);
+		__entry->vlan_tagged = skb_vlan_tag_present(skb);
 		__entry->vlan_proto = ntohs(skb->vlan_proto);
-		__entry->vlan_tci = vlan_tx_tag_get(skb);
+		__entry->vlan_tci = skb_vlan_tag_get(skb);
 		__entry->protocol = ntohs(skb->protocol);
 		__entry->ip_summed = skb->ip_summed;
 		__entry->hash = skb->hash;
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 90cc2bdd4064..61bf2a06e85d 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -9,7 +9,7 @@ bool vlan_do_receive(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	__be16 vlan_proto = skb->vlan_proto;
-	u16 vlan_id = vlan_tx_tag_get_id(skb);
+	u16 vlan_id = skb_vlan_tag_get_id(skb);
 	struct net_device *vlan_dev;
 	struct vlan_pcpu_stats *rx_stats;
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index c190d22b6b3d..65728e0dc4ff 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -66,17 +66,17 @@ static int brnf_pass_vlan_indev __read_mostly = 0;
 #endif
 
 #define IS_IP(skb) \
-	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
 
 #define IS_IPV6(skb) \
-	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
 
 #define IS_ARP(skb) \
-	(!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
 
 static inline __be16 vlan_proto(const struct sk_buff *skb)
 {
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		return skb->protocol;
 	else if (skb->protocol == htons(ETH_P_8021Q))
 		return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
@@ -436,11 +436,11 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
 	struct net_device *vlan, *br;
 
 	br = bridge_parent(dev);
-	if (brnf_pass_vlan_indev == 0 || !vlan_tx_tag_present(skb))
+	if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
 		return br;
 
 	vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
-				    vlan_tx_tag_get(skb) & VLAN_VID_MASK);
+				    skb_vlan_tag_get(skb) & VLAN_VID_MASK);
 
 	return vlan ? vlan : br;
 }
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aea3d1339b3f..d808d766334d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -628,8 +628,8 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid)
 {
 	int err = 0;
 
-	if (vlan_tx_tag_present(skb))
-		*vid = vlan_tx_tag_get(skb) & VLAN_VID_MASK;
+	if (skb_vlan_tag_present(skb))
+		*vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK;
 	else {
 		*vid = 0;
 		err = -EINVAL;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 97b8ddf57363..13013fe8db24 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -187,7 +187,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
 	 * sent from vlan device on the bridge device, it does not have
 	 * HW accelerated vlan tag.
 	 */
-	if (unlikely(!vlan_tx_tag_present(skb) &&
+	if (unlikely(!skb_vlan_tag_present(skb) &&
 		     skb->protocol == proto)) {
 		skb = skb_vlan_untag(skb);
 		if (unlikely(!skb))
@@ -200,7 +200,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
 			/* Protocol-mismatch, empty out vlan_tci for new tag */
 			skb_push(skb, ETH_HLEN);
 			skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
-							vlan_tx_tag_get(skb));
+							skb_vlan_tag_get(skb));
 			if (unlikely(!skb))
 				return false;
 
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 8d3f8c7651f0..618568888128 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -45,8 +45,8 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	/* VLAN encapsulated Type/Length field, given from orig frame */
 	__be16 encap;
 
-	if (vlan_tx_tag_present(skb)) {
-		TCI = vlan_tx_tag_get(skb);
+	if (skb_vlan_tag_present(skb)) {
+		TCI = skb_vlan_tag_get(skb);
 		encap = skb->protocol;
 	} else {
 		const struct vlan_hdr *fp;
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index d9a8c05d995d..91180a7fc943 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -133,7 +133,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
 	__be16 ethproto;
 	int verdict, i;
 
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		ethproto = htons(ETH_P_8021Q);
 	else
 		ethproto = h->h_proto;
diff --git a/net/core/dev.c b/net/core/dev.c
index 805456147c30..1e325adc4367 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2578,7 +2578,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 	if (skb->encapsulation)
 		features &= dev->hw_enc_features;
 
-	if (!vlan_tx_tag_present(skb)) {
+	if (!skb_vlan_tag_present(skb)) {
 		if (unlikely(protocol == htons(ETH_P_8021Q) ||
 			     protocol == htons(ETH_P_8021AD))) {
 			struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
@@ -2659,7 +2659,7 @@ out:
 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
 					  netdev_features_t features)
 {
-	if (vlan_tx_tag_present(skb) &&
+	if (skb_vlan_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
 		skb = __vlan_hwaccel_push_inside(skb);
 	return skb;
@@ -3676,7 +3676,7 @@ ncls:
 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
 		goto drop;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		if (pt_prev) {
 			ret = deliver_skb(skb, pt_prev, orig_dev);
 			pt_prev = NULL;
@@ -3708,8 +3708,8 @@ ncls:
 		}
 	}
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
-		if (vlan_tx_tag_get_id(skb))
+	if (unlikely(skb_vlan_tag_present(skb))) {
+		if (skb_vlan_tag_get_id(skb))
 			skb->pkt_type = PACKET_OTHERHOST;
 		/* Note: we might in the future use prio bits
 		 * and set skb->priority like in vlan_do_receive()
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index e0ad5d16c9c5..c126a878c47c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -77,7 +77,7 @@ static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
 
 	features = netif_skb_features(skb);
 
-	if (vlan_tx_tag_present(skb) &&
+	if (skb_vlan_tag_present(skb) &&
 	    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
 		skb = __vlan_hwaccel_push_inside(skb);
 		if (unlikely(!skb)) {
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 5a2a2e887a12..56db472e9b86 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -4197,7 +4197,7 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
 	struct vlan_hdr *vhdr;
 	u16 vlan_tci;
 
-	if (unlikely(vlan_tx_tag_present(skb))) {
+	if (unlikely(skb_vlan_tag_present(skb))) {
 		/* vlan_tci is already set-up so leave this for another time */
 		return skb;
 	}
@@ -4283,7 +4283,7 @@ int skb_vlan_pop(struct sk_buff *skb)
 	__be16 vlan_proto;
 	int err;
 
-	if (likely(vlan_tx_tag_present(skb))) {
+	if (likely(skb_vlan_tag_present(skb))) {
 		skb->vlan_tci = 0;
 	} else {
 		if (unlikely((skb->protocol != htons(ETH_P_8021Q) &&
@@ -4313,7 +4313,7 @@ EXPORT_SYMBOL(skb_vlan_pop);
 
 int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
 {
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		unsigned int offset = skb->data - skb_mac_header(skb);
 		int err;
 
@@ -4323,7 +4323,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
 		 */
 		__skb_push(skb, offset);
 		err = __vlan_insert_tag(skb, skb->vlan_proto,
-					vlan_tx_tag_get(skb));
+					skb_vlan_tag_get(skb));
 		if (err)
 			return err;
 		skb->protocol = skb->vlan_proto;
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 5b52046ec7a2..23744c7a9718 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -119,7 +119,7 @@ int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
 
 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 			+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	err = skb_cow_head(skb, min_headroom);
 	if (unlikely(err)) {
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 770064c83711..b4cffe686126 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -212,7 +212,7 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 	int err;
 
 	err = skb_vlan_pop(skb);
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		invalidate_flow_key(key);
 	else
 		key->eth.tci = 0;
@@ -222,7 +222,7 @@ static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
 		     const struct ovs_action_push_vlan *vlan)
 {
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		invalidate_flow_key(key);
 	else
 		key->eth.tci = vlan->vlan_tci;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4e9a5f035cbc..54854e3ecd83 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -419,7 +419,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 	if (!dp_ifindex)
 		return -ENODEV;
 
-	if (vlan_tx_tag_present(skb)) {
+	if (skb_vlan_tag_present(skb)) {
 		nskb = skb_clone(skb, GFP_ATOMIC);
 		if (!nskb)
 			return -ENOMEM;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index da2fae0873a5..df334fe43d7f 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -70,7 +70,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
 {
 	struct flow_stats *stats;
 	int node = numa_node_id();
-	int len = skb->len + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+	int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
 	stats = rcu_dereference(flow->stats[node]);
 
@@ -472,7 +472,7 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 	 */
 
 	key->eth.tci = 0;
-	if (vlan_tx_tag_present(skb))
+	if (skb_vlan_tag_present(skb))
 		key->eth.tci = htons(skb->vlan_tci);
 	else if (eth->h_proto == htons(ETH_P_8021Q))
 		if (unlikely(parse_vlan(skb, key)))
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index d4168c442db5..e9aedb7c7106 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -166,7 +166,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
 
 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
 			+ tunnel_hlen + sizeof(struct iphdr)
-			+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+			+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
 		int head_delta = SKB_DATA_ALIGN(min_headroom -
 						skb_headroom(skb) +
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 2034c6d9cb5a..464739aac0f3 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -480,7 +480,8 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
 	stats = this_cpu_ptr(vport->percpu_stats);
 	u64_stats_update_begin(&stats->syncp);
 	stats->rx_packets++;
-	stats->rx_bytes += skb->len + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
+	stats->rx_bytes += skb->len +
+			   (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 	u64_stats_update_end(&stats->syncp);
 
 	OVS_CB(skb)->input_vport = vport;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 0f02668dc219..d37075b0d6d5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -986,8 +986,8 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
 			struct tpacket3_hdr *ppd)
 {
-	if (vlan_tx_tag_present(pkc->skb)) {
-		ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
+	if (skb_vlan_tag_present(pkc->skb)) {
+		ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
 		ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
 		ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 	} else {
@@ -2000,8 +2000,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		h.h2->tp_net = netoff;
 		h.h2->tp_sec = ts.tv_sec;
 		h.h2->tp_nsec = ts.tv_nsec;
-		if (vlan_tx_tag_present(skb)) {
-			h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
+		if (skb_vlan_tag_present(skb)) {
+			h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
 			h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
 			status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 		} else {
@@ -3010,8 +3010,8 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		aux.tp_snaplen = skb->len;
 		aux.tp_mac = 0;
 		aux.tp_net = skb_network_offset(skb);
-		if (vlan_tx_tag_present(skb)) {
-			aux.tp_vlan_tci = vlan_tx_tag_get(skb);
+		if (skb_vlan_tag_present(skb)) {
+			aux.tp_vlan_tci = skb_vlan_tag_get(skb);
 			aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
 			aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 		} else {
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 2159981b604e..b5294ce20cd4 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -176,7 +176,7 @@ META_COLLECTOR(int_vlan_tag)
 {
 	unsigned short tag;
 
-	tag = vlan_tx_tag_get(skb);
+	tag = skb_vlan_tag_get(skb);
 	if (!tag && __vlan_get_tag(skb, &tag))
 		*err = -1;
 	else
diff --git a/net/wireless/util.c b/net/wireless/util.c
index d0ac795445b7..1d2fcfad06cc 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -708,8 +708,8 @@ unsigned int cfg80211_classify8021d(struct sk_buff *skb,
 	if (skb->priority >= 256 && skb->priority <= 263)
 		return skb->priority - 256;
 
-	if (vlan_tx_tag_present(skb)) {
-		vlan_priority = (vlan_tx_tag_get(skb) & VLAN_PRIO_MASK)
+	if (skb_vlan_tag_present(skb)) {
+		vlan_priority = (skb_vlan_tag_get(skb) & VLAN_PRIO_MASK)
 			>> VLAN_PRIO_SHIFT;
 		if (vlan_priority > 0)
 			return vlan_priority;
-- 
cgit v1.2.3


From f684e4ac9f4bae4e6ecff92eef9645a44764fc04 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 12 Jan 2015 00:45:55 +0100
Subject: pinctrl: pinconf-generic: loose DT dependence

New pin controllers such as ACPI-based may also have custom properties
to parse, and should be able to use generic pin config. Let's make the
code compile on !OF systems and rename members a bit to underscore it
is custom parameters and not necessarily DT parameters.

This fixes a build regression for x86_64 on the zeroday kernel builds.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Reviewed-and-tested-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c        | 39 ++++++++++++++++++--------------
 drivers/pinctrl/pinctrl-zynq.c           |  8 +++----
 drivers/pinctrl/qcom/pinctrl-spmi-gpio.c |  8 +++----
 include/linux/pinctrl/pinconf-generic.h  |  2 +-
 include/linux/pinctrl/pinctrl.h          | 17 ++++++++------
 5 files changed, 41 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index e0886665b70a..4db92f64b4de 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -113,10 +113,11 @@ void pinconf_generic_dump_pins(struct pinctrl_dev *pctldev, struct seq_file *s,
 	pinconf_generic_dump_one(pctldev, s, gname, pin, conf_items,
 				 ARRAY_SIZE(conf_items));
 	/* driver-specific parameters */
-	if (pctldev->desc->num_dt_params && pctldev->desc->conf_items)
+	if (pctldev->desc->num_custom_params &&
+	    pctldev->desc->custom_conf_items)
 		pinconf_generic_dump_one(pctldev, s, gname, pin,
-					 pctldev->desc->conf_items,
-					 pctldev->desc->num_dt_params);
+					 pctldev->desc->custom_conf_items,
+					 pctldev->desc->num_custom_params);
 }
 
 void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
@@ -131,21 +132,24 @@ void pinconf_generic_dump_config(struct pinctrl_dev *pctldev,
 			   pinconf_to_config_argument(config));
 	}
 
-	if (!pctldev->desc->num_dt_params || !pctldev->desc->conf_items)
+	if (!pctldev->desc->num_custom_params ||
+	    !pctldev->desc->custom_conf_items)
 		return;
 
-	for (i = 0; i < pctldev->desc->num_dt_params; i++) {
-		if (pinconf_to_config_param(config) != pctldev->desc->conf_items[i].param)
+	for (i = 0; i < pctldev->desc->num_custom_params; i++) {
+		if (pinconf_to_config_param(config) !=
+		    pctldev->desc->custom_conf_items[i].param)
 			continue;
-		seq_printf(s, "%s: 0x%x", pctldev->desc->conf_items[i].display,
-			   pinconf_to_config_argument(config));
+		seq_printf(s, "%s: 0x%x",
+				pctldev->desc->custom_conf_items[i].display,
+				pinconf_to_config_argument(config));
 	}
 }
 EXPORT_SYMBOL_GPL(pinconf_generic_dump_config);
 #endif
 
 #ifdef CONFIG_OF
-static const struct pinconf_generic_dt_params dt_params[] = {
+static const struct pinconf_generic_params dt_params[] = {
 	{ "bias-disable", PIN_CONFIG_BIAS_DISABLE, 0 },
 	{ "bias-high-impedance", PIN_CONFIG_BIAS_HIGH_IMPEDANCE, 0 },
 	{ "bias-bus-hold", PIN_CONFIG_BIAS_BUS_HOLD, 0 },
@@ -170,9 +174,9 @@ static const struct pinconf_generic_dt_params dt_params[] = {
 };
 
 /**
- * parse_dt_cfg - Parse DT pinconf parameters
+ * parse_dt_cfg() - Parse DT pinconf parameters
  * @np:	DT node
- * @params:	Array of describing DT parameters
+ * @params:	Array of describing generic parameters
  * @count:	Number of entries in @params
  * @cfg:	Array of parsed config options
  * @ncfg:	Number of entries in @cfg
@@ -183,7 +187,7 @@ static const struct pinconf_generic_dt_params dt_params[] = {
  * needs to have enough memory allocated to hold all possible entries.
  */
 static void parse_dt_cfg(struct device_node *np,
-			 const struct pinconf_generic_dt_params *params,
+			 const struct pinconf_generic_params *params,
 			 unsigned int count, unsigned long *cfg,
 			 unsigned int *ncfg)
 {
@@ -192,7 +196,7 @@ static void parse_dt_cfg(struct device_node *np,
 	for (i = 0; i < count; i++) {
 		u32 val;
 		int ret;
-		const struct pinconf_generic_dt_params *par = &params[i];
+		const struct pinconf_generic_params *par = &params[i];
 
 		ret = of_property_read_u32(np, par->property, &val);
 
@@ -232,15 +236,16 @@ int pinconf_generic_parse_dt_config(struct device_node *np,
 	/* allocate a temporary array big enough to hold one of each option */
 	max_cfg = ARRAY_SIZE(dt_params);
 	if (pctldev)
-		max_cfg += pctldev->desc->num_dt_params;
+		max_cfg += pctldev->desc->num_custom_params;
 	cfg = kcalloc(max_cfg, sizeof(*cfg), GFP_KERNEL);
 	if (!cfg)
 		return -ENOMEM;
 
 	parse_dt_cfg(np, dt_params, ARRAY_SIZE(dt_params), cfg, &ncfg);
-	if (pctldev && pctldev->desc->num_dt_params && pctldev->desc->params)
-		parse_dt_cfg(np, pctldev->desc->params,
-			      pctldev->desc->num_dt_params, cfg, &ncfg);
+	if (pctldev && pctldev->desc->num_custom_params &&
+		pctldev->desc->custom_params)
+		parse_dt_cfg(np, pctldev->desc->custom_params,
+			     pctldev->desc->num_custom_params, cfg, &ncfg);
 
 	ret = 0;
 
diff --git a/drivers/pinctrl/pinctrl-zynq.c b/drivers/pinctrl/pinctrl-zynq.c
index 62534234da78..8aa05e2eb705 100644
--- a/drivers/pinctrl/pinctrl-zynq.c
+++ b/drivers/pinctrl/pinctrl-zynq.c
@@ -920,7 +920,7 @@ enum zynq_pin_config_param {
 	PIN_CONFIG_IOSTANDARD = PIN_CONFIG_END + 1,
 };
 
-static const struct pinconf_generic_dt_params zynq_dt_params[] = {
+static const struct pinconf_generic_params zynq_dt_params[] = {
 	{"io-standard", PIN_CONFIG_IOSTANDARD, zynq_iostd_lvcmos18},
 };
 
@@ -1099,9 +1099,9 @@ static struct pinctrl_desc zynq_desc = {
 	.pctlops = &zynq_pctrl_ops,
 	.pmxops = &zynq_pinmux_ops,
 	.confops = &zynq_pinconf_ops,
-	.num_dt_params = ARRAY_SIZE(zynq_dt_params),
-	.params = zynq_dt_params,
-	.conf_items = zynq_conf_items,
+	.num_custom_params = ARRAY_SIZE(zynq_dt_params),
+	.custom_params = zynq_dt_params,
+	.custom_conf_items = zynq_conf_items,
 	.owner = THIS_MODULE,
 };
 
diff --git a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
index 17f811c9c2c0..bbf99a715b63 100644
--- a/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
+++ b/drivers/pinctrl/qcom/pinctrl-spmi-gpio.c
@@ -131,7 +131,7 @@ struct pmic_gpio_state {
 	struct gpio_chip chip;
 };
 
-static const struct pinconf_generic_dt_params pmic_gpio_bindings[] = {
+static const struct pinconf_generic_params pmic_gpio_bindings[] = {
 	{"qcom,pull-up-strength",	PMIC_GPIO_CONF_PULL_UP,		0},
 	{"qcom,drive-strength",		PMIC_GPIO_CONF_STRENGTH,	0},
 };
@@ -742,9 +742,9 @@ static int pmic_gpio_probe(struct platform_device *pdev)
 	pctrldesc->name = dev_name(dev);
 	pctrldesc->pins = pindesc;
 	pctrldesc->npins = npins;
-	pctrldesc->num_dt_params = ARRAY_SIZE(pmic_gpio_bindings);
-	pctrldesc->params = pmic_gpio_bindings;
-	pctrldesc->conf_items = pmic_conf_items;
+	pctrldesc->num_custom_params = ARRAY_SIZE(pmic_gpio_bindings);
+	pctrldesc->custom_params = pmic_gpio_bindings;
+	pctrldesc->custom_conf_items = pmic_conf_items;
 
 	for (i = 0; i < npins; i++, pindesc++) {
 		pad = &pads[i];
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 342409f7f3ec..fe65962b264f 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -162,7 +162,7 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param
 struct pinctrl_dev;
 struct pinctrl_map;
 
-struct pinconf_generic_dt_params {
+struct pinconf_generic_params {
 	const char * const property;
 	enum pin_config_param param;
 	u32 default_value;
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index c58b3e11ba8e..66e4697516de 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -118,9 +118,12 @@ struct pinctrl_ops {
  * @confops: pin config operations vtable, if you support pin configuration in
  *	your driver
  * @owner: module providing the pin controller, used for refcounting
- * @num_dt_params: Number of driver-specific DT parameters
- * @params: List of DT parameters
- * @conf_items: Information how to print @params in debugfs
+ * @num_custom_params: Number of driver-specific custom parameters to be parsed
+ *	from the hardware description
+ * @custom_params: List of driver_specific custom parameters to be parsed from
+ *	the hardware description
+ * @custom_conf_items: Information how to print @params in debugfs, must be
+ *	the same size as the @custom_params, i.e. @num_custom_params
  */
 struct pinctrl_desc {
 	const char *name;
@@ -130,10 +133,10 @@ struct pinctrl_desc {
 	const struct pinmux_ops *pmxops;
 	const struct pinconf_ops *confops;
 	struct module *owner;
-#if defined(CONFIG_GENERIC_PINCONF) && defined(CONFIG_OF)
-	unsigned int num_dt_params;
-	const struct pinconf_generic_dt_params *params;
-	const struct pin_config_item *conf_items;
+#ifdef CONFIG_GENERIC_PINCONF
+	unsigned int num_custom_params;
+	const struct pinconf_generic_params *custom_params;
+	const struct pin_config_item *custom_conf_items;
 #endif
 };
 
-- 
cgit v1.2.3


From 5a7d2efdd93f6c4bb6cd3d5df3d2f5611c9b87ac Mon Sep 17 00:00:00 2001
From: Wolfram Sang <wsa+renesas@sang-engineering.com>
Date: Tue, 16 Dec 2014 10:59:17 +0100
Subject: pinctrl: consumer: use correct retval for placeholder functions

These functions are supposed to return an error pointer, not NULL.

Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 18eccefea06e..72c0415d6c21 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -82,7 +82,7 @@ static inline int pinctrl_gpio_direction_output(unsigned gpio)
 
 static inline struct pinctrl * __must_check pinctrl_get(struct device *dev)
 {
-	return NULL;
+	return ERR_PTR(-ENOSYS);
 }
 
 static inline void pinctrl_put(struct pinctrl *p)
@@ -93,7 +93,7 @@ static inline struct pinctrl_state * __must_check pinctrl_lookup_state(
 							struct pinctrl *p,
 							const char *name)
 {
-	return NULL;
+	return ERR_PTR(-ENOSYS);
 }
 
 static inline int pinctrl_select_state(struct pinctrl *p,
@@ -104,7 +104,7 @@ static inline int pinctrl_select_state(struct pinctrl *p,
 
 static inline struct pinctrl * __must_check devm_pinctrl_get(struct device *dev)
 {
-	return NULL;
+	return ERR_PTR(-ENOSYS);
 }
 
 static inline void devm_pinctrl_put(struct pinctrl *p)
-- 
cgit v1.2.3


From d84b6728c54dcf73bcef3e3f7cf6767e2d224e39 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 6 Jan 2015 11:45:07 -0800
Subject: locking/mcs: Better differentiate between MCS variants

We have two flavors of the MCS spinlock: standard and cancelable (OSQ).
While each one is independent of the other, we currently mix and match
them. This patch:

  - Moves the OSQ code out of mcs_spinlock.h (which only deals with the traditional
    version) into include/linux/osq_lock.h. No unnecessary code is added to the
    more global header file, anything locks that make use of OSQ must include
    it anyway.

  - Renames mcs_spinlock.c to osq_lock.c. This file only contains osq code.

  - Introduces a CONFIG_LOCK_SPIN_ON_OWNER in order to only build osq_lock
    if there is support for it.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Jason Low <jason.low2@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Waiman Long <Waiman.Long@hp.com>
Link: http://lkml.kernel.org/r/1420573509-24774-5-git-send-email-dave@stgolabs.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/osq_lock.h      |  12 ++-
 kernel/Kconfig.locks          |   4 +
 kernel/locking/Makefile       |   3 +-
 kernel/locking/mcs_spinlock.c | 208 ------------------------------------------
 kernel/locking/mcs_spinlock.h |  16 ----
 kernel/locking/osq_lock.c     | 203 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 219 insertions(+), 227 deletions(-)
 delete mode 100644 kernel/locking/mcs_spinlock.c
 create mode 100644 kernel/locking/osq_lock.c

(limited to 'include/linux')

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index 90230d5811c5..3a6490e81b28 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -5,8 +5,11 @@
  * An MCS like lock especially tailored for optimistic spinning for sleeping
  * lock implementations (mutex, rwsem, etc).
  */
-
-#define OSQ_UNLOCKED_VAL (0)
+struct optimistic_spin_node {
+	struct optimistic_spin_node *next, *prev;
+	int locked; /* 1 if lock acquired */
+	int cpu; /* encoded CPU # + 1 value */
+};
 
 struct optimistic_spin_queue {
 	/*
@@ -16,6 +19,8 @@ struct optimistic_spin_queue {
 	atomic_t tail;
 };
 
+#define OSQ_UNLOCKED_VAL (0)
+
 /* Init macro and function. */
 #define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
 
@@ -24,4 +29,7 @@ static inline void osq_lock_init(struct optimistic_spin_queue *lock)
 	atomic_set(&lock->tail, OSQ_UNLOCKED_VAL);
 }
 
+extern bool osq_lock(struct optimistic_spin_queue *lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);
+
 #endif
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 76768ee812b2..08561f1acd13 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -231,6 +231,10 @@ config RWSEM_SPIN_ON_OWNER
        def_bool y
        depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
 
+config LOCK_SPIN_ON_OWNER
+       def_bool y
+       depends on MUTEX_SPIN_ON_OWNER || RWSEM_SPIN_ON_OWNER
+
 config ARCH_USE_QUEUE_RWLOCK
 	bool
 
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8541bfdfd232..4ca8eb151975 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
 
-obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
+obj-y += mutex.o semaphore.o rwsem.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
 endif
 obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
 obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
deleted file mode 100644
index 9887a905a762..000000000000
--- a/kernel/locking/mcs_spinlock.c
+++ /dev/null
@@ -1,208 +0,0 @@
-#include <linux/percpu.h>
-#include <linux/sched.h>
-#include "mcs_spinlock.h"
-
-#ifdef CONFIG_SMP
-
-/*
- * An MCS like lock especially tailored for optimistic spinning for sleeping
- * lock implementations (mutex, rwsem, etc).
- *
- * Using a single mcs node per CPU is safe because sleeping locks should not be
- * called from interrupt context and we have preemption disabled while
- * spinning.
- */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
-
-/*
- * We use the value 0 to represent "no CPU", thus the encoded value
- * will be the CPU number incremented by 1.
- */
-static inline int encode_cpu(int cpu_nr)
-{
-	return cpu_nr + 1;
-}
-
-static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
-{
-	int cpu_nr = encoded_cpu_val - 1;
-
-	return per_cpu_ptr(&osq_node, cpu_nr);
-}
-
-/*
- * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
- * Can return NULL in case we were the last queued and we updated @lock instead.
- */
-static inline struct optimistic_spin_node *
-osq_wait_next(struct optimistic_spin_queue *lock,
-	      struct optimistic_spin_node *node,
-	      struct optimistic_spin_node *prev)
-{
-	struct optimistic_spin_node *next = NULL;
-	int curr = encode_cpu(smp_processor_id());
-	int old;
-
-	/*
-	 * If there is a prev node in queue, then the 'old' value will be
-	 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
-	 * we're currently last in queue, then the queue will then become empty.
-	 */
-	old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
-
-	for (;;) {
-		if (atomic_read(&lock->tail) == curr &&
-		    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
-			/*
-			 * We were the last queued, we moved @lock back. @prev
-			 * will now observe @lock and will complete its
-			 * unlock()/unqueue().
-			 */
-			break;
-		}
-
-		/*
-		 * We must xchg() the @node->next value, because if we were to
-		 * leave it in, a concurrent unlock()/unqueue() from
-		 * @node->next might complete Step-A and think its @prev is
-		 * still valid.
-		 *
-		 * If the concurrent unlock()/unqueue() wins the race, we'll
-		 * wait for either @lock to point to us, through its Step-B, or
-		 * wait for a new @node->next from its Step-C.
-		 */
-		if (node->next) {
-			next = xchg(&node->next, NULL);
-			if (next)
-				break;
-		}
-
-		cpu_relax_lowlatency();
-	}
-
-	return next;
-}
-
-bool osq_lock(struct optimistic_spin_queue *lock)
-{
-	struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
-	struct optimistic_spin_node *prev, *next;
-	int curr = encode_cpu(smp_processor_id());
-	int old;
-
-	node->locked = 0;
-	node->next = NULL;
-	node->cpu = curr;
-
-	old = atomic_xchg(&lock->tail, curr);
-	if (old == OSQ_UNLOCKED_VAL)
-		return true;
-
-	prev = decode_cpu(old);
-	node->prev = prev;
-	ACCESS_ONCE(prev->next) = node;
-
-	/*
-	 * Normally @prev is untouchable after the above store; because at that
-	 * moment unlock can proceed and wipe the node element from stack.
-	 *
-	 * However, since our nodes are static per-cpu storage, we're
-	 * guaranteed their existence -- this allows us to apply
-	 * cmpxchg in an attempt to undo our queueing.
-	 */
-
-	while (!smp_load_acquire(&node->locked)) {
-		/*
-		 * If we need to reschedule bail... so we can block.
-		 */
-		if (need_resched())
-			goto unqueue;
-
-		cpu_relax_lowlatency();
-	}
-	return true;
-
-unqueue:
-	/*
-	 * Step - A  -- stabilize @prev
-	 *
-	 * Undo our @prev->next assignment; this will make @prev's
-	 * unlock()/unqueue() wait for a next pointer since @lock points to us
-	 * (or later).
-	 */
-
-	for (;;) {
-		if (prev->next == node &&
-		    cmpxchg(&prev->next, node, NULL) == node)
-			break;
-
-		/*
-		 * We can only fail the cmpxchg() racing against an unlock(),
-		 * in which case we should observe @node->locked becomming
-		 * true.
-		 */
-		if (smp_load_acquire(&node->locked))
-			return true;
-
-		cpu_relax_lowlatency();
-
-		/*
-		 * Or we race against a concurrent unqueue()'s step-B, in which
-		 * case its step-C will write us a new @node->prev pointer.
-		 */
-		prev = ACCESS_ONCE(node->prev);
-	}
-
-	/*
-	 * Step - B -- stabilize @next
-	 *
-	 * Similar to unlock(), wait for @node->next or move @lock from @node
-	 * back to @prev.
-	 */
-
-	next = osq_wait_next(lock, node, prev);
-	if (!next)
-		return false;
-
-	/*
-	 * Step - C -- unlink
-	 *
-	 * @prev is stable because its still waiting for a new @prev->next
-	 * pointer, @next is stable because our @node->next pointer is NULL and
-	 * it will wait in Step-A.
-	 */
-
-	ACCESS_ONCE(next->prev) = prev;
-	ACCESS_ONCE(prev->next) = next;
-
-	return false;
-}
-
-void osq_unlock(struct optimistic_spin_queue *lock)
-{
-	struct optimistic_spin_node *node, *next;
-	int curr = encode_cpu(smp_processor_id());
-
-	/*
-	 * Fast path for the uncontended case.
-	 */
-	if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
-		return;
-
-	/*
-	 * Second most likely case.
-	 */
-	node = this_cpu_ptr(&osq_node);
-	next = xchg(&node->next, NULL);
-	if (next) {
-		ACCESS_ONCE(next->locked) = 1;
-		return;
-	}
-
-	next = osq_wait_next(lock, node, NULL);
-	if (next)
-		ACCESS_ONCE(next->locked) = 1;
-}
-
-#endif
-
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 4d60986fcbee..d1fe2ba5bac9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -108,20 +108,4 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 	arch_mcs_spin_unlock_contended(&next->locked);
 }
 
-/*
- * Cancellable version of the MCS lock above.
- *
- * Intended for adaptive spinning of sleeping locks:
- * mutex_lock()/rwsem_down_{read,write}() etc.
- */
-
-struct optimistic_spin_node {
-	struct optimistic_spin_node *next, *prev;
-	int locked; /* 1 if lock acquired */
-	int cpu; /* encoded CPU # value */
-};
-
-extern bool osq_lock(struct optimistic_spin_queue *lock);
-extern void osq_unlock(struct optimistic_spin_queue *lock);
-
 #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
new file mode 100644
index 000000000000..ec83d4db8ec6
--- /dev/null
+++ b/kernel/locking/osq_lock.c
@@ -0,0 +1,203 @@
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/osq_lock.h>
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
+
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+	return cpu_nr + 1;
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+	int cpu_nr = encoded_cpu_val - 1;
+
+	return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_queue *lock,
+	      struct optimistic_spin_node *node,
+	      struct optimistic_spin_node *prev)
+{
+	struct optimistic_spin_node *next = NULL;
+	int curr = encode_cpu(smp_processor_id());
+	int old;
+
+	/*
+	 * If there is a prev node in queue, then the 'old' value will be
+	 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+	 * we're currently last in queue, then the queue will then become empty.
+	 */
+	old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
+
+	for (;;) {
+		if (atomic_read(&lock->tail) == curr &&
+		    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
+			/*
+			 * We were the last queued, we moved @lock back. @prev
+			 * will now observe @lock and will complete its
+			 * unlock()/unqueue().
+			 */
+			break;
+		}
+
+		/*
+		 * We must xchg() the @node->next value, because if we were to
+		 * leave it in, a concurrent unlock()/unqueue() from
+		 * @node->next might complete Step-A and think its @prev is
+		 * still valid.
+		 *
+		 * If the concurrent unlock()/unqueue() wins the race, we'll
+		 * wait for either @lock to point to us, through its Step-B, or
+		 * wait for a new @node->next from its Step-C.
+		 */
+		if (node->next) {
+			next = xchg(&node->next, NULL);
+			if (next)
+				break;
+		}
+
+		cpu_relax_lowlatency();
+	}
+
+	return next;
+}
+
+bool osq_lock(struct optimistic_spin_queue *lock)
+{
+	struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+	struct optimistic_spin_node *prev, *next;
+	int curr = encode_cpu(smp_processor_id());
+	int old;
+
+	node->locked = 0;
+	node->next = NULL;
+	node->cpu = curr;
+
+	old = atomic_xchg(&lock->tail, curr);
+	if (old == OSQ_UNLOCKED_VAL)
+		return true;
+
+	prev = decode_cpu(old);
+	node->prev = prev;
+	ACCESS_ONCE(prev->next) = node;
+
+	/*
+	 * Normally @prev is untouchable after the above store; because at that
+	 * moment unlock can proceed and wipe the node element from stack.
+	 *
+	 * However, since our nodes are static per-cpu storage, we're
+	 * guaranteed their existence -- this allows us to apply
+	 * cmpxchg in an attempt to undo our queueing.
+	 */
+
+	while (!smp_load_acquire(&node->locked)) {
+		/*
+		 * If we need to reschedule bail... so we can block.
+		 */
+		if (need_resched())
+			goto unqueue;
+
+		cpu_relax_lowlatency();
+	}
+	return true;
+
+unqueue:
+	/*
+	 * Step - A  -- stabilize @prev
+	 *
+	 * Undo our @prev->next assignment; this will make @prev's
+	 * unlock()/unqueue() wait for a next pointer since @lock points to us
+	 * (or later).
+	 */
+
+	for (;;) {
+		if (prev->next == node &&
+		    cmpxchg(&prev->next, node, NULL) == node)
+			break;
+
+		/*
+		 * We can only fail the cmpxchg() racing against an unlock(),
+		 * in which case we should observe @node->locked becomming
+		 * true.
+		 */
+		if (smp_load_acquire(&node->locked))
+			return true;
+
+		cpu_relax_lowlatency();
+
+		/*
+		 * Or we race against a concurrent unqueue()'s step-B, in which
+		 * case its step-C will write us a new @node->prev pointer.
+		 */
+		prev = ACCESS_ONCE(node->prev);
+	}
+
+	/*
+	 * Step - B -- stabilize @next
+	 *
+	 * Similar to unlock(), wait for @node->next or move @lock from @node
+	 * back to @prev.
+	 */
+
+	next = osq_wait_next(lock, node, prev);
+	if (!next)
+		return false;
+
+	/*
+	 * Step - C -- unlink
+	 *
+	 * @prev is stable because its still waiting for a new @prev->next
+	 * pointer, @next is stable because our @node->next pointer is NULL and
+	 * it will wait in Step-A.
+	 */
+
+	ACCESS_ONCE(next->prev) = prev;
+	ACCESS_ONCE(prev->next) = next;
+
+	return false;
+}
+
+void osq_unlock(struct optimistic_spin_queue *lock)
+{
+	struct optimistic_spin_node *node, *next;
+	int curr = encode_cpu(smp_processor_id());
+
+	/*
+	 * Fast path for the uncontended case.
+	 */
+	if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
+		return;
+
+	/*
+	 * Second most likely case.
+	 */
+	node = this_cpu_ptr(&osq_node);
+	next = xchg(&node->next, NULL);
+	if (next) {
+		ACCESS_ONCE(next->locked) = 1;
+		return;
+	}
+
+	next = osq_wait_next(lock, node, NULL);
+	if (next)
+		ACCESS_ONCE(next->locked) = 1;
+}
-- 
cgit v1.2.3


From 86038c5ea81b519a8a1fcfcd5e4599aab0cdd119 Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Tue, 16 Dec 2014 12:47:34 +0100
Subject: perf: Avoid horrible stack usage

Both Linus (most recent) and Steve (a while ago) reported that perf
related callbacks have massive stack bloat.

The problem is that software events need a pt_regs in order to
properly report the event location and unwind stack. And because we
could not assume one was present we allocated one on stack and filled
it with minimal bits required for operation.

Now, pt_regs is quite large, so this is undesirable. Furthermore it
turns out that most sites actually have a pt_regs pointer available,
making this even more onerous, as the stack space is pointless waste.

This patch addresses the problem by observing that software events
have well defined nesting semantics, therefore we can use static
per-cpu storage instead of on-stack.

Linus made the further observation that all but the scheduler callers
of perf_sw_event() have a pt_regs available, so we change the regular
perf_sw_event() to require a valid pt_regs (where it used to be
optional) and add perf_sw_event_sched() for the scheduler.

We have a scheduler specific call instead of a more generic _noregs()
like construct because we can assume non-recursion from the scheduler
and thereby simplify the code further (_noregs would have to put the
recursion context call inline in order to assertain which __perf_regs
element to use).

One last note on the implementation of perf_trace_buf_prepare(); we
allow .regs = NULL for those cases where we already have a pt_regs
pointer available and do not need another.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Javi Merino <javi.merino@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Petr Mladek <pmladek@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Cc: Vaibhav Nagarnaik <vnagarnaik@google.com>
Link: http://lkml.kernel.org/r/20141216115041.GW3337@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/ftrace_event.h    |  2 +-
 include/linux/perf_event.h      | 28 +++++++++++++++++++++-------
 include/trace/ftrace.h          |  7 ++++---
 kernel/events/core.c            | 23 +++++++++++++++++------
 kernel/sched/core.c             |  2 +-
 kernel/trace/trace_event_perf.c |  4 +++-
 kernel/trace/trace_kprobe.c     |  4 ++--
 kernel/trace/trace_syscalls.c   |  4 ++--
 kernel/trace/trace_uprobe.c     |  2 +-
 9 files changed, 52 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..d36f68b08acc 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -595,7 +595,7 @@ extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
 extern void ftrace_profile_free_filter(struct perf_event *event);
 extern void *perf_trace_buf_prepare(int size, unsigned short type,
-				    struct pt_regs *regs, int *rctxp);
+				    struct pt_regs **regs, int *rctxp);
 
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4f7a61ca4b39..3a7bd80b4db8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -665,6 +665,7 @@ static inline int is_software_event(struct perf_event *event)
 
 extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
+extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
 extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);
 
 #ifndef perf_arch_fetch_caller_regs
@@ -689,14 +690,25 @@ static inline void perf_fetch_caller_regs(struct pt_regs *regs)
 static __always_inline void
 perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
-	struct pt_regs hot_regs;
+	if (static_key_false(&perf_swevent_enabled[event_id]))
+		__perf_sw_event(event_id, nr, regs, addr);
+}
+
+DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);
 
+/*
+ * 'Special' version for the scheduler, it hard assumes no recursion,
+ * which is guaranteed by us not actually scheduling inside other swevents
+ * because those disable preemption.
+ */
+static __always_inline void
+perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
+{
 	if (static_key_false(&perf_swevent_enabled[event_id])) {
-		if (!regs) {
-			perf_fetch_caller_regs(&hot_regs);
-			regs = &hot_regs;
-		}
-		__perf_sw_event(event_id, nr, regs, addr);
+		struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);
+
+		perf_fetch_caller_regs(regs);
+		___perf_sw_event(event_id, nr, regs, addr);
 	}
 }
 
@@ -712,7 +724,7 @@ static inline void perf_event_task_sched_in(struct task_struct *prev,
 static inline void perf_event_task_sched_out(struct task_struct *prev,
 					     struct task_struct *next)
 {
-	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0);
+	perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
 
 	if (static_key_false(&perf_sched_events.key))
 		__perf_event_task_sched_out(prev, next);
@@ -823,6 +835,8 @@ static inline int perf_event_refresh(struct perf_event *event, int refresh)
 static inline void
 perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)	{ }
 static inline void
+perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)			{ }
+static inline void
 perf_bp_event(struct perf_event *event, void *data)			{ }
 
 static inline int perf_register_guest_info_callbacks
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..27609dfcce25 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -763,7 +763,7 @@ perf_trace_##call(void *__data, proto)					\
 	struct ftrace_event_call *event_call = __data;			\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_raw_##call *entry;				\
-	struct pt_regs __regs;						\
+	struct pt_regs *__regs;						\
 	u64 __addr = 0, __count = 1;					\
 	struct task_struct *__task = NULL;				\
 	struct hlist_head *head;					\
@@ -782,18 +782,19 @@ perf_trace_##call(void *__data, proto)					\
 			     sizeof(u64));				\
 	__entry_size -= sizeof(u32);					\
 									\
-	perf_fetch_caller_regs(&__regs);				\
 	entry = perf_trace_buf_prepare(__entry_size,			\
 			event_call->event.type, &__regs, &rctx);	\
 	if (!entry)							\
 		return;							\
 									\
+	perf_fetch_caller_regs(__regs);					\
+									\
 	tstruct								\
 									\
 	{ assign; }							\
 									\
 	perf_trace_buf_submit(entry, __entry_size, rctx, __addr,	\
-		__count, &__regs, head, __task);			\
+		__count, __regs, head, __task);				\
 }
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835a0d85..c10124b772c4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5889,6 +5889,8 @@ end:
 	rcu_read_unlock();
 }
 
+DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
+
 int perf_swevent_get_recursion_context(void)
 {
 	struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
@@ -5904,21 +5906,30 @@ inline void perf_swevent_put_recursion_context(int rctx)
 	put_recursion_context(swhash->recursion, rctx);
 }
 
-void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data;
-	int rctx;
 
-	preempt_disable_notrace();
-	rctx = perf_swevent_get_recursion_context();
-	if (rctx < 0)
+	if (WARN_ON_ONCE(!regs))
 		return;
 
 	perf_sample_data_init(&data, addr, 0);
-
 	do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
+}
+
+void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
+{
+	int rctx;
+
+	preempt_disable_notrace();
+	rctx = perf_swevent_get_recursion_context();
+	if (unlikely(rctx < 0))
+		goto fail;
+
+	___perf_sw_event(event_id, nr, regs, addr);
 
 	perf_swevent_put_recursion_context(rctx);
+fail:
 	preempt_enable_notrace();
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc00566e..d22fb16a7153 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1082,7 +1082,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
-		perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
 	}
 
 	__set_task_cpu(p, new_cpu);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4b9c114ee9de..6fa484de2ba1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -261,7 +261,7 @@ void perf_trace_del(struct perf_event *p_event, int flags)
 }
 
 void *perf_trace_buf_prepare(int size, unsigned short type,
-			     struct pt_regs *regs, int *rctxp)
+			     struct pt_regs **regs, int *rctxp)
 {
 	struct trace_entry *entry;
 	unsigned long flags;
@@ -280,6 +280,8 @@ void *perf_trace_buf_prepare(int size, unsigned short type,
 	if (*rctxp < 0)
 		return NULL;
 
+	if (regs)
+		*regs = this_cpu_ptr(&__perf_regs[*rctxp]);
 	raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
 
 	/* zero the dead bytes from align to not leak stack to user */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5edb518be345..296079ae6583 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1148,7 +1148,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
 	if (!entry)
 		return;
 
@@ -1179,7 +1179,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	size = ALIGN(__size + sizeof(u32), sizeof(u64));
 	size -= sizeof(u32);
 
-	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
 	if (!entry)
 		return;
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c6ee36fcbf90..f97f6e3a676c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -574,7 +574,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	size -= sizeof(u32);
 
 	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
-				sys_data->enter_event->event.type, regs, &rctx);
+				sys_data->enter_event->event.type, NULL, &rctx);
 	if (!rec)
 		return;
 
@@ -647,7 +647,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	size -= sizeof(u32);
 
 	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
-				sys_data->exit_event->event.type, regs, &rctx);
+				sys_data->exit_event->event.type, NULL, &rctx);
 	if (!rec)
 		return;
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8520acc34b18..b11441321e7a 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1111,7 +1111,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	if (hlist_empty(head))
 		goto out;
 
-	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
+	entry = perf_trace_buf_prepare(size, call->event.type, NULL, &rctx);
 	if (!entry)
 		goto out;
 
-- 
cgit v1.2.3


From a2b12f3c7ac1ea43ae646db74faf0b56c2bba563 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 12 Jan 2015 17:00:37 -0800
Subject: udp: pass udp_offload struct to UDP gro callbacks

This patch introduces udp_offload_callbacks which has the same
GRO functions (but not a GSO function) as offload_callbacks,
except there is an argument to a udp_offload struct passed to
gro_receive and gro_complete functions. This additional argument
can be used to retrieve the per port structure of the encapsulation
for use in gro processing (mostly by doing container_of on the
structure).

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       |  7 +++++--
 include/linux/netdevice.h | 15 +++++++++++++--
 net/ipv4/fou.c            | 12 ++++++++----
 net/ipv4/geneve.c         |  6 ++++--
 net/ipv4/udp_offload.c    |  7 +++++--
 5 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 985359dd6033..5c56a3ff25aa 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -539,7 +539,9 @@ static int vxlan_fdb_append(struct vxlan_fdb *f,
 	return 1;
 }
 
-static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff *skb)
+static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
+					  struct sk_buff *skb,
+					  struct udp_offload *uoff)
 {
 	struct sk_buff *p, **pp = NULL;
 	struct vxlanhdr *vh, *vh2;
@@ -578,7 +580,8 @@ out:
 	return pp;
 }
 
-static int vxlan_gro_complete(struct sk_buff *skb, int nhoff)
+static int vxlan_gro_complete(struct sk_buff *skb, int nhoff,
+			      struct udp_offload *uoff)
 {
 	udp_tunnel_gro_complete(skb, nhoff);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 679e6e90aa4c..47921c291dd6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1969,7 +1969,7 @@ struct offload_callbacks {
 	struct sk_buff		*(*gso_segment)(struct sk_buff *skb,
 						netdev_features_t features);
 	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
-					       struct sk_buff *skb);
+						 struct sk_buff *skb);
 	int			(*gro_complete)(struct sk_buff *skb, int nhoff);
 };
 
@@ -1979,10 +1979,21 @@ struct packet_offload {
 	struct list_head	 list;
 };
 
+struct udp_offload;
+
+struct udp_offload_callbacks {
+	struct sk_buff		**(*gro_receive)(struct sk_buff **head,
+						 struct sk_buff *skb,
+						 struct udp_offload *uoff);
+	int			(*gro_complete)(struct sk_buff *skb,
+						int nhoff,
+						struct udp_offload *uoff);
+};
+
 struct udp_offload {
 	__be16			 port;
 	u8			 ipproto;
-	struct offload_callbacks callbacks;
+	struct udp_offload_callbacks callbacks;
 };
 
 /* often modified stats are per cpu, other are shared (netdev->stats) */
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 2197c36f722f..3bc0cf07661c 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -174,7 +174,8 @@ drop:
 }
 
 static struct sk_buff **fou_gro_receive(struct sk_buff **head,
-					struct sk_buff *skb)
+					struct sk_buff *skb,
+					struct udp_offload *uoff)
 {
 	const struct net_offload *ops;
 	struct sk_buff **pp = NULL;
@@ -195,7 +196,8 @@ out_unlock:
 	return pp;
 }
 
-static int fou_gro_complete(struct sk_buff *skb, int nhoff)
+static int fou_gro_complete(struct sk_buff *skb, int nhoff,
+			    struct udp_offload *uoff)
 {
 	const struct net_offload *ops;
 	u8 proto = NAPI_GRO_CB(skb)->proto;
@@ -254,7 +256,8 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 }
 
 static struct sk_buff **gue_gro_receive(struct sk_buff **head,
-					struct sk_buff *skb)
+					struct sk_buff *skb,
+					struct udp_offload *uoff)
 {
 	const struct net_offload **offloads;
 	const struct net_offload *ops;
@@ -360,7 +363,8 @@ out:
 	return pp;
 }
 
-static int gue_gro_complete(struct sk_buff *skb, int nhoff)
+static int gue_gro_complete(struct sk_buff *skb, int nhoff,
+			    struct udp_offload *uoff)
 {
 	const struct net_offload **offloads;
 	struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
diff --git a/net/ipv4/geneve.c b/net/ipv4/geneve.c
index 23744c7a9718..9568594ca2f1 100644
--- a/net/ipv4/geneve.c
+++ b/net/ipv4/geneve.c
@@ -147,7 +147,8 @@ static int geneve_hlen(struct genevehdr *gh)
 }
 
 static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
-					   struct sk_buff *skb)
+					   struct sk_buff *skb,
+					   struct udp_offload *uoff)
 {
 	struct sk_buff *p, **pp = NULL;
 	struct genevehdr *gh, *gh2;
@@ -211,7 +212,8 @@ out:
 	return pp;
 }
 
-static int geneve_gro_complete(struct sk_buff *skb, int nhoff)
+static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
+			       struct udp_offload *uoff)
 {
 	struct genevehdr *gh;
 	struct packet_offload *ptype;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index d3e537ef6b7f..d10f6f4ead27 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -339,7 +339,8 @@ unflush:
 	skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
 	skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
 	NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-	pp = uo_priv->offload->callbacks.gro_receive(head, skb);
+	pp = uo_priv->offload->callbacks.gro_receive(head, skb,
+						     uo_priv->offload);
 
 out_unlock:
 	rcu_read_unlock();
@@ -395,7 +396,9 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
 
 	if (uo_priv != NULL) {
 		NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
-		err = uo_priv->offload->callbacks.gro_complete(skb, nhoff + sizeof(struct udphdr));
+		err = uo_priv->offload->callbacks.gro_complete(skb,
+				nhoff + sizeof(struct udphdr),
+				uo_priv->offload);
 	}
 
 	rcu_read_unlock();
-- 
cgit v1.2.3


From c38fda3fe8163898f538a45d3c1419e6870625ed Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Wed, 14 Jan 2015 22:59:13 -0800
Subject: jbd: drop jbd_ENOSYS debug

A quick search shows that there are no users, drop the
macro for both jbd and jbd2.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 include/linux/jbd.h  | 9 ---------
 include/linux/jbd2.h | 9 ---------
 2 files changed, 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 31229e0be90b..d32615280be9 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -956,15 +956,6 @@ void __log_wait_for_space(journal_t *journal);
 extern void	__journal_drop_transaction(journal_t *, transaction_t *);
 extern int	cleanup_journal_tail(journal_t *);
 
-/* Debugging code only: */
-
-#define jbd_ENOSYS() \
-do {								           \
-	printk (KERN_ERR "JBD unimplemented function %s\n", __func__); \
-	current->state = TASK_UNINTERRUPTIBLE;			           \
-	schedule();						           \
-} while (1)
-
 /*
  * is_journal_abort
  *
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 704b9a599b26..20e7f78041c8 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1251,15 +1251,6 @@ void __jbd2_log_wait_for_space(journal_t *journal);
 extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *);
 extern int jbd2_cleanup_journal_tail(journal_t *);
 
-/* Debugging code only: */
-
-#define jbd_ENOSYS() \
-do {								           \
-	printk (KERN_ERR "JBD unimplemented function %s\n", __func__); \
-	current->state = TASK_UNINTERRUPTIBLE;			           \
-	schedule();						           \
-} while (1)
-
 /*
  * is_journal_abort
  *
-- 
cgit v1.2.3


From 37715f556a0776356300391f8ac41ace91bea447 Mon Sep 17 00:00:00 2001
From: David Ung <davidu@nvidia.com>
Date: Tue, 13 Jan 2015 19:04:25 -0800
Subject: video: fbdev: Add additional vesa modes

Add high resolution modes to vesa_modes struct.

Signed-off-by: David Ung <davidu@nvidia.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 drivers/video/fbdev/core/modedb.c | 27 +++++++++++++++++++++++++++
 include/linux/fb.h                |  2 +-
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/modedb.c b/drivers/video/fbdev/core/modedb.c
index 388f7971494b..0b57c1df73e3 100644
--- a/drivers/video/fbdev/core/modedb.c
+++ b/drivers/video/fbdev/core/modedb.c
@@ -468,6 +468,33 @@ const struct fb_videomode vesa_modes[] = {
 	/* 33 1920x1440-75 VESA */
 	{ NULL, 75, 1920, 1440, 3367, 352, 144, 56, 1, 224, 3,
 	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 34 1920x1200-60 RB VESA */
+	{ NULL, 60, 1920, 1200, 6493, 80, 48, 26, 3, 32, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 35 1920x1200-60 VESA */
+	{ NULL, 60, 1920, 1200, 5174, 336, 136, 36, 3, 200, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 36 1920x1200-75 VESA */
+	{ NULL, 75, 1920, 1200, 4077, 344, 136, 46, 3, 208, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 37 1920x1200-85 VESA */
+	{ NULL, 85, 1920, 1200, 3555, 352, 144, 53, 3, 208, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 38 2560x1600-60 RB VESA */
+	{ NULL, 60, 2560, 1600, 3724, 80, 48, 37, 3, 32, 6,
+	  FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 39 2560x1600-60 VESA */
+	{ NULL, 60, 2560, 1600, 2869, 472, 192, 49, 3, 280, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 40 2560x1600-75 VESA */
+	{ NULL, 75, 2560, 1600, 2256, 488, 208, 63, 3, 280, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 41 2560x1600-85 VESA */
+	{ NULL, 85, 2560, 1600, 1979, 488, 208, 73, 3, 280, 6,
+	  FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
+	/* 42 2560x1600-120 RB VESA */
+	{ NULL, 120, 2560, 1600, 1809, 80, 48, 85, 3, 32, 6,
+	  FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
 };
 EXPORT_SYMBOL(vesa_modes);
 #endif /* CONFIG_FB_MODE_HELPERS */
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 09bb7a18d287..882dbd1a87b7 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -726,7 +726,7 @@ extern int fb_videomode_from_videomode(const struct videomode *vm,
 				       struct fb_videomode *fbmode);
 
 /* drivers/video/modedb.c */
-#define VESA_MODEDB_SIZE 34
+#define VESA_MODEDB_SIZE 43
 extern void fb_var_to_videomode(struct fb_videomode *mode,
 				const struct fb_var_screeninfo *var);
 extern void fb_videomode_to_var(struct fb_var_screeninfo *var,
-- 
cgit v1.2.3


From 8f5ee77bb8d162abe28ff8cd56f36e825d143207 Mon Sep 17 00:00:00 2001
From: David Ung <davidu@nvidia.com>
Date: Tue, 13 Jan 2015 19:04:26 -0800
Subject: video: fbdev: Check Standard Timing against DMT

Add the VESA Display Monitor Timing (DMT) table.
During parsing of Standard Timings, it compare the 2 byte STD code
with DMT to see what the VESA mode should be.  If there is no entry
in the vesa_modes table or no match found, it fallsback to the
GTF timings.

Signed-off-by: David Ung <davidu@nvidia.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 drivers/video/fbdev/core/fbmon.c  | 76 +++++++++++++++++++----------------
 drivers/video/fbdev/core/modedb.c | 84 +++++++++++++++++++++++++++++++++++++++
 include/linux/fb.h                | 10 +++++
 3 files changed, 136 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/fbdev/core/fbmon.c b/drivers/video/fbdev/core/fbmon.c
index 5b0e313849bd..0f234c15ece3 100644
--- a/drivers/video/fbdev/core/fbmon.c
+++ b/drivers/video/fbdev/core/fbmon.c
@@ -498,44 +498,52 @@ static int get_est_timing(unsigned char *block, struct fb_videomode *mode)
 static int get_std_timing(unsigned char *block, struct fb_videomode *mode,
 		int ver, int rev)
 {
-	int xres, yres = 0, refresh, ratio, i;
-
-	xres = (block[0] + 31) * 8;
-	if (xres <= 256)
-		return 0;
+	int i;
 
-	ratio = (block[1] & 0xc0) >> 6;
-	switch (ratio) {
-	case 0:
-		/* in EDID 1.3 the meaning of 0 changed to 16:10 (prior 1:1) */
-		if (ver < 1 || (ver == 1 && rev < 3))
-			yres = xres;
-		else
-			yres = (xres * 10)/16;
-		break;
-	case 1:
-		yres = (xres * 3)/4;
-		break;
-	case 2:
-		yres = (xres * 4)/5;
-		break;
-	case 3:
-		yres = (xres * 9)/16;
-		break;
+	for (i = 0; i < DMT_SIZE; i++) {
+		u32 std_2byte_code = block[0] << 8 | block[1];
+		if (std_2byte_code == dmt_modes[i].std_2byte_code)
+			break;
 	}
-	refresh = (block[1] & 0x3f) + 60;
-
-	DPRINTK("      %dx%d@%dHz\n", xres, yres, refresh);
-	for (i = 0; i < VESA_MODEDB_SIZE; i++) {
-		if (vesa_modes[i].xres == xres &&
-		    vesa_modes[i].yres == yres &&
-		    vesa_modes[i].refresh == refresh) {
-			*mode = vesa_modes[i];
-			mode->flag |= FB_MODE_IS_STANDARD;
-			return 1;
+
+	if (i < DMT_SIZE && dmt_modes[i].mode) {
+		/* DMT mode found */
+		*mode = *dmt_modes[i].mode;
+		mode->flag |= FB_MODE_IS_STANDARD;
+		DPRINTK("        DMT id=%d\n", dmt_modes[i].dmt_id);
+
+	} else {
+		int xres, yres = 0, refresh, ratio;
+
+		xres = (block[0] + 31) * 8;
+		if (xres <= 256)
+			return 0;
+
+		ratio = (block[1] & 0xc0) >> 6;
+		switch (ratio) {
+		case 0:
+			/* in EDID 1.3 the meaning of 0 changed to 16:10 (prior 1:1) */
+			if (ver < 1 || (ver == 1 && rev < 3))
+				yres = xres;
+			else
+				yres = (xres * 10)/16;
+			break;
+		case 1:
+			yres = (xres * 3)/4;
+			break;
+		case 2:
+			yres = (xres * 4)/5;
+			break;
+		case 3:
+			yres = (xres * 9)/16;
+			break;
 		}
+		refresh = (block[1] & 0x3f) + 60;
+		DPRINTK("      %dx%d@%dHz\n", xres, yres, refresh);
+
+		calc_mode_timings(xres, yres, refresh, mode);
 	}
-	calc_mode_timings(xres, yres, refresh, mode);
+
 	return 1;
 }
 
diff --git a/drivers/video/fbdev/core/modedb.c b/drivers/video/fbdev/core/modedb.c
index 0b57c1df73e3..858a97e03200 100644
--- a/drivers/video/fbdev/core/modedb.c
+++ b/drivers/video/fbdev/core/modedb.c
@@ -497,6 +497,90 @@ const struct fb_videomode vesa_modes[] = {
 	  FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
 };
 EXPORT_SYMBOL(vesa_modes);
+
+const struct dmt_videomode dmt_modes[DMT_SIZE] = {
+	{ 0x01, 0x0000, 0x000000, &vesa_modes[0] },
+	{ 0x02, 0x3119, 0x000000, &vesa_modes[1] },
+	{ 0x03, 0x0000, 0x000000, &vesa_modes[2] },
+	{ 0x04, 0x3140, 0x000000, &vesa_modes[3] },
+	{ 0x05, 0x314c, 0x000000, &vesa_modes[4] },
+	{ 0x06, 0x314f, 0x000000, &vesa_modes[5] },
+	{ 0x07, 0x3159, 0x000000, &vesa_modes[6] },
+	{ 0x08, 0x0000, 0x000000, &vesa_modes[7] },
+	{ 0x09, 0x4540, 0x000000, &vesa_modes[8] },
+	{ 0x0a, 0x454c, 0x000000, &vesa_modes[9] },
+	{ 0x0b, 0x454f, 0x000000, &vesa_modes[10] },
+	{ 0x0c, 0x4559, 0x000000, &vesa_modes[11] },
+	{ 0x0d, 0x0000, 0x000000, 0 },
+	{ 0x0e, 0x0000, 0x000000, 0 },
+	{ 0x0f, 0x0000, 0x000000, &vesa_modes[12] },
+	{ 0x10, 0x6140, 0x000000, &vesa_modes[13] },
+	{ 0x11, 0x614a, 0x000000, &vesa_modes[14] },
+	{ 0x12, 0x614f, 0x000000, &vesa_modes[15] },
+	{ 0x13, 0x6159, 0x000000, &vesa_modes[16] },
+	{ 0x14, 0x0000, 0x000000, 0 },
+	{ 0x15, 0x714f, 0x000000, &vesa_modes[17] },
+	{ 0x16, 0x0000, 0x7f1c21, 0 },
+	{ 0x17, 0x0000, 0x7f1c28, 0 },
+	{ 0x18, 0x0000, 0x7f1c44, 0 },
+	{ 0x19, 0x0000, 0x7f1c62, 0 },
+	{ 0x1a, 0x0000, 0x000000, 0 },
+	{ 0x1b, 0x0000, 0x8f1821, 0 },
+	{ 0x1c, 0x8100, 0x8f1828, 0 },
+	{ 0x1d, 0x810f, 0x8f1844, 0 },
+	{ 0x1e, 0x8119, 0x8f1862, 0 },
+	{ 0x1f, 0x0000, 0x000000, 0 },
+	{ 0x20, 0x8140, 0x000000, &vesa_modes[18] },
+	{ 0x21, 0x8159, 0x000000, &vesa_modes[19] },
+	{ 0x22, 0x0000, 0x000000, 0 },
+	{ 0x23, 0x8180, 0x000000, &vesa_modes[20] },
+	{ 0x24, 0x818f, 0x000000, &vesa_modes[21] },
+	{ 0x25, 0x8199, 0x000000, &vesa_modes[22] },
+	{ 0x26, 0x0000, 0x000000, 0 },
+	{ 0x27, 0x0000, 0x000000, 0 },
+	{ 0x28, 0x0000, 0x000000, 0 },
+	{ 0x29, 0x0000, 0x0c2021, 0 },
+	{ 0x2a, 0x9040, 0x0c2028, 0 },
+	{ 0x2b, 0x904f, 0x0c2044, 0 },
+	{ 0x2c, 0x9059, 0x0c2062, 0 },
+	{ 0x2d, 0x0000, 0x000000, 0 },
+	{ 0x2e, 0x9500, 0xc11821, 0 },
+	{ 0x2f, 0x9500, 0xc11828, 0 },
+	{ 0x30, 0x950f, 0xc11844, 0 },
+	{ 0x31, 0x9519, 0xc11868, 0 },
+	{ 0x32, 0x0000, 0x000000, 0 },
+	{ 0x33, 0xa940, 0x000000, &vesa_modes[23] },
+	{ 0x34, 0xa945, 0x000000, &vesa_modes[24] },
+	{ 0x35, 0xa94a, 0x000000, &vesa_modes[25] },
+	{ 0x36, 0xa94f, 0x000000, &vesa_modes[26] },
+	{ 0x37, 0xa959, 0x000000, &vesa_modes[27] },
+	{ 0x38, 0x0000, 0x000000, 0 },
+	{ 0x39, 0x0000, 0x0c2821, 0 },
+	{ 0x3a, 0xb300, 0x0c2828, 0 },
+	{ 0x3b, 0xb30f, 0x0c2844, 0 },
+	{ 0x3c, 0xb319, 0x0c2868, 0 },
+	{ 0x3d, 0x0000, 0x000000, 0 },
+	{ 0x3e, 0xc140, 0x000000, &vesa_modes[28] },
+	{ 0x3f, 0xc14f, 0x000000, &vesa_modes[29] },
+	{ 0x40, 0x0000, 0x000000, 0 },
+	{ 0x41, 0xc940, 0x000000, &vesa_modes[30] },
+	{ 0x42, 0xc94f, 0x000000, &vesa_modes[31] },
+	{ 0x43, 0x0000, 0x000000, 0 },
+	{ 0x44, 0x0000, 0x572821, &vesa_modes[34] },
+	{ 0x45, 0xd100, 0x572828, &vesa_modes[35] },
+	{ 0x46, 0xd10f, 0x572844, &vesa_modes[36] },
+	{ 0x47, 0xd119, 0x572862, &vesa_modes[37] },
+	{ 0x48, 0x0000, 0x000000, 0 },
+	{ 0x49, 0xd140, 0x000000, &vesa_modes[32] },
+	{ 0x4a, 0xd14f, 0x000000, &vesa_modes[33] },
+	{ 0x4b, 0x0000, 0x000000, 0 },
+	{ 0x4c, 0x0000, 0x1f3821, &vesa_modes[38] },
+	{ 0x4d, 0x0000, 0x1f3828, &vesa_modes[39] },
+	{ 0x4e, 0x0000, 0x1f3844, &vesa_modes[40] },
+	{ 0x4f, 0x0000, 0x1f3862, &vesa_modes[41] },
+	{ 0x50, 0x0000, 0x000000, &vesa_modes[42] },
+};
+EXPORT_SYMBOL(dmt_modes);
 #endif /* CONFIG_FB_MODE_HELPERS */
 
 /**
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 882dbd1a87b7..043f3283b71c 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -727,6 +727,8 @@ extern int fb_videomode_from_videomode(const struct videomode *vm,
 
 /* drivers/video/modedb.c */
 #define VESA_MODEDB_SIZE 43
+#define DMT_SIZE 0x50
+
 extern void fb_var_to_videomode(struct fb_videomode *mode,
 				const struct fb_var_screeninfo *var);
 extern void fb_videomode_to_var(struct fb_var_screeninfo *var,
@@ -777,9 +779,17 @@ struct fb_videomode {
 	u32 flag;
 };
 
+struct dmt_videomode {
+	u32 dmt_id;
+	u32 std_2byte_code;
+	u32 cvt_3byte_code;
+	const struct fb_videomode *mode;
+};
+
 extern const char *fb_mode_option;
 extern const struct fb_videomode vesa_modes[];
 extern const struct fb_videomode cea_modes[64];
+extern const struct dmt_videomode dmt_modes[];
 
 struct fb_modelist {
 	struct list_head list;
-- 
cgit v1.2.3


From d621e8bae5ac9c67de4de90c5cded12adc8ee1e1 Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Date: Wed, 17 Dec 2014 16:51:13 +0100
Subject: gpio/gpiolib-of: Create of_mm_gpiochip_remove

Create counterpart of of_mm_gpiochip_add(). This way the modules that
can be removable do not duplicate the cleanup code.

Suggested-by: Alexandre Courbot <gnurou@gmail.com>
Signed-off-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-of.c | 17 +++++++++++++++++
 include/linux/of_gpio.h   |  1 +
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-of.c b/drivers/gpio/gpiolib-of.c
index 604dbe60bdee..3e2c6afeab11 100644
--- a/drivers/gpio/gpiolib-of.c
+++ b/drivers/gpio/gpiolib-of.c
@@ -204,6 +204,23 @@ err0:
 }
 EXPORT_SYMBOL(of_mm_gpiochip_add);
 
+/**
+ * of_mm_gpiochip_remove - Remove memory mapped GPIO chip (bank)
+ * @mm_gc:	pointer to the of_mm_gpio_chip allocated structure
+ */
+void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc)
+{
+	struct gpio_chip *gc = &mm_gc->gc;
+
+	if (!mm_gc)
+		return;
+
+	gpiochip_remove(gc);
+	iounmap(mm_gc->regs);
+	kfree(gc->label);
+}
+EXPORT_SYMBOL(of_mm_gpiochip_remove);
+
 #ifdef CONFIG_PINCTRL
 static void of_gpiochip_add_pin_range(struct gpio_chip *chip)
 {
diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h
index 38fc05036015..69dbe312b11b 100644
--- a/include/linux/of_gpio.h
+++ b/include/linux/of_gpio.h
@@ -52,6 +52,7 @@ extern int of_get_named_gpio_flags(struct device_node *np,
 
 extern int of_mm_gpiochip_add(struct device_node *np,
 			      struct of_mm_gpio_chip *mm_gc);
+extern void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc);
 
 extern void of_gpiochip_add(struct gpio_chip *gc);
 extern void of_gpiochip_remove(struct gpio_chip *gc);
-- 
cgit v1.2.3


From 53e41f554a0cbad139ee5072bbb49b4951f680c2 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 15 Dec 2014 10:39:47 +0100
Subject: gpio: tc3589x: get rid of platform data

This device is only used from the device tree, and the startup()
and remove() callbacks are not used anywhere in the kernel, so
retire them and the pdata altogether.

Cc: Samuel Ortiz <sameo@linux.intel.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-tc3589x.c | 15 ++-------------
 include/linux/mfd/tc3589x.h | 12 ------------
 2 files changed, 2 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-tc3589x.c b/drivers/gpio/gpio-tc3589x.c
index abdcf58935f5..11aed2671065 100644
--- a/drivers/gpio/gpio-tc3589x.c
+++ b/drivers/gpio/gpio-tc3589x.c
@@ -232,16 +232,13 @@ static irqreturn_t tc3589x_gpio_irq(int irq, void *dev)
 static int tc3589x_gpio_probe(struct platform_device *pdev)
 {
 	struct tc3589x *tc3589x = dev_get_drvdata(pdev->dev.parent);
-	struct tc3589x_gpio_platform_data *pdata;
 	struct device_node *np = pdev->dev.of_node;
 	struct tc3589x_gpio *tc3589x_gpio;
 	int ret;
 	int irq;
 
-	pdata = tc3589x->pdata->gpio;
-
-	if (!(pdata || np)) {
-		dev_err(&pdev->dev, "No platform data or Device Tree found\n");
+	if (!np) {
+		dev_err(&pdev->dev, "No Device Tree node found\n");
 		return -EINVAL;
 	}
 
@@ -305,9 +302,6 @@ static int tc3589x_gpio_probe(struct platform_device *pdev)
 				     irq,
 				     NULL);
 
-	if (pdata && pdata->setup)
-		pdata->setup(tc3589x, tc3589x_gpio->chip.base);
-
 	platform_set_drvdata(pdev, tc3589x_gpio);
 
 	return 0;
@@ -316,11 +310,6 @@ static int tc3589x_gpio_probe(struct platform_device *pdev)
 static int tc3589x_gpio_remove(struct platform_device *pdev)
 {
 	struct tc3589x_gpio *tc3589x_gpio = platform_get_drvdata(pdev);
-	struct tc3589x *tc3589x = tc3589x_gpio->tc3589x;
-	struct tc3589x_gpio_platform_data *pdata = tc3589x->pdata->gpio;
-
-	if (pdata && pdata->remove)
-		pdata->remove(tc3589x, tc3589x_gpio->chip.base);
 
 	gpiochip_remove(&tc3589x_gpio->chip);
 
diff --git a/include/linux/mfd/tc3589x.h b/include/linux/mfd/tc3589x.h
index e1c12d84c26a..c203c9c56776 100644
--- a/include/linux/mfd/tc3589x.h
+++ b/include/linux/mfd/tc3589x.h
@@ -162,25 +162,13 @@ struct tc3589x_keypad_platform_data {
 	bool                    no_autorepeat;
 };
 
-/**
- * struct tc3589x_gpio_platform_data - TC3589x GPIO platform data
- * @setup: callback for board-specific initialization
- * @remove: callback for board-specific teardown
- */
-struct tc3589x_gpio_platform_data {
-	void (*setup)(struct tc3589x *tc3589x, unsigned gpio_base);
-	void (*remove)(struct tc3589x *tc3589x, unsigned gpio_base);
-};
-
 /**
  * struct tc3589x_platform_data - TC3589x platform data
  * @block: bitmask of blocks to enable (use TC3589x_BLOCK_*)
- * @gpio: GPIO-specific platform data
  * @keypad: keypad-specific platform data
  */
 struct tc3589x_platform_data {
 	unsigned int block;
-	struct tc3589x_gpio_platform_data *gpio;
 	const struct tc3589x_keypad_platform_data *keypad;
 };
 
-- 
cgit v1.2.3


From d34541bc48eb7d7cb2ead5ff0284acf65af96f17 Mon Sep 17 00:00:00 2001
From: Olliver Schinagl <oliver@schinagl.nl>
Date: Wed, 7 Jan 2015 09:44:57 +0100
Subject: gpio: Make the vararg hacks not pass magic values

Right now, in consumer.h, there's some vararg hacks that pass 0 as the
flags. What actually is passed however is GPIOD_ASIS, which naturally is
also 0. Using the define/enum rather then the magic 0 makes it the
define more readable to a passer by.

Signed-off-by: Olliver Schinagl <oliver@schinagl.nl>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/gpio/consumer.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h
index fd85cb120ee0..45afc2dee560 100644
--- a/include/linux/gpio/consumer.h
+++ b/include/linux/gpio/consumer.h
@@ -340,31 +340,32 @@ static inline int desc_to_gpio(const struct gpio_desc *desc)
  * etc.
  */
 #define __gpiod_get(dev, con_id, flags, ...) __gpiod_get(dev, con_id, flags)
-#define gpiod_get(varargs...) __gpiod_get(varargs, 0)
+#define gpiod_get(varargs...) __gpiod_get(varargs, GPIOD_ASIS)
 #define __gpiod_get_index(dev, con_id, index, flags, ...)		\
 	__gpiod_get_index(dev, con_id, index, flags)
-#define gpiod_get_index(varargs...) __gpiod_get_index(varargs, 0)
+#define gpiod_get_index(varargs...) __gpiod_get_index(varargs, GPIOD_ASIS)
 #define __gpiod_get_optional(dev, con_id, flags, ...)			\
 	__gpiod_get_optional(dev, con_id, flags)
-#define gpiod_get_optional(varargs...) __gpiod_get_optional(varargs, 0)
+#define gpiod_get_optional(varargs...) __gpiod_get_optional(varargs, GPIOD_ASIS)
 #define __gpiod_get_index_optional(dev, con_id, index, flags, ...)	\
 	__gpiod_get_index_optional(dev, con_id, index, flags)
 #define gpiod_get_index_optional(varargs...)				\
-	__gpiod_get_index_optional(varargs, 0)
+	__gpiod_get_index_optional(varargs, GPIOD_ASIS)
 #define __devm_gpiod_get(dev, con_id, flags, ...)			\
 	__devm_gpiod_get(dev, con_id, flags)
-#define devm_gpiod_get(varargs...) __devm_gpiod_get(varargs, 0)
+#define devm_gpiod_get(varargs...) __devm_gpiod_get(varargs, GPIOD_ASIS)
 #define __devm_gpiod_get_index(dev, con_id, index, flags, ...)		\
 	__devm_gpiod_get_index(dev, con_id, index, flags)
-#define devm_gpiod_get_index(varargs...) __devm_gpiod_get_index(varargs, 0)
+#define devm_gpiod_get_index(varargs...)				\
+	__devm_gpiod_get_index(varargs, GPIOD_ASIS)
 #define __devm_gpiod_get_optional(dev, con_id, flags, ...)		\
 	__devm_gpiod_get_optional(dev, con_id, flags)
 #define devm_gpiod_get_optional(varargs...)				\
-	__devm_gpiod_get_optional(varargs, 0)
+	__devm_gpiod_get_optional(varargs, GPIOD_ASIS)
 #define __devm_gpiod_get_index_optional(dev, con_id, index, flags, ...)	\
 	__devm_gpiod_get_index_optional(dev, con_id, index, flags)
 #define devm_gpiod_get_index_optional(varargs...)			\
-	__devm_gpiod_get_index_optional(varargs, 0)
+	__devm_gpiod_get_index_optional(varargs, GPIOD_ASIS)
 
 #if IS_ENABLED(CONFIG_GPIOLIB) && IS_ENABLED(CONFIG_GPIO_SYSFS)
 
-- 
cgit v1.2.3


From 2397aa8b515f7bd77c8d5698170b6a98fdd6721c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 13 Jan 2015 11:02:54 -0500
Subject: svcrdma: Clean up read chunk counting

The byte_count argument is not used, and the function is called
only from one place.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h         |  2 --
 net/sunrpc/xprtrdma/svc_rdma_marshal.c  | 16 ----------------
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 15 ++++++++++++---
 3 files changed, 12 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 975da754c778..2280325e4c88 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -178,8 +178,6 @@ struct svcxprt_rdma {
 #define RPCRDMA_MAX_REQ_SIZE    4096
 
 /* svc_rdma_marshal.c */
-extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *,
-				      int *, int *);
 extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *);
 extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *);
 extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index 65b146297f5a..b681855cf970 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -70,22 +70,6 @@ static u32 *decode_read_list(u32 *va, u32 *vaend)
 	return (u32 *)&ch->rc_position;
 }
 
-/*
- * Determine number of chunks and total bytes in chunk list. The chunk
- * list has already been verified to fit within the RPCRDMA header.
- */
-void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch,
-			       int *ch_count, int *byte_count)
-{
-	/* compute the number of bytes represented by read chunks */
-	*byte_count = 0;
-	*ch_count = 0;
-	for (; ch->rc_discrim != 0; ch++) {
-		*byte_count = *byte_count + ntohl(ch->rc_target.rs_length);
-		*ch_count = *ch_count + 1;
-	}
-}
-
 /*
  * Decodes a write chunk list. The expected format is as follows:
  *    descrim  : xdr_one
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2c67de032009..b3b7bb85844d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -365,12 +365,22 @@ static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
 	return ret;
 }
 
+static unsigned int
+rdma_rcl_chunk_count(struct rpcrdma_read_chunk *ch)
+{
+	unsigned int count;
+
+	for (count = 0; ch->rc_discrim != xdr_zero; ch++)
+		count++;
+	return count;
+}
+
 static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 			    struct rpcrdma_msg *rmsgp,
 			    struct svc_rqst *rqstp,
 			    struct svc_rdma_op_ctxt *head)
 {
-	int page_no, ch_count, ret;
+	int page_no, ret;
 	struct rpcrdma_read_chunk *ch;
 	u32 page_offset, byte_count;
 	u64 rs_offset;
@@ -381,8 +391,7 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	if (!ch)
 		return 0;
 
-	svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count);
-	if (ch_count > RPCSVC_MAXPAGES)
+	if (rdma_rcl_chunk_count(ch) > RPCSVC_MAXPAGES)
 		return -EINVAL;
 
 	/* The request is completed when the RDMA_READs complete. The
-- 
cgit v1.2.3


From e54524111f51eac1900cf91aca3d38a92a6b11c0 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 13 Jan 2015 11:03:20 -0500
Subject: svcrdma: Plant reader function in struct svcxprt_rdma

The RDMA reader function doesn't change once an svcxprt_rdma is
instantiated. Instead of checking sc_devcap during every incoming
RPC, set the reader function once when the connection is accepted.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 10 +++++
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 71 ++++++++++++--------------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  2 +
 3 files changed, 39 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 2280325e4c88..f161e309f25e 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -150,6 +150,10 @@ struct svcxprt_rdma {
 	struct ib_cq         *sc_rq_cq;
 	struct ib_cq         *sc_sq_cq;
 	struct ib_mr         *sc_phys_mr;	/* MR for server memory */
+	int		     (*sc_reader)(struct svcxprt_rdma *,
+					  struct svc_rqst *,
+					  struct svc_rdma_op_ctxt *,
+					  int *, u32 *, u32, u32, u64, bool);
 	u32		     sc_dev_caps;	/* distilled device caps */
 	u32		     sc_dma_lkey;	/* local dma key */
 	unsigned int	     sc_frmr_pg_list_len;
@@ -195,6 +199,12 @@ extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *);
 
 /* svc_rdma_recvfrom.c */
 extern int svc_rdma_recvfrom(struct svc_rqst *);
+extern int rdma_read_chunk_lcl(struct svcxprt_rdma *, struct svc_rqst *,
+			       struct svc_rdma_op_ctxt *, int *, u32 *,
+			       u32, u32, u64, bool);
+extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *,
+				struct svc_rdma_op_ctxt *, int *, u32 *,
+				u32, u32, u64, bool);
 
 /* svc_rdma_sendto.c */
 extern int svc_rdma_sendto(struct svc_rqst *);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 577f8659ca30..c3aebc1bf0a6 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -117,26 +117,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count)
 		return min_t(int, sge_count, xprt->sc_max_sge);
 }
 
-typedef int (*rdma_reader_fn)(struct svcxprt_rdma *xprt,
-			      struct svc_rqst *rqstp,
-			      struct svc_rdma_op_ctxt *head,
-			      int *page_no,
-			      u32 *page_offset,
-			      u32 rs_handle,
-			      u32 rs_length,
-			      u64 rs_offset,
-			      int last);
-
 /* Issue an RDMA_READ using the local lkey to map the data sink */
-static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
-			       struct svc_rqst *rqstp,
-			       struct svc_rdma_op_ctxt *head,
-			       int *page_no,
-			       u32 *page_offset,
-			       u32 rs_handle,
-			       u32 rs_length,
-			       u64 rs_offset,
-			       int last)
+int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
+			struct svc_rqst *rqstp,
+			struct svc_rdma_op_ctxt *head,
+			int *page_no,
+			u32 *page_offset,
+			u32 rs_handle,
+			u32 rs_length,
+			u64 rs_offset,
+			bool last)
 {
 	struct ib_send_wr read_wr;
 	int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
@@ -221,15 +211,15 @@ static int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 }
 
 /* Issue an RDMA_READ using an FRMR to map the data sink */
-static int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
-				struct svc_rqst *rqstp,
-				struct svc_rdma_op_ctxt *head,
-				int *page_no,
-				u32 *page_offset,
-				u32 rs_handle,
-				u32 rs_length,
-				u64 rs_offset,
-				int last)
+int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
+			 struct svc_rqst *rqstp,
+			 struct svc_rdma_op_ctxt *head,
+			 int *page_no,
+			 u32 *page_offset,
+			 u32 rs_handle,
+			 u32 rs_length,
+			 u64 rs_offset,
+			 bool last)
 {
 	struct ib_send_wr read_wr;
 	struct ib_send_wr inv_wr;
@@ -374,9 +364,9 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 {
 	int page_no, ret;
 	struct rpcrdma_read_chunk *ch;
-	u32 page_offset, byte_count;
+	u32 handle, page_offset, byte_count;
 	u64 rs_offset;
-	rdma_reader_fn reader;
+	bool last;
 
 	/* If no read list is present, return 0 */
 	ch = svc_rdma_get_read_chunk(rmsgp);
@@ -399,27 +389,20 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	head->arg.len = rqstp->rq_arg.len;
 	head->arg.buflen = rqstp->rq_arg.buflen;
 
-	/* Use FRMR if supported */
-	if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)
-		reader = rdma_read_chunk_frmr;
-	else
-		reader = rdma_read_chunk_lcl;
-
 	page_no = 0; page_offset = 0;
 	for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
 	     ch->rc_discrim != 0; ch++) {
-
+		handle = be32_to_cpu(ch->rc_target.rs_handle);
+		byte_count = be32_to_cpu(ch->rc_target.rs_length);
 		xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset,
 				 &rs_offset);
-		byte_count = ntohl(ch->rc_target.rs_length);
 
 		while (byte_count > 0) {
-			ret = reader(xprt, rqstp, head,
-				     &page_no, &page_offset,
-				     ntohl(ch->rc_target.rs_handle),
-				     byte_count, rs_offset,
-				     ((ch+1)->rc_discrim == 0) /* last */
-				     );
+			last = (ch + 1)->rc_discrim == xdr_zero;
+			ret = xprt->sc_reader(xprt, rqstp, head,
+					      &page_no, &page_offset,
+					      handle, byte_count,
+					      rs_offset, last);
 			if (ret < 0)
 				goto err;
 			byte_count -= ret;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index f2e059bbab42..f609c1c2d38d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -974,10 +974,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	 * NB:	iWARP requires remote write access for the data sink
 	 *	of an RDMA_READ. IB does not.
 	 */
+	newxprt->sc_reader = rdma_read_chunk_lcl;
 	if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
 		newxprt->sc_frmr_pg_list_len =
 			devattr.max_fast_reg_page_list_len;
 		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
+		newxprt->sc_reader = rdma_read_chunk_frmr;
 	}
 
 	/*
-- 
cgit v1.2.3


From 0b056c224bea63060ce8a981e84193c93fac6f5d Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 13 Jan 2015 11:03:37 -0500
Subject: svcrdma: Support RDMA_NOMSG requests

Currently the Linux server can not decode RDMA_NOMSG type requests.
Operations whose length exceeds the fixed size of RDMA SEND buffers,
like large NFSv4 CREATE(NF4LNK) operations, must be conveyed via
RDMA_NOMSG.

For an RDMA_MSG type request, the client sends the RPC/RDMA, RPC
headers, and some or all of the NFS arguments via RDMA SEND.

For an RDMA_NOMSG type request, the client sends just the RPC/RDMA
header via RDMA SEND. The request's read list contains elements for
the entire RPC message, including the RPC header.

NFSD expects the RPC/RMDA header and RPC header to be contiguous in
page zero of the XDR buffer. Add logic in the RDMA READ path to make
the read list contents land where the server prefers, when the
incoming message is a type RDMA_NOMSG message.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Reviewed-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h         |  1 +
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 39 ++++++++++++++++++++++++++++++---
 2 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index f161e309f25e..c343a94bc791 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -79,6 +79,7 @@ struct svc_rdma_op_ctxt {
 	enum ib_wr_opcode wr_op;
 	enum ib_wc_status wc_status;
 	u32 byte_len;
+	u32 position;
 	struct svcxprt_rdma *xprt;
 	unsigned long flags;
 	enum dma_data_direction direction;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index a67dd1a081dd..36cf51a3eab7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -60,6 +60,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 			       struct svc_rdma_op_ctxt *ctxt,
 			       u32 byte_count)
 {
+	struct rpcrdma_msg *rmsgp;
 	struct page *page;
 	u32 bc;
 	int sge_no;
@@ -82,7 +83,14 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
 	/* If data remains, store it in the pagelist */
 	rqstp->rq_arg.page_len = bc;
 	rqstp->rq_arg.page_base = 0;
-	rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+
+	/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+	rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+	if (be32_to_cpu(rmsgp->rm_type) == RDMA_NOMSG)
+		rqstp->rq_arg.pages = &rqstp->rq_pages[0];
+	else
+		rqstp->rq_arg.pages = &rqstp->rq_pages[1];
+
 	sge_no = 1;
 	while (bc && sge_no < ctxt->count) {
 		page = ctxt->pages[sge_no];
@@ -383,7 +391,6 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	 */
 	head->arg.head[0] = rqstp->rq_arg.head[0];
 	head->arg.tail[0] = rqstp->rq_arg.tail[0];
-	head->arg.pages = &head->pages[head->count];
 	head->hdr_count = head->count;
 	head->arg.page_base = 0;
 	head->arg.page_len = 0;
@@ -393,9 +400,17 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 	ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0];
 	position = be32_to_cpu(ch->rc_position);
 
+	/* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */
+	if (position == 0) {
+		head->arg.pages = &head->pages[0];
+		page_offset = head->byte_len;
+	} else {
+		head->arg.pages = &head->pages[head->count];
+		page_offset = 0;
+	}
+
 	ret = 0;
 	page_no = 0;
-	page_offset = 0;
 	for (; ch->rc_discrim != xdr_zero; ch++) {
 		if (be32_to_cpu(ch->rc_position) != position)
 			goto err;
@@ -418,7 +433,10 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt,
 			head->arg.buflen += ret;
 		}
 	}
+
 	ret = 1;
+	head->position = position;
+
  err:
 	/* Detach arg pages. svc_recv will replenish them */
 	for (page_no = 0;
@@ -465,6 +483,21 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
 		put_page(rqstp->rq_pages[page_no]);
 		rqstp->rq_pages[page_no] = head->pages[page_no];
 	}
+
+	/* Adjustments made for RDMA_NOMSG type requests */
+	if (head->position == 0) {
+		if (head->arg.len <= head->sge[0].length) {
+			head->arg.head[0].iov_len = head->arg.len -
+							head->byte_len;
+			head->arg.page_len = 0;
+		} else {
+			head->arg.head[0].iov_len = head->sge[0].length -
+								head->byte_len;
+			head->arg.page_len = head->arg.len -
+						head->sge[0].length;
+		}
+	}
+
 	/* Point rq_arg.pages past header */
 	rdma_fix_xdr_pad(&head->arg);
 	rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count];
-- 
cgit v1.2.3


From 57699a40b4f2694d3ee63fd5e6465ec8f600b620 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Fri, 16 Jan 2015 11:13:09 +0800
Subject: rhashtable: Fix race in rhashtable_destroy() and use regular
 work_struct

When we put our declared work task in the global workqueue with
schedule_delayed_work(), its delay parameter is always zero.
Therefore, we should define a regular work in rhashtable structure
instead of a delayed work.

By the way, we add a condition to check whether resizing functions
are NULL before cancelling the work, avoiding to cancel an
uninitialized work.

Lastly, while we wait for all work items we submitted before to run
to completion with cancel_delayed_work(), ht->mutex has been taken in
rhashtable_destroy(). Moreover, cancel_delayed_work() doesn't return
until all work items are accomplished, and when work items are
scheduled, the work's function - rht_deferred_worker() will be called.
However, as rht_deferred_worker() also needs to acquire the lock,
deadlock might happen at the moment as the lock is already held before.
So if the cancel work function is moved out of the lock covered scope,
this will avoid the deadlock.

Fixes: 97defe1 ("rhashtable: Per bucket locks & deferred expansion/shrinking")
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Cc: Thomas Graf <tgraf@suug.ch>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  2 +-
 lib/rhashtable.c           | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 9570832ab07c..a2562ed53ea3 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -119,7 +119,7 @@ struct rhashtable {
 	atomic_t			nelems;
 	atomic_t			shift;
 	struct rhashtable_params	p;
-	struct delayed_work             run_work;
+	struct work_struct		run_work;
 	struct mutex                    mutex;
 	bool                            being_destroyed;
 };
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index aca699813ba9..84a78e396a56 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -485,7 +485,7 @@ static void rht_deferred_worker(struct work_struct *work)
 	struct rhashtable *ht;
 	struct bucket_table *tbl;
 
-	ht = container_of(work, struct rhashtable, run_work.work);
+	ht = container_of(work, struct rhashtable, run_work);
 	mutex_lock(&ht->mutex);
 	tbl = rht_dereference(ht->tbl, ht);
 
@@ -507,7 +507,7 @@ static void rhashtable_wakeup_worker(struct rhashtable *ht)
 	if (tbl == new_tbl &&
 	    ((ht->p.grow_decision && ht->p.grow_decision(ht, size)) ||
 	     (ht->p.shrink_decision && ht->p.shrink_decision(ht, size))))
-		schedule_delayed_work(&ht->run_work, 0);
+		schedule_work(&ht->run_work);
 }
 
 static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj,
@@ -903,7 +903,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 		get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
 
 	if (ht->p.grow_decision || ht->p.shrink_decision)
-		INIT_DEFERRABLE_WORK(&ht->run_work, rht_deferred_worker);
+		INIT_WORK(&ht->run_work, rht_deferred_worker);
 
 	return 0;
 }
@@ -921,11 +921,11 @@ void rhashtable_destroy(struct rhashtable *ht)
 {
 	ht->being_destroyed = true;
 
-	mutex_lock(&ht->mutex);
+	if (ht->p.grow_decision || ht->p.shrink_decision)
+		cancel_work_sync(&ht->run_work);
 
-	cancel_delayed_work(&ht->run_work);
+	mutex_lock(&ht->mutex);
 	bucket_table_free(rht_dereference(ht->tbl, ht));
-
 	mutex_unlock(&ht->mutex);
 }
 EXPORT_SYMBOL_GPL(rhashtable_destroy);
-- 
cgit v1.2.3


From 5cd37193ce8539be1e6ef76be226f4bcc984e0f5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 13 Dec 2014 20:32:04 -0800
Subject: rcu: Make cond_resched_rcu_qs() apply to normal RCU flavors

Although cond_resched_rcu_qs() only applies to TASKS_RCU, it is used
in places where it would be useful for it to apply to the normal RCU
flavors, rcu_preempt, rcu_sched, and rcu_bh.  This is especially the
case for workloads that aggressively overload the system, particularly
those that generate large numbers of RCU updates on systems running
NO_HZ_FULL CPUs.  This commit therefore communicates quiescent states
from cond_resched_rcu_qs() to the normal RCU flavors.

Note that it is unfortunately necessary to leave the old ->passed_quiesce
mechanism in place to allow quiescent states that apply to only one
flavor to be recorded.  (Yes, we could decrement ->rcu_qs_ctr_snap in
that case, but that is not so good for debugging of RCU internals.)
In addition, if one of the RCU flavor's grace period has stalled, this
will invoke rcu_momentary_dyntick_idle(), resulting in a heavy-weight
quiescent state visible from other CPUs.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Merge commit from Sasha Levin fixing a bug where __this_cpu()
  was used in preemptible code. ]
---
 Documentation/RCU/trace.txt | 32 ++++++++++++++++----------------
 include/linux/rcupdate.h    |  3 ++-
 include/linux/rcutiny.h     |  5 ++++-
 include/linux/rcutree.h     |  2 ++
 kernel/rcu/tree.c           | 38 +++++++++++++++++++++++++++++++++-----
 kernel/rcu/tree.h           |  2 ++
 kernel/rcu/tree_trace.c     |  8 ++++++--
 7 files changed, 65 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index b63b9bb3bc0c..08651da15448 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -56,14 +56,14 @@ rcuboost:
 
 The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
 
-  0!c=30455 g=30456 pq=1 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
-  1!c=30719 g=30720 pq=1 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
-  2!c=30150 g=30151 pq=1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
-  3 c=31249 g=31250 pq=1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
-  4!c=29502 g=29503 pq=1 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
-  5 c=31201 g=31202 pq=1 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
-  6!c=30253 g=30254 pq=1 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
-  7 c=31178 g=31178 pq=1 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
+  0!c=30455 g=30456 pq=1/0 qp=1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
+  1!c=30719 g=30720 pq=1/0 qp=0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
+  2!c=30150 g=30151 pq=1/1 qp=1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
+  3 c=31249 g=31250 pq=1/1 qp=0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
+  4!c=29502 g=29503 pq=1/0 qp=1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
+  5 c=31201 g=31202 pq=1/0 qp=1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
+  6!c=30253 g=30254 pq=1/0 qp=1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
+  7 c=31178 g=31178 pq=1/0 qp=0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
 
 This file has one line per CPU, or eight for this 8-CPU system.
 The fields are as follows:
@@ -188,14 +188,14 @@ o	"ca" is the number of RCU callbacks that have been adopted by this
 Kernels compiled with CONFIG_RCU_BOOST=y display the following from
 /debug/rcu/rcu_preempt/rcudata:
 
-  0!c=12865 g=12866 pq=1 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
-  1 c=14407 g=14408 pq=1 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
-  2 c=14407 g=14408 pq=1 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
-  3 c=14407 g=14408 pq=1 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
-  4 c=14405 g=14406 pq=1 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
-  5!c=14168 g=14169 pq=1 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
-  6 c=14404 g=14405 pq=1 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
-  7 c=14407 g=14408 pq=1 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
+  0!c=12865 g=12866 pq=1/0 qp=1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
+  1 c=14407 g=14408 pq=1/0 qp=0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
+  2 c=14407 g=14408 pq=1/0 qp=0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
+  3 c=14407 g=14408 pq=1/0 qp=0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
+  4 c=14405 g=14406 pq=1/0 qp=1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
+  5!c=14168 g=14169 pq=1/0 qp=0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
+  6 c=14404 g=14405 pq=1/0 qp=0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
+  7 c=14407 g=14408 pq=1/0 qp=1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
 
 This is similar to the output discussed above, but contains the following
 additional fields:
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ed4f5939a452..468228750299 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -331,12 +331,13 @@ static inline void rcu_init_nohz(void)
 extern struct srcu_struct tasks_rcu_exit_srcu;
 #define rcu_note_voluntary_context_switch(t) \
 	do { \
+		rcu_all_qs(); \
 		if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
 			ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
 	} while (0)
 #else /* #ifdef CONFIG_TASKS_RCU */
 #define TASKS_RCU(x) do { } while (0)
-#define rcu_note_voluntary_context_switch(t)	do { } while (0)
+#define rcu_note_voluntary_context_switch(t)	rcu_all_qs()
 #endif /* #else #ifdef CONFIG_TASKS_RCU */
 
 /**
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 0e5366200154..fabd3fad8516 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -154,7 +154,10 @@ static inline bool rcu_is_watching(void)
 	return true;
 }
 
-
 #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
 
+static inline void rcu_all_qs(void)
+{
+}
+
 #endif /* __LINUX_RCUTINY_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 52953790dcca..ddba927f7316 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -97,4 +97,6 @@ extern int rcu_scheduler_active __read_mostly;
 
 bool rcu_is_watching(void);
 
+void rcu_all_qs(void);
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 75ce12316b4c..cb00e038c2f2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -219,6 +219,9 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 };
 
+DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
+
 /*
  * Let the RCU core know that this CPU has gone through the scheduler,
  * which is a quiescent state.  This is called when the need for a
@@ -288,6 +291,22 @@ void rcu_note_context_switch(void)
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
+/*
+ * Register a quiesecent state for all RCU flavors.  If there is an
+ * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
+ * dyntick-idle quiescent state visible to other CPUs (but only for those
+ * RCU flavors in desparate need of a quiescent state, which will normally
+ * be none of them).  Either way, do a lightweight quiescent state for
+ * all RCU flavors.
+ */
+void rcu_all_qs(void)
+{
+	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+		rcu_momentary_dyntick_idle();
+	this_cpu_inc(rcu_qs_ctr);
+}
+EXPORT_SYMBOL_GPL(rcu_all_qs);
+
 static long blimit = 10;	/* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;	/* If this many pending, ignore blimit. */
 static long qlowmark = 100;	/* Once only this many pending, use blimit. */
@@ -1609,6 +1628,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		rdp->gpnum = rnp->gpnum;
 		trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart"));
 		rdp->passed_quiesce = 0;
+		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
 		zero_cpu_stall_ticks(rdp);
 		ACCESS_ONCE(rdp->gpwrap) = false;
@@ -2075,8 +2095,10 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
-	    rnp->completed == rnp->gpnum || rdp->gpwrap) {
+	if ((rdp->passed_quiesce == 0 &&
+	     rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
+	    rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
+	    rdp->gpwrap) {
 
 		/*
 		 * The grace period in which this quiescent state was
@@ -2085,6 +2107,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 		 * within the current grace period.
 		 */
 		rdp->passed_quiesce = 0;	/* need qs for new gp. */
+		rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
@@ -2129,7 +2152,8 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 	 * Was there a quiescent state since the beginning of the grace
 	 * period? If no, then exit and wait for the next call.
 	 */
-	if (!rdp->passed_quiesce)
+	if (!rdp->passed_quiesce &&
+	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr))
 		return;
 
 	/*
@@ -3174,9 +3198,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
 	if (rcu_scheduler_fully_active &&
-	    rdp->qs_pending && !rdp->passed_quiesce) {
+	    rdp->qs_pending && !rdp->passed_quiesce &&
+	    rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) {
 		rdp->n_rp_qs_pending++;
-	} else if (rdp->qs_pending && rdp->passed_quiesce) {
+	} else if (rdp->qs_pending &&
+		   (rdp->passed_quiesce ||
+		    rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) {
 		rdp->n_rp_report_qs++;
 		return 1;
 	}
@@ -3510,6 +3537,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 			rdp->gpnum = rnp->completed;
 			rdp->completed = rnp->completed;
 			rdp->passed_quiesce = 0;
+			rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
 			rdp->qs_pending = 0;
 			trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
 		}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 7472ff388d55..1e7f8b05714e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -257,6 +257,8 @@ struct rcu_data {
 					/*  in order to detect GP end. */
 	unsigned long	gpnum;		/* Highest gp number that this CPU */
 					/*  is aware of having started. */
+	unsigned long	rcu_qs_ctr_snap;/* Snapshot of rcu_qs_ctr to check */
+					/*  for rcu_all_qs() invocations. */
 	bool		passed_quiesce;	/* User-mode/idle loop etc. */
 	bool		qs_pending;	/* Core waits for quiesc state. */
 	bool		beenonline;	/* CPU online at least once. */
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 5cdc62e1beeb..fbb6240509ea 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -46,6 +46,8 @@
 #define RCU_TREE_NONCORE
 #include "tree.h"
 
+DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr);
+
 static int r_open(struct inode *inode, struct file *file,
 					const struct seq_operations *op)
 {
@@ -115,11 +117,13 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 
 	if (!rdp->beenonline)
 		return;
-	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d",
+	seq_printf(m, "%3d%cc=%ld g=%ld pq=%d/%d qp=%d",
 		   rdp->cpu,
 		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
 		   ulong2long(rdp->completed), ulong2long(rdp->gpnum),
-		   rdp->passed_quiesce, rdp->qs_pending);
+		   rdp->passed_quiesce,
+		   rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu),
+		   rdp->qs_pending);
 	seq_printf(m, " dt=%d/%llx/%d df=%lu",
 		   atomic_read(&rdp->dynticks->dynticks),
 		   rdp->dynticks->dynticks_nesting,
-- 
cgit v1.2.3


From 076c3b8e03e2737659a89660bb8e54e13587d974 Mon Sep 17 00:00:00 2001
From: James Ban <james.ban.opensource@diasemi.com>
Date: Fri, 16 Jan 2015 12:13:27 +0900
Subject: regulator: da9211: fix unmatched of_node

This is a patch for fixing unmatched of_node.

Signed-off-by: James Ban <james.ban.opensource@diasemi.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/da9211-regulator.c | 4 ++--
 include/linux/regulator/da9211.h     | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/da9211-regulator.c b/drivers/regulator/da9211-regulator.c
index c78d2106d6cb..8e6957c63a69 100644
--- a/drivers/regulator/da9211-regulator.c
+++ b/drivers/regulator/da9211-regulator.c
@@ -276,7 +276,7 @@ static struct da9211_pdata *da9211_parse_regulators_dt(
 			continue;
 
 		pdata->init_data[n] = da9211_matches[i].init_data;
-
+		pdata->reg_node[n] = da9211_matches[i].of_node;
 		n++;
 	}
 
@@ -364,7 +364,7 @@ static int da9211_regulator_init(struct da9211 *chip)
 		config.dev = chip->dev;
 		config.driver_data = chip;
 		config.regmap = chip->regmap;
-		config.of_node = chip->dev->of_node;
+		config.of_node = chip->pdata->reg_node[i];
 
 		chip->rdev[i] = devm_regulator_register(chip->dev,
 			&da9211_regulators[i], &config);
diff --git a/include/linux/regulator/da9211.h b/include/linux/regulator/da9211.h
index 5479394fefce..d1d9d3849bdb 100644
--- a/include/linux/regulator/da9211.h
+++ b/include/linux/regulator/da9211.h
@@ -32,6 +32,7 @@ struct da9211_pdata {
 	 * 2 : 2 phase 2 buck
 	 */
 	int num_buck;
+	struct device_node *reg_node[DA9211_MAX_REGULATORS];
 	struct regulator_init_data *init_data[DA9211_MAX_REGULATORS];
 };
 #endif
-- 
cgit v1.2.3


From 67c2b9cb30f561325b010e046b7bbe2a327e69a0 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Tue, 18 Nov 2014 12:18:18 +0100
Subject: ARM: 8207/1: amba: Use inlines instead of macros for
 amba_pclk_enable/disable

Replace the amba_pclk_enable and amba_pclk_disable macros with static
inline functions and remove checks for IS_ERR. The amba bus clock won't
be ERR because probe would fail before the use of these functions.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 include/linux/amba/bus.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 2afc618b15ce..0ab5f8e0dea2 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -92,11 +92,15 @@ struct amba_device *amba_find_device(const char *, struct device *, unsigned int
 int amba_request_regions(struct amba_device *, const char *);
 void amba_release_regions(struct amba_device *);
 
-#define amba_pclk_enable(d)	\
-	(IS_ERR((d)->pclk) ? 0 : clk_enable((d)->pclk))
+static inline int amba_pclk_enable(struct amba_device *dev)
+{
+	return clk_enable(dev->pclk);
+}
 
-#define amba_pclk_disable(d)	\
-	do { if (!IS_ERR((d)->pclk)) clk_disable((d)->pclk); } while (0)
+static inline void amba_pclk_disable(struct amba_device *dev)
+{
+	clk_disable(dev->pclk);
+}
 
 static inline int amba_pclk_prepare(struct amba_device *dev)
 {
-- 
cgit v1.2.3


From 78e1f974dd351ac82d978e3aac2d27422013f914 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Sun, 14 Dec 2014 02:37:09 +0200
Subject: iommu/ipmmu-vmsa: Remove platform data support

No board file instantiates the IPMMU using platform data. Now that we
have DT support, get rid of platform data.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
---
 drivers/iommu/ipmmu-vmsa.c               | 24 ------------------------
 include/linux/platform_data/ipmmu-vmsa.h | 24 ------------------------
 2 files changed, 48 deletions(-)
 delete mode 100644 include/linux/platform_data/ipmmu-vmsa.h

(limited to 'include/linux')

diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 5d080cf11ba5..791c3daec7c0 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -17,7 +17,6 @@
 #include <linux/iommu.h>
 #include <linux/module.h>
 #include <linux/of.h>
-#include <linux/platform_data/ipmmu-vmsa.h>
 #include <linux/platform_device.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
@@ -30,7 +29,6 @@ struct ipmmu_vmsa_device {
 	void __iomem *base;
 	struct list_head list;
 
-	const struct ipmmu_vmsa_platform_data *pdata;
 	unsigned int num_utlbs;
 
 	struct dma_iommu_mapping *mapping;
@@ -1015,27 +1013,6 @@ static int ipmmu_find_utlbs(struct ipmmu_vmsa_device *mmu, struct device *dev,
 	unsigned int i;
 	int count;
 
-	if (mmu->pdata) {
-		const struct ipmmu_vmsa_master *master = mmu->pdata->masters;
-		const char *devname = dev_name(dev);
-		unsigned int i;
-
-		for (i = 0; i < mmu->pdata->num_masters; ++i, ++master) {
-			if (strcmp(master->name, devname) == 0) {
-				utlbs = kmalloc(sizeof(*utlbs), GFP_KERNEL);
-				if (!utlbs)
-					return -ENOMEM;
-
-				utlbs[0] = master->utlb;
-
-				*_utlbs = utlbs;
-				return 1;
-			}
-		}
-
-		return -EINVAL;
-	}
-
 	count = of_count_phandle_with_args(dev->of_node, "iommus",
 					   "#iommu-cells");
 	if (count < 0)
@@ -1246,7 +1223,6 @@ static int ipmmu_probe(struct platform_device *pdev)
 	}
 
 	mmu->dev = &pdev->dev;
-	mmu->pdata = pdev->dev.platform_data;
 	mmu->num_utlbs = 32;
 
 	/* Map I/O memory and request IRQ. */
diff --git a/include/linux/platform_data/ipmmu-vmsa.h b/include/linux/platform_data/ipmmu-vmsa.h
deleted file mode 100644
index 5275b3ac6d37..000000000000
--- a/include/linux/platform_data/ipmmu-vmsa.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * IPMMU VMSA Platform Data
- *
- * Copyright (C) 2014 Renesas Electronics Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; version 2 of the License.
- */
-
-#ifndef __IPMMU_VMSA_H__
-#define __IPMMU_VMSA_H__
-
-struct ipmmu_vmsa_master {
-	const char *name;
-	unsigned int utlb;
-};
-
-struct ipmmu_vmsa_platform_data {
-	const struct ipmmu_vmsa_master *masters;
-	unsigned int num_masters;
-};
-
-#endif /* __IPMMU_VMSA_H__ */
-- 
cgit v1.2.3


From f331a859e0ee5a898c1f47596eddad4c4f02d657 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 15 Jan 2015 18:16:04 -0600
Subject: PCI: Add flag for devices where we can't use bus reset

Enable a mechanism for devices to quirk that they do not behave when
doing a PCI bus reset.  We require a modest level of spec compliant
behavior in order to do a reset, for instance the device should come
out of reset without throwing errors and PCI config space should be
accessible after reset.  This is too much to ask for some devices.

Link: http://lkml.kernel.org/r/20140923210318.498dacbd@dualc.maya.org
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: stable@vger.kernel.org	# v3.14+
---
 drivers/pci/pci.c   | 40 ++++++++++++++++++++++++++++++++++++----
 include/linux/pci.h |  2 ++
 2 files changed, 38 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index cab05f31223f..e9d4fd861ba1 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3271,7 +3271,8 @@ static int pci_parent_bus_reset(struct pci_dev *dev, int probe)
 {
 	struct pci_dev *pdev;
 
-	if (pci_is_root_bus(dev->bus) || dev->subordinate || !dev->bus->self)
+	if (pci_is_root_bus(dev->bus) || dev->subordinate ||
+	    !dev->bus->self || dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET)
 		return -ENOTTY;
 
 	list_for_each_entry(pdev, &dev->bus->devices, bus_list)
@@ -3305,7 +3306,8 @@ static int pci_dev_reset_slot_function(struct pci_dev *dev, int probe)
 {
 	struct pci_dev *pdev;
 
-	if (dev->subordinate || !dev->slot)
+	if (dev->subordinate || !dev->slot ||
+	    dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET)
 		return -ENOTTY;
 
 	list_for_each_entry(pdev, &dev->bus->devices, bus_list)
@@ -3557,6 +3559,20 @@ int pci_try_reset_function(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_try_reset_function);
 
+/* Do any devices on or below this bus prevent a bus reset? */
+static bool pci_bus_resetable(struct pci_bus *bus)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &bus->devices, bus_list) {
+		if (dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET ||
+		    (dev->subordinate && !pci_bus_resetable(dev->subordinate)))
+			return false;
+	}
+
+	return true;
+}
+
 /* Lock devices from the top of the tree down */
 static void pci_bus_lock(struct pci_bus *bus)
 {
@@ -3607,6 +3623,22 @@ unlock:
 	return 0;
 }
 
+/* Do any devices on or below this slot prevent a bus reset? */
+static bool pci_slot_resetable(struct pci_slot *slot)
+{
+	struct pci_dev *dev;
+
+	list_for_each_entry(dev, &slot->bus->devices, bus_list) {
+		if (!dev->slot || dev->slot != slot)
+			continue;
+		if (dev->dev_flags & PCI_DEV_FLAGS_NO_BUS_RESET ||
+		    (dev->subordinate && !pci_bus_resetable(dev->subordinate)))
+			return false;
+	}
+
+	return true;
+}
+
 /* Lock devices from the top of the tree down */
 static void pci_slot_lock(struct pci_slot *slot)
 {
@@ -3728,7 +3760,7 @@ static int pci_slot_reset(struct pci_slot *slot, int probe)
 {
 	int rc;
 
-	if (!slot)
+	if (!slot || !pci_slot_resetable(slot))
 		return -ENOTTY;
 
 	if (!probe)
@@ -3820,7 +3852,7 @@ EXPORT_SYMBOL_GPL(pci_try_reset_slot);
 
 static int pci_bus_reset(struct pci_bus *bus, int probe)
 {
-	if (!bus->self)
+	if (!bus->self || !pci_bus_resetable(bus))
 		return -ENOTTY;
 
 	if (probe)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 360a966a97a5..44627f1df4ca 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -175,6 +175,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAGS_DMA_ALIAS_DEVFN = (__force pci_dev_flags_t) (1 << 4),
 	/* Use a PCIe-to-PCI bridge alias even if !pci_is_pcie */
 	PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS = (__force pci_dev_flags_t) (1 << 5),
+	/* Do not use bus resets for device */
+	PCI_DEV_FLAGS_NO_BUS_RESET = (__force pci_dev_flags_t) (1 << 6),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3


From 8505e729a2f6eb0803ff943a15f133dd10afff3a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 15 Jan 2015 16:21:49 -0600
Subject: PCI: Add pci_claim_bridge_resource() to clip window if necessary

Add pci_claim_bridge_resource() to claim a PCI-PCI bridge window.  This is
like regular pci_claim_resource(), except that if we fail to claim the
window, we check to see if we can reduce the size of the window and try
again.

This is for scenarios like this:

  pci_bus 0000:00: root bus resource [mem 0xc0000000-0xffffffff]
  pci 0000:00:01.0:   bridge window [mem 0xbdf00000-0xddefffff 64bit pref]
  pci 0000:01:00.0: reg 0x10: [mem 0xc0000000-0xcfffffff pref]

The 00:01.0 window is illegal: it starts before the host bridge window, so
we have to assume the [0xbdf00000-0xbfffffff] region is inaccessible.  We
can make it legal by clipping it to [mem 0xc0000000-0xddefffff 64bit pref].

Previously we discarded the 00:01.0 window and tried to reassign that part
of the hierarchy from scratch.  That is a problem because Linux doesn't
always assign things optimally.  For example, in this case, BIOS put the
01:00.0 device in a prefetchable window below 4GB, but after 5b28541552ef,
Linux puts the prefetchable window above 4GB where the 32-bit 01:00.0
device can't use it.

Clipping the 00:01.0 window is less intrusive than completely reassigning
things and is sufficient to let us use most of the BIOS configuration.  Of
course, it's possible that devices below 00:01.0 will no longer fit.  If
that's the case, we'll have to reassign things.  But that's a separate
problem.

[bhelgaas: changelog, split into separate patch]
Link: https://bugzilla.kernel.org/show_bug.cgi?id=85491
Reported-by: Marek Kordik <kordikmarek@gmail.com>
Fixes: 5b28541552ef ("PCI: Restrict 64-bit prefetchable bridge windows to 64-bit resources")
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: stable@vger.kernel.org	# v3.16+
---
 drivers/pci/setup-bus.c | 35 +++++++++++++++++++++++++++++++++++
 include/linux/pci.h     |  1 +
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 802f56be2149..e3e17f3c0f0f 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -646,6 +646,41 @@ void pci_setup_bridge(struct pci_bus *bus)
 	__pci_setup_bridge(bus, type);
 }
 
+
+int pci_claim_bridge_resource(struct pci_dev *bridge, int i)
+{
+	if (i < PCI_BRIDGE_RESOURCES || i > PCI_BRIDGE_RESOURCE_END)
+		return 0;
+
+	if (pci_claim_resource(bridge, i) == 0)
+		return 0;	/* claimed the window */
+
+	if ((bridge->class >> 8) != PCI_CLASS_BRIDGE_PCI)
+		return 0;
+
+	if (!pci_bus_clip_resource(bridge, i))
+		return -EINVAL;	/* clipping didn't change anything */
+
+	switch (i - PCI_BRIDGE_RESOURCES) {
+	case 0:
+		pci_setup_bridge_io(bridge);
+		break;
+	case 1:
+		pci_setup_bridge_mmio(bridge);
+		break;
+	case 2:
+		pci_setup_bridge_mmio_pref(bridge);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (pci_claim_resource(bridge, i) == 0)
+		return 0;	/* claimed a smaller window */
+
+	return -EINVAL;
+}
+
 /* Check whether the bridge supports optional I/O and
    prefetchable memory ranges. If not, the respective
    base/limit registers must be read-only and read as 0. */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 44627f1df4ca..9603094ed59b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1067,6 +1067,7 @@ resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx);
 void pci_bus_assign_resources(const struct pci_bus *bus);
 void pci_bus_size_bridges(struct pci_bus *bus);
 int pci_claim_resource(struct pci_dev *, int);
+int pci_claim_bridge_resource(struct pci_dev *bridge, int i);
 void pci_assign_unassigned_resources(void);
 void pci_assign_unassigned_bridge_resources(struct pci_dev *bridge);
 void pci_assign_unassigned_bus_resources(struct pci_bus *bus);
-- 
cgit v1.2.3


From 51e537387990dc1f00752103f314fd135cb94bc6 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Fri, 21 Nov 2014 11:24:08 -0700
Subject: PCI: Add flag for devices that don't reset on D3hot->D0 transition

Per the PCI Power Management spec r1.2, sec 3.2.4, a device that advertises
No_Soft_Reset == 0 in the PMCSR register (reported by lspci as "NoSoftRst-")
should perform an internal reset when transitioning from D3hot to D0 via
software control.  Configuration context is lost and the device requires a
full reinitialization sequence.

Unfortunately the definition of "internal reset", beyond the application of
the configuration context, is largely left to the interpretation of the
specific device.  Some devices don't seem to perform an "internal reset"
even if they report No_Soft_Reset == 0.

We still need to honor the PCI specification and restore PCI config context
in the event that we do a PM reset, so we don't cache and modify the
PCI_PM_CTRL_NO_SOFT_RESET bit for the device, but for interfaces where the
intention is to reset the device, like pci_reset_function(), we need a
mechanism to flag that PM reset (a D3hot->D0 transition) doesn't perform
any significant "internal reset" of the device.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 2 +-
 include/linux/pci.h | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e9d4fd861ba1..422bc0179e90 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3197,7 +3197,7 @@ static int pci_pm_reset(struct pci_dev *dev, int probe)
 {
 	u16 csr;
 
-	if (!dev->pm_cap)
+	if (!dev->pm_cap || dev->dev_flags & PCI_DEV_FLAGS_NO_PM_RESET)
 		return -ENOTTY;
 
 	pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &csr);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 44627f1df4ca..7bed32b3fd54 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -177,6 +177,8 @@ enum pci_dev_flags {
 	PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS = (__force pci_dev_flags_t) (1 << 5),
 	/* Do not use bus resets for device */
 	PCI_DEV_FLAGS_NO_BUS_RESET = (__force pci_dev_flags_t) (1 << 6),
+	/* Do not use PM reset even if device advertises NoSoftRst- */
+	PCI_DEV_FLAGS_NO_PM_RESET = (__force pci_dev_flags_t) (1 << 7),
 };
 
 enum pci_irq_reroute_variant {
-- 
cgit v1.2.3


From 6dee60f69d48fcef021b4b53b3431797ec440764 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:54 -0500
Subject: locks: add new struct list_head to struct file_lock

...that we can use to queue file_locks to per-ctx list_heads. Go ahead
and convert locks_delete_lock and locks_dispose_list to use it instead
of the fl_block list.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/locks.c         | 8 +++++---
 include/linux/fs.h | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 59e2f905e4ff..bfe5f17401de 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -207,6 +207,7 @@ static struct kmem_cache *filelock_cache __read_mostly;
 static void locks_init_lock_heads(struct file_lock *fl)
 {
 	INIT_HLIST_NODE(&fl->fl_link);
+	INIT_LIST_HEAD(&fl->fl_list);
 	INIT_LIST_HEAD(&fl->fl_block);
 	init_waitqueue_head(&fl->fl_wait);
 }
@@ -243,6 +244,7 @@ EXPORT_SYMBOL_GPL(locks_release_private);
 void locks_free_lock(struct file_lock *fl)
 {
 	BUG_ON(waitqueue_active(&fl->fl_wait));
+	BUG_ON(!list_empty(&fl->fl_list));
 	BUG_ON(!list_empty(&fl->fl_block));
 	BUG_ON(!hlist_unhashed(&fl->fl_link));
 
@@ -257,8 +259,8 @@ locks_dispose_list(struct list_head *dispose)
 	struct file_lock *fl;
 
 	while (!list_empty(dispose)) {
-		fl = list_first_entry(dispose, struct file_lock, fl_block);
-		list_del_init(&fl->fl_block);
+		fl = list_first_entry(dispose, struct file_lock, fl_list);
+		list_del_init(&fl->fl_list);
 		locks_free_lock(fl);
 	}
 }
@@ -691,7 +693,7 @@ static void locks_delete_lock(struct file_lock **thisfl_p,
 
 	locks_unlink_lock(thisfl_p);
 	if (dispose)
-		list_add(&fl->fl_block, dispose);
+		list_add(&fl->fl_list, dispose);
 	else
 		locks_free_lock(fl);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..cd6818115162 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -934,6 +934,7 @@ int locks_in_grace(struct net *);
  */
 struct file_lock {
 	struct file_lock *fl_next;	/* singly linked list for this inode  */
+	struct list_head fl_list;	/* link into file_lock_context */
 	struct hlist_node fl_link;	/* node in global lists */
 	struct list_head fl_block;	/* circular list of blocked processes */
 	fl_owner_t fl_owner;
-- 
cgit v1.2.3


From 4a075e39c86490cc0f0c10ac6abe3592d1689463 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:54 -0500
Subject: locks: add a new struct file_locking_context pointer to struct inode

The current scheme of using the i_flock list is really difficult to
manage. There is also a legitimate desire for a per-inode spinlock to
manage these lists that isn't the i_lock.

Start conversion to a new scheme to eventually replace the old i_flock
list with a new "file_lock_context" object.

We start by adding a new i_flctx to struct inode. For now, it lives in
parallel with i_flock list, but will eventually replace it. The idea is
to allocate a structure to sit in that pointer and act as a locus for
all things file locking.

We allocate a file_lock_context for an inode when the first lock is
added to it, and it's only freed when the inode is freed. We use the
i_lock to protect the assignment, but afterward it should mostly be
accessed locklessly.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/inode.c         |  3 ++-
 fs/locks.c         | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h | 11 +++++++++++
 3 files changed, 57 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..f30872ade6d7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -194,7 +194,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 #ifdef CONFIG_FSNOTIFY
 	inode->i_fsnotify_mask = 0;
 #endif
-
+	inode->i_flctx = NULL;
 	this_cpu_inc(nr_inodes);
 
 	return 0;
@@ -237,6 +237,7 @@ void __destroy_inode(struct inode *inode)
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
+	locks_free_lock_context(inode->i_flctx);
 	if (!inode->i_nlink) {
 		WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
 		atomic_long_dec(&inode->i_sb->s_remove_count);
diff --git a/fs/locks.c b/fs/locks.c
index ae1e7cf721d6..526d5fca67c8 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -202,8 +202,49 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
  */
 static DEFINE_SPINLOCK(blocked_lock_lock);
 
+static struct kmem_cache *flctx_cache __read_mostly;
 static struct kmem_cache *filelock_cache __read_mostly;
 
+static struct file_lock_context *
+locks_get_lock_context(struct inode *inode)
+{
+	struct file_lock_context *new;
+
+	if (likely(inode->i_flctx))
+		goto out;
+
+	new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+	if (!new)
+		goto out;
+
+	INIT_LIST_HEAD(&new->flc_flock);
+
+	/*
+	 * Assign the pointer if it's not already assigned. If it is, then
+	 * free the context we just allocated.
+	 */
+	spin_lock(&inode->i_lock);
+	if (likely(!inode->i_flctx)) {
+		inode->i_flctx = new;
+		new = NULL;
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (new)
+		kmem_cache_free(flctx_cache, new);
+out:
+	return inode->i_flctx;
+}
+
+void
+locks_free_lock_context(struct file_lock_context *ctx)
+{
+	if (ctx) {
+		WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
+		kmem_cache_free(flctx_cache, ctx);
+	}
+}
+
 static void locks_init_lock_heads(struct file_lock *fl)
 {
 	INIT_HLIST_NODE(&fl->fl_link);
@@ -2636,6 +2677,9 @@ static int __init filelock_init(void)
 {
 	int i;
 
+	flctx_cache = kmem_cache_create("file_lock_ctx",
+			sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
+
 	filelock_cache = kmem_cache_create("file_lock_cache",
 			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cd6818115162..dec0d38b05de 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -626,6 +626,7 @@ struct inode {
 #endif
 	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct file_lock	*i_flock;
+	struct file_lock_context	*i_flctx;
 	struct address_space	i_data;
 	struct list_head	i_devices;
 	union {
@@ -965,6 +966,10 @@ struct file_lock {
 	} fl_u;
 };
 
+struct file_lock_context {
+	struct list_head	flc_flock;
+};
+
 /* The following constant reflects the upper bound of the file/locking space */
 #ifndef OFFSET_MAX
 #define INT_LIMIT(x)	(~((x)1 << (sizeof(x)*8 - 1)))
@@ -991,6 +996,7 @@ extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
 extern int fcntl_getlease(struct file *filp);
 
 /* fs/locks.c */
+void locks_free_lock_context(struct file_lock_context *ctx);
 void locks_free_lock(struct file_lock *fl);
 extern void locks_init_lock(struct file_lock *);
 extern struct file_lock * locks_alloc_lock(void);
@@ -1048,6 +1054,11 @@ static inline int fcntl_getlease(struct file *filp)
 	return F_UNLCK;
 }
 
+static inline void
+locks_free_lock_context(struct file_lock_context *ctx)
+{
+}
+
 static inline void locks_init_lock(struct file_lock *fl)
 {
 	return;
-- 
cgit v1.2.3


From bd61e0a9c852de2d705b6f1bb2cc54c5774db570 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:55 -0500
Subject: locks: convert posix locks to file_lock_context

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/ceph/locks.c     |  58 +++++++++++++---------------
 fs/cifs/file.c      |  26 +++++--------
 fs/lockd/svcsubs.c  |  20 ++++++----
 fs/locks.c          | 108 +++++++++++++++++++++++++++-------------------------
 fs/nfs/delegation.c |  28 +++++---------
 fs/nfs/nfs4state.c  |  52 +++++--------------------
 fs/nfs/pagelist.c   |   8 ++--
 fs/nfs/write.c      |  30 +++++++--------
 fs/nfsd/nfs4state.c |  18 +++++----
 fs/read_write.c     |   2 +-
 include/linux/fs.h  |   3 +-
 11 files changed, 155 insertions(+), 198 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 917656ea8dcf..19beeed83233 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -253,18 +253,15 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 	*fcntl_count = 0;
 	*flock_count = 0;
 
-	spin_lock(&inode->i_lock);
-	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
-		if (lock->fl_flags & FL_POSIX)
-			++(*fcntl_count);
-	}
-
 	ctx = inode->i_flctx;
 	if (ctx) {
+		spin_lock(&inode->i_lock);
+		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
+			++(*fcntl_count);
 		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
 			++(*flock_count);
+		spin_unlock(&inode->i_lock);
 	}
-	spin_unlock(&inode->i_lock);
 	dout("counted %d flock locks and %d fcntl locks",
 	     *flock_count, *fcntl_count);
 }
@@ -279,7 +276,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 				int num_fcntl_locks, int num_flock_locks)
 {
 	struct file_lock *lock;
-	struct file_lock_context *ctx;
+	struct file_lock_context *ctx = inode->i_flctx;
 	int err = 0;
 	int seen_fcntl = 0;
 	int seen_flock = 0;
@@ -288,34 +285,31 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 	dout("encoding %d flock and %d fcntl locks", num_flock_locks,
 	     num_fcntl_locks);
 
+	if (!ctx)
+		return 0;
+
 	spin_lock(&inode->i_lock);
-	for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
-		if (lock->fl_flags & FL_POSIX) {
-			++seen_fcntl;
-			if (seen_fcntl > num_fcntl_locks) {
-				err = -ENOSPC;
-				goto fail;
-			}
-			err = lock_to_ceph_filelock(lock, &flocks[l]);
-			if (err)
-				goto fail;
-			++l;
+	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+		++seen_fcntl;
+		if (seen_fcntl > num_fcntl_locks) {
+			err = -ENOSPC;
+			goto fail;
 		}
+		err = lock_to_ceph_filelock(lock, &flocks[l]);
+		if (err)
+			goto fail;
+		++l;
 	}
-
-	ctx = inode->i_flctx;
-	if (ctx) {
-		list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
-			++seen_flock;
-			if (seen_flock > num_flock_locks) {
-				err = -ENOSPC;
-				goto fail;
-			}
-			err = lock_to_ceph_filelock(lock, &flocks[l]);
-			if (err)
-				goto fail;
-			++l;
+	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+		++seen_flock;
+		if (seen_flock > num_flock_locks) {
+			err = -ENOSPC;
+			goto fail;
 		}
+		err = lock_to_ceph_filelock(lock, &flocks[l]);
+		if (err)
+			goto fail;
+		++l;
 	}
 fail:
 	spin_unlock(&inode->i_lock);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 96b7e9b7706d..ea78f6f81ce2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1109,11 +1109,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
 	return rc;
 }
 
-/* copied from fs/locks.c with a name change */
-#define cifs_for_each_lock(inode, lockp) \
-	for (lockp = &inode->i_flock; *lockp != NULL; \
-	     lockp = &(*lockp)->fl_next)
-
 struct lock_to_push {
 	struct list_head llist;
 	__u64 offset;
@@ -1128,8 +1123,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 {
 	struct inode *inode = cfile->dentry->d_inode;
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
-	struct file_lock *flock, **before;
-	unsigned int count = 0, i = 0;
+	struct file_lock *flock;
+	struct file_lock_context *flctx = inode->i_flctx;
+	unsigned int count = 0, i;
 	int rc = 0, xid, type;
 	struct list_head locks_to_send, *el;
 	struct lock_to_push *lck, *tmp;
@@ -1137,10 +1133,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 
 	xid = get_xid();
 
+	if (!flctx)
+		goto out;
+
 	spin_lock(&inode->i_lock);
-	cifs_for_each_lock(inode, before) {
-		if ((*before)->fl_flags & FL_POSIX)
-			count++;
+	list_for_each(el, &flctx->flc_posix) {
+		count++;
 	}
 	spin_unlock(&inode->i_lock);
 
@@ -1151,7 +1149,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	 * added to the list while we are holding cinode->lock_sem that
 	 * protects locking operations of this inode.
 	 */
-	for (; i < count; i++) {
+	for (i = 0; i < count; i++) {
 		lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
 		if (!lck) {
 			rc = -ENOMEM;
@@ -1162,10 +1160,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 
 	el = locks_to_send.next;
 	spin_lock(&inode->i_lock);
-	cifs_for_each_lock(inode, before) {
-		flock = *before;
-		if ((flock->fl_flags & FL_POSIX) == 0)
-			continue;
+	list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
 		if (el == &locks_to_send) {
 			/*
 			 * The list ended. We don't have enough allocated
@@ -1185,7 +1180,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 		lck->length = length;
 		lck->type = type;
 		lck->offset = flock->fl_start;
-		el = el->next;
 	}
 	spin_unlock(&inode->i_lock);
 
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d12ff4e2dbe7..5300bb53835f 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -164,12 +164,15 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 {
 	struct inode	 *inode = nlmsvc_file_inode(file);
 	struct file_lock *fl;
+	struct file_lock_context *flctx = inode->i_flctx;
 	struct nlm_host	 *lockhost;
 
+	if (!flctx || list_empty_careful(&flctx->flc_posix))
+		return 0;
 again:
 	file->f_locks = 0;
 	spin_lock(&inode->i_lock);
-	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+	list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
 		if (fl->fl_lmops != &nlmsvc_lock_operations)
 			continue;
 
@@ -223,18 +226,21 @@ nlm_file_inuse(struct nlm_file *file)
 {
 	struct inode	 *inode = nlmsvc_file_inode(file);
 	struct file_lock *fl;
+	struct file_lock_context *flctx = inode->i_flctx;
 
 	if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
 		return 1;
 
-	spin_lock(&inode->i_lock);
-	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
-		if (fl->fl_lmops == &nlmsvc_lock_operations) {
-			spin_unlock(&inode->i_lock);
-			return 1;
+	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
+		spin_lock(&inode->i_lock);
+		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+			if (fl->fl_lmops == &nlmsvc_lock_operations) {
+				spin_unlock(&inode->i_lock);
+				return 1;
+			}
 		}
+		spin_unlock(&inode->i_lock);
 	}
-	spin_unlock(&inode->i_lock);
 	file->f_locks = 0;
 	return 0;
 }
diff --git a/fs/locks.c b/fs/locks.c
index 055df53f19de..e50bb4d9e757 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -157,9 +157,6 @@ static int target_leasetype(struct file_lock *fl)
 int leases_enable = 1;
 int lease_break_time = 45;
 
-#define for_each_lock(inode, lockp) \
-	for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
-
 /*
  * The global file_lock_list is only used for displaying /proc/locks, so we
  * keep a list on each CPU, with each list protected by its own spinlock via
@@ -218,6 +215,7 @@ locks_get_lock_context(struct inode *inode)
 		goto out;
 
 	INIT_LIST_HEAD(&new->flc_flock);
+	INIT_LIST_HEAD(&new->flc_posix);
 
 	/*
 	 * Assign the pointer if it's not already assigned. If it is, then
@@ -241,6 +239,7 @@ locks_free_lock_context(struct file_lock_context *ctx)
 {
 	if (ctx) {
 		WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
+		WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
 		kmem_cache_free(flctx_cache, ctx);
 	}
 }
@@ -809,21 +808,26 @@ void
 posix_test_lock(struct file *filp, struct file_lock *fl)
 {
 	struct file_lock *cfl;
+	struct file_lock_context *ctx;
 	struct inode *inode = file_inode(filp);
 
+	ctx = inode->i_flctx;
+	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
+		fl->fl_type = F_UNLCK;
+		return;
+	}
+
 	spin_lock(&inode->i_lock);
-	for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
-		if (!IS_POSIX(cfl))
-			continue;
-		if (posix_locks_conflict(fl, cfl))
-			break;
+	list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
+		if (posix_locks_conflict(fl, cfl)) {
+			locks_copy_conflock(fl, cfl);
+			if (cfl->fl_nspid)
+				fl->fl_pid = pid_vnr(cfl->fl_nspid);
+			goto out;
+		}
 	}
-	if (cfl) {
-		locks_copy_conflock(fl, cfl);
-		if (cfl->fl_nspid)
-			fl->fl_pid = pid_vnr(cfl->fl_nspid);
-	} else
-		fl->fl_type = F_UNLCK;
+	fl->fl_type = F_UNLCK;
+out:
 	spin_unlock(&inode->i_lock);
 	return;
 }
@@ -983,16 +987,20 @@ out:
 
 static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
 {
-	struct file_lock *fl;
+	struct file_lock *fl, *tmp;
 	struct file_lock *new_fl = NULL;
 	struct file_lock *new_fl2 = NULL;
 	struct file_lock *left = NULL;
 	struct file_lock *right = NULL;
-	struct file_lock **before;
+	struct file_lock_context *ctx;
 	int error;
 	bool added = false;
 	LIST_HEAD(dispose);
 
+	ctx = locks_get_lock_context(inode);
+	if (!ctx)
+		return -ENOMEM;
+
 	/*
 	 * We may need two file_lock structures for this operation,
 	 * so we get them in advance to avoid races.
@@ -1013,8 +1021,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	 * blocker's list of waiters and the global blocked_hash.
 	 */
 	if (request->fl_type != F_UNLCK) {
-		for_each_lock(inode, before) {
-			fl = *before;
+		list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
 			if (!IS_POSIX(fl))
 				continue;
 			if (!posix_locks_conflict(request, fl))
@@ -1044,29 +1051,25 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
 
-	/*
-	 * Find the first old lock with the same owner as the new lock.
-	 */
-	
-	before = &inode->i_flock;
-
-	/* First skip locks owned by other processes.  */
-	while ((fl = *before) && (!IS_POSIX(fl) ||
-				  !posix_same_owner(request, fl))) {
-		before = &fl->fl_next;
+	/* Find the first old lock with the same owner as the new lock */
+	list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
+		if (posix_same_owner(request, fl))
+			break;
 	}
 
 	/* Process locks with this owner. */
-	while ((fl = *before) && posix_same_owner(request, fl)) {
-		/* Detect adjacent or overlapping regions (if same lock type)
-		 */
+	list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
+		if (!posix_same_owner(request, fl))
+			break;
+
+		/* Detect adjacent or overlapping regions (if same lock type) */
 		if (request->fl_type == fl->fl_type) {
 			/* In all comparisons of start vs end, use
 			 * "start - 1" rather than "end + 1". If end
 			 * is OFFSET_MAX, end + 1 will become negative.
 			 */
 			if (fl->fl_end < request->fl_start - 1)
-				goto next_lock;
+				continue;
 			/* If the next lock in the list has entirely bigger
 			 * addresses than the new one, insert the lock here.
 			 */
@@ -1087,18 +1090,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			else
 				request->fl_end = fl->fl_end;
 			if (added) {
-				locks_delete_lock(before, &dispose);
+				locks_delete_lock_ctx(fl, &dispose);
 				continue;
 			}
 			request = fl;
 			added = true;
-		}
-		else {
+		} else {
 			/* Processing for different lock types is a bit
 			 * more complex.
 			 */
 			if (fl->fl_end < request->fl_start)
-				goto next_lock;
+				continue;
 			if (fl->fl_start > request->fl_end)
 				break;
 			if (request->fl_type == F_UNLCK)
@@ -1117,7 +1119,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				 * one (This may happen several times).
 				 */
 				if (added) {
-					locks_delete_lock(before, &dispose);
+					locks_delete_lock_ctx(fl, &dispose);
 					continue;
 				}
 				/*
@@ -1133,15 +1135,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				locks_copy_lock(new_fl, request);
 				request = new_fl;
 				new_fl = NULL;
-				locks_delete_lock(before, &dispose);
-				locks_insert_lock(before, request);
+				locks_insert_lock_ctx(request, &fl->fl_list);
+				locks_delete_lock_ctx(fl, &dispose);
 				added = true;
 			}
 		}
-		/* Go on to next lock.
-		 */
-	next_lock:
-		before = &fl->fl_next;
 	}
 
 	/*
@@ -1166,7 +1164,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			goto out;
 		}
 		locks_copy_lock(new_fl, request);
-		locks_insert_lock(before, new_fl);
+		locks_insert_lock_ctx(new_fl, &fl->fl_list);
 		new_fl = NULL;
 	}
 	if (right) {
@@ -1177,7 +1175,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			left = new_fl2;
 			new_fl2 = NULL;
 			locks_copy_lock(left, right);
-			locks_insert_lock(before, left);
+			locks_insert_lock_ctx(left, &fl->fl_list);
 		}
 		right->fl_start = request->fl_end + 1;
 		locks_wake_up_blocks(right);
@@ -1257,22 +1255,29 @@ EXPORT_SYMBOL(posix_lock_file_wait);
  */
 int locks_mandatory_locked(struct file *file)
 {
+	int ret;
 	struct inode *inode = file_inode(file);
+	struct file_lock_context *ctx;
 	struct file_lock *fl;
 
+	ctx = inode->i_flctx;
+	if (!ctx || list_empty_careful(&ctx->flc_posix))
+		return 0;
+
 	/*
 	 * Search the lock list for this inode for any POSIX locks.
 	 */
 	spin_lock(&inode->i_lock);
-	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
-		if (!IS_POSIX(fl))
-			continue;
+	ret = 0;
+	list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
 		if (fl->fl_owner != current->files &&
-		    fl->fl_owner != file)
+		    fl->fl_owner != file) {
+			ret = -EAGAIN;
 			break;
+		}
 	}
 	spin_unlock(&inode->i_lock);
-	return fl ? -EAGAIN : 0;
+	return ret;
 }
 
 /**
@@ -2389,13 +2394,14 @@ out:
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
 	struct file_lock lock;
+	struct file_lock_context *ctx = file_inode(filp)->i_flctx;
 
 	/*
 	 * If there are no locks held on this file, we don't need to call
 	 * posix_lock_file().  Another process could be setting a lock on this
 	 * file at the same time, but we wouldn't remove that lock anyway.
 	 */
-	if (!file_inode(filp)->i_flock)
+	if (!ctx || list_empty(&ctx->flc_posix))
 		return;
 
 	lock.fl_type = F_UNLCK;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 9f9f67b17e2b..3fb1caa3874d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -85,17 +85,17 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 {
 	struct inode *inode = state->inode;
 	struct file_lock *fl;
-	struct file_lock_context *flctx;
+	struct file_lock_context *flctx = inode->i_flctx;
+	struct list_head *list;
 	int status = 0;
 
-	if (inode->i_flock == NULL && inode->i_flctx == NULL)
+	if (flctx == NULL)
 		goto out;
 
-	/* Protect inode->i_flock using the i_lock */
+	list = &flctx->flc_posix;
 	spin_lock(&inode->i_lock);
-	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
-		if (!(fl->fl_flags & (FL_POSIX)))
-			continue;
+restart:
+	list_for_each_entry(fl, list, fl_list) {
 		if (nfs_file_open_context(fl->fl_file) != ctx)
 			continue;
 		spin_unlock(&inode->i_lock);
@@ -104,19 +104,9 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 			goto out;
 		spin_lock(&inode->i_lock);
 	}
-
-	flctx = inode->i_flctx;
-	if (flctx) {
-		list_for_each_entry(fl, &flctx->flc_flock, fl_list) {
-			if (nfs_file_open_context(fl->fl_file) != ctx)
-				continue;
-			spin_unlock(&inode->i_lock);
-			status = nfs4_lock_delegation_recall(fl, state,
-								stateid);
-			if (status < 0)
-				goto out;
-			spin_lock(&inode->i_lock);
-		}
+	if (list == &flctx->flc_posix) {
+		list = &flctx->flc_flock;
+		goto restart;
 	}
 	spin_unlock(&inode->i_lock);
 out:
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 65c404bf61ae..6084c267f3a0 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1367,53 +1367,18 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 	struct file_lock *fl;
 	int status = 0;
 	struct file_lock_context *flctx = inode->i_flctx;
+	struct list_head *list;
 
-	if (inode->i_flock == NULL && flctx == NULL)
+	if (flctx == NULL)
 		return 0;
 
+	list = &flctx->flc_posix;
+
 	/* Guard against delegation returns and new lock/unlock calls */
 	down_write(&nfsi->rwsem);
-	/* Protect inode->i_flock using the BKL */
 	spin_lock(&inode->i_lock);
-	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
-		if (!(fl->fl_flags & FL_POSIX))
-			continue;
-		if (nfs_file_open_context(fl->fl_file)->state != state)
-			continue;
-		spin_unlock(&inode->i_lock);
-		status = ops->recover_lock(state, fl);
-		switch (status) {
-			case 0:
-				break;
-			case -ESTALE:
-			case -NFS4ERR_ADMIN_REVOKED:
-			case -NFS4ERR_STALE_STATEID:
-			case -NFS4ERR_BAD_STATEID:
-			case -NFS4ERR_EXPIRED:
-			case -NFS4ERR_NO_GRACE:
-			case -NFS4ERR_STALE_CLIENTID:
-			case -NFS4ERR_BADSESSION:
-			case -NFS4ERR_BADSLOT:
-			case -NFS4ERR_BAD_HIGH_SLOT:
-			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-				goto out;
-			default:
-				printk(KERN_ERR "NFS: %s: unhandled error %d\n",
-					 __func__, status);
-			case -ENOMEM:
-			case -NFS4ERR_DENIED:
-			case -NFS4ERR_RECLAIM_BAD:
-			case -NFS4ERR_RECLAIM_CONFLICT:
-				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
-				status = 0;
-		}
-		spin_lock(&inode->i_lock);
-	}
-
-	if (!flctx)
-		goto out_unlock;
-
-	list_for_each_entry(fl, &flctx->flc_flock, fl_list) {
+restart:
+	list_for_each_entry(fl, list, fl_list) {
 		if (nfs_file_open_context(fl->fl_file)->state != state)
 			continue;
 		spin_unlock(&inode->i_lock);
@@ -1445,7 +1410,10 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 		}
 		spin_lock(&inode->i_lock);
 	}
-out_unlock:
+	if (list == &flctx->flc_posix) {
+		list = &flctx->flc_flock;
+		goto restart;
+	}
 	spin_unlock(&inode->i_lock);
 out:
 	up_write(&nfsi->rwsem);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a3b62e15b444..29c7f33c9cf1 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -831,12 +831,10 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 	if (prev) {
 		if (!nfs_match_open_context(req->wb_context, prev->wb_context))
 			return false;
-		if (req->wb_context->dentry->d_inode->i_flock != NULL &&
-		    !nfs_match_lock_context(req->wb_lock_context,
-					    prev->wb_lock_context))
-			return false;
 		flctx = req->wb_context->dentry->d_inode->i_flctx;
-		if (flctx != NULL && !list_empty_careful(&flctx->flc_flock) &&
+		if (flctx != NULL &&
+		    !(list_empty_careful(&flctx->flc_posix) &&
+		      list_empty_careful(&flctx->flc_flock)) &&
 		    !nfs_match_lock_context(req->wb_lock_context,
 					    prev->wb_lock_context))
 			return false;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e072aeb34195..784c13485b3f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1091,6 +1091,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 {
 	struct nfs_open_context *ctx = nfs_file_open_context(file);
 	struct nfs_lock_context *l_ctx;
+	struct file_lock_context *flctx = file_inode(file)->i_flctx;
 	struct nfs_page	*req;
 	int do_flush, status;
 	/*
@@ -1109,12 +1110,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 		do_flush = req->wb_page != page || req->wb_context != ctx;
 		/* for now, flush if more than 1 request in page_group */
 		do_flush |= req->wb_this_page != req;
-		if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
-			do_flush |= l_ctx->lockowner.l_owner != current->files
-				|| l_ctx->lockowner.l_pid != current->tgid;
-		}
-		if (l_ctx && ctx->dentry->d_inode->i_flctx &&
-		    !list_empty_careful(&ctx->dentry->d_inode->i_flctx->flc_flock)) {
+		if (l_ctx && flctx &&
+		    !(list_empty_careful(&flctx->flc_posix) &&
+		      list_empty_careful(&flctx->flc_flock))) {
 			do_flush |= l_ctx->lockowner.l_owner != current->files
 				|| l_ctx->lockowner.l_pid != current->tgid;
 		}
@@ -1202,26 +1200,24 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino
 		return 0;
 	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
 		return 1;
-	if (!inode->i_flock && !flctx)
+	if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
+		       list_empty_careful(&flctx->flc_posix)))
 		return 0;
 
 	/* Check to see if there are whole file write locks */
-	spin_lock(&inode->i_lock);
 	ret = 0;
-
-	fl = inode->i_flock;
-	if (fl && is_whole_file_wrlock(fl)) {
-		ret = 1;
-		goto out;
-	}
-
-	if (!list_empty(&flctx->flc_flock)) {
+	spin_lock(&inode->i_lock);
+	if (!list_empty(&flctx->flc_posix)) {
+		fl = list_first_entry(&flctx->flc_posix, struct file_lock,
+					fl_list);
+		if (is_whole_file_wrlock(fl))
+			ret = 1;
+	} else if (!list_empty(&flctx->flc_flock)) {
 		fl = list_first_entry(&flctx->flc_flock, struct file_lock,
 					fl_list);
 		if (fl->fl_type == F_WRLCK)
 			ret = 1;
 	}
-out:
 	spin_unlock(&inode->i_lock);
 	return ret;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index c06a1ba80d73..fad821991369 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5556,10 +5556,11 @@ out_nfserr:
 static bool
 check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 {
-	struct file_lock **flpp;
+	struct file_lock *fl;
 	int status = false;
 	struct file *filp = find_any_file(fp);
 	struct inode *inode;
+	struct file_lock_context *flctx;
 
 	if (!filp) {
 		/* Any valid lock stateid should have some sort of access */
@@ -5568,15 +5569,18 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 	}
 
 	inode = file_inode(filp);
+	flctx = inode->i_flctx;
 
-	spin_lock(&inode->i_lock);
-	for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
-		if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
-			status = true;
-			break;
+	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
+		spin_lock(&inode->i_lock);
+		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+			if (fl->fl_owner == (fl_owner_t)lowner) {
+				status = true;
+				break;
+			}
 		}
+		spin_unlock(&inode->i_lock);
 	}
-	spin_unlock(&inode->i_lock);
 	fput(filp);
 	return status;
 }
diff --git a/fs/read_write.c b/fs/read_write.c
index c0805c93b6fa..4060691e78f7 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -358,7 +358,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
 			return retval;
 	}
 
-	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
+	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
 		retval = locks_mandatory_area(
 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 			inode, file, pos, count);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dec0d38b05de..571f113588e9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -968,6 +968,7 @@ struct file_lock {
 
 struct file_lock_context {
 	struct list_head	flc_flock;
+	struct list_head	flc_posix;
 };
 
 /* The following constant reflects the upper bound of the file/locking space */
@@ -1971,7 +1972,7 @@ static inline int locks_verify_truncate(struct inode *inode,
 				    struct file *filp,
 				    loff_t size)
 {
-	if (inode->i_flock && mandatory_lock(inode))
+	if (inode->i_flctx && mandatory_lock(inode))
 		return locks_mandatory_area(
 			FLOCK_VERIFY_WRITE, inode, filp,
 			size < inode->i_size ? size : inode->i_size,
-- 
cgit v1.2.3


From 8634b51f6ca298fb8b07aa4847340764903533ab Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:55 -0500
Subject: locks: convert lease handling to file_lock_context

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/locks.c         | 252 +++++++++++++++++++++--------------------------------
 include/linux/fs.h |   5 +-
 2 files changed, 102 insertions(+), 155 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index e50bb4d9e757..d46e70567b99 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -216,6 +216,7 @@ locks_get_lock_context(struct inode *inode)
 
 	INIT_LIST_HEAD(&new->flc_flock);
 	INIT_LIST_HEAD(&new->flc_posix);
+	INIT_LIST_HEAD(&new->flc_lease);
 
 	/*
 	 * Assign the pointer if it's not already assigned. If it is, then
@@ -240,6 +241,7 @@ locks_free_lock_context(struct file_lock_context *ctx)
 	if (ctx) {
 		WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
 		WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
+		WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
 		kmem_cache_free(flctx_cache, ctx);
 	}
 }
@@ -677,22 +679,6 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
 	spin_unlock(&blocked_lock_lock);
 }
 
-/* Insert file lock fl into an inode's lock list at the position indicated
- * by pos. At the same time add the lock to the global file lock list.
- *
- * Must be called with the i_lock held!
- */
-static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
-{
-	fl->fl_nspid = get_pid(task_tgid(current));
-
-	/* insert into file's list */
-	fl->fl_next = *pos;
-	*pos = fl;
-
-	locks_insert_global_locks(fl);
-}
-
 static void
 locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
 {
@@ -701,63 +687,28 @@ locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
 	locks_insert_global_locks(fl);
 }
 
-/**
- * locks_delete_lock - Delete a lock and then free it.
- * @thisfl_p: pointer that points to the fl_next field of the previous
- * 	      inode->i_flock list entry
- *
- * Unlink a lock from all lists and free the namespace reference, but don't
- * free it yet. Wake up processes that are blocked waiting for this lock and
- * notify the FS that the lock has been cleared.
- *
- * Must be called with the i_lock held!
- */
-static void locks_unlink_lock(struct file_lock **thisfl_p)
+static void
+locks_unlink_lock_ctx(struct file_lock *fl)
 {
-	struct file_lock *fl = *thisfl_p;
-
 	locks_delete_global_locks(fl);
-
-	*thisfl_p = fl->fl_next;
-	fl->fl_next = NULL;
-
+	list_del_init(&fl->fl_list);
 	if (fl->fl_nspid) {
 		put_pid(fl->fl_nspid);
 		fl->fl_nspid = NULL;
 	}
-
 	locks_wake_up_blocks(fl);
 }
 
-/*
- * Unlink a lock from all lists and free it.
- *
- * Must be called with i_lock held!
- */
-static void locks_delete_lock(struct file_lock **thisfl_p,
-			      struct list_head *dispose)
+static void
+locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
 {
-	struct file_lock *fl = *thisfl_p;
-
-	locks_unlink_lock(thisfl_p);
+	locks_unlink_lock_ctx(fl);
 	if (dispose)
 		list_add(&fl->fl_list, dispose);
 	else
 		locks_free_lock(fl);
 }
 
-static void
-locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
-{
-	locks_delete_global_locks(fl);
-	if (fl->fl_nspid) {
-		put_pid(fl->fl_nspid);
-		fl->fl_nspid = NULL;
-	}
-	locks_wake_up_blocks(fl);
-	list_move(&fl->fl_list, dispose);
-}
-
 /* Determine if lock sys_fl blocks lock caller_fl. Common functionality
  * checks for shared/exclusive status of overlapping locks.
  */
@@ -1376,7 +1327,7 @@ int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
 			printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
 			fl->fl_fasync = NULL;
 		}
-		locks_delete_lock(before, dispose);
+		locks_delete_lock_ctx(fl, dispose);
 	}
 	return 0;
 }
@@ -1392,20 +1343,17 @@ static bool past_time(unsigned long then)
 
 static void time_out_leases(struct inode *inode, struct list_head *dispose)
 {
-	struct file_lock **before;
-	struct file_lock *fl;
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl, *tmp;
 
 	lockdep_assert_held(&inode->i_lock);
 
-	before = &inode->i_flock;
-	while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
+	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
 		trace_time_out_leases(inode, fl);
 		if (past_time(fl->fl_downgrade_time))
-			lease_modify(before, F_RDLCK, dispose);
+			lease_modify(&fl, F_RDLCK, dispose);
 		if (past_time(fl->fl_break_time))
-			lease_modify(before, F_UNLCK, dispose);
-		if (fl == *before)	/* lease_modify may have freed fl */
-			before = &fl->fl_next;
+			lease_modify(&fl, F_UNLCK, dispose);
 	}
 }
 
@@ -1419,11 +1367,12 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 static bool
 any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 {
+	struct file_lock_context *ctx = inode->i_flctx;
 	struct file_lock *fl;
 
 	lockdep_assert_held(&inode->i_lock);
 
-	for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) {
+	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (leases_conflict(fl, breaker))
 			return true;
 	}
@@ -1447,7 +1396,8 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
 	int error = 0;
 	struct file_lock *new_fl;
-	struct file_lock *fl, **before;
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl;
 	unsigned long break_time;
 	int want_write = (mode & O_ACCMODE) != O_RDONLY;
 	LIST_HEAD(dispose);
@@ -1457,6 +1407,12 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 		return PTR_ERR(new_fl);
 	new_fl->fl_flags = type;
 
+	/* typically we will check that ctx is non-NULL before calling */
+	if (!ctx) {
+		WARN_ON_ONCE(1);
+		return error;
+	}
+
 	spin_lock(&inode->i_lock);
 
 	time_out_leases(inode, &dispose);
@@ -1471,9 +1427,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 			break_time++;	/* so that 0 means no break time */
 	}
 
-	for (before = &inode->i_flock;
-			((fl = *before) != NULL) && IS_LEASE(fl);
-			before = &fl->fl_next) {
+	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (!leases_conflict(fl, new_fl))
 			continue;
 		if (want_write) {
@@ -1482,17 +1436,16 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 			fl->fl_flags |= FL_UNLOCK_PENDING;
 			fl->fl_break_time = break_time;
 		} else {
-			if (lease_breaking(inode->i_flock))
+			if (lease_breaking(fl))
 				continue;
 			fl->fl_flags |= FL_DOWNGRADE_PENDING;
 			fl->fl_downgrade_time = break_time;
 		}
 		if (fl->fl_lmops->lm_break(fl))
-			locks_delete_lock(before, &dispose);
+			locks_delete_lock_ctx(fl, &dispose);
 	}
 
-	fl = inode->i_flock;
-	if (!fl || !IS_LEASE(fl))
+	if (list_empty(&ctx->flc_lease))
 		goto out;
 
 	if (mode & O_NONBLOCK) {
@@ -1502,12 +1455,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 	}
 
 restart:
-	break_time = inode->i_flock->fl_break_time;
+	fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
+	break_time = fl->fl_break_time;
 	if (break_time != 0)
 		break_time -= jiffies;
 	if (break_time == 0)
 		break_time++;
-	locks_insert_block(inode->i_flock, new_fl);
+	locks_insert_block(fl, new_fl);
 	trace_break_lease_block(inode, new_fl);
 	spin_unlock(&inode->i_lock);
 	locks_dispose_list(&dispose);
@@ -1525,10 +1479,8 @@ restart:
 			time_out_leases(inode, &dispose);
 		if (any_leases_conflict(inode, new_fl))
 			goto restart;
-
 		error = 0;
 	}
-
 out:
 	spin_unlock(&inode->i_lock);
 	locks_dispose_list(&dispose);
@@ -1550,13 +1502,17 @@ EXPORT_SYMBOL(__break_lease);
 void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
 	bool has_lease = false;
-	struct file_lock *flock;
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl;
 
-	if (inode->i_flock) {
+	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&inode->i_lock);
-		flock = inode->i_flock;
-		if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
-			has_lease = true;
+		if (!list_empty(&ctx->flc_lease)) {
+			fl = list_first_entry(&ctx->flc_lease,
+						struct file_lock, fl_list);
+			if (fl->fl_type == F_WRLCK)
+				has_lease = true;
+		}
 		spin_unlock(&inode->i_lock);
 	}
 
@@ -1595,20 +1551,22 @@ int fcntl_getlease(struct file *filp)
 {
 	struct file_lock *fl;
 	struct inode *inode = file_inode(filp);
+	struct file_lock_context *ctx = inode->i_flctx;
 	int type = F_UNLCK;
 	LIST_HEAD(dispose);
 
-	spin_lock(&inode->i_lock);
-	time_out_leases(file_inode(filp), &dispose);
-	for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
-			fl = fl->fl_next) {
-		if (fl->fl_file == filp) {
+	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
+		spin_lock(&inode->i_lock);
+		time_out_leases(file_inode(filp), &dispose);
+		list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+			if (fl->fl_file != filp)
+				continue;
 			type = target_leasetype(fl);
 			break;
 		}
+		spin_unlock(&inode->i_lock);
+		locks_dispose_list(&dispose);
 	}
-	spin_unlock(&inode->i_lock);
-	locks_dispose_list(&dispose);
 	return type;
 }
 
@@ -1641,9 +1599,10 @@ check_conflicting_open(const struct dentry *dentry, const long arg)
 static int
 generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
 {
-	struct file_lock *fl, **before, **my_before = NULL, *lease;
+	struct file_lock *fl, *my_fl = NULL, *lease;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
+	struct file_lock_context *ctx;
 	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
 	int error;
 	LIST_HEAD(dispose);
@@ -1651,6 +1610,10 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	lease = *flp;
 	trace_generic_add_lease(inode, lease);
 
+	ctx = locks_get_lock_context(inode);
+	if (!ctx)
+		return -ENOMEM;
+
 	/*
 	 * In the delegation case we need mutual exclusion with
 	 * a number of operations that take the i_mutex.  We trylock
@@ -1684,13 +1647,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 * except for this filp.
 	 */
 	error = -EAGAIN;
-	for (before = &inode->i_flock;
-			((fl = *before) != NULL) && IS_LEASE(fl);
-			before = &fl->fl_next) {
+	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (fl->fl_file == filp) {
-			my_before = before;
+			my_fl = fl;
 			continue;
 		}
+
 		/*
 		 * No exclusive leases if someone else has a lease on
 		 * this file:
@@ -1705,9 +1667,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 			goto out;
 	}
 
-	if (my_before != NULL) {
-		lease = *my_before;
-		error = lease->fl_lmops->lm_change(my_before, arg, &dispose);
+	if (my_fl != NULL) {
+		error = lease->fl_lmops->lm_change(&my_fl, arg, &dispose);
 		if (error)
 			goto out;
 		goto out_setup;
@@ -1717,7 +1678,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	if (!leases_enable)
 		goto out;
 
-	locks_insert_lock(before, lease);
+	locks_insert_lock_ctx(lease, &ctx->flc_lease);
 	/*
 	 * The check in break_lease() is lockless. It's possible for another
 	 * open to race in after we did the earlier check for a conflicting
@@ -1729,8 +1690,10 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 */
 	smp_mb();
 	error = check_conflicting_open(dentry, arg);
-	if (error)
-		goto out_unlink;
+	if (error) {
+		locks_unlink_lock_ctx(lease);
+		goto out;
+	}
 
 out_setup:
 	if (lease->fl_lmops->lm_setup)
@@ -1740,33 +1703,35 @@ out:
 	locks_dispose_list(&dispose);
 	if (is_deleg)
 		mutex_unlock(&inode->i_mutex);
-	if (!error && !my_before)
+	if (!error && !my_fl)
 		*flp = NULL;
 	return error;
-out_unlink:
-	locks_unlink_lock(before);
-	goto out;
 }
 
 static int generic_delete_lease(struct file *filp)
 {
 	int error = -EAGAIN;
-	struct file_lock *fl, **before;
+	struct file_lock *fl, *victim = NULL;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
+	struct file_lock_context *ctx = inode->i_flctx;
 	LIST_HEAD(dispose);
 
+	if (!ctx) {
+		trace_generic_delete_lease(inode, NULL);
+		return error;
+	}
+
 	spin_lock(&inode->i_lock);
-	time_out_leases(inode, &dispose);
-	for (before = &inode->i_flock;
-			((fl = *before) != NULL) && IS_LEASE(fl);
-			before = &fl->fl_next) {
-		if (fl->fl_file == filp)
+	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+		if (fl->fl_file == filp) {
+			victim = fl;
 			break;
+		}
 	}
 	trace_generic_delete_lease(inode, fl);
-	if (fl && IS_LEASE(fl))
-		error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose);
+	if (victim)
+		error = fl->fl_lmops->lm_change(&victim, F_UNLCK, &dispose);
 	spin_unlock(&inode->i_lock);
 	locks_dispose_list(&dispose);
 	return error;
@@ -2447,56 +2412,37 @@ locks_remove_flock(struct file *filp)
 		fl.fl_ops->fl_release_private(&fl);
 }
 
+static void
+locks_remove_lease(struct file *filp)
+{
+	struct inode *inode = file_inode(filp);
+	struct file_lock_context *ctx = inode->i_flctx;
+	struct file_lock *fl, *tmp;
+	LIST_HEAD(dispose);
+
+	if (!ctx || list_empty(&ctx->flc_lease))
+		return;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
+		lease_modify(&fl, F_UNLCK, &dispose);
+	spin_unlock(&inode->i_lock);
+	locks_dispose_list(&dispose);
+}
+
 /*
  * This function is called on the last close of an open file.
  */
 void locks_remove_file(struct file *filp)
 {
-	struct inode * inode = file_inode(filp);
-	struct file_lock *fl;
-	struct file_lock **before;
-	LIST_HEAD(dispose);
-
 	/* remove any OFD locks */
 	locks_remove_posix(filp, filp);
 
 	/* remove flock locks */
 	locks_remove_flock(filp);
 
-	if (!inode->i_flock)
-		return;
-
-	spin_lock(&inode->i_lock);
-	before = &inode->i_flock;
-
-	while ((fl = *before) != NULL) {
-		if (fl->fl_file == filp) {
-			if (IS_LEASE(fl)) {
-				lease_modify(before, F_UNLCK, &dispose);
-				continue;
-			}
-
-			/*
-			 * There's a leftover lock on the list of a type that
-			 * we didn't expect to see. Most likely a classic
-			 * POSIX lock that ended up not getting released
-			 * properly, or that raced onto the list somehow. Log
-			 * some info about it and then just remove it from
-			 * the list.
-			 */
-			WARN(1, "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
-				MAJOR(inode->i_sb->s_dev),
-				MINOR(inode->i_sb->s_dev), inode->i_ino,
-				fl->fl_type, fl->fl_flags,
-				fl->fl_start, fl->fl_end);
-
-			locks_delete_lock(before, &dispose);
-			continue;
- 		}
-		before = &fl->fl_next;
-	}
-	spin_unlock(&inode->i_lock);
-	locks_dispose_list(&dispose);
+	/* remove any leases */
+	locks_remove_lease(filp);
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 571f113588e9..2ddec3cf81b9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -969,6 +969,7 @@ struct file_lock {
 struct file_lock_context {
 	struct list_head	flc_flock;
 	struct list_head	flc_posix;
+	struct list_head	flc_lease;
 };
 
 /* The following constant reflects the upper bound of the file/locking space */
@@ -1990,7 +1991,7 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 	 * end up racing with tasks trying to set a new lease on this file.
 	 */
 	smp_mb();
-	if (inode->i_flock)
+	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
 		return __break_lease(inode, mode, FL_LEASE);
 	return 0;
 }
@@ -2003,7 +2004,7 @@ static inline int break_deleg(struct inode *inode, unsigned int mode)
 	 * end up racing with tasks trying to set a new lease on this file.
 	 */
 	smp_mb();
-	if (inode->i_flock)
+	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
 		return __break_lease(inode, mode, FL_DELEG);
 	return 0;
 }
-- 
cgit v1.2.3


From a7231a97467d5a0c36f82f581c76c12c034e4b80 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:56 -0500
Subject: locks: remove i_flock field from struct inode

Nothing uses it anymore. Also add a forward declaration for struct
file_lock to silence some compiler warnings that the removal triggers.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2ddec3cf81b9..ce0873af0b97 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -625,7 +625,6 @@ struct inode {
 	atomic_t		i_readcount; /* struct files open RO */
 #endif
 	const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
-	struct file_lock	*i_flock;
 	struct file_lock_context	*i_flctx;
 	struct address_space	i_data;
 	struct list_head	i_devices;
@@ -886,6 +885,8 @@ static inline struct file *get_file(struct file *f)
 /* legacy typedef, should eventually be removed */
 typedef void *fl_owner_t;
 
+struct file_lock;
+
 struct file_lock_operations {
 	void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
 	void (*fl_release_private)(struct file_lock *);
-- 
cgit v1.2.3


From 6109c85037e53443f29fd39c0de69f578a1cf285 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:57 -0500
Subject: locks: add a dedicated spinlock to protect i_flctx lists

We can now add a dedicated spinlock without expanding struct inode.
Change to using that to protect the various i_flctx lists.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/ceph/locks.c     |  8 ++---
 fs/cifs/file.c      |  8 ++---
 fs/lockd/svcsubs.c  | 12 ++++----
 fs/locks.c          | 87 +++++++++++++++++++++++++++--------------------------
 fs/nfs/delegation.c |  8 ++---
 fs/nfs/nfs4state.c  |  8 ++---
 fs/nfs/write.c      |  4 +--
 fs/nfsd/nfs4state.c |  4 +--
 include/linux/fs.h  |  1 +
 9 files changed, 71 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 19beeed83233..0303da8e3233 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -255,12 +255,12 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 
 	ctx = inode->i_flctx;
 	if (ctx) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&ctx->flc_lock);
 		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
 			++(*fcntl_count);
 		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
 			++(*flock_count);
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ctx->flc_lock);
 	}
 	dout("counted %d flock locks and %d fcntl locks",
 	     *flock_count, *fcntl_count);
@@ -288,7 +288,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 	if (!ctx)
 		return 0;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
 		++seen_fcntl;
 		if (seen_fcntl > num_fcntl_locks) {
@@ -312,7 +312,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 		++l;
 	}
 fail:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	return err;
 }
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index ea78f6f81ce2..b65166eb111e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1136,11 +1136,11 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	if (!flctx)
 		goto out;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 	list_for_each(el, &flctx->flc_posix) {
 		count++;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 
 	INIT_LIST_HEAD(&locks_to_send);
 
@@ -1159,7 +1159,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	}
 
 	el = locks_to_send.next;
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 	list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
 		if (el == &locks_to_send) {
 			/*
@@ -1181,7 +1181,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 		lck->type = type;
 		lck->offset = flock->fl_start;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 
 	list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
 		int stored_rc;
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 5300bb53835f..665ef5a05183 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -171,7 +171,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 		return 0;
 again:
 	file->f_locks = 0;
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 	list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
 		if (fl->fl_lmops != &nlmsvc_lock_operations)
 			continue;
@@ -183,7 +183,7 @@ again:
 		if (match(lockhost, host)) {
 			struct file_lock lock = *fl;
 
-			spin_unlock(&inode->i_lock);
+			spin_unlock(&flctx->flc_lock);
 			lock.fl_type  = F_UNLCK;
 			lock.fl_start = 0;
 			lock.fl_end   = OFFSET_MAX;
@@ -195,7 +195,7 @@ again:
 			goto again;
 		}
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 
 	return 0;
 }
@@ -232,14 +232,14 @@ nlm_file_inuse(struct nlm_file *file)
 		return 1;
 
 	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&flctx->flc_lock);
 		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
 			if (fl->fl_lmops == &nlmsvc_lock_operations) {
-				spin_unlock(&inode->i_lock);
+				spin_unlock(&flctx->flc_lock);
 				return 1;
 			}
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&flctx->flc_lock);
 	}
 	file->f_locks = 0;
 	return 0;
diff --git a/fs/locks.c b/fs/locks.c
index d46e70567b99..a268d959ccd6 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -161,7 +161,7 @@ int lease_break_time = 45;
  * The global file_lock_list is only used for displaying /proc/locks, so we
  * keep a list on each CPU, with each list protected by its own spinlock via
  * the file_lock_lglock. Note that alterations to the list also require that
- * the relevant i_lock is held.
+ * the relevant flc_lock is held.
  */
 DEFINE_STATIC_LGLOCK(file_lock_lglock);
 static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
@@ -189,13 +189,13 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
  * contrast to those that are acting as records of acquired locks).
  *
  * Note that when we acquire this lock in order to change the above fields,
- * we often hold the i_lock as well. In certain cases, when reading the fields
+ * we often hold the flc_lock as well. In certain cases, when reading the fields
  * protected by this lock, we can skip acquiring it iff we already hold the
- * i_lock.
+ * flc_lock.
  *
  * In particular, adding an entry to the fl_block list requires that you hold
- * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
- * an entry from the list however only requires the file_lock_lock.
+ * both the flc_lock and the blocked_lock_lock (acquired in that order).
+ * Deleting an entry from the list however only requires the file_lock_lock.
  */
 static DEFINE_SPINLOCK(blocked_lock_lock);
 
@@ -214,6 +214,7 @@ locks_get_lock_context(struct inode *inode)
 	if (!new)
 		goto out;
 
+	spin_lock_init(&new->flc_lock);
 	INIT_LIST_HEAD(&new->flc_flock);
 	INIT_LIST_HEAD(&new->flc_posix);
 	INIT_LIST_HEAD(&new->flc_lease);
@@ -557,7 +558,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 	return fl1->fl_owner == fl2->fl_owner;
 }
 
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
 static void locks_insert_global_locks(struct file_lock *fl)
 {
 	lg_local_lock(&file_lock_lglock);
@@ -566,12 +567,12 @@ static void locks_insert_global_locks(struct file_lock *fl)
 	lg_local_unlock(&file_lock_lglock);
 }
 
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
 static void locks_delete_global_locks(struct file_lock *fl)
 {
 	/*
 	 * Avoid taking lock if already unhashed. This is safe since this check
-	 * is done while holding the i_lock, and new insertions into the list
+	 * is done while holding the flc_lock, and new insertions into the list
 	 * also require that it be held.
 	 */
 	if (hlist_unhashed(&fl->fl_link))
@@ -623,10 +624,10 @@ static void locks_delete_block(struct file_lock *waiter)
  * the order they blocked. The documentation doesn't require this but
  * it seems like the reasonable thing to do.
  *
- * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
- * list itself is protected by the blocked_lock_lock, but by ensuring that the
- * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
- * in some cases when we see that the fl_block list is empty.
+ * Must be called with both the flc_lock and blocked_lock_lock held. The
+ * fl_block list itself is protected by the blocked_lock_lock, but by ensuring
+ * that the flc_lock is also held on insertions we can avoid taking the
+ * blocked_lock_lock in some cases when we see that the fl_block list is empty.
  */
 static void __locks_insert_block(struct file_lock *blocker,
 					struct file_lock *waiter)
@@ -638,7 +639,7 @@ static void __locks_insert_block(struct file_lock *blocker,
 		locks_insert_global_blocked(waiter);
 }
 
-/* Must be called with i_lock held. */
+/* Must be called with flc_lock held. */
 static void locks_insert_block(struct file_lock *blocker,
 					struct file_lock *waiter)
 {
@@ -650,15 +651,15 @@ static void locks_insert_block(struct file_lock *blocker,
 /*
  * Wake up processes blocked waiting for blocker.
  *
- * Must be called with the inode->i_lock held!
+ * Must be called with the inode->flc_lock held!
  */
 static void locks_wake_up_blocks(struct file_lock *blocker)
 {
 	/*
 	 * Avoid taking global lock if list is empty. This is safe since new
-	 * blocked requests are only added to the list under the i_lock, and
-	 * the i_lock is always held here. Note that removal from the fl_block
-	 * list does not require the i_lock, so we must recheck list_empty()
+	 * blocked requests are only added to the list under the flc_lock, and
+	 * the flc_lock is always held here. Note that removal from the fl_block
+	 * list does not require the flc_lock, so we must recheck list_empty()
 	 * after acquiring the blocked_lock_lock.
 	 */
 	if (list_empty(&blocker->fl_block))
@@ -768,7 +769,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 		return;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
 		if (posix_locks_conflict(fl, cfl)) {
 			locks_copy_conflock(fl, cfl);
@@ -779,7 +780,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 	}
 	fl->fl_type = F_UNLCK;
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -880,7 +881,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 			return -ENOMEM;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	if (request->fl_flags & FL_ACCESS)
 		goto find_conflict;
 
@@ -905,9 +906,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	 * give it the opportunity to lock the file.
 	 */
 	if (found) {
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ctx->flc_lock);
 		cond_resched();
-		spin_lock(&inode->i_lock);
+		spin_lock(&ctx->flc_lock);
 	}
 
 find_conflict:
@@ -929,7 +930,7 @@ find_conflict:
 	error = 0;
 
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	if (new_fl)
 		locks_free_lock(new_fl);
 	locks_dispose_list(&dispose);
@@ -965,7 +966,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		new_fl2 = locks_alloc_lock();
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	/*
 	 * New lock request. Walk all POSIX locks and look for conflicts. If
 	 * there are any, either return error or put the request on the
@@ -1136,7 +1137,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		locks_wake_up_blocks(left);
 	}
  out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	/*
 	 * Free any unused locks.
 	 */
@@ -1218,7 +1219,7 @@ int locks_mandatory_locked(struct file *file)
 	/*
 	 * Search the lock list for this inode for any POSIX locks.
 	 */
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	ret = 0;
 	list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
 		if (fl->fl_owner != current->files &&
@@ -1227,7 +1228,7 @@ int locks_mandatory_locked(struct file *file)
 			break;
 		}
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	return ret;
 }
 
@@ -1346,7 +1347,7 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 	struct file_lock_context *ctx = inode->i_flctx;
 	struct file_lock *fl, *tmp;
 
-	lockdep_assert_held(&inode->i_lock);
+	lockdep_assert_held(&ctx->flc_lock);
 
 	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
 		trace_time_out_leases(inode, fl);
@@ -1370,7 +1371,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 	struct file_lock_context *ctx = inode->i_flctx;
 	struct file_lock *fl;
 
-	lockdep_assert_held(&inode->i_lock);
+	lockdep_assert_held(&ctx->flc_lock);
 
 	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (leases_conflict(fl, breaker))
@@ -1413,7 +1414,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 		return error;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 
 	time_out_leases(inode, &dispose);
 
@@ -1463,11 +1464,11 @@ restart:
 		break_time++;
 	locks_insert_block(fl, new_fl);
 	trace_break_lease_block(inode, new_fl);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	error = wait_event_interruptible_timeout(new_fl->fl_wait,
 						!new_fl->fl_next, break_time);
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	trace_break_lease_unblock(inode, new_fl);
 	locks_delete_block(new_fl);
 	if (error >= 0) {
@@ -1482,7 +1483,7 @@ restart:
 		error = 0;
 	}
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	locks_free_lock(new_fl);
 	return error;
@@ -1506,14 +1507,14 @@ void lease_get_mtime(struct inode *inode, struct timespec *time)
 	struct file_lock *fl;
 
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&ctx->flc_lock);
 		if (!list_empty(&ctx->flc_lease)) {
 			fl = list_first_entry(&ctx->flc_lease,
 						struct file_lock, fl_list);
 			if (fl->fl_type == F_WRLCK)
 				has_lease = true;
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ctx->flc_lock);
 	}
 
 	if (has_lease)
@@ -1556,7 +1557,7 @@ int fcntl_getlease(struct file *filp)
 	LIST_HEAD(dispose);
 
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&ctx->flc_lock);
 		time_out_leases(file_inode(filp), &dispose);
 		list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 			if (fl->fl_file != filp)
@@ -1564,7 +1565,7 @@ int fcntl_getlease(struct file *filp)
 			type = target_leasetype(fl);
 			break;
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&ctx->flc_lock);
 		locks_dispose_list(&dispose);
 	}
 	return type;
@@ -1632,7 +1633,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 		return -EINVAL;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
 	error = check_conflicting_open(dentry, arg);
 	if (error)
@@ -1699,7 +1700,7 @@ out_setup:
 	if (lease->fl_lmops->lm_setup)
 		lease->fl_lmops->lm_setup(lease, priv);
 out:
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	if (is_deleg)
 		mutex_unlock(&inode->i_mutex);
@@ -1722,7 +1723,7 @@ static int generic_delete_lease(struct file *filp)
 		return error;
 	}
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
 		if (fl->fl_file == filp) {
 			victim = fl;
@@ -1732,7 +1733,7 @@ static int generic_delete_lease(struct file *filp)
 	trace_generic_delete_lease(inode, fl);
 	if (victim)
 		error = fl->fl_lmops->lm_change(&victim, F_UNLCK, &dispose);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	return error;
 }
@@ -2423,10 +2424,10 @@ locks_remove_lease(struct file *filp)
 	if (!ctx || list_empty(&ctx->flc_lease))
 		return;
 
-	spin_lock(&inode->i_lock);
+	spin_lock(&ctx->flc_lock);
 	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
 		lease_modify(&fl, F_UNLCK, &dispose);
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 }
 
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 3fb1caa3874d..8cdb2b28a104 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -93,22 +93,22 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 		goto out;
 
 	list = &flctx->flc_posix;
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 restart:
 	list_for_each_entry(fl, list, fl_list) {
 		if (nfs_file_open_context(fl->fl_file) != ctx)
 			continue;
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&flctx->flc_lock);
 		status = nfs4_lock_delegation_recall(fl, state, stateid);
 		if (status < 0)
 			goto out;
-		spin_lock(&inode->i_lock);
+		spin_lock(&flctx->flc_lock);
 	}
 	if (list == &flctx->flc_posix) {
 		list = &flctx->flc_flock;
 		goto restart;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 out:
 	return status;
 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6084c267f3a0..a3bb22ab68c5 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1376,12 +1376,12 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 
 	/* Guard against delegation returns and new lock/unlock calls */
 	down_write(&nfsi->rwsem);
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 restart:
 	list_for_each_entry(fl, list, fl_list) {
 		if (nfs_file_open_context(fl->fl_file)->state != state)
 			continue;
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&flctx->flc_lock);
 		status = ops->recover_lock(state, fl);
 		switch (status) {
 		case 0:
@@ -1408,13 +1408,13 @@ restart:
 			/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 			status = 0;
 		}
-		spin_lock(&inode->i_lock);
+		spin_lock(&flctx->flc_lock);
 	}
 	if (list == &flctx->flc_posix) {
 		list = &flctx->flc_flock;
 		goto restart;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 out:
 	up_write(&nfsi->rwsem);
 	return status;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 784c13485b3f..4ae66f416eb9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1206,7 +1206,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino
 
 	/* Check to see if there are whole file write locks */
 	ret = 0;
-	spin_lock(&inode->i_lock);
+	spin_lock(&flctx->flc_lock);
 	if (!list_empty(&flctx->flc_posix)) {
 		fl = list_first_entry(&flctx->flc_posix, struct file_lock,
 					fl_list);
@@ -1218,7 +1218,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino
 		if (fl->fl_type == F_WRLCK)
 			ret = 1;
 	}
-	spin_unlock(&inode->i_lock);
+	spin_unlock(&flctx->flc_lock);
 	return ret;
 }
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index fad821991369..80242f5bd621 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5572,14 +5572,14 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 	flctx = inode->i_flctx;
 
 	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
-		spin_lock(&inode->i_lock);
+		spin_lock(&flctx->flc_lock);
 		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
 			if (fl->fl_owner == (fl_owner_t)lowner) {
 				status = true;
 				break;
 			}
 		}
-		spin_unlock(&inode->i_lock);
+		spin_unlock(&flctx->flc_lock);
 	}
 	fput(filp);
 	return status;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ce0873af0b97..32eafa9b5c9f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -968,6 +968,7 @@ struct file_lock {
 };
 
 struct file_lock_context {
+	spinlock_t		flc_lock;
 	struct list_head	flc_flock;
 	struct list_head	flc_posix;
 	struct list_head	flc_lease;
-- 
cgit v1.2.3


From 7448cc37b1a6b620d948aaee3bb30960c06d5d5d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:57 -0500
Subject: locks: clean up the lm_change prototype

Now that we use standard list_heads for tracking leases, we can have
lm_change take a pointer to the lease to be modified instead of a
double pointer.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/locks.c          | 13 ++++++-------
 fs/nfsd/nfs4state.c |  3 ++-
 include/linux/fs.h  |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index a268d959ccd6..864f2460a0ad 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1309,9 +1309,8 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
 }
 
 /* We already had a lease on this file; just change its type */
-int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
+int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 {
-	struct file_lock *fl = *before;
 	int error = assign_type(fl, arg);
 
 	if (error)
@@ -1352,9 +1351,9 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
 		trace_time_out_leases(inode, fl);
 		if (past_time(fl->fl_downgrade_time))
-			lease_modify(&fl, F_RDLCK, dispose);
+			lease_modify(fl, F_RDLCK, dispose);
 		if (past_time(fl->fl_break_time))
-			lease_modify(&fl, F_UNLCK, dispose);
+			lease_modify(fl, F_UNLCK, dispose);
 	}
 }
 
@@ -1669,7 +1668,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	}
 
 	if (my_fl != NULL) {
-		error = lease->fl_lmops->lm_change(&my_fl, arg, &dispose);
+		error = lease->fl_lmops->lm_change(my_fl, arg, &dispose);
 		if (error)
 			goto out;
 		goto out_setup;
@@ -1732,7 +1731,7 @@ static int generic_delete_lease(struct file *filp)
 	}
 	trace_generic_delete_lease(inode, fl);
 	if (victim)
-		error = fl->fl_lmops->lm_change(&victim, F_UNLCK, &dispose);
+		error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
 	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	return error;
@@ -2426,7 +2425,7 @@ locks_remove_lease(struct file *filp)
 
 	spin_lock(&ctx->flc_lock);
 	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
-		lease_modify(&fl, F_UNLCK, &dispose);
+		lease_modify(fl, F_UNLCK, &dispose);
 	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 80242f5bd621..532a60cca2fb 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3477,7 +3477,8 @@ nfsd_break_deleg_cb(struct file_lock *fl)
 }
 
 static int
-nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose)
+nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+		     struct list_head *dispose)
 {
 	if (arg & F_UNLCK)
 		return lease_modify(onlist, arg, dispose);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 32eafa9b5c9f..94e706a0a408 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -900,7 +900,7 @@ struct lock_manager_operations {
 	void (*lm_notify)(struct file_lock *);	/* unblock callback */
 	int (*lm_grant)(struct file_lock *, int);
 	bool (*lm_break)(struct file_lock *);
-	int (*lm_change)(struct file_lock **, int, struct list_head *);
+	int (*lm_change)(struct file_lock *, int, struct list_head *);
 	void (*lm_setup)(struct file_lock *, void **);
 };
 
@@ -1021,7 +1021,7 @@ extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int t
 extern void lease_get_mtime(struct inode *, struct timespec *time);
 extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
 extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
-extern int lease_modify(struct file_lock **, int, struct list_head *);
+extern int lease_modify(struct file_lock *, int, struct list_head *);
 #else /* !CONFIG_FILE_LOCKING */
 static inline int fcntl_getlk(struct file *file, unsigned int cmd,
 			      struct flock __user *user)
@@ -1153,7 +1153,7 @@ static inline int vfs_setlease(struct file *filp, long arg,
 	return -EINVAL;
 }
 
-static inline int lease_modify(struct file_lock **before, int arg,
+static inline int lease_modify(struct file_lock *fl, int arg,
 			       struct list_head *dispose)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 9bd0f45b7037fcfa8b575c7e27d0431d6e6dc3bb Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Fri, 16 Jan 2015 15:05:57 -0500
Subject: locks: keep a count of locks on the flctx lists

This makes things a bit more efficient in the cifs and ceph lock
pushing code.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 fs/ceph/locks.c    | 11 ++---------
 fs/cifs/file.c     | 14 ++++----------
 fs/locks.c         | 45 +++++++++++++++++++++++++++++----------------
 include/linux/fs.h |  3 +++
 4 files changed, 38 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 0303da8e3233..06ea5cd05cd9 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -242,12 +242,9 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 /*
  * Fills in the passed counter variables, so you can prepare pagelist metadata
  * before calling ceph_encode_locks.
- *
- * FIXME: add counters to struct file_lock_context so we don't need to do this?
  */
 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 {
-	struct file_lock *lock;
 	struct file_lock_context *ctx;
 
 	*fcntl_count = 0;
@@ -255,12 +252,8 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 
 	ctx = inode->i_flctx;
 	if (ctx) {
-		spin_lock(&ctx->flc_lock);
-		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
-			++(*fcntl_count);
-		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
-			++(*flock_count);
-		spin_unlock(&ctx->flc_lock);
+		*fcntl_count = ctx->flc_posix_cnt;
+		*flock_count = ctx->flc_flock_cnt;
 	}
 	dout("counted %d flock locks and %d fcntl locks",
 	     *flock_count, *fcntl_count);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b65166eb111e..8c2ca6f62bad 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1125,7 +1125,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 	struct file_lock *flock;
 	struct file_lock_context *flctx = inode->i_flctx;
-	unsigned int count = 0, i;
+	unsigned int i;
 	int rc = 0, xid, type;
 	struct list_head locks_to_send, *el;
 	struct lock_to_push *lck, *tmp;
@@ -1136,20 +1136,14 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	if (!flctx)
 		goto out;
 
-	spin_lock(&flctx->flc_lock);
-	list_for_each(el, &flctx->flc_posix) {
-		count++;
-	}
-	spin_unlock(&flctx->flc_lock);
-
 	INIT_LIST_HEAD(&locks_to_send);
 
 	/*
-	 * Allocating count locks is enough because no FL_POSIX locks can be
-	 * added to the list while we are holding cinode->lock_sem that
+	 * Allocating flc_posix_cnt locks is enough because no FL_POSIX locks
+	 * can be added to the list while we are holding cinode->lock_sem that
 	 * protects locking operations of this inode.
 	 */
-	for (i = 0; i < count; i++) {
+	for (i = 0; i < flctx->flc_posix_cnt; i++) {
 		lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
 		if (!lck) {
 			rc = -ENOMEM;
diff --git a/fs/locks.c b/fs/locks.c
index 864f2460a0ad..bd578700342d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -681,18 +681,21 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
 }
 
 static void
-locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
+locks_insert_lock_ctx(struct file_lock *fl, int *counter,
+		      struct list_head *before)
 {
 	fl->fl_nspid = get_pid(task_tgid(current));
 	list_add_tail(&fl->fl_list, before);
+	++*counter;
 	locks_insert_global_locks(fl);
 }
 
 static void
-locks_unlink_lock_ctx(struct file_lock *fl)
+locks_unlink_lock_ctx(struct file_lock *fl, int *counter)
 {
 	locks_delete_global_locks(fl);
 	list_del_init(&fl->fl_list);
+	--*counter;
 	if (fl->fl_nspid) {
 		put_pid(fl->fl_nspid);
 		fl->fl_nspid = NULL;
@@ -701,9 +704,10 @@ locks_unlink_lock_ctx(struct file_lock *fl)
 }
 
 static void
-locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
+locks_delete_lock_ctx(struct file_lock *fl, int *counter,
+		      struct list_head *dispose)
 {
-	locks_unlink_lock_ctx(fl);
+	locks_unlink_lock_ctx(fl, counter);
 	if (dispose)
 		list_add(&fl->fl_list, dispose);
 	else
@@ -891,7 +895,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 		if (request->fl_type == fl->fl_type)
 			goto out;
 		found = true;
-		locks_delete_lock_ctx(fl, &dispose);
+		locks_delete_lock_ctx(fl, &ctx->flc_flock_cnt, &dispose);
 		break;
 	}
 
@@ -925,7 +929,7 @@ find_conflict:
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
 	locks_copy_lock(new_fl, request);
-	locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
+	locks_insert_lock_ctx(new_fl, &ctx->flc_flock_cnt, &ctx->flc_flock);
 	new_fl = NULL;
 	error = 0;
 
@@ -1042,7 +1046,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			else
 				request->fl_end = fl->fl_end;
 			if (added) {
-				locks_delete_lock_ctx(fl, &dispose);
+				locks_delete_lock_ctx(fl, &ctx->flc_posix_cnt,
+							&dispose);
 				continue;
 			}
 			request = fl;
@@ -1071,7 +1076,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				 * one (This may happen several times).
 				 */
 				if (added) {
-					locks_delete_lock_ctx(fl, &dispose);
+					locks_delete_lock_ctx(fl,
+						&ctx->flc_posix_cnt, &dispose);
 					continue;
 				}
 				/*
@@ -1087,8 +1093,10 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 				locks_copy_lock(new_fl, request);
 				request = new_fl;
 				new_fl = NULL;
-				locks_insert_lock_ctx(request, &fl->fl_list);
-				locks_delete_lock_ctx(fl, &dispose);
+				locks_insert_lock_ctx(request,
+					&ctx->flc_posix_cnt, &fl->fl_list);
+				locks_delete_lock_ctx(fl,
+					&ctx->flc_posix_cnt, &dispose);
 				added = true;
 			}
 		}
@@ -1116,7 +1124,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			goto out;
 		}
 		locks_copy_lock(new_fl, request);
-		locks_insert_lock_ctx(new_fl, &fl->fl_list);
+		locks_insert_lock_ctx(new_fl, &ctx->flc_posix_cnt,
+					&fl->fl_list);
 		new_fl = NULL;
 	}
 	if (right) {
@@ -1127,7 +1136,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			left = new_fl2;
 			new_fl2 = NULL;
 			locks_copy_lock(left, right);
-			locks_insert_lock_ctx(left, &fl->fl_list);
+			locks_insert_lock_ctx(left, &ctx->flc_posix_cnt,
+						&fl->fl_list);
 		}
 		right->fl_start = request->fl_end + 1;
 		locks_wake_up_blocks(right);
@@ -1311,6 +1321,7 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
 /* We already had a lease on this file; just change its type */
 int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 {
+	struct file_lock_context *flctx;
 	int error = assign_type(fl, arg);
 
 	if (error)
@@ -1320,6 +1331,7 @@ int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 	if (arg == F_UNLCK) {
 		struct file *filp = fl->fl_file;
 
+		flctx = file_inode(filp)->i_flctx;
 		f_delown(filp);
 		filp->f_owner.signum = 0;
 		fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
@@ -1327,7 +1339,7 @@ int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 			printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
 			fl->fl_fasync = NULL;
 		}
-		locks_delete_lock_ctx(fl, dispose);
+		locks_delete_lock_ctx(fl, &flctx->flc_lease_cnt, dispose);
 	}
 	return 0;
 }
@@ -1442,7 +1454,8 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 			fl->fl_downgrade_time = break_time;
 		}
 		if (fl->fl_lmops->lm_break(fl))
-			locks_delete_lock_ctx(fl, &dispose);
+			locks_delete_lock_ctx(fl, &ctx->flc_lease_cnt,
+						&dispose);
 	}
 
 	if (list_empty(&ctx->flc_lease))
@@ -1678,7 +1691,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	if (!leases_enable)
 		goto out;
 
-	locks_insert_lock_ctx(lease, &ctx->flc_lease);
+	locks_insert_lock_ctx(lease, &ctx->flc_lease_cnt, &ctx->flc_lease);
 	/*
 	 * The check in break_lease() is lockless. It's possible for another
 	 * open to race in after we did the earlier check for a conflicting
@@ -1691,7 +1704,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	smp_mb();
 	error = check_conflicting_open(dentry, arg);
 	if (error) {
-		locks_unlink_lock_ctx(lease);
+		locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt);
 		goto out;
 	}
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 94e706a0a408..f87cb2f03103 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -972,6 +972,9 @@ struct file_lock_context {
 	struct list_head	flc_flock;
 	struct list_head	flc_posix;
 	struct list_head	flc_lease;
+	int			flc_flock_cnt;
+	int			flc_posix_cnt;
+	int			flc_lease_cnt;
 };
 
 /* The following constant reflects the upper bound of the file/locking space */
-- 
cgit v1.2.3


From ee1c244219fd652964710a6cc3e4f922e86aa492 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 16 Jan 2015 11:37:14 +0100
Subject: genetlink: synchronize socket closing and family removal

In addition to the problem Jeff Layton reported, I looked at the code
and reproduced the same warning by subscribing and removing the genl
family with a socket still open. This is a fairly tricky race which
originates in the fact that generic netlink allows the family to go
away while sockets are still open - unlike regular netlink which has
a module refcount for every open socket so in general this cannot be
triggered.

Trying to resolve this issue by the obvious locking isn't possible as
it will result in deadlocks between unregistration and group unbind
notification (which incidentally lockdep doesn't find due to the home
grown locking in the netlink table.)

To really resolve this, introduce a "closing socket" reference counter
(for generic netlink only, as it's the only affected family) in the
core netlink code and use that in generic netlink to wait for all the
sockets that are being closed at the same time as a generic netlink
family is removed.

This fixes the race that when a socket is closed, it will should call
the unbind, but if the family is removed at the same time the unbind
will not find it, leading to the warning. The real problem though is
that in this case the unbind could actually find a new family that is
registered to have a multicast group with the same ID, and call its
mcast_unbind() leading to confusing.

Also remove the warning since it would still trigger, but is now no
longer a problem.

This also moves the code in af_netlink.c to before unreferencing the
module to avoid having the same problem in the normal non-genl case.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/genetlink.h |  4 ++++
 include/net/genetlink.h   |  5 ++++-
 net/netlink/af_netlink.c  | 24 +++++++++++++++++-------
 net/netlink/af_netlink.h  |  1 +
 net/netlink/genetlink.c   | 16 +++++++++-------
 5 files changed, 35 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h
index 55b685719d52..09460d6d6682 100644
--- a/include/linux/genetlink.h
+++ b/include/linux/genetlink.h
@@ -11,6 +11,10 @@ extern void genl_unlock(void);
 extern int lockdep_genl_is_held(void);
 #endif
 
+/* for synchronisation between af_netlink and genetlink */
+extern atomic_t genl_sk_destructing_cnt;
+extern wait_queue_head_t genl_sk_destructing_waitq;
+
 /**
  * rcu_dereference_genl - rcu_dereference with debug checking
  * @p: The pointer to read, prior to dereferencing
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 2ea2c55bdc87..6c92415311ca 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -35,7 +35,10 @@ struct genl_info;
  *	undo operations done by pre_doit, for example release locks
  * @mcast_bind: a socket bound to the given multicast group (which
  *	is given as the offset into the groups array)
- * @mcast_unbind: a socket was unbound from the given multicast group
+ * @mcast_unbind: a socket was unbound from the given multicast group.
+ *	Note that unbind() will not be called symmetrically if the
+ *	generic netlink family is removed while there are still open
+ *	sockets.
  * @attrbuf: buffer to store parsed attributes
  * @family_list: family list
  * @mcgrps: multicast groups used by this family (private)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 84ea76ca3f1f..02fdde28dada 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -61,6 +61,7 @@
 #include <linux/rhashtable.h>
 #include <asm/cacheflush.h>
 #include <linux/hash.h>
+#include <linux/genetlink.h>
 
 #include <net/net_namespace.h>
 #include <net/sock.h>
@@ -1095,6 +1096,8 @@ static void netlink_remove(struct sock *sk)
 		__sk_del_bind_node(sk);
 		netlink_update_listeners(sk);
 	}
+	if (sk->sk_protocol == NETLINK_GENERIC)
+		atomic_inc(&genl_sk_destructing_cnt);
 	netlink_table_ungrab();
 }
 
@@ -1211,6 +1214,20 @@ static int netlink_release(struct socket *sock)
 	 * will be purged.
 	 */
 
+	/* must not acquire netlink_table_lock in any way again before unbind
+	 * and notifying genetlink is done as otherwise it might deadlock
+	 */
+	if (nlk->netlink_unbind) {
+		int i;
+
+		for (i = 0; i < nlk->ngroups; i++)
+			if (test_bit(i, nlk->groups))
+				nlk->netlink_unbind(sock_net(sk), i + 1);
+	}
+	if (sk->sk_protocol == NETLINK_GENERIC &&
+	    atomic_dec_return(&genl_sk_destructing_cnt) == 0)
+		wake_up(&genl_sk_destructing_waitq);
+
 	sock->sk = NULL;
 	wake_up_interruptible_all(&nlk->wait);
 
@@ -1246,13 +1263,6 @@ static int netlink_release(struct socket *sock)
 		netlink_table_ungrab();
 	}
 
-	if (nlk->netlink_unbind) {
-		int i;
-
-		for (i = 0; i < nlk->ngroups; i++)
-			if (test_bit(i, nlk->groups))
-				nlk->netlink_unbind(sock_net(sk), i + 1);
-	}
 	kfree(nlk->groups);
 	nlk->groups = NULL;
 
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index f123a88496f8..f1c31b39aa3e 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -2,6 +2,7 @@
 #define _AF_NETLINK_H
 
 #include <linux/rhashtable.h>
+#include <linux/atomic.h>
 #include <net/sock.h>
 
 #define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index c18d3f5624b2..ee57459fc258 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -23,6 +23,9 @@
 static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */
 static DECLARE_RWSEM(cb_lock);
 
+atomic_t genl_sk_destructing_cnt = ATOMIC_INIT(0);
+DECLARE_WAIT_QUEUE_HEAD(genl_sk_destructing_waitq);
+
 void genl_lock(void)
 {
 	mutex_lock(&genl_mutex);
@@ -435,15 +438,18 @@ int genl_unregister_family(struct genl_family *family)
 
 	genl_lock_all();
 
-	genl_unregister_mc_groups(family);
-
 	list_for_each_entry(rc, genl_family_chain(family->id), family_list) {
 		if (family->id != rc->id || strcmp(rc->name, family->name))
 			continue;
 
+		genl_unregister_mc_groups(family);
+
 		list_del(&rc->family_list);
 		family->n_ops = 0;
-		genl_unlock_all();
+		up_write(&cb_lock);
+		wait_event(genl_sk_destructing_waitq,
+			   atomic_read(&genl_sk_destructing_cnt) == 0);
+		genl_unlock();
 
 		kfree(family->attrbuf);
 		genl_ctrl_event(CTRL_CMD_DELFAMILY, family, NULL, 0);
@@ -1014,7 +1020,6 @@ static int genl_bind(struct net *net, int group)
 static void genl_unbind(struct net *net, int group)
 {
 	int i;
-	bool found = false;
 
 	down_read(&cb_lock);
 	for (i = 0; i < GENL_FAM_TAB_SIZE; i++) {
@@ -1027,14 +1032,11 @@ static void genl_unbind(struct net *net, int group)
 
 				if (f->mcast_unbind)
 					f->mcast_unbind(net, fam_grp);
-				found = true;
 				break;
 			}
 		}
 	}
 	up_read(&cb_lock);
-
-	WARN_ON(!found);
 }
 
 static int __net_init genl_pernet_init(struct net *net)
-- 
cgit v1.2.3


From 9d5438f462abd6398cdb7b3211bdcec271873a3b Mon Sep 17 00:00:00 2001
From: Mark Salyzyn <salyzyn@android.com>
Date: Fri, 16 Jan 2015 16:01:10 -0800
Subject: pstore: Add pmsg - user-space accessible pstore object

A secured user-space accessible pstore object. Writes
to /dev/pmsg0 are appended to the buffer, on reboot
the persistent contents are available in
/sys/fs/pstore/pmsg-ramoops-[ID].

One possible use is syslogd, or other daemon, can
write messages, then on reboot provides a means to
triage user-space activities leading up to a panic
as a companion to the pstore dmesg or console logs.

Signed-off-by: Mark Salyzyn <salyzyn@android.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 fs/pstore/Kconfig          |  10 ++++
 fs/pstore/Makefile         |   2 +
 fs/pstore/inode.c          |   3 ++
 fs/pstore/internal.h       |   6 +++
 fs/pstore/platform.c       |   1 +
 fs/pstore/pmsg.c           | 114 +++++++++++++++++++++++++++++++++++++++++++++
 fs/pstore/ram.c            |  34 +++++++++++++-
 include/linux/pstore.h     |   1 +
 include/linux/pstore_ram.h |   1 +
 9 files changed, 170 insertions(+), 2 deletions(-)
 create mode 100644 fs/pstore/pmsg.c

(limited to 'include/linux')

diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 983d9510becc..916b8e23d968 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -21,6 +21,16 @@ config PSTORE_CONSOLE
 	  When the option is enabled, pstore will log all kernel
 	  messages, even if no oops or panic happened.
 
+config PSTORE_PMSG
+	bool "Log user space messages"
+	depends on PSTORE
+	help
+	  When the option is enabled, pstore will export a character
+	  interface /dev/pmsg0 to log user space messages. On reboot
+	  data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID].
+
+	  If unsure, say N.
+
 config PSTORE_FTRACE
 	bool "Persistent function tracer"
 	depends on PSTORE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index 4c9095c2781e..e647d8e81712 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -7,5 +7,7 @@ obj-y += pstore.o
 pstore-objs += inode.o platform.o
 obj-$(CONFIG_PSTORE_FTRACE)	+= ftrace.o
 
+obj-$(CONFIG_PSTORE_PMSG)	+= pmsg.o
+
 ramoops-objs += ram.o ram_core.o
 obj-$(CONFIG_PSTORE_RAM)	+= ramoops.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index d69586f09ffd..b32ce53d24ee 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -361,6 +361,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 		scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
 			  psname, id);
 		break;
+	case PSTORE_TYPE_PMSG:
+		scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
+		break;
 	case PSTORE_TYPE_UNKNOWN:
 		scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
 		break;
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 3b3d305277c4..c36ba2cd0b5d 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -45,6 +45,12 @@ extern void pstore_register_ftrace(void);
 static inline void pstore_register_ftrace(void) {}
 #endif
 
+#ifdef CONFIG_PSTORE_PMSG
+extern void pstore_register_pmsg(void);
+#else
+static inline void pstore_register_pmsg(void) {}
+#endif
+
 extern struct pstore_info *psinfo;
 
 extern void	pstore_set_kmsg_bytes(int);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0a9b72cdfeca..15ee78c5020b 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -447,6 +447,7 @@ int pstore_register(struct pstore_info *psi)
 	if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
 		pstore_register_console();
 		pstore_register_ftrace();
+		pstore_register_pmsg();
 	}
 
 	if (pstore_update_ms >= 0) {
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
new file mode 100644
index 000000000000..feb5dd2948b4
--- /dev/null
+++ b/fs/pstore/pmsg.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2014  Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+
+static DEFINE_MUTEX(pmsg_lock);
+#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
+
+static ssize_t write_pmsg(struct file *file, const char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	size_t i, buffer_size;
+	char *buffer;
+
+	if (!count)
+		return 0;
+
+	if (!access_ok(VERIFY_READ, buf, count))
+		return -EFAULT;
+
+	buffer_size = count;
+	if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
+		buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
+	buffer = vmalloc(buffer_size);
+
+	mutex_lock(&pmsg_lock);
+	for (i = 0; i < count; ) {
+		size_t c = min(count - i, buffer_size);
+		u64 id;
+		long ret;
+
+		ret = __copy_from_user(buffer, buf + i, c);
+		if (unlikely(ret != 0)) {
+			mutex_unlock(&pmsg_lock);
+			vfree(buffer);
+			return -EFAULT;
+		}
+		psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
+				  psinfo);
+
+		i += c;
+	}
+
+	mutex_unlock(&pmsg_lock);
+	vfree(buffer);
+	return count;
+}
+
+static const struct file_operations pmsg_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+	.write		= write_pmsg,
+};
+
+static struct class *pmsg_class;
+static int pmsg_major;
+#define PMSG_NAME "pmsg"
+#undef pr_fmt
+#define pr_fmt(fmt) PMSG_NAME ": " fmt
+
+static char *pmsg_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0220;
+	return NULL;
+}
+
+void pstore_register_pmsg(void)
+{
+	struct device *pmsg_device;
+
+	pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops);
+	if (pmsg_major < 0) {
+		pr_err("register_chrdev failed\n");
+		goto err;
+	}
+
+	pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
+	if (IS_ERR(pmsg_class)) {
+		pr_err("device class file already in use\n");
+		goto err_class;
+	}
+	pmsg_class->devnode = pmsg_devnode;
+
+	pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0),
+					NULL, "%s%d", PMSG_NAME, 0);
+	if (IS_ERR(pmsg_device)) {
+		pr_err("failed to create device\n");
+		goto err_device;
+	}
+	return;
+
+err_device:
+	class_destroy(pmsg_class);
+err_class:
+	unregister_chrdev(pmsg_major, PMSG_NAME);
+err:
+	return;
+}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 6150e54eed30..39d1373128e9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -51,6 +51,10 @@ static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
 module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
 MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
 
+static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
+module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
+MODULE_PARM_DESC(pmsg_size, "size of user space message log");
+
 static ulong mem_address;
 module_param(mem_address, ulong, 0400);
 MODULE_PARM_DESC(mem_address,
@@ -82,12 +86,14 @@ struct ramoops_context {
 	struct persistent_ram_zone **przs;
 	struct persistent_ram_zone *cprz;
 	struct persistent_ram_zone *fprz;
+	struct persistent_ram_zone *mprz;
 	phys_addr_t phys_addr;
 	unsigned long size;
 	unsigned int memtype;
 	size_t record_size;
 	size_t console_size;
 	size_t ftrace_size;
+	size_t pmsg_size;
 	int dump_oops;
 	struct persistent_ram_ecc_info ecc_info;
 	unsigned int max_dump_cnt;
@@ -96,6 +102,7 @@ struct ramoops_context {
 	unsigned int dump_read_cnt;
 	unsigned int console_read_cnt;
 	unsigned int ftrace_read_cnt;
+	unsigned int pmsg_read_cnt;
 	struct pstore_info pstore;
 };
 
@@ -109,6 +116,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
 	cxt->dump_read_cnt = 0;
 	cxt->console_read_cnt = 0;
 	cxt->ftrace_read_cnt = 0;
+	cxt->pmsg_read_cnt = 0;
 	return 0;
 }
 
@@ -190,6 +198,9 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 	if (!prz_ok(prz))
 		prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
 					   1, id, type, PSTORE_TYPE_FTRACE, 0);
+	if (!prz_ok(prz))
+		prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
+					   1, id, type, PSTORE_TYPE_PMSG, 0);
 	if (!prz_ok(prz))
 		return 0;
 
@@ -258,6 +269,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
 			return -ENOMEM;
 		persistent_ram_write(cxt->fprz, buf, size);
 		return 0;
+	} else if (type == PSTORE_TYPE_PMSG) {
+		if (!cxt->mprz)
+			return -ENOMEM;
+		persistent_ram_write(cxt->mprz, buf, size);
+		return 0;
 	}
 
 	if (type != PSTORE_TYPE_DMESG)
@@ -315,6 +331,9 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
 	case PSTORE_TYPE_FTRACE:
 		prz = cxt->fprz;
 		break;
+	case PSTORE_TYPE_PMSG:
+		prz = cxt->mprz;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -441,7 +460,7 @@ static int ramoops_probe(struct platform_device *pdev)
 		goto fail_out;
 
 	if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
-			!pdata->ftrace_size)) {
+			!pdata->ftrace_size && !pdata->pmsg_size)) {
 		pr_err("The memory size and the record/console size must be "
 			"non-zero\n");
 		goto fail_out;
@@ -453,6 +472,8 @@ static int ramoops_probe(struct platform_device *pdev)
 		pdata->console_size = rounddown_pow_of_two(pdata->console_size);
 	if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
 		pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
+	if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
+		pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size);
 
 	cxt->size = pdata->mem_size;
 	cxt->phys_addr = pdata->mem_address;
@@ -460,12 +481,14 @@ static int ramoops_probe(struct platform_device *pdev)
 	cxt->record_size = pdata->record_size;
 	cxt->console_size = pdata->console_size;
 	cxt->ftrace_size = pdata->ftrace_size;
+	cxt->pmsg_size = pdata->pmsg_size;
 	cxt->dump_oops = pdata->dump_oops;
 	cxt->ecc_info = pdata->ecc_info;
 
 	paddr = cxt->phys_addr;
 
-	dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size;
+	dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
+			- cxt->pmsg_size;
 	err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
 	if (err)
 		goto fail_out;
@@ -480,6 +503,10 @@ static int ramoops_probe(struct platform_device *pdev)
 	if (err)
 		goto fail_init_fprz;
 
+	err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
+	if (err)
+		goto fail_init_mprz;
+
 	cxt->pstore.data = cxt;
 	/*
 	 * Console can handle any buffer size, so prefer LOG_LINE_MAX. If we
@@ -523,6 +550,8 @@ fail_buf:
 	kfree(cxt->pstore.buf);
 fail_clear:
 	cxt->pstore.bufsize = 0;
+	kfree(cxt->mprz);
+fail_init_mprz:
 	kfree(cxt->fprz);
 fail_init_fprz:
 	kfree(cxt->cprz);
@@ -580,6 +609,7 @@ static void ramoops_register_dummy(void)
 	dummy_data->record_size = record_size;
 	dummy_data->console_size = ramoops_console_size;
 	dummy_data->ftrace_size = ramoops_ftrace_size;
+	dummy_data->pmsg_size = ramoops_pmsg_size;
 	dummy_data->dump_oops = dump_oops;
 	/*
 	 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index ece0c6bbfcc5..8884f6e507f7 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -39,6 +39,7 @@ enum pstore_type_id {
 	PSTORE_TYPE_PPC_RTAS	= 4,
 	PSTORE_TYPE_PPC_OF	= 5,
 	PSTORE_TYPE_PPC_COMMON	= 6,
+	PSTORE_TYPE_PMSG	= 7,
 	PSTORE_TYPE_UNKNOWN	= 255
 };
 
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index 4af3fdc85b01..9c9d6c154c8e 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -81,6 +81,7 @@ struct ramoops_platform_data {
 	unsigned long	record_size;
 	unsigned long	console_size;
 	unsigned long	ftrace_size;
+	unsigned long	pmsg_size;
 	int		dump_oops;
 	struct persistent_ram_ecc_info ecc_info;
 };
-- 
cgit v1.2.3


From b9626f32876debc356fadb6e19aebcfe9d70c5ff Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Mon, 1 Dec 2014 19:32:51 +0100
Subject: tpm/tpm_i2c_stm_st33: Add new tpm_stm_dev structure and remove
 tpm_i2c_buffer[0], [1] buffer.

In order to clean big buffers in st33zp24_platform_data structure,
replace with tpm_stm_dev for driver internal usage.
As only one buffer is really necessary replace with buf field.

In the mean time move tpm_i2c_stm_st33.h to include/linux/platform_data.

Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
---
 drivers/char/tpm/tpm_i2c_stm_st33.c            | 204 +++++++++++--------------
 drivers/char/tpm/tpm_i2c_stm_st33.h            |  44 ------
 include/linux/platform_data/tpm_i2c_stm_st33.h |  40 +++++
 3 files changed, 132 insertions(+), 156 deletions(-)
 delete mode 100644 drivers/char/tpm/tpm_i2c_stm_st33.h
 create mode 100644 include/linux/platform_data/tpm_i2c_stm_st33.h

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.c b/drivers/char/tpm/tpm_i2c_stm_st33.c
index 44a02103ad49..4f725386385f 100644
--- a/drivers/char/tpm/tpm_i2c_stm_st33.c
+++ b/drivers/char/tpm/tpm_i2c_stm_st33.c
@@ -40,7 +40,6 @@
 #include <linux/wait.h>
 #include <linux/string.h>
 #include <linux/interrupt.h>
-#include <linux/spinlock.h>
 #include <linux/sysfs.h>
 #include <linux/gpio.h>
 #include <linux/sched.h>
@@ -48,6 +47,7 @@
 #include <linux/io.h>
 #include <linux/slab.h>
 
+#include <linux/platform_data/tpm_i2c_stm_st33.h>
 #include "tpm.h"
 
 #define TPM_ACCESS			0x0
@@ -66,7 +66,7 @@
 #define TPM_BUFSIZE			2048
 
 #define LOCALITY0		0
-#include "tpm_i2c_stm_st33.h"
+
 
 enum stm33zp24_access {
 	TPM_ACCESS_VALID = 0x80,
@@ -98,6 +98,15 @@ enum tis_defaults {
 	TIS_LONG_TIMEOUT = 2000,
 };
 
+struct tpm_stm_dev {
+	struct i2c_client *client;
+	struct completion irq_detection;
+	struct tpm_chip *chip;
+	u8 buf[TPM_BUFSIZE + 1];
+	int io_serirq;
+	int io_lpcpd;
+};
+
 /*
  * write8_reg
  * Send byte to the TIS register according to the ST33ZP24 I2C protocol.
@@ -106,17 +115,12 @@ enum tis_defaults {
  * @param: tpm_size, The length of the data
  * @return: Returns negative errno, or else the number of bytes written.
  */
-static int write8_reg(struct i2c_client *client, u8 tpm_register,
+static int write8_reg(struct tpm_stm_dev *tpm_dev, u8 tpm_register,
 		      u8 *tpm_data, u16 tpm_size)
 {
-	struct st33zp24_platform_data *pin_infos;
-
-	pin_infos = client->dev.platform_data;
-
-	pin_infos->tpm_i2c_buffer[0][0] = tpm_register;
-	memcpy(&pin_infos->tpm_i2c_buffer[0][1], tpm_data, tpm_size);
-	return i2c_master_send(client, pin_infos->tpm_i2c_buffer[0],
-				tpm_size + 1);
+	tpm_dev->buf[0] = tpm_register;
+	memcpy(tpm_dev->buf + 1, tpm_data, tpm_size);
+	return i2c_master_send(tpm_dev->client, tpm_dev->buf, tpm_size + 1);
 } /* write8_reg() */
 
 /*
@@ -127,56 +131,56 @@ static int write8_reg(struct i2c_client *client, u8 tpm_register,
  * @param: tpm_size, tpm TPM response size to read.
  * @return: number of byte read successfully: should be one if success.
  */
-static int read8_reg(struct i2c_client *client, u8 tpm_register,
+static int read8_reg(struct tpm_stm_dev *tpm_dev, u8 tpm_register,
 		    u8 *tpm_data, int tpm_size)
 {
 	u8 status = 0;
 	u8 data;
 
 	data = TPM_DUMMY_BYTE;
-	status = write8_reg(client, tpm_register, &data, 1);
+	status = write8_reg(tpm_dev, tpm_register, &data, 1);
 	if (status == 2)
-		status = i2c_master_recv(client, tpm_data, tpm_size);
+		status = i2c_master_recv(tpm_dev->client, tpm_data, tpm_size);
 	return status;
 } /* read8_reg() */
 
 /*
  * I2C_WRITE_DATA
  * Send byte to the TIS register according to the ST33ZP24 I2C protocol.
- * @param: client, the chip description
+ * @param: tpm_dev, the chip description
  * @param: tpm_register, the tpm tis register where the data should be written
  * @param: tpm_data, the tpm_data to write inside the tpm_register
  * @param: tpm_size, The length of the data
  * @return: number of byte written successfully: should be one if success.
  */
-#define I2C_WRITE_DATA(client, tpm_register, tpm_data, tpm_size) \
-	(write8_reg(client, tpm_register | \
+#define I2C_WRITE_DATA(tpm_dev, tpm_register, tpm_data, tpm_size) \
+	(write8_reg(tpm_dev, tpm_register | \
 	TPM_WRITE_DIRECTION, tpm_data, tpm_size))
 
 /*
  * I2C_READ_DATA
  * Recv byte from the TIS register according to the ST33ZP24 I2C protocol.
- * @param: tpm, the chip description
+ * @param: tpm_dev, the chip description
  * @param: tpm_register, the tpm tis register where the data should be read
  * @param: tpm_data, the TPM response
  * @param: tpm_size, tpm TPM response size to read.
  * @return: number of byte read successfully: should be one if success.
  */
-#define I2C_READ_DATA(client, tpm_register, tpm_data, tpm_size) \
-	(read8_reg(client, tpm_register, tpm_data, tpm_size))
+#define I2C_READ_DATA(tpm_dev, tpm_register, tpm_data, tpm_size) \
+	(read8_reg(tpm_dev, tpm_register, tpm_data, tpm_size))
 
 /*
  * clear_interruption
  * clear the TPM interrupt register.
  * @param: tpm, the chip description
  */
-static void clear_interruption(struct i2c_client *client)
+static void clear_interruption(struct tpm_stm_dev *tpm_dev)
 {
 	u8 interrupt;
 
-	I2C_READ_DATA(client, TPM_INT_STATUS, &interrupt, 1);
-	I2C_WRITE_DATA(client, TPM_INT_STATUS, &interrupt, 1);
-	I2C_READ_DATA(client, TPM_INT_STATUS, &interrupt, 1);
+	I2C_READ_DATA(tpm_dev, TPM_INT_STATUS, &interrupt, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_INT_STATUS, &interrupt, 1);
+	I2C_READ_DATA(tpm_dev, TPM_INT_STATUS, &interrupt, 1);
 } /* clear_interruption() */
 
 /*
@@ -190,17 +194,16 @@ static long _wait_for_interrupt_serirq_timeout(struct tpm_chip *chip,
 {
 	long status;
 	struct i2c_client *client;
-	struct st33zp24_platform_data *pin_infos;
+	struct tpm_stm_dev *tpm_dev;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
-	pin_infos = client->dev.platform_data;
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
+	client = tpm_dev->client;
 
 	status = wait_for_completion_interruptible_timeout(
-					&pin_infos->irq_detection,
-						timeout);
+					&tpm_dev->irq_detection,
+					timeout);
 	if (status > 0)
-		enable_irq(gpio_to_irq(pin_infos->io_serirq));
-	gpio_direction_input(pin_infos->io_serirq);
+		enable_irq(client->irq);
 
 	return status;
 } /* wait_for_interrupt_serirq_timeout() */
@@ -209,15 +212,15 @@ static int wait_for_serirq_timeout(struct tpm_chip *chip, bool condition,
 				 unsigned long timeout)
 {
 	int status = 2;
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	status = _wait_for_interrupt_serirq_timeout(chip, timeout);
 	if (!status) {
 		status = -EBUSY;
 	} else {
-		clear_interruption(client);
+		clear_interruption(tpm_dev);
 		if (condition)
 			status = 1;
 	}
@@ -230,13 +233,13 @@ static int wait_for_serirq_timeout(struct tpm_chip *chip, bool condition,
  */
 static void tpm_stm_i2c_cancel(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	data = TPM_STS_COMMAND_READY;
-	I2C_WRITE_DATA(client, TPM_STS, &data, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_STS, &data, 1);
 	if (chip->vendor.irq)
 		wait_for_serirq_timeout(chip, 1, chip->vendor.timeout_a);
 }	/* tpm_stm_i2c_cancel() */
@@ -248,12 +251,12 @@ static void tpm_stm_i2c_cancel(struct tpm_chip *chip)
  */
 static u8 tpm_stm_i2c_status(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
-	I2C_READ_DATA(client, TPM_STS, &data, 1);
+	I2C_READ_DATA(tpm_dev, TPM_STS, &data, 1);
 	return data;
 }				/* tpm_stm_i2c_status() */
 
@@ -265,13 +268,13 @@ static u8 tpm_stm_i2c_status(struct tpm_chip *chip)
  */
 static int check_locality(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 	u8 status;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
-	status = I2C_READ_DATA(client, TPM_ACCESS, &data, 1);
+	status = I2C_READ_DATA(tpm_dev, TPM_ACCESS, &data, 1);
 	if (status && (data &
 		(TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID)) ==
 		(TPM_ACCESS_ACTIVE_LOCALITY | TPM_ACCESS_VALID))
@@ -290,16 +293,16 @@ static int request_locality(struct tpm_chip *chip)
 {
 	unsigned long stop;
 	long rc;
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	if (check_locality(chip) == chip->vendor.locality)
 		return chip->vendor.locality;
 
 	data = TPM_ACCESS_REQUEST_USE;
-	rc = I2C_WRITE_DATA(client, TPM_ACCESS, &data, 1);
+	rc = I2C_WRITE_DATA(tpm_dev, TPM_ACCESS, &data, 1);
 	if (rc < 0)
 		goto end;
 
@@ -328,13 +331,13 @@ end:
  */
 static void release_locality(struct tpm_chip *chip)
 {
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 	u8 data;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 	data = TPM_ACCESS_ACTIVE_LOCALITY;
 
-	I2C_WRITE_DATA(client, TPM_ACCESS, &data, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_ACCESS, &data, 1);
 }
 
 /*
@@ -347,19 +350,20 @@ static int get_burstcount(struct tpm_chip *chip)
 	unsigned long stop;
 	int burstcnt, status;
 	u8 tpm_reg, temp;
+	struct tpm_stm_dev *tpm_dev;
 
-	struct i2c_client *client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	stop = jiffies + chip->vendor.timeout_d;
 	do {
 		tpm_reg = TPM_STS + 1;
-		status = I2C_READ_DATA(client, tpm_reg, &temp, 1);
+		status = I2C_READ_DATA(tpm_dev, tpm_reg, &temp, 1);
 		if (status < 0)
 			goto end;
 
 		tpm_reg = tpm_reg + 1;
 		burstcnt = temp;
-		status = I2C_READ_DATA(client, tpm_reg, &temp, 1);
+		status = I2C_READ_DATA(tpm_dev, tpm_reg, &temp, 1);
 		if (status < 0)
 			goto end;
 
@@ -416,9 +420,9 @@ static int wait_for_stat(struct tpm_chip *chip, u8 mask, unsigned long timeout,
 static int recv_data(struct tpm_chip *chip, u8 *buf, size_t count)
 {
 	int size = 0, burstcnt, len;
-	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
 
 	while (size < count &&
 	       wait_for_stat(chip,
@@ -430,7 +434,7 @@ static int recv_data(struct tpm_chip *chip, u8 *buf, size_t count)
 		if (burstcnt < 0)
 			return burstcnt;
 		len = min_t(int, burstcnt, count - size);
-		I2C_READ_DATA(client, TPM_DATA_FIFO, buf + size, len);
+		I2C_READ_DATA(tpm_dev, TPM_DATA_FIFO, buf + size, len);
 		size += len;
 	}
 	return size;
@@ -446,14 +450,14 @@ static irqreturn_t tpm_ioserirq_handler(int irq, void *dev_id)
 {
 	struct tpm_chip *chip = dev_id;
 	struct i2c_client *client;
-	struct st33zp24_platform_data *pin_infos;
+	struct tpm_stm_dev *tpm_dev;
 
 	disable_irq_nosync(irq);
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
-	pin_infos = client->dev.platform_data;
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
+	client = tpm_dev->client;
 
-	complete(&pin_infos->irq_detection);
+	complete(&tpm_dev->irq_detection);
 	return IRQ_HANDLED;
 } /* tpm_ioserirq_handler() */
 
@@ -475,13 +479,15 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 	int ret;
 	u8 data;
 	struct i2c_client *client;
+	struct tpm_stm_dev *tpm_dev;
 
 	if (chip == NULL)
 		return -EBUSY;
 	if (len < TPM_HEADER_SIZE)
 		return -EBUSY;
 
-	client = (struct i2c_client *)TPM_VPRIV(chip);
+	tpm_dev = (struct tpm_stm_dev *)TPM_VPRIV(chip);
+	client = tpm_dev->client;
 
 	client->flags = 0;
 
@@ -505,7 +511,7 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 		if (burstcnt < 0)
 			return burstcnt;
 		size = min_t(int, len - i - 1, burstcnt);
-		ret = I2C_WRITE_DATA(client, TPM_DATA_FIFO, buf + i, size);
+		ret = I2C_WRITE_DATA(tpm_dev, TPM_DATA_FIFO, buf + i, size);
 		if (ret < 0)
 			goto out_err;
 
@@ -518,7 +524,7 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 		goto out_err;
 	}
 
-	ret = I2C_WRITE_DATA(client, TPM_DATA_FIFO, buf + len - 1, 1);
+	ret = I2C_WRITE_DATA(tpm_dev, TPM_DATA_FIFO, buf + len - 1, 1);
 	if (ret < 0)
 		goto out_err;
 
@@ -529,7 +535,7 @@ static int tpm_stm_i2c_send(struct tpm_chip *chip, unsigned char *buf,
 	}
 
 	data = TPM_STS_GO;
-	I2C_WRITE_DATA(client, TPM_STS, &data, 1);
+	I2C_WRITE_DATA(tpm_dev, TPM_STS, &data, 1);
 
 	return len;
 out_err:
@@ -623,6 +629,7 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	u8 intmask;
 	struct tpm_chip *chip;
 	struct st33zp24_platform_data *platform_data;
+	struct tpm_stm_dev *tpm_dev;
 
 	if (client == NULL) {
 		pr_info("%s: i2c client is NULL. Device not accessible.\n",
@@ -637,11 +644,11 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		goto end;
 	}
 
-	chip = tpm_register_hardware(&client->dev, &st_i2c_tpm);
-	if (!chip) {
-		dev_info(&client->dev, "fail chip\n");
-		err = -ENODEV;
-		goto end;
+	tpm_dev = devm_kzalloc(&client->dev, sizeof(struct tpm_stm_dev),
+			       GFP_KERNEL);
+	if (!tpm_dev) {
+		err = -ENOMEM;
+		goto _tpm_clean_answer;
 	}
 
 	platform_data = client->dev.platform_data;
@@ -652,20 +659,14 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 		goto _tpm_clean_answer;
 	}
 
-	platform_data->tpm_i2c_buffer[0] =
-	    kmalloc(TPM_BUFSIZE, GFP_KERNEL);
-	if (platform_data->tpm_i2c_buffer[0] == NULL) {
-		err = -ENOMEM;
-		goto _tpm_clean_answer;
-	}
-	platform_data->tpm_i2c_buffer[1] =
-	    kmalloc(TPM_BUFSIZE, GFP_KERNEL);
-	if (platform_data->tpm_i2c_buffer[1] == NULL) {
-		err = -ENOMEM;
-		goto _tpm_clean_response1;
+	chip = tpm_register_hardware(&client->dev, &st_i2c_tpm);
+	if (!chip) {
+		dev_info(&client->dev, "fail chip\n");
+		return -ENODEV;
 	}
 
-	TPM_VPRIV(chip) = client;
+	TPM_VPRIV(chip) = tpm_dev;
+	tpm_dev->client = client;
 
 	chip->vendor.timeout_a = msecs_to_jiffies(TIS_SHORT_TIMEOUT);
 	chip->vendor.timeout_b = msecs_to_jiffies(TIS_LONG_TIMEOUT);
@@ -682,16 +683,16 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	}
 
 	if (interrupts) {
-		init_completion(&platform_data->irq_detection);
+		init_completion(&tpm_dev->irq_detection);
 		if (request_locality(chip) != LOCALITY0) {
 			err = -ENODEV;
-			goto _tpm_clean_response2;
+			goto _tpm_clean_answer;
 		}
 		err = gpio_request(platform_data->io_serirq, "TPM IO_SERIRQ");
 		if (err)
 			goto _gpio_init2;
 
-		clear_interruption(client);
+		clear_interruption(tpm_dev);
 		err = request_irq(gpio_to_irq(platform_data->io_serirq),
 				&tpm_ioserirq_handler,
 				IRQF_TRIGGER_HIGH,
@@ -702,7 +703,7 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 			goto _irq_set;
 		}
 
-		err = I2C_READ_DATA(client, TPM_INT_ENABLE, &intmask, 1);
+		err = I2C_READ_DATA(tpm_dev, TPM_INT_ENABLE, &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
@@ -713,16 +714,17 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 			|  TPM_INTF_STS_VALID_INT
 			|  TPM_INTF_DATA_AVAIL_INT;
 
-		err = I2C_WRITE_DATA(client, TPM_INT_ENABLE, &intmask, 1);
+		err = I2C_WRITE_DATA(tpm_dev, TPM_INT_ENABLE, &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
 		intmask = TPM_GLOBAL_INT_ENABLE;
-		err = I2C_WRITE_DATA(client, (TPM_INT_ENABLE + 3), &intmask, 1);
+		err = I2C_WRITE_DATA(tpm_dev, (TPM_INT_ENABLE + 3),
+				     &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
-		err = I2C_READ_DATA(client, TPM_INT_STATUS, &intmask, 1);
+		err = I2C_READ_DATA(tpm_dev, TPM_INT_STATUS, &intmask, 1);
 		if (err < 0)
 			goto _irq_set;
 
@@ -744,12 +746,6 @@ _gpio_init2:
 _gpio_init1:
 	if (power_mgt)
 		gpio_free(platform_data->io_lpcpd);
-_tpm_clean_response2:
-	kzfree(platform_data->tpm_i2c_buffer[1]);
-	platform_data->tpm_i2c_buffer[1] = NULL;
-_tpm_clean_response1:
-	kzfree(platform_data->tpm_i2c_buffer[0]);
-	platform_data->tpm_i2c_buffer[0] = NULL;
 _tpm_clean_answer:
 	tpm_remove_hardware(chip->dev);
 end:
@@ -765,28 +761,12 @@ end:
  */
 static int tpm_st33_i2c_remove(struct i2c_client *client)
 {
-	struct tpm_chip *chip = (struct tpm_chip *)i2c_get_clientdata(client);
-	struct st33zp24_platform_data *pin_infos =
-		((struct i2c_client *)TPM_VPRIV(chip))->dev.platform_data;
-
-	if (pin_infos != NULL) {
-		free_irq(pin_infos->io_serirq, chip);
-
-		gpio_free(pin_infos->io_serirq);
-		gpio_free(pin_infos->io_lpcpd);
+	struct tpm_chip *chip =
+		(struct tpm_chip *) i2c_get_clientdata(client);
 
+	if (chip)
 		tpm_remove_hardware(chip->dev);
 
-		if (pin_infos->tpm_i2c_buffer[1] != NULL) {
-			kzfree(pin_infos->tpm_i2c_buffer[1]);
-			pin_infos->tpm_i2c_buffer[1] = NULL;
-		}
-		if (pin_infos->tpm_i2c_buffer[0] != NULL) {
-			kzfree(pin_infos->tpm_i2c_buffer[0]);
-			pin_infos->tpm_i2c_buffer[0] = NULL;
-		}
-	}
-
 	return 0;
 }
 
diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.h b/drivers/char/tpm/tpm_i2c_stm_st33.h
deleted file mode 100644
index 3041271342eb..000000000000
--- a/drivers/char/tpm/tpm_i2c_stm_st33.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
- * Copyright (C) 2009, 2010  STMicroelectronics
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- *
- * STMicroelectronics version 1.2.0, Copyright (C) 2010
- * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
- * This is free software, and you are welcome to redistribute it
- * under certain conditions.
- *
- * @Author: Christophe RICARD tpmsupport@st.com
- *
- * @File: stm_st33_tpm_i2c.h
- *
- * @Date: 09/15/2010
- */
-#ifndef __STM_ST33_TPM_I2C_MAIN_H__
-#define __STM_ST33_TPM_I2C_MAIN_H__
-
-#define TPM_ST33_I2C			"st33zp24_i2c"
-
-struct st33zp24_platform_data {
-	int io_serirq;
-	int io_lpcpd;
-	struct i2c_client *client;
-	u8 *tpm_i2c_buffer[2]; /* 0 Request 1 Response */
-	struct completion irq_detection;
-	struct mutex lock;
-};
-
-#endif /* __STM_ST33_TPM_I2C_MAIN_H__ */
diff --git a/include/linux/platform_data/tpm_i2c_stm_st33.h b/include/linux/platform_data/tpm_i2c_stm_st33.h
new file mode 100644
index 000000000000..88f9cb18113a
--- /dev/null
+++ b/include/linux/platform_data/tpm_i2c_stm_st33.h
@@ -0,0 +1,40 @@
+/*
+ * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
+ * Copyright (C) 2009, 2010  STMicroelectronics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * STMicroelectronics version 1.2.0, Copyright (C) 2010
+ * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
+ * This is free software, and you are welcome to redistribute it
+ * under certain conditions.
+ *
+ * @Author: Christophe RICARD tpmsupport@st.com
+ *
+ * @File: stm_st33_tpm_i2c.h
+ *
+ * @Date: 09/15/2010
+ */
+#ifndef __STM_ST33_TPM_I2C_MAIN_H__
+#define __STM_ST33_TPM_I2C_MAIN_H__
+
+
+#define TPM_ST33_I2C			"st33zp24_i2c"
+
+struct st33zp24_platform_data {
+	int io_serirq;
+	int io_lpcpd;
+};
+
+#endif /* __STM_ST33_TPM_I2C_MAIN_H__ */
-- 
cgit v1.2.3


From 76182b6b008b694d095aec1e2eb6c07fae787128 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Mon, 1 Dec 2014 19:32:52 +0100
Subject: tpm/tpm_i2c_stm_st33: Remove reference to io_serirq

The serirq gpio pin is used only as interrupt. After driver initialization,
the serirq signal is always used through interrupt and never with gpio
kernel API.

The irq can then be initialized during the platform_data definition within the client->irq pin.

Reviewed-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
---
 drivers/char/tpm/tpm_i2c_stm_st33.c            | 13 +++----------
 include/linux/platform_data/tpm_i2c_stm_st33.h |  1 -
 2 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.c b/drivers/char/tpm/tpm_i2c_stm_st33.c
index 4f725386385f..728611638d1c 100644
--- a/drivers/char/tpm/tpm_i2c_stm_st33.c
+++ b/drivers/char/tpm/tpm_i2c_stm_st33.c
@@ -103,7 +103,6 @@ struct tpm_stm_dev {
 	struct completion irq_detection;
 	struct tpm_chip *chip;
 	u8 buf[TPM_BUFSIZE + 1];
-	int io_serirq;
 	int io_lpcpd;
 };
 
@@ -688,18 +687,15 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 			err = -ENODEV;
 			goto _tpm_clean_answer;
 		}
-		err = gpio_request(platform_data->io_serirq, "TPM IO_SERIRQ");
-		if (err)
-			goto _gpio_init2;
 
 		clear_interruption(tpm_dev);
-		err = request_irq(gpio_to_irq(platform_data->io_serirq),
+		err = request_irq(client->irq,
 				&tpm_ioserirq_handler,
 				IRQF_TRIGGER_HIGH,
 				"TPM SERIRQ management", chip);
 		if (err < 0) {
 			dev_err(chip->dev , "TPM SERIRQ signals %d not available\n",
-				gpio_to_irq(platform_data->io_serirq));
+				client->irq);
 			goto _irq_set;
 		}
 
@@ -739,10 +735,7 @@ tpm_st33_i2c_probe(struct i2c_client *client, const struct i2c_device_id *id)
 	dev_info(chip->dev, "TPM I2C Initialized\n");
 	return 0;
 _irq_set:
-	free_irq(gpio_to_irq(platform_data->io_serirq), (void *)chip);
-_gpio_init2:
-	if (interrupts)
-		gpio_free(platform_data->io_serirq);
+	free_irq(client->irq, (void *)chip);
 _gpio_init1:
 	if (power_mgt)
 		gpio_free(platform_data->io_lpcpd);
diff --git a/include/linux/platform_data/tpm_i2c_stm_st33.h b/include/linux/platform_data/tpm_i2c_stm_st33.h
index 88f9cb18113a..85775cf5f9a5 100644
--- a/include/linux/platform_data/tpm_i2c_stm_st33.h
+++ b/include/linux/platform_data/tpm_i2c_stm_st33.h
@@ -33,7 +33,6 @@
 #define TPM_ST33_I2C			"st33zp24_i2c"
 
 struct st33zp24_platform_data {
-	int io_serirq;
 	int io_lpcpd;
 };
 
-- 
cgit v1.2.3


From 3eda7d0ea3a0365aa72a2007f9450f314d92f065 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Tue, 13 Jan 2015 23:13:13 +0100
Subject: tpm/tpm_i2c_stm_st33: Change tpm_i2c_stm_st33.h to tpm_stm_st33.h

include/linux/platform_data/tpm_i2c_stm_st33.h can be used by other st33
tpm device driver not using i2c protocol.

Reviewed-by: Jason Gunthorpe <jason.gunthorpe@obsidianresearch.com>
Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Reviewed-by: Peter Huewe <peterhuewe@gmx.de>
Signed-off-by: Peter Huewe <peterhuewe@gmx.de>
---
 drivers/char/tpm/tpm_i2c_stm_st33.c            |  2 +-
 include/linux/platform_data/tpm_i2c_stm_st33.h | 39 --------------------------
 include/linux/platform_data/tpm_stm_st33.h     | 39 ++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 40 deletions(-)
 delete mode 100644 include/linux/platform_data/tpm_i2c_stm_st33.h
 create mode 100644 include/linux/platform_data/tpm_stm_st33.h

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm_i2c_stm_st33.c b/drivers/char/tpm/tpm_i2c_stm_st33.c
index 86a24ced66c0..dbab8d0d875e 100644
--- a/drivers/char/tpm/tpm_i2c_stm_st33.c
+++ b/drivers/char/tpm/tpm_i2c_stm_st33.c
@@ -50,7 +50,7 @@
 #include <linux/of_irq.h>
 #include <linux/of_gpio.h>
 
-#include <linux/platform_data/tpm_i2c_stm_st33.h>
+#include <linux/platform_data/tpm_stm_st33.h>
 #include "tpm.h"
 
 #define TPM_ACCESS			0x0
diff --git a/include/linux/platform_data/tpm_i2c_stm_st33.h b/include/linux/platform_data/tpm_i2c_stm_st33.h
deleted file mode 100644
index 85775cf5f9a5..000000000000
--- a/include/linux/platform_data/tpm_i2c_stm_st33.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
- * Copyright (C) 2009, 2010  STMicroelectronics
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
- * STMicroelectronics version 1.2.0, Copyright (C) 2010
- * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
- * This is free software, and you are welcome to redistribute it
- * under certain conditions.
- *
- * @Author: Christophe RICARD tpmsupport@st.com
- *
- * @File: stm_st33_tpm_i2c.h
- *
- * @Date: 09/15/2010
- */
-#ifndef __STM_ST33_TPM_I2C_MAIN_H__
-#define __STM_ST33_TPM_I2C_MAIN_H__
-
-
-#define TPM_ST33_I2C			"st33zp24_i2c"
-
-struct st33zp24_platform_data {
-	int io_lpcpd;
-};
-
-#endif /* __STM_ST33_TPM_I2C_MAIN_H__ */
diff --git a/include/linux/platform_data/tpm_stm_st33.h b/include/linux/platform_data/tpm_stm_st33.h
new file mode 100644
index 000000000000..ff75310c0f47
--- /dev/null
+++ b/include/linux/platform_data/tpm_stm_st33.h
@@ -0,0 +1,39 @@
+/*
+ * STMicroelectronics TPM I2C Linux driver for TPM ST33ZP24
+ * Copyright (C) 2009, 2010  STMicroelectronics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * STMicroelectronics version 1.2.0, Copyright (C) 2010
+ * STMicroelectronics comes with ABSOLUTELY NO WARRANTY.
+ * This is free software, and you are welcome to redistribute it
+ * under certain conditions.
+ *
+ * @Author: Christophe RICARD tpmsupport@st.com
+ *
+ * @File: stm_st33_tpm.h
+ *
+ * @Date: 09/15/2010
+ */
+#ifndef __STM_ST33_TPM_H__
+#define __STM_ST33_TPM_H__
+
+#define TPM_ST33_I2C			"st33zp24-i2c"
+#define TPM_ST33_SPI			"st33zp24-spi"
+
+struct st33zp24_platform_data {
+	int io_lpcpd;
+};
+
+#endif /* __STM_ST33_TPM_H__ */
-- 
cgit v1.2.3


From 3aeb66176ffa8fefd7a9f7d37bda1d8adcf469a1 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Thu, 15 Jan 2015 23:49:37 +0100
Subject: net: replace br_fdb_external_learn_* calls with switchdev notifier
 events

This patch benefits from newly introduced switchdev notifier and uses it
to propagate fdb learn events from rocker driver to bridge. That avoids
direct function calls and possible use by other listeners (ovs).

Suggested-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Scott Feldman <sfeldma@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 10 +++++--
 include/linux/if_bridge.h            | 18 -------------
 include/net/switchdev.h              | 11 ++++++++
 net/bridge/br.c                      | 52 +++++++++++++++++++++++++++++++++++-
 net/bridge/br_fdb.c                  | 38 +++-----------------------
 net/bridge/br_private.h              |  4 +++
 6 files changed, 78 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index cad8cf962cdf..964d719b150f 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3026,11 +3026,17 @@ static void rocker_port_fdb_learn_work(struct work_struct *work)
 		container_of(work, struct rocker_fdb_learn_work, work);
 	bool removing = (lw->flags & ROCKER_OP_FLAG_REMOVE);
 	bool learned = (lw->flags & ROCKER_OP_FLAG_LEARNED);
+	struct netdev_switch_notifier_fdb_info info;
+
+	info.addr = lw->addr;
+	info.vid = lw->vid;
 
 	if (learned && removing)
-		br_fdb_external_learn_del(lw->dev, lw->addr, lw->vid);
+		call_netdev_switch_notifiers(NETDEV_SWITCH_FDB_DEL,
+					     lw->dev, &info.info);
 	else if (learned && !removing)
-		br_fdb_external_learn_add(lw->dev, lw->addr, lw->vid);
+		call_netdev_switch_notifiers(NETDEV_SWITCH_FDB_ADD,
+					     lw->dev, &info.info);
 
 	kfree(work);
 }
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index 0a8ce762a47f..a57bca2ea97e 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -50,24 +50,6 @@ extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __use
 typedef int br_should_route_hook_t(struct sk_buff *skb);
 extern br_should_route_hook_t __rcu *br_should_route_hook;
 
-#if IS_ENABLED(CONFIG_BRIDGE)
-int br_fdb_external_learn_add(struct net_device *dev,
-			      const unsigned char *addr, u16 vid);
-int br_fdb_external_learn_del(struct net_device *dev,
-			      const unsigned char *addr, u16 vid);
-#else
-static inline int br_fdb_external_learn_add(struct net_device *dev,
-					    const unsigned char *addr, u16 vid)
-{
-	return 0;
-}
-static inline int br_fdb_external_learn_del(struct net_device *dev,
-					    const unsigned char *addr, u16 vid)
-{
-	return 0;
-}
-#endif
-
 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)
 int br_multicast_list_adjacent(struct net_device *dev,
 			       struct list_head *br_ip_list);
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 7f8d74372d87..201120e18e4d 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -13,10 +13,21 @@
 #include <linux/netdevice.h>
 #include <linux/notifier.h>
 
+enum netdev_switch_notifier_type {
+	NETDEV_SWITCH_FDB_ADD = 1,
+	NETDEV_SWITCH_FDB_DEL,
+};
+
 struct netdev_switch_notifier_info {
 	struct net_device *dev;
 };
 
+struct netdev_switch_notifier_fdb_info {
+	struct netdev_switch_notifier_info info; /* must be first */
+	const unsigned char *addr;
+	u16 vid;
+};
+
 static inline struct net_device *
 netdev_switch_notifier_info_to_dev(const struct netdev_switch_notifier_info *info)
 {
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 44425aff7cba..fb57ab6b24f9 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -19,6 +19,7 @@
 #include <linux/llc.h>
 #include <net/llc.h>
 #include <net/stp.h>
+#include <net/switchdev.h>
 
 #include "br_private.h"
 
@@ -120,6 +121,48 @@ static struct notifier_block br_device_notifier = {
 	.notifier_call = br_device_event
 };
 
+static int br_netdev_switch_event(struct notifier_block *unused,
+				  unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_switch_notifier_info_to_dev(ptr);
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	struct netdev_switch_notifier_fdb_info *fdb_info;
+	int err = NOTIFY_DONE;
+
+	rtnl_lock();
+	p = br_port_get_rtnl(dev);
+	if (!p)
+		goto out;
+
+	br = p->br;
+
+	switch (event) {
+	case NETDEV_SWITCH_FDB_ADD:
+		fdb_info = ptr;
+		err = br_fdb_external_learn_add(br, p, fdb_info->addr,
+						fdb_info->vid);
+		if (err)
+			err = notifier_from_errno(err);
+		break;
+	case NETDEV_SWITCH_FDB_DEL:
+		fdb_info = ptr;
+		err = br_fdb_external_learn_del(br, p, fdb_info->addr,
+						fdb_info->vid);
+		if (err)
+			err = notifier_from_errno(err);
+		break;
+	}
+
+out:
+	rtnl_unlock();
+	return err;
+}
+
+static struct notifier_block br_netdev_switch_notifier = {
+	.notifier_call = br_netdev_switch_event,
+};
+
 static void __net_exit br_net_exit(struct net *net)
 {
 	struct net_device *dev;
@@ -169,10 +212,14 @@ static int __init br_init(void)
 	if (err)
 		goto err_out3;
 
-	err = br_netlink_init();
+	err = register_netdev_switch_notifier(&br_netdev_switch_notifier);
 	if (err)
 		goto err_out4;
 
+	err = br_netlink_init();
+	if (err)
+		goto err_out5;
+
 	brioctl_set(br_ioctl_deviceless_stub);
 
 #if IS_ENABLED(CONFIG_ATM_LANE)
@@ -185,6 +232,8 @@ static int __init br_init(void)
 
 	return 0;
 
+err_out5:
+	unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
 err_out4:
 	unregister_netdevice_notifier(&br_device_notifier);
 err_out3:
@@ -202,6 +251,7 @@ static void __exit br_deinit(void)
 {
 	stp_proto_unregister(&br_stp_proto);
 	br_netlink_fini();
+	unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
 	unregister_netdevice_notifier(&br_device_notifier);
 	brioctl_set(NULL);
 	unregister_pernet_subsys(&br_net_ops);
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index e6e0372bc3cd..03667e65cc29 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -990,26 +990,14 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
 	}
 }
 
-int br_fdb_external_learn_add(struct net_device *dev,
+int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid)
 {
-	struct net_bridge_port *p;
-	struct net_bridge *br;
 	struct hlist_head *head;
 	struct net_bridge_fdb_entry *fdb;
 	int err = 0;
 
-	rtnl_lock();
-
-	p = br_port_get_rtnl(dev);
-	if (!p) {
-		pr_info("bridge: %s not a bridge port\n", dev->name);
-		err = -EINVAL;
-		goto err_rtnl_unlock;
-	}
-
-	br = p->br;
-
+	ASSERT_RTNL();
 	spin_lock_bh(&br->hash_lock);
 
 	head = &br->hash[br_mac_hash(addr, vid)];
@@ -1034,33 +1022,18 @@ int br_fdb_external_learn_add(struct net_device *dev,
 
 err_unlock:
 	spin_unlock_bh(&br->hash_lock);
-err_rtnl_unlock:
-	rtnl_unlock();
 
 	return err;
 }
-EXPORT_SYMBOL(br_fdb_external_learn_add);
 
-int br_fdb_external_learn_del(struct net_device *dev,
+int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid)
 {
-	struct net_bridge_port *p;
-	struct net_bridge *br;
 	struct hlist_head *head;
 	struct net_bridge_fdb_entry *fdb;
 	int err = 0;
 
-	rtnl_lock();
-
-	p = br_port_get_rtnl(dev);
-	if (!p) {
-		pr_info("bridge: %s not a bridge port\n", dev->name);
-		err = -EINVAL;
-		goto err_rtnl_unlock;
-	}
-
-	br = p->br;
-
+	ASSERT_RTNL();
 	spin_lock_bh(&br->hash_lock);
 
 	head = &br->hash[br_mac_hash(addr, vid)];
@@ -1071,9 +1044,6 @@ int br_fdb_external_learn_del(struct net_device *dev,
 		err = -ENOENT;
 
 	spin_unlock_bh(&br->hash_lock);
-err_rtnl_unlock:
-	rtnl_unlock();
 
 	return err;
 }
-EXPORT_SYMBOL(br_fdb_external_learn_del);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index d808d766334d..e8e3f3681680 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -402,6 +402,10 @@ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
 		struct net_device *dev, struct net_device *fdev, int idx);
 int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
 void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
+int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
+			      const unsigned char *addr, u16 vid);
+int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
+			      const unsigned char *addr, u16 vid);
 
 /* br_forward.c */
 void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb);
-- 
cgit v1.2.3


From f8c58c1136349fdfa9b605c501f2f911622d3a9a Mon Sep 17 00:00:00 2001
From: Doug Anderson <dianders@chromium.org>
Date: Tue, 2 Dec 2014 15:42:47 -0800
Subject: mmc: dw_mmc: Protect read-modify-write of INTMASK with a lock

We're running into cases where our enabling of the SDIO interrupt in
dw_mmc doesn't actually take effect.  Specifically, adding patch like
this:

 +++ b/drivers/mmc/host/dw_mmc.c
 @@ -1076,6 +1076,9 @@ static void dw_mci_enable_sdio_irq(struct mmc_host *mmc, int enb)

      mci_writel(host, INTMASK,
           (int_mask | SDMMC_INT_SDIO(slot->id)));
 +    int_mask = mci_readl(host, INTMASK);
 +    if (!(int_mask & SDMMC_INT_SDIO(slot->id)))
 +      dev_err(&mmc->class_dev, "failed to enable sdio irq\n");
    } else {

...actually triggers the error message.  That's because the
dw_mci_enable_sdio_irq() unsafely does a read-modify-write of the
INTMASK register.

We can't just use the standard host->lock since that lock is not irq
safe and mmc_signal_sdio_irq() (called from interrupt context) calls
dw_mci_enable_sdio_irq().  Add a new irq-safe lock to protect INTMASK.

An alternate solution to this is to punt mmc_signal_sdio_irq() to the
tasklet and then protect INTMASK modifications by the standard host
lock.  This seemed like a bit more of a high-latency change.

Reported-by: Bing Zhao <bzhao@marvell.com>
Signed-off-by: Doug Anderson <dianders@chromium.org>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/dw_mmc.c  | 13 +++++++++++++
 include/linux/mmc/dw_mmc.h |  6 ++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c
index ae10a021765c..64ea04274913 100644
--- a/drivers/mmc/host/dw_mmc.c
+++ b/drivers/mmc/host/dw_mmc.c
@@ -759,6 +759,7 @@ disable:
 
 static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
 {
+	unsigned long irqflags;
 	int sg_len;
 	u32 temp;
 
@@ -795,9 +796,11 @@ static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
 	mci_writel(host, CTRL, temp);
 
 	/* Disable RX/TX IRQs, let DMA handle it */
+	spin_lock_irqsave(&host->irq_lock, irqflags);
 	temp = mci_readl(host, INTMASK);
 	temp  &= ~(SDMMC_INT_RXDR | SDMMC_INT_TXDR);
 	mci_writel(host, INTMASK, temp);
+	spin_unlock_irqrestore(&host->irq_lock, irqflags);
 
 	host->dma_ops->start(host, sg_len);
 
@@ -806,6 +809,7 @@ static int dw_mci_submit_data_dma(struct dw_mci *host, struct mmc_data *data)
 
 static void dw_mci_submit_data(struct dw_mci *host, struct mmc_data *data)
 {
+	unsigned long irqflags;
 	u32 temp;
 
 	data->error = -EINPROGRESS;
@@ -834,9 +838,12 @@ static void dw_mci_submit_data(struct dw_mci *host, struct mmc_data *data)
 		host->part_buf_count = 0;
 
 		mci_writel(host, RINTSTS, SDMMC_INT_TXDR | SDMMC_INT_RXDR);
+
+		spin_lock_irqsave(&host->irq_lock, irqflags);
 		temp = mci_readl(host, INTMASK);
 		temp |= SDMMC_INT_TXDR | SDMMC_INT_RXDR;
 		mci_writel(host, INTMASK, temp);
+		spin_unlock_irqrestore(&host->irq_lock, irqflags);
 
 		temp = mci_readl(host, CTRL);
 		temp &= ~SDMMC_CTRL_DMA_ENABLE;
@@ -1284,8 +1291,11 @@ static void dw_mci_enable_sdio_irq(struct mmc_host *mmc, int enb)
 {
 	struct dw_mci_slot *slot = mmc_priv(mmc);
 	struct dw_mci *host = slot->host;
+	unsigned long irqflags;
 	u32 int_mask;
 
+	spin_lock_irqsave(&host->irq_lock, irqflags);
+
 	/* Enable/disable Slot Specific SDIO interrupt */
 	int_mask = mci_readl(host, INTMASK);
 	if (enb)
@@ -1293,6 +1303,8 @@ static void dw_mci_enable_sdio_irq(struct mmc_host *mmc, int enb)
 	else
 		int_mask &= ~SDMMC_INT_SDIO(slot->sdio_id);
 	mci_writel(host, INTMASK, int_mask);
+
+	spin_unlock_irqrestore(&host->irq_lock, irqflags);
 }
 
 static int dw_mci_execute_tuning(struct mmc_host *mmc, u32 opcode)
@@ -2661,6 +2673,7 @@ int dw_mci_probe(struct dw_mci *host)
 	host->quirks = host->pdata->quirks;
 
 	spin_lock_init(&host->lock);
+	spin_lock_init(&host->irq_lock);
 	INIT_LIST_HEAD(&host->queue);
 
 	/*
diff --git a/include/linux/mmc/dw_mmc.h b/include/linux/mmc/dw_mmc.h
index 42b724e8d503..471fb3116dbe 100644
--- a/include/linux/mmc/dw_mmc.h
+++ b/include/linux/mmc/dw_mmc.h
@@ -106,6 +106,11 @@ struct mmc_data;
  * @cur_slot, @mrq and @state. These must always be updated
  * at the same time while holding @lock.
  *
+ * @irq_lock is an irq-safe spinlock protecting the INTMASK register
+ * to allow the interrupt handler to modify it directly.  Held for only long
+ * enough to read-modify-write INTMASK and no other locks are grabbed when
+ * holding this one.
+ *
  * The @mrq field of struct dw_mci_slot is also protected by @lock,
  * and must always be written at the same time as the slot is added to
  * @queue.
@@ -125,6 +130,7 @@ struct mmc_data;
  */
 struct dw_mci {
 	spinlock_t		lock;
+	spinlock_t		irq_lock;
 	void __iomem		*regs;
 
 	struct scatterlist	*sg;
-- 
cgit v1.2.3


From 61bd8a04b379e4fefbc0b0d06941d3f91071bb8c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 12 Dec 2014 08:43:08 +1100
Subject: mmc: omap_hsmmc: remove prepare/complete system suspend support.

The only function of these 'prepare' and 'complete' is to
disable the 'card detect' irq during suspend.

The commit which added this,
commit a48ce884d5819d5df2cf1139ab3c43f8e9e419b3
    mmc: omap_hsmmc: Introduce omap_hsmmc_prepare/complete

justified it by the need to avoid the registration of new devices
during suspend.
However mmc_pm_notify will set ->rescan_disable in the 'prepare'
stage and clear it in the 'complete' stage, so no card detection
will actually happen.
Also the interrupt will be disabled before final suspend as part
of common suspend processing.

So this disabling of the interrupt is unnecessary, and interferes
with a transition to using common code for card-detect management.

Cc: Felipe Balbi <balbi@ti.com>
Cc: Venkatraman S <svenkatr@ti.com>
Cc: Chris Ball <cjb@laptop.org>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/omap_hsmmc.c          | 50 ----------------------------------
 include/linux/platform_data/mmc-omap.h |  4 ---
 2 files changed, 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c
index 7c71dcdcba8b..537cba8f1de1 100644
--- a/drivers/mmc/host/omap_hsmmc.c
+++ b/drivers/mmc/host/omap_hsmmc.c
@@ -275,31 +275,6 @@ static int omap_hsmmc_get_cover_state(struct device *dev)
 	return !gpio_get_value_cansleep(mmc->switch_pin);
 }
 
-#ifdef CONFIG_PM
-
-static int omap_hsmmc_suspend_cdirq(struct device *dev)
-{
-	struct omap_hsmmc_host *host = dev_get_drvdata(dev);
-
-	disable_irq(host->card_detect_irq);
-	return 0;
-}
-
-static int omap_hsmmc_resume_cdirq(struct device *dev)
-{
-	struct omap_hsmmc_host *host = dev_get_drvdata(dev);
-
-	enable_irq(host->card_detect_irq);
-	return 0;
-}
-
-#else
-
-#define omap_hsmmc_suspend_cdirq	NULL
-#define omap_hsmmc_resume_cdirq		NULL
-
-#endif
-
 #ifdef CONFIG_REGULATOR
 
 static int omap_hsmmc_set_power(struct device *dev, int power_on, int vdd)
@@ -2234,8 +2209,6 @@ static int omap_hsmmc_probe(struct platform_device *pdev)
 				"Unable to grab MMC CD IRQ\n");
 			goto err_irq_cd;
 		}
-		host->suspend = omap_hsmmc_suspend_cdirq;
-		host->resume = omap_hsmmc_resume_cdirq;
 	}
 
 	omap_hsmmc_disable_irq(host);
@@ -2322,25 +2295,6 @@ static int omap_hsmmc_remove(struct platform_device *pdev)
 }
 
 #ifdef CONFIG_PM
-static int omap_hsmmc_prepare(struct device *dev)
-{
-	struct omap_hsmmc_host *host = dev_get_drvdata(dev);
-
-	if (host->suspend)
-		return host->suspend(dev);
-
-	return 0;
-}
-
-static void omap_hsmmc_complete(struct device *dev)
-{
-	struct omap_hsmmc_host *host = dev_get_drvdata(dev);
-
-	if (host->resume)
-		host->resume(dev);
-
-}
-
 static int omap_hsmmc_suspend(struct device *dev)
 {
 	struct omap_hsmmc_host *host = dev_get_drvdata(dev);
@@ -2398,8 +2352,6 @@ static int omap_hsmmc_resume(struct device *dev)
 }
 
 #else
-#define omap_hsmmc_prepare	NULL
-#define omap_hsmmc_complete	NULL
 #define omap_hsmmc_suspend	NULL
 #define omap_hsmmc_resume	NULL
 #endif
@@ -2484,8 +2436,6 @@ static int omap_hsmmc_runtime_resume(struct device *dev)
 static struct dev_pm_ops omap_hsmmc_dev_pm_ops = {
 	.suspend	= omap_hsmmc_suspend,
 	.resume		= omap_hsmmc_resume,
-	.prepare	= omap_hsmmc_prepare,
-	.complete	= omap_hsmmc_complete,
 	.runtime_suspend = omap_hsmmc_runtime_suspend,
 	.runtime_resume = omap_hsmmc_runtime_resume,
 };
diff --git a/include/linux/platform_data/mmc-omap.h b/include/linux/platform_data/mmc-omap.h
index 5c188f4e9bec..929469291406 100644
--- a/include/linux/platform_data/mmc-omap.h
+++ b/include/linux/platform_data/mmc-omap.h
@@ -31,10 +31,6 @@ struct omap_mmc_platform_data {
 	void (*cleanup)(struct device *dev);
 	void (*shutdown)(struct device *dev);
 
-	/* To handle board related suspend/resume functionality for MMC */
-	int (*suspend)(struct device *dev, int slot);
-	int (*resume)(struct device *dev, int slot);
-
 	/* Return context loss count due to PM states changing */
 	int (*get_context_loss_count)(struct device *dev);
 
-- 
cgit v1.2.3


From eddbc3abc5bf11bdfc92ef84fd97ec4d379b7278 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 18 Dec 2014 15:44:32 +0100
Subject: mmc: slot-gpio: Remove option to explicitly free requested CD/WP
 GPIOs

The slot-gpio uses the devm*_ managed functions. Still it provide APIs
to explicitly free requested CD/WP GPIOs, but these API isn't being
used.

Therefore let's simplify slot-gpio by removing these unused APIs. If it
later turns out we need some of them, we can always consider to restore
the code.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/slot-gpio.c  | 92 ++-----------------------------------------
 include/linux/mmc/slot-gpio.h |  3 --
 2 files changed, 4 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c
index 69bbf2adb329..a428f10747d2 100644
--- a/drivers/mmc/core/slot-gpio.c
+++ b/drivers/mmc/core/slot-gpio.c
@@ -103,11 +103,7 @@ EXPORT_SYMBOL(mmc_gpio_get_cd);
  * @gpio: gpio number requested
  *
  * As devm_* managed functions are used in mmc_gpio_request_ro(), client
- * drivers do not need to explicitly call mmc_gpio_free_ro() for freeing up,
- * if the requesting and freeing are only needed at probing and unbinding time
- * for once.  However, if client drivers do something special like runtime
- * switching for write-protection, they are responsible for calling
- * mmc_gpio_request_ro() and mmc_gpio_free_ro() as a pair on their own.
+ * drivers do not need to worry about freeing up memory.
  *
  * Returns zero on success, else an error.
  */
@@ -178,11 +174,7 @@ EXPORT_SYMBOL(mmc_gpiod_request_cd_irq);
  * @debounce: debounce time in microseconds
  *
  * As devm_* managed functions are used in mmc_gpio_request_cd(), client
- * drivers do not need to explicitly call mmc_gpio_free_cd() for freeing up,
- * if the requesting and freeing are only needed at probing and unbinding time
- * for once.  However, if client drivers do something special like runtime
- * switching for card-detection, they are responsible for calling
- * mmc_gpio_request_cd() and mmc_gpio_free_cd() as a pair on their own.
+ * drivers do not need to worry about freeing up memory.
  *
  * If GPIO debouncing is desired, set the debounce parameter to a non-zero
  * value. The caller is responsible for ensuring that the GPIO driver associated
@@ -225,55 +217,6 @@ int mmc_gpio_request_cd(struct mmc_host *host, unsigned int gpio,
 }
 EXPORT_SYMBOL(mmc_gpio_request_cd);
 
-/**
- * mmc_gpio_free_ro - free the write-protection gpio
- * @host: mmc host
- *
- * It's provided only for cases that client drivers need to manually free
- * up the write-protection gpio requested by mmc_gpio_request_ro().
- */
-void mmc_gpio_free_ro(struct mmc_host *host)
-{
-	struct mmc_gpio *ctx = host->slot.handler_priv;
-	int gpio;
-
-	if (!ctx || !ctx->ro_gpio)
-		return;
-
-	gpio = desc_to_gpio(ctx->ro_gpio);
-	ctx->ro_gpio = NULL;
-
-	devm_gpio_free(&host->class_dev, gpio);
-}
-EXPORT_SYMBOL(mmc_gpio_free_ro);
-
-/**
- * mmc_gpio_free_cd - free the card-detection gpio
- * @host: mmc host
- *
- * It's provided only for cases that client drivers need to manually free
- * up the card-detection gpio requested by mmc_gpio_request_cd().
- */
-void mmc_gpio_free_cd(struct mmc_host *host)
-{
-	struct mmc_gpio *ctx = host->slot.handler_priv;
-	int gpio;
-
-	if (!ctx || !ctx->cd_gpio)
-		return;
-
-	if (host->slot.cd_irq >= 0) {
-		devm_free_irq(&host->class_dev, host->slot.cd_irq, host);
-		host->slot.cd_irq = -EINVAL;
-	}
-
-	gpio = desc_to_gpio(ctx->cd_gpio);
-	ctx->cd_gpio = NULL;
-
-	devm_gpio_free(&host->class_dev, gpio);
-}
-EXPORT_SYMBOL(mmc_gpio_free_cd);
-
 /**
  * mmc_gpiod_request_cd - request a gpio descriptor for card-detection
  * @host: mmc host
@@ -285,8 +228,7 @@ EXPORT_SYMBOL(mmc_gpio_free_cd);
  * to NULL to ignore
  *
  * Use this function in place of mmc_gpio_request_cd() to use the GPIO
- * descriptor API.  Note that it is paired with mmc_gpiod_free_cd() not
- * mmc_gpio_free_cd().  Note also that it must be called prior to mmc_add_host()
+ * descriptor API.  Note that it must be called prior to mmc_add_host()
  * otherwise the caller must also call mmc_gpiod_request_cd_irq().
  *
  * Returns zero on success, else an error.
@@ -339,8 +281,7 @@ EXPORT_SYMBOL(mmc_gpiod_request_cd);
  * set to NULL to ignore
  *
  * Use this function in place of mmc_gpio_request_ro() to use the GPIO
- * descriptor API.  Note that it is paired with mmc_gpiod_free_ro() not
- * mmc_gpio_free_ro().
+ * descriptor API.
  *
  * Returns zero on success, else an error.
  */
@@ -380,28 +321,3 @@ int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id,
 	return 0;
 }
 EXPORT_SYMBOL(mmc_gpiod_request_ro);
-
-/**
- * mmc_gpiod_free_cd - free the card-detection gpio descriptor
- * @host: mmc host
- *
- * It's provided only for cases that client drivers need to manually free
- * up the card-detection gpio requested by mmc_gpiod_request_cd().
- */
-void mmc_gpiod_free_cd(struct mmc_host *host)
-{
-	struct mmc_gpio *ctx = host->slot.handler_priv;
-
-	if (!ctx || !ctx->cd_gpio)
-		return;
-
-	if (host->slot.cd_irq >= 0) {
-		devm_free_irq(&host->class_dev, host->slot.cd_irq, host);
-		host->slot.cd_irq = -EINVAL;
-	}
-
-	devm_gpiod_put(host->parent, ctx->cd_gpio);
-
-	ctx->cd_gpio = NULL;
-}
-EXPORT_SYMBOL(mmc_gpiod_free_cd);
diff --git a/include/linux/mmc/slot-gpio.h b/include/linux/mmc/slot-gpio.h
index e56fa24c9322..4a36d6954631 100644
--- a/include/linux/mmc/slot-gpio.h
+++ b/include/linux/mmc/slot-gpio.h
@@ -15,12 +15,10 @@ struct mmc_host;
 
 int mmc_gpio_get_ro(struct mmc_host *host);
 int mmc_gpio_request_ro(struct mmc_host *host, unsigned int gpio);
-void mmc_gpio_free_ro(struct mmc_host *host);
 
 int mmc_gpio_get_cd(struct mmc_host *host);
 int mmc_gpio_request_cd(struct mmc_host *host, unsigned int gpio,
 			unsigned int debounce);
-void mmc_gpio_free_cd(struct mmc_host *host);
 
 int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, bool override_active_level,
@@ -28,7 +26,6 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id,
 int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, bool override_active_level,
 			 unsigned int debounce, bool *gpio_invert);
-void mmc_gpiod_free_cd(struct mmc_host *host);
 void mmc_gpiod_request_cd_irq(struct mmc_host *host);
 
 #endif
-- 
cgit v1.2.3


From df8aca162e5ff2b20c7a4de3e64e5b96ff838ab0 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 18 Dec 2014 15:44:36 +0100
Subject: mmc: slot-gpio: Rework how to handle allocation of slot-gpio data

By moving the allocation of the slot-gpio data into mmc_alloc_host(),
we can remove the slot-gpio internal calls to mmc_gpio_alloc().

This means mmc_gpio_alloc() has now only one caller left, which
consequence allow us to simplify and remove some of the slot-gpio code.

Additionally, this makes the slot-gpio mutex redundant, so let's remove
it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/host.c      | 20 +++++++-------
 drivers/mmc/core/slot-gpio.c | 62 ++++++++++----------------------------------
 include/linux/mmc/host.h     |  2 --
 3 files changed, 23 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index fcb7f06373cf..07636449b4de 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -29,6 +29,7 @@
 
 #include "core.h"
 #include "host.h"
+#include "slot-gpio.h"
 
 #define cls_dev_to_mmc_host(d)	container_of(d, struct mmc_host, class_dev)
 
@@ -38,7 +39,6 @@ static DEFINE_SPINLOCK(mmc_host_lock);
 static void mmc_host_classdev_release(struct device *dev)
 {
 	struct mmc_host *host = cls_dev_to_mmc_host(dev);
-	mutex_destroy(&host->slot.lock);
 	spin_lock(&mmc_host_lock);
 	idr_remove(&mmc_host_idr, host->index);
 	spin_unlock(&mmc_host_lock);
@@ -478,8 +478,10 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 		host->index = err;
 	spin_unlock(&mmc_host_lock);
 	idr_preload_end();
-	if (err < 0)
-		goto free;
+	if (err < 0) {
+		kfree(host);
+		return NULL;
+	}
 
 	dev_set_name(&host->class_dev, "mmc%d", host->index);
 
@@ -488,10 +490,12 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	host->class_dev.class = &mmc_host_class;
 	device_initialize(&host->class_dev);
 
-	mmc_host_clk_init(host);
+	if (mmc_gpio_alloc(host)) {
+		put_device(&host->class_dev);
+		return NULL;
+	}
 
-	mutex_init(&host->slot.lock);
-	host->slot.cd_irq = -EINVAL;
+	mmc_host_clk_init(host);
 
 	spin_lock_init(&host->lock);
 	init_waitqueue_head(&host->wq);
@@ -512,10 +516,6 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	host->max_blk_count = PAGE_CACHE_SIZE / 512;
 
 	return host;
-
-free:
-	kfree(host);
-	return NULL;
 }
 
 EXPORT_SYMBOL(mmc_alloc_host);
diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c
index ec918c27e77f..1a3edbd47719 100644
--- a/drivers/mmc/core/slot-gpio.c
+++ b/drivers/mmc/core/slot-gpio.c
@@ -43,29 +43,17 @@ static irqreturn_t mmc_gpio_cd_irqt(int irq, void *dev_id)
 int mmc_gpio_alloc(struct mmc_host *host)
 {
 	size_t len = strlen(dev_name(host->parent)) + 4;
-	struct mmc_gpio *ctx;
-
-	mutex_lock(&host->slot.lock);
-
-	ctx = host->slot.handler_priv;
-	if (!ctx) {
-		/*
-		 * devm_kzalloc() can be called after device_initialize(), even
-		 * before device_add(), i.e., between mmc_alloc_host() and
-		 * mmc_add_host()
-		 */
-		ctx = devm_kzalloc(host->parent, sizeof(*ctx) + 2 * len,
-				   GFP_KERNEL);
-		if (ctx) {
-			ctx->ro_label = ctx->cd_label + len;
-			snprintf(ctx->cd_label, len, "%s cd", dev_name(host->parent));
-			snprintf(ctx->ro_label, len, "%s ro", dev_name(host->parent));
-			host->slot.handler_priv = ctx;
-		}
+	struct mmc_gpio *ctx = devm_kzalloc(host->parent,
+				sizeof(*ctx) + 2 * len,	GFP_KERNEL);
+
+	if (ctx) {
+		ctx->ro_label = ctx->cd_label + len;
+		snprintf(ctx->cd_label, len, "%s cd", dev_name(host->parent));
+		snprintf(ctx->ro_label, len, "%s ro", dev_name(host->parent));
+		host->slot.handler_priv = ctx;
+		host->slot.cd_irq = -EINVAL;
 	}
 
-	mutex_unlock(&host->slot.lock);
-
 	return ctx ? 0 : -ENOMEM;
 }
 
@@ -111,18 +99,12 @@ EXPORT_SYMBOL(mmc_gpio_get_cd);
  */
 int mmc_gpio_request_ro(struct mmc_host *host, unsigned int gpio)
 {
-	struct mmc_gpio *ctx;
+	struct mmc_gpio *ctx = host->slot.handler_priv;
 	int ret;
 
 	if (!gpio_is_valid(gpio))
 		return -EINVAL;
 
-	ret = mmc_gpio_alloc(host);
-	if (ret < 0)
-		return ret;
-
-	ctx = host->slot.handler_priv;
-
 	ret = devm_gpio_request_one(host->parent, gpio, GPIOF_DIR_IN,
 				    ctx->ro_label);
 	if (ret < 0)
@@ -187,15 +169,9 @@ EXPORT_SYMBOL(mmc_gpiod_request_cd_irq);
 int mmc_gpio_request_cd(struct mmc_host *host, unsigned int gpio,
 			unsigned int debounce)
 {
-	struct mmc_gpio *ctx;
+	struct mmc_gpio *ctx = host->slot.handler_priv;
 	int ret;
 
-	ret = mmc_gpio_alloc(host);
-	if (ret < 0)
-		return ret;
-
-	ctx = host->slot.handler_priv;
-
 	ret = devm_gpio_request_one(host->parent, gpio, GPIOF_DIR_IN,
 				    ctx->cd_label);
 	if (ret < 0)
@@ -239,16 +215,10 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, bool override_active_level,
 			 unsigned int debounce, bool *gpio_invert)
 {
-	struct mmc_gpio *ctx;
+	struct mmc_gpio *ctx = host->slot.handler_priv;
 	struct gpio_desc *desc;
 	int ret;
 
-	ret = mmc_gpio_alloc(host);
-	if (ret < 0)
-		return ret;
-
-	ctx = host->slot.handler_priv;
-
 	if (!con_id)
 		con_id = ctx->cd_label;
 
@@ -291,16 +261,10 @@ int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, bool override_active_level,
 			 unsigned int debounce, bool *gpio_invert)
 {
-	struct mmc_gpio *ctx;
+	struct mmc_gpio *ctx = host->slot.handler_priv;
 	struct gpio_desc *desc;
 	int ret;
 
-	ret = mmc_gpio_alloc(host);
-	if (ret < 0)
-		return ret;
-
-	ctx = host->slot.handler_priv;
-
 	if (!con_id)
 		con_id = ctx->ro_label;
 
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 9f322706f7cb..b6bf718c3498 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -166,7 +166,6 @@ struct mmc_async_req {
  * struct mmc_slot - MMC slot functions
  *
  * @cd_irq:		MMC/SD-card slot hotplug detection IRQ or -EINVAL
- * @lock:		protect the @handler_priv pointer
  * @handler_priv:	MMC/SD-card slot context
  *
  * Some MMC/SD host controllers implement slot-functions like card and
@@ -176,7 +175,6 @@ struct mmc_async_req {
  */
 struct mmc_slot {
 	int cd_irq;
-	struct mutex lock;
 	void *handler_priv;
 };
 
-- 
cgit v1.2.3


From 04cdbbfa73ebac57a30ec2ebebfd7e9342bbdc44 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 1 Dec 2014 16:53:34 +0100
Subject: mmc: core: Make tuning block patterns static

Since previous patches removed the need for the tuning block patterns
to be exported, let's move them close to the mmc_send_tuning() API.

Those are now intended to be used only by the mmc core.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Jaehoon Chung <jh80.chung@samsung.com>
---
 drivers/mmc/core/mmc.c     | 32 --------------------------------
 drivers/mmc/core/mmc_ops.c | 30 ++++++++++++++++++++++++++++++
 include/linux/mmc/mmc.h    |  5 -----
 3 files changed, 30 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index d854bff037a2..0b8ec87fc517 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -1155,38 +1155,6 @@ bus_speed:
 	return err;
 }
 
-const u8 tuning_blk_pattern_4bit[MMC_TUNING_BLK_PATTERN_4BIT_SIZE] = {
-	0xff, 0x0f, 0xff, 0x00, 0xff, 0xcc, 0xc3, 0xcc,
-	0xc3, 0x3c, 0xcc, 0xff, 0xfe, 0xff, 0xfe, 0xef,
-	0xff, 0xdf, 0xff, 0xdd, 0xff, 0xfb, 0xff, 0xfb,
-	0xbf, 0xff, 0x7f, 0xff, 0x77, 0xf7, 0xbd, 0xef,
-	0xff, 0xf0, 0xff, 0xf0, 0x0f, 0xfc, 0xcc, 0x3c,
-	0xcc, 0x33, 0xcc, 0xcf, 0xff, 0xef, 0xff, 0xee,
-	0xff, 0xfd, 0xff, 0xfd, 0xdf, 0xff, 0xbf, 0xff,
-	0xbb, 0xff, 0xf7, 0xff, 0xf7, 0x7f, 0x7b, 0xde,
-};
-EXPORT_SYMBOL(tuning_blk_pattern_4bit);
-
-const u8 tuning_blk_pattern_8bit[MMC_TUNING_BLK_PATTERN_8BIT_SIZE] = {
-	0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00, 0x00,
-	0xff, 0xff, 0xcc, 0xcc, 0xcc, 0x33, 0xcc, 0xcc,
-	0xcc, 0x33, 0x33, 0xcc, 0xcc, 0xcc, 0xff, 0xff,
-	0xff, 0xee, 0xff, 0xff, 0xff, 0xee, 0xee, 0xff,
-	0xff, 0xff, 0xdd, 0xff, 0xff, 0xff, 0xdd, 0xdd,
-	0xff, 0xff, 0xff, 0xbb, 0xff, 0xff, 0xff, 0xbb,
-	0xbb, 0xff, 0xff, 0xff, 0x77, 0xff, 0xff, 0xff,
-	0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee, 0xff,
-	0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00,
-	0x00, 0xff, 0xff, 0xcc, 0xcc, 0xcc, 0x33, 0xcc,
-	0xcc, 0xcc, 0x33, 0x33, 0xcc, 0xcc, 0xcc, 0xff,
-	0xff, 0xff, 0xee, 0xff, 0xff, 0xff, 0xee, 0xee,
-	0xff, 0xff, 0xff, 0xdd, 0xff, 0xff, 0xff, 0xdd,
-	0xdd, 0xff, 0xff, 0xff, 0xbb, 0xff, 0xff, 0xff,
-	0xbb, 0xbb, 0xff, 0xff, 0xff, 0x77, 0xff, 0xff,
-	0xff, 0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee,
-};
-EXPORT_SYMBOL(tuning_blk_pattern_8bit);
-
 /*
  * Execute tuning sequence to seek the proper bus operating
  * conditions for HS200 and HS400, which sends CMD21 to the device.
diff --git a/drivers/mmc/core/mmc_ops.c b/drivers/mmc/core/mmc_ops.c
index 3b044c5b029c..0ea042dc7443 100644
--- a/drivers/mmc/core/mmc_ops.c
+++ b/drivers/mmc/core/mmc_ops.c
@@ -23,6 +23,36 @@
 
 #define MMC_OPS_TIMEOUT_MS	(10 * 60 * 1000) /* 10 minute timeout */
 
+static const u8 tuning_blk_pattern_4bit[] = {
+	0xff, 0x0f, 0xff, 0x00, 0xff, 0xcc, 0xc3, 0xcc,
+	0xc3, 0x3c, 0xcc, 0xff, 0xfe, 0xff, 0xfe, 0xef,
+	0xff, 0xdf, 0xff, 0xdd, 0xff, 0xfb, 0xff, 0xfb,
+	0xbf, 0xff, 0x7f, 0xff, 0x77, 0xf7, 0xbd, 0xef,
+	0xff, 0xf0, 0xff, 0xf0, 0x0f, 0xfc, 0xcc, 0x3c,
+	0xcc, 0x33, 0xcc, 0xcf, 0xff, 0xef, 0xff, 0xee,
+	0xff, 0xfd, 0xff, 0xfd, 0xdf, 0xff, 0xbf, 0xff,
+	0xbb, 0xff, 0xf7, 0xff, 0xf7, 0x7f, 0x7b, 0xde,
+};
+
+static const u8 tuning_blk_pattern_8bit[] = {
+	0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00, 0x00,
+	0xff, 0xff, 0xcc, 0xcc, 0xcc, 0x33, 0xcc, 0xcc,
+	0xcc, 0x33, 0x33, 0xcc, 0xcc, 0xcc, 0xff, 0xff,
+	0xff, 0xee, 0xff, 0xff, 0xff, 0xee, 0xee, 0xff,
+	0xff, 0xff, 0xdd, 0xff, 0xff, 0xff, 0xdd, 0xdd,
+	0xff, 0xff, 0xff, 0xbb, 0xff, 0xff, 0xff, 0xbb,
+	0xbb, 0xff, 0xff, 0xff, 0x77, 0xff, 0xff, 0xff,
+	0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee, 0xff,
+	0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00,
+	0x00, 0xff, 0xff, 0xcc, 0xcc, 0xcc, 0x33, 0xcc,
+	0xcc, 0xcc, 0x33, 0x33, 0xcc, 0xcc, 0xcc, 0xff,
+	0xff, 0xff, 0xee, 0xff, 0xff, 0xff, 0xee, 0xee,
+	0xff, 0xff, 0xff, 0xdd, 0xff, 0xff, 0xff, 0xdd,
+	0xdd, 0xff, 0xff, 0xff, 0xbb, 0xff, 0xff, 0xff,
+	0xbb, 0xbb, 0xff, 0xff, 0xff, 0x77, 0xff, 0xff,
+	0xff, 0x77, 0x77, 0xff, 0x77, 0xbb, 0xdd, 0xee,
+};
+
 static inline int __mmc_send_status(struct mmc_card *card, u32 *status,
 				    bool ignore_crc)
 {
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index 49ad7a943638..fb97b5cc91cd 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -53,11 +53,6 @@
 #define MMC_SEND_TUNING_BLOCK    19   /* adtc                    R1  */
 #define MMC_SEND_TUNING_BLOCK_HS200	21	/* adtc R1  */
 
-#define MMC_TUNING_BLK_PATTERN_4BIT_SIZE	 64
-#define MMC_TUNING_BLK_PATTERN_8BIT_SIZE	128
-extern const u8 tuning_blk_pattern_4bit[MMC_TUNING_BLK_PATTERN_4BIT_SIZE];
-extern const u8 tuning_blk_pattern_8bit[MMC_TUNING_BLK_PATTERN_8BIT_SIZE];
-
   /* class 3 */
 #define MMC_WRITE_DAT_UNTIL_STOP 20   /* adtc [31:0] data addr   R1  */
 
-- 
cgit v1.2.3


From 348487cb28e66b032bae1b38424d81bf5b444408 Mon Sep 17 00:00:00 2001
From: Haibo Chen <haibo.chen@freescale.com>
Date: Tue, 9 Dec 2014 17:04:05 +0800
Subject: mmc: sdhci: use pipeline mmc requests to improve performance

This patch is based on the patches by Per Forlin, Tony Lin and Ryan QIAN.

This patch complete the API 'post_req' and 'pre_req' in sdhci host side,

Test Env:
1. i.MX6Q-SABREAUTO board, CPU @ 996MHz, use ADMA in uSDHC controller.
2. Test command:
		$ echo 1 > /proc/sys/vm/drop_caches
	write to sd card:
		$ dd if=/dev/zero of=/dev/mmcblk0 bs=1M count=2000 conv=fsync
	read the sd card:
		$ dd if=/dev/mmcblk0 of=/dev/null bs=1M count=2000

3. TOSHIBA 16GB SD3.0 card, running at 4 bit, SDR104 @ 198MHZ
	Performance with and without this patch:
      -------------------------------------------------
	  |                    | read speed | write speed |
	  |------------------------------------------------
	  | with this patch    | ~76.7 MB/s |  ~23.3 MB/s |
	  |------------------------------------------------
	  |without this patch  | ~60.5 MB/s |  ~22.5 MB/s |
	  -------------------------------------------------

4. SanDisk 8GB SD3.0 card, running at 4 bit, DDR50 @ 50MHZ
	Performance with and without this patch:
      -------------------------------------------------
	  |                    | read speed | write speed |
	  |------------------------------------------------
	  | with this patch    | ~40.5 MB/s |  ~15.6 MB/s |
	  |------------------------------------------------
	  |without this patch  | ~36.1 MB/s |  ~14.1 MB/s |
	  -------------------------------------------------

5. Kingston 8GB SD2.0 card, running at 4 bit, High-speed @ 50MHZ
	Performance with and without this patch:
      -------------------------------------------------
	  |                    | read speed | write speed |
	  |------------------------------------------------
	  | with this patch    | ~22.7 MB/s |  ~8.2 MB/s  |
	  |------------------------------------------------
	  |without this patch  | ~21.3 MB/s |  ~8.0 MB/s  |
	  -------------------------------------------------

6. About eMMC, Sandisk 8GB eMMC on i.MX6DL-sabresd board, CPU @ 792MHZ,
   eMMC running at 8 bit, DDR52 @ 52MHZ.
	Performance with and without this patch:
      -------------------------------------------------
	  |                    | read speed | write speed |
	  |------------------------------------------------
	  | with this patch    | ~37.3 MB/s |  ~10.5 MB/s |
	  |------------------------------------------------
	  |without this patch  | ~33.4 MB/s |  ~10.5 MB/s |
	  -------------------------------------------------

Signed-off-by: Haibo Chen <haibo.chen@freescale.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci.c  | 99 +++++++++++++++++++++++++++++++++++++++++------
 include/linux/mmc/sdhci.h |  6 +++
 2 files changed, 93 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 2558d705c92e..1aab8a120d02 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -53,6 +53,9 @@ static void sdhci_finish_command(struct sdhci_host *);
 static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode);
 static void sdhci_tuning_timer(unsigned long data);
 static void sdhci_enable_preset_value(struct sdhci_host *host, bool enable);
+static int sdhci_pre_dma_transfer(struct sdhci_host *host,
+					struct mmc_data *data,
+					struct sdhci_host_next *next);
 
 #ifdef CONFIG_PM
 static int sdhci_runtime_pm_get(struct sdhci_host *host);
@@ -505,9 +508,8 @@ static int sdhci_adma_table_pre(struct sdhci_host *host,
 		goto fail;
 	BUG_ON(host->align_addr & host->align_mask);
 
-	host->sg_count = dma_map_sg(mmc_dev(host->mmc),
-		data->sg, data->sg_len, direction);
-	if (host->sg_count == 0)
+	host->sg_count = sdhci_pre_dma_transfer(host, data, NULL);
+	if (host->sg_count < 0)
 		goto unmap_align;
 
 	desc = host->adma_table;
@@ -645,8 +647,9 @@ static void sdhci_adma_table_post(struct sdhci_host *host,
 		}
 	}
 
-	dma_unmap_sg(mmc_dev(host->mmc), data->sg,
-		data->sg_len, direction);
+	if (!data->host_cookie)
+		dma_unmap_sg(mmc_dev(host->mmc), data->sg,
+			data->sg_len, direction);
 }
 
 static u8 sdhci_calc_timeout(struct sdhci_host *host, struct mmc_command *cmd)
@@ -842,11 +845,7 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_command *cmd)
 		} else {
 			int sg_cnt;
 
-			sg_cnt = dma_map_sg(mmc_dev(host->mmc),
-					data->sg, data->sg_len,
-					(data->flags & MMC_DATA_READ) ?
-						DMA_FROM_DEVICE :
-						DMA_TO_DEVICE);
+			sg_cnt = sdhci_pre_dma_transfer(host, data, NULL);
 			if (sg_cnt == 0) {
 				/*
 				 * This only happens when someone fed
@@ -959,8 +958,10 @@ static void sdhci_finish_data(struct sdhci_host *host)
 		if (host->flags & SDHCI_USE_ADMA)
 			sdhci_adma_table_post(host, data);
 		else {
-			dma_unmap_sg(mmc_dev(host->mmc), data->sg,
-				data->sg_len, (data->flags & MMC_DATA_READ) ?
+			if (!data->host_cookie)
+				dma_unmap_sg(mmc_dev(host->mmc),
+					data->sg, data->sg_len,
+					(data->flags & MMC_DATA_READ) ?
 					DMA_FROM_DEVICE : DMA_TO_DEVICE);
 		}
 	}
@@ -2125,6 +2126,77 @@ static void sdhci_enable_preset_value(struct sdhci_host *host, bool enable)
 	}
 }
 
+static void sdhci_post_req(struct mmc_host *mmc, struct mmc_request *mrq,
+				int err)
+{
+	struct sdhci_host *host = mmc_priv(mmc);
+	struct mmc_data *data = mrq->data;
+
+	if (host->flags & SDHCI_REQ_USE_DMA) {
+		if (data->host_cookie)
+			dma_unmap_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
+					 data->flags & MMC_DATA_WRITE ?
+					 DMA_TO_DEVICE : DMA_FROM_DEVICE);
+		mrq->data->host_cookie = 0;
+	}
+}
+
+static int sdhci_pre_dma_transfer(struct sdhci_host *host,
+				       struct mmc_data *data,
+				       struct sdhci_host_next *next)
+{
+	int sg_count;
+
+	if (!next && data->host_cookie &&
+	    data->host_cookie != host->next_data.cookie) {
+		pr_debug(DRIVER_NAME "[%s] invalid cookie: %d, next-cookie %d\n",
+			__func__, data->host_cookie, host->next_data.cookie);
+		data->host_cookie = 0;
+	}
+
+	/* Check if next job is already prepared */
+	if (next ||
+	    (!next && data->host_cookie != host->next_data.cookie)) {
+		sg_count = dma_map_sg(mmc_dev(host->mmc), data->sg,
+				     data->sg_len,
+				     data->flags & MMC_DATA_WRITE ?
+				     DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	} else {
+		sg_count = host->next_data.sg_count;
+		host->next_data.sg_count = 0;
+	}
+
+
+	if (sg_count == 0)
+		return -EINVAL;
+
+	if (next) {
+		next->sg_count = sg_count;
+		data->host_cookie = ++next->cookie < 0 ? 1 : next->cookie;
+	} else
+		host->sg_count = sg_count;
+
+	return sg_count;
+}
+
+static void sdhci_pre_req(struct mmc_host *mmc, struct mmc_request *mrq,
+			       bool is_first_req)
+{
+	struct sdhci_host *host = mmc_priv(mmc);
+
+	if (mrq->data->host_cookie) {
+		mrq->data->host_cookie = 0;
+		return;
+	}
+
+	if (host->flags & SDHCI_REQ_USE_DMA)
+		if (sdhci_pre_dma_transfer(host,
+					mrq->data,
+					&host->next_data) < 0)
+			mrq->data->host_cookie = 0;
+}
+
 static void sdhci_card_event(struct mmc_host *mmc)
 {
 	struct sdhci_host *host = mmc_priv(mmc);
@@ -2158,6 +2230,8 @@ static void sdhci_card_event(struct mmc_host *mmc)
 
 static const struct mmc_host_ops sdhci_ops = {
 	.request	= sdhci_request,
+	.post_req	= sdhci_post_req,
+	.pre_req	= sdhci_pre_req,
 	.set_ios	= sdhci_set_ios,
 	.get_cd		= sdhci_get_cd,
 	.get_ro		= sdhci_get_ro,
@@ -3015,6 +3089,7 @@ int sdhci_add_host(struct sdhci_host *host)
 		host->max_clk = host->ops->get_max_clock(host);
 	}
 
+	host->next_data.cookie = 1;
 	/*
 	 * In case of Host Controller v3.00, find out whether clock
 	 * multiplier is supported.
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index f767a0de611f..cb8b94ff6a26 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -17,6 +17,11 @@
 #include <linux/io.h>
 #include <linux/mmc/host.h>
 
+struct sdhci_host_next {
+	unsigned int	sg_count;
+	s32		cookie;
+};
+
 struct sdhci_host {
 	/* Data set by hardware interface driver */
 	const char *hw_name;	/* Hardware bus name */
@@ -203,6 +208,7 @@ struct sdhci_host {
 #define SDHCI_TUNING_MODE_1	0
 	struct timer_list	tuning_timer;	/* Timer for tuning */
 
+	struct sdhci_host_next	next_data;
 	unsigned long private[0] ____cacheline_aligned;
 };
 #endif /* LINUX_MMC_SDHCI_H */
-- 
cgit v1.2.3


From 83533ab28380f6957af39a7b322e639e42dbdaf1 Mon Sep 17 00:00:00 2001
From: Johan Rudholm <johan.rudholm@axis.com>
Date: Mon, 12 Jan 2015 15:38:04 +0100
Subject: mmc: core: always check status after reset

Always check if the card is alive after a successful reset. This allows
us to remove mmc_hw_reset_check(), leaving mmc_hw_reset() as the only
card reset interface.

Signed-off-by: Johan Rudholm <johanru@axis.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/card/mmc_test.c | 18 +++++++-----------
 drivers/mmc/core/core.c     | 24 +++++-------------------
 include/linux/mmc/core.h    |  1 -
 3 files changed, 12 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/card/mmc_test.c b/drivers/mmc/card/mmc_test.c
index 0a7430f94d29..7dac4695163b 100644
--- a/drivers/mmc/card/mmc_test.c
+++ b/drivers/mmc/card/mmc_test.c
@@ -2342,20 +2342,16 @@ static int mmc_test_hw_reset(struct mmc_test_card *test)
 	struct mmc_host *host = card->host;
 	int err;
 
-	err = mmc_hw_reset_check(host);
+	if (!mmc_card_mmc(card) || !mmc_can_reset(card))
+		return RESULT_UNSUP_CARD;
+
+	err = mmc_hw_reset(host);
 	if (!err)
 		return RESULT_OK;
+	else if (err == -EOPNOTSUPP)
+		return RESULT_UNSUP_HOST;
 
-	if (err == -ENOSYS)
-		return RESULT_FAIL;
-
-	if (err != -EOPNOTSUPP)
-		return err;
-
-	if (!mmc_can_reset(card))
-		return RESULT_UNSUP_CARD;
-
-	return RESULT_UNSUP_HOST;
+	return RESULT_FAIL;
 }
 
 static const struct mmc_test_case mmc_test_cases[] = {
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index d3bfbdfab052..72070f188cc5 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2286,9 +2286,10 @@ int mmc_can_reset(struct mmc_card *card)
 }
 EXPORT_SYMBOL(mmc_can_reset);
 
-static int mmc_do_hw_reset(struct mmc_host *host, int check)
+int mmc_hw_reset(struct mmc_host *host)
 {
 	struct mmc_card *card = host->card;
+	u32 status;
 
 	if (!(host->caps & MMC_CAP_HW_RESET) || !host->ops->hw_reset)
 		return -EOPNOTSUPP;
@@ -2305,13 +2306,9 @@ static int mmc_do_hw_reset(struct mmc_host *host, int check)
 	host->ops->hw_reset(host);
 
 	/* If the reset has happened, then a status command will fail */
-	if (check) {
-		u32 status;
-
-		if (!mmc_send_status(card, &status)) {
-			mmc_host_clk_release(host);
-			return -ENOSYS;
-		}
+	if (!mmc_send_status(card, &status)) {
+		mmc_host_clk_release(host);
+		return -ENOSYS;
 	}
 
 	/* Set initial state and call mmc_set_ios */
@@ -2321,19 +2318,8 @@ static int mmc_do_hw_reset(struct mmc_host *host, int check)
 
 	return host->bus_ops->power_restore(host);
 }
-
-int mmc_hw_reset(struct mmc_host *host)
-{
-	return mmc_do_hw_reset(host, 0);
-}
 EXPORT_SYMBOL(mmc_hw_reset);
 
-int mmc_hw_reset_check(struct mmc_host *host)
-{
-	return mmc_do_hw_reset(host, 1);
-}
-EXPORT_SYMBOL(mmc_hw_reset_check);
-
 static int mmc_rescan_try_freq(struct mmc_host *host, unsigned freq)
 {
 	host->f_init = freq;
diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index cb2b0400d284..160448f920ac 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -182,7 +182,6 @@ extern int mmc_set_blocklen(struct mmc_card *card, unsigned int blocklen);
 extern int mmc_set_blockcount(struct mmc_card *card, unsigned int blockcount,
 			      bool is_rel_write);
 extern int mmc_hw_reset(struct mmc_host *host);
-extern int mmc_hw_reset_check(struct mmc_host *host);
 extern int mmc_can_reset(struct mmc_card *card);
 
 extern void mmc_set_data_timeout(struct mmc_data *, const struct mmc_card *);
-- 
cgit v1.2.3


From c7ea834d81904b71505093f7ec50d036132cf628 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 13 Jan 2015 08:23:18 +1300
Subject: mmc: slot-gpio: Allow host driver to provide isr for card-detect
 interrupts

One of the reasons omap_hsmmc doesn't use the slot-gpio library
is that it has some non-standard functionality in the card-detect
interrupt service routine.

To make it possible for omap_hsmmc (and maybe others) to be converted
to use slot-gpio, add 'mmc_gpio_request_cd_isr' which provide an
alternate isr to be register by the slot-gpio code.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/slot-gpio.c  | 18 +++++++++++++++++-
 include/linux/mmc/slot-gpio.h |  2 ++
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c
index 1a3edbd47719..27117ba47073 100644
--- a/drivers/mmc/core/slot-gpio.c
+++ b/drivers/mmc/core/slot-gpio.c
@@ -25,6 +25,7 @@ struct mmc_gpio {
 	struct gpio_desc *cd_gpio;
 	bool override_ro_active_level;
 	bool override_cd_active_level;
+	irqreturn_t (*cd_gpio_isr)(int irq, void *dev_id);
 	char *ro_label;
 	char cd_label[0];
 };
@@ -136,8 +137,10 @@ void mmc_gpiod_request_cd_irq(struct mmc_host *host)
 		irq = -EINVAL;
 
 	if (irq >= 0) {
+		if (!ctx->cd_gpio_isr)
+			ctx->cd_gpio_isr = mmc_gpio_cd_irqt;
 		ret = devm_request_threaded_irq(host->parent, irq,
-			NULL, mmc_gpio_cd_irqt,
+			NULL, ctx->cd_gpio_isr,
 			IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
 			ctx->cd_label, host);
 		if (ret < 0)
@@ -151,6 +154,19 @@ void mmc_gpiod_request_cd_irq(struct mmc_host *host)
 }
 EXPORT_SYMBOL(mmc_gpiod_request_cd_irq);
 
+/* Register an alternate interrupt service routine for
+ * the card-detect GPIO.
+ */
+void mmc_gpio_set_cd_isr(struct mmc_host *host,
+			 irqreturn_t (*isr)(int irq, void *dev_id))
+{
+	struct mmc_gpio *ctx = host->slot.handler_priv;
+
+	WARN_ON(ctx->cd_gpio_isr);
+	ctx->cd_gpio_isr = isr;
+}
+EXPORT_SYMBOL(mmc_gpio_set_cd_isr);
+
 /**
  * mmc_gpio_request_cd - request a gpio for card-detection
  * @host: mmc host
diff --git a/include/linux/mmc/slot-gpio.h b/include/linux/mmc/slot-gpio.h
index 4a36d6954631..3945a8c9d3cb 100644
--- a/include/linux/mmc/slot-gpio.h
+++ b/include/linux/mmc/slot-gpio.h
@@ -26,6 +26,8 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id,
 int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, bool override_active_level,
 			 unsigned int debounce, bool *gpio_invert);
+void mmc_gpio_set_cd_isr(struct mmc_host *host,
+			 irqreturn_t (*isr)(int irq, void *dev_id));
 void mmc_gpiod_request_cd_irq(struct mmc_host *host);
 
 #endif
-- 
cgit v1.2.3


From 85b4545629663486b7f71047ce3b54fa0ad3eb28 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 12 Jan 2015 17:51:14 +0000
Subject: iommu: Consolidate IOVA allocator code

In order to share the IOVA allocator with other architectures, break
the unnecssary dependency on the Intel IOMMU driver and move the
remaining IOVA internals to iova.c

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c | 33 ++-------------------------------
 drivers/iommu/iova.c        | 35 +++++++++++++++++++++++++++++++++++
 include/linux/iova.h        |  3 +++
 3 files changed, 40 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 40dfbc0444c0..e758d8ed8fb5 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -485,7 +485,6 @@ __setup("intel_iommu=", intel_iommu_setup);
 
 static struct kmem_cache *iommu_domain_cache;
 static struct kmem_cache *iommu_devinfo_cache;
-static struct kmem_cache *iommu_iova_cache;
 
 static inline void *alloc_pgtable_page(int node)
 {
@@ -523,16 +522,6 @@ static inline void free_devinfo_mem(void *vaddr)
 	kmem_cache_free(iommu_devinfo_cache, vaddr);
 }
 
-struct iova *alloc_iova_mem(void)
-{
-	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
-}
-
-void free_iova_mem(struct iova *iova)
-{
-	kmem_cache_free(iommu_iova_cache, iova);
-}
-
 static inline int domain_type_is_vm(struct dmar_domain *domain)
 {
 	return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
@@ -3427,23 +3416,6 @@ static inline int iommu_devinfo_cache_init(void)
 	return ret;
 }
 
-static inline int iommu_iova_cache_init(void)
-{
-	int ret = 0;
-
-	iommu_iova_cache = kmem_cache_create("iommu_iova",
-					 sizeof(struct iova),
-					 0,
-					 SLAB_HWCACHE_ALIGN,
-					 NULL);
-	if (!iommu_iova_cache) {
-		printk(KERN_ERR "Couldn't create iova cache\n");
-		ret = -ENOMEM;
-	}
-
-	return ret;
-}
-
 static int __init iommu_init_mempool(void)
 {
 	int ret;
@@ -3461,7 +3433,7 @@ static int __init iommu_init_mempool(void)
 
 	kmem_cache_destroy(iommu_domain_cache);
 domain_error:
-	kmem_cache_destroy(iommu_iova_cache);
+	iommu_iova_cache_destroy();
 
 	return -ENOMEM;
 }
@@ -3470,8 +3442,7 @@ static void __init iommu_exit_mempool(void)
 {
 	kmem_cache_destroy(iommu_devinfo_cache);
 	kmem_cache_destroy(iommu_domain_cache);
-	kmem_cache_destroy(iommu_iova_cache);
-
+	iommu_iova_cache_destroy();
 }
 
 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index f6b17e6af2fb..520b8c8ae0c4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -18,6 +18,41 @@
  */
 
 #include <linux/iova.h>
+#include <linux/slab.h>
+
+static struct kmem_cache *iommu_iova_cache;
+
+int iommu_iova_cache_init(void)
+{
+	int ret = 0;
+
+	iommu_iova_cache = kmem_cache_create("iommu_iova",
+					 sizeof(struct iova),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!iommu_iova_cache) {
+		pr_err("Couldn't create iova cache\n");
+		ret = -ENOMEM;
+	}
+
+	return ret;
+}
+
+void iommu_iova_cache_destroy(void)
+{
+	kmem_cache_destroy(iommu_iova_cache);
+}
+
+struct iova *alloc_iova_mem(void)
+{
+	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
+}
+
+void free_iova_mem(struct iova *iova)
+{
+	kmem_cache_free(iommu_iova_cache, iova);
+}
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 19e81d5ccb6d..ad0507c61cc7 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -39,6 +39,9 @@ static inline unsigned long iova_size(struct iova *iova)
 	return iova->pfn_hi - iova->pfn_lo + 1;
 }
 
+int iommu_iova_cache_init(void);
+void iommu_iova_cache_destroy(void);
+
 struct iova *alloc_iova_mem(void);
 void free_iova_mem(struct iova *iova);
 void free_iova(struct iova_domain *iovad, unsigned long pfn);
-- 
cgit v1.2.3


From 1b72250076dde4276acecf3a7da722b185703e78 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 12 Jan 2015 17:51:15 +0000
Subject: iommu: Make IOVA domain low limit flexible

To share the IOVA allocator with other architectures, it needs to
accommodate more general aperture restrictions; move the lower limit
from a compile-time constant to a runtime domain property to allow
IOVA domains with different requirements to co-exist.

Also reword the slightly unclear description of alloc_iova since we're
touching it anyway.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c |  9 ++++++---
 drivers/iommu/iova.c        | 10 ++++++----
 include/linux/iova.h        |  7 +++----
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index e758d8ed8fb5..86f9e82b015b 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -71,6 +71,9 @@
 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
 
+/* IO virtual address start page frame number */
+#define IOVA_START_PFN		(1)
+
 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
 #define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
 #define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
@@ -1632,7 +1635,7 @@ static int dmar_init_reserved_ranges(void)
 	struct iova *iova;
 	int i;
 
-	init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
+	init_iova_domain(&reserved_iova_list, IOVA_START_PFN, DMA_32BIT_PFN);
 
 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
 		&reserved_rbtree_key);
@@ -1690,7 +1693,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 	int adjust_width, agaw;
 	unsigned long sagaw;
 
-	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
@@ -4313,7 +4316,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 {
 	int adjust_width;
 
-	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 520b8c8ae0c4..a3dbba8caa19 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -55,11 +55,13 @@ void free_iova_mem(struct iova *iova)
 }
 
 void
-init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit)
+init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
+	unsigned long pfn_32bit)
 {
 	spin_lock_init(&iovad->iova_rbtree_lock);
 	iovad->rbroot = RB_ROOT;
 	iovad->cached32_node = NULL;
+	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit;
 }
 
@@ -162,7 +164,7 @@ move_left:
 	if (!curr) {
 		if (size_aligned)
 			pad_size = iova_get_pad_size(size, limit_pfn);
-		if ((IOVA_START_PFN + size + pad_size) > limit_pfn) {
+		if ((iovad->start_pfn + size + pad_size) > limit_pfn) {
 			spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
 			return -ENOMEM;
 		}
@@ -237,8 +239,8 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova)
  * @size: - size of page frames to allocate
  * @limit_pfn: - max limit address
  * @size_aligned: - set if size_aligned address range is required
- * This function allocates an iova in the range limit_pfn to IOVA_START_PFN
- * looking from limit_pfn instead from IOVA_START_PFN. If the size_aligned
+ * This function allocates an iova in the range iovad->start_pfn to limit_pfn,
+ * searching top-down from limit_pfn to iovad->start_pfn. If the size_aligned
  * flag is set then the allocated address iova->pfn_lo will be naturally
  * aligned on roundup_power_of_two(size).
  */
diff --git a/include/linux/iova.h b/include/linux/iova.h
index ad0507c61cc7..591b19626b46 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -16,9 +16,6 @@
 #include <linux/rbtree.h>
 #include <linux/dma-mapping.h>
 
-/* IO virtual address start page frame number */
-#define IOVA_START_PFN		(1)
-
 /* iova structure */
 struct iova {
 	struct rb_node	node;
@@ -31,6 +28,7 @@ struct iova_domain {
 	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
 	struct rb_root	rbroot;		/* iova domain rbtree root */
 	struct rb_node	*cached32_node; /* Save last alloced node */
+	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
 };
 
@@ -52,7 +50,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
-void init_iova_domain(struct iova_domain *iovad, unsigned long pfn_32bit);
+void init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
+	unsigned long pfn_32bit);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
-- 
cgit v1.2.3


From 0fb5fe874c42942e16c450ae05da453e13a1c09e Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 12 Jan 2015 17:51:16 +0000
Subject: iommu: Make IOVA domain page size explicit

Systems may contain heterogeneous IOMMUs supporting differing minimum
page sizes, which may also not be common with the CPU page size.
Thus it is practical to have an explicit notion of IOVA granularity
to simplify handling of mapping and allocation constraints.

As an initial step, move the IOVA page granularity from an implicit
compile-time constant to a per-domain property so we can make use
of it in IOVA domain context at runtime. To keep the abstraction tidy,
extend the little API of inline iova_* helpers to parallel some of the
equivalent PAGE_* macros.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel-iommu.c |  9 ++++++---
 drivers/iommu/iova.c        | 12 ++++++++++--
 include/linux/iova.h        | 35 +++++++++++++++++++++++++++++++++--
 3 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 86f9e82b015b..ae4c1a854e57 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -1635,7 +1635,8 @@ static int dmar_init_reserved_ranges(void)
 	struct iova *iova;
 	int i;
 
-	init_iova_domain(&reserved_iova_list, IOVA_START_PFN, DMA_32BIT_PFN);
+	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
+			DMA_32BIT_PFN);
 
 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
 		&reserved_rbtree_key);
@@ -1693,7 +1694,8 @@ static int domain_init(struct dmar_domain *domain, int guest_width)
 	int adjust_width, agaw;
 	unsigned long sagaw;
 
-	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
+			DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
@@ -4316,7 +4318,8 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 {
 	int adjust_width;
 
-	init_iova_domain(&domain->iovad, IOVA_START_PFN, DMA_32BIT_PFN);
+	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
+			DMA_32BIT_PFN);
 	domain_reserve_special_ranges(domain);
 
 	/* calculate AGAW */
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index a3dbba8caa19..9dd8208312c2 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -55,12 +55,20 @@ void free_iova_mem(struct iova *iova)
 }
 
 void
-init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
-	unsigned long pfn_32bit)
+init_iova_domain(struct iova_domain *iovad, unsigned long granule,
+	unsigned long start_pfn, unsigned long pfn_32bit)
 {
+	/*
+	 * IOVA granularity will normally be equal to the smallest
+	 * supported IOMMU page size; both *must* be capable of
+	 * representing individual CPU pages exactly.
+	 */
+	BUG_ON((granule > PAGE_SIZE) || !is_power_of_2(granule));
+
 	spin_lock_init(&iovad->iova_rbtree_lock);
 	iovad->rbroot = RB_ROOT;
 	iovad->cached32_node = NULL;
+	iovad->granule = granule;
 	iovad->start_pfn = start_pfn;
 	iovad->dma_32bit_pfn = pfn_32bit;
 }
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 591b19626b46..3920a19d8194 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -28,6 +28,7 @@ struct iova_domain {
 	spinlock_t	iova_rbtree_lock; /* Lock to protect update of rbtree */
 	struct rb_root	rbroot;		/* iova domain rbtree root */
 	struct rb_node	*cached32_node; /* Save last alloced node */
+	unsigned long	granule;	/* pfn granularity for this domain */
 	unsigned long	start_pfn;	/* Lower limit for this domain */
 	unsigned long	dma_32bit_pfn;
 };
@@ -37,6 +38,36 @@ static inline unsigned long iova_size(struct iova *iova)
 	return iova->pfn_hi - iova->pfn_lo + 1;
 }
 
+static inline unsigned long iova_shift(struct iova_domain *iovad)
+{
+	return __ffs(iovad->granule);
+}
+
+static inline unsigned long iova_mask(struct iova_domain *iovad)
+{
+	return iovad->granule - 1;
+}
+
+static inline size_t iova_offset(struct iova_domain *iovad, dma_addr_t iova)
+{
+	return iova & iova_mask(iovad);
+}
+
+static inline size_t iova_align(struct iova_domain *iovad, size_t size)
+{
+	return ALIGN(size, iovad->granule);
+}
+
+static inline dma_addr_t iova_dma_addr(struct iova_domain *iovad, struct iova *iova)
+{
+	return (dma_addr_t)iova->pfn_lo << iova_shift(iovad);
+}
+
+static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova)
+{
+	return iova >> iova_shift(iovad);
+}
+
 int iommu_iova_cache_init(void);
 void iommu_iova_cache_destroy(void);
 
@@ -50,8 +81,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
 struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
 	unsigned long pfn_hi);
 void copy_reserved_iova(struct iova_domain *from, struct iova_domain *to);
-void init_iova_domain(struct iova_domain *iovad, unsigned long start_pfn,
-	unsigned long pfn_32bit);
+void init_iova_domain(struct iova_domain *iovad, unsigned long granule,
+	unsigned long start_pfn, unsigned long pfn_32bit);
 struct iova *find_iova(struct iova_domain *iovad, unsigned long pfn);
 void put_iova_domain(struct iova_domain *iovad);
 struct iova *split_and_remove_iova(struct iova_domain *iovad,
-- 
cgit v1.2.3


From 72dd299d5039a336493993dcc63413cf31d0e662 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 16 Jan 2015 15:13:02 -0800
Subject: libata: allow sata_sil24 to opt-out of tag ordered submission

Ronny reports: https://bugzilla.kernel.org/show_bug.cgi?id=87101
    "Since commit 8a4aeec8d "libata/ahci: accommodate tag ordered
    controllers" the access to the harddisk on the first SATA-port is
    failing on its first access. The access to the harddisk on the
    second port is working normal.

    When reverting the above commit, access to both harddisks is working
    fine again."

Maintain tag ordered submission as the default, but allow sata_sil24 to
continue with the old behavior.

Cc: <stable@vger.kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Reported-by: Ronny Hegewald <Ronny.Hegewald@online.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c | 5 ++++-
 drivers/ata/sata_sil24.c  | 2 +-
 include/linux/libata.h    | 1 +
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 23c2ae03a7ab..d1a05f9bb91f 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4771,7 +4771,10 @@ static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
 		return NULL;
 
 	for (i = 0, tag = ap->last_tag + 1; i < max_queue; i++, tag++) {
-		tag = tag < max_queue ? tag : 0;
+		if (ap->flags & ATA_FLAG_LOWTAG)
+			tag = i;
+		else
+			tag = tag < max_queue ? tag : 0;
 
 		/* the last tag is reserved for internal command. */
 		if (tag == ATA_TAG_INTERNAL)
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index d81b20ddb527..ea655949023f 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -246,7 +246,7 @@ enum {
 	/* host flags */
 	SIL24_COMMON_FLAGS	= ATA_FLAG_SATA | ATA_FLAG_PIO_DMA |
 				  ATA_FLAG_NCQ | ATA_FLAG_ACPI_SATA |
-				  ATA_FLAG_AN | ATA_FLAG_PMP,
+				  ATA_FLAG_AN | ATA_FLAG_PMP | ATA_FLAG_LOWTAG,
 	SIL24_FLAG_PCIX_IRQ_WOC	= (1 << 24), /* IRQ loss errata on PCI-X */
 
 	IRQ_STAT_4PORTS		= 0xf,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index f2b440e44fd7..91f705de2c0b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -231,6 +231,7 @@ enum {
 	ATA_FLAG_SW_ACTIVITY	= (1 << 22), /* driver supports sw activity
 					      * led */
 	ATA_FLAG_NO_DIPM	= (1 << 23), /* host not happy with DIPM */
+	ATA_FLAG_LOWTAG		= (1 << 24), /* host wants lowest available tag */
 
 	/* bits 24:31 of ap->flags are reserved for LLD specific flags */
 
-- 
cgit v1.2.3


From c7d7ddee7e24eedde6149eefbcfbfbc7125b9ff0 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Thu, 15 Jan 2015 15:09:37 +0100
Subject: ata: libahci: Allow using multiple regulators

The current implementation of the libahci allows using multiple PHYs
but not multiple regulators. This patch adds the support of multiple
regulators. Until now it was mandatory to have a PHY under a subnode,
now a port subnode can contain either a regulator or a PHY (or both).

In order to be able to asociate a port with a regulator the port are
now a platform device in the device tree case.

There was only one driver which used directly the regulator field of
the ahci_host_priv structure. To preserve the bisectability the change
in the ahci_imx driver was done in the same patch.

Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Acked-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/ahci.h             |   2 +-
 drivers/ata/ahci_imx.c         |  14 +--
 drivers/ata/libahci_platform.c | 230 +++++++++++++++++++++++++++++------------
 include/linux/ahci_platform.h  |   2 +
 4 files changed, 173 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index 40f0e34f17af..275358ae0b3f 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -333,7 +333,7 @@ struct ahci_host_priv {
 	u32			em_msg_type;	/* EM message type */
 	bool			got_runtime_pm; /* Did we do pm_runtime_get? */
 	struct clk		*clks[AHCI_MAX_CLKS]; /* Optional */
-	struct regulator	*target_pwr;	/* Optional */
+	struct regulator	**target_pwrs;	/* Optional */
 	/*
 	 * If platform uses PHYs. There is a 1:1 relation between the port number and
 	 * the PHY position in this array.
diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c
index 35d51c59a370..41632e57d46f 100644
--- a/drivers/ata/ahci_imx.c
+++ b/drivers/ata/ahci_imx.c
@@ -221,11 +221,9 @@ static int imx_sata_enable(struct ahci_host_priv *hpriv)
 	if (imxpriv->no_device)
 		return 0;
 
-	if (hpriv->target_pwr) {
-		ret = regulator_enable(hpriv->target_pwr);
-		if (ret)
-			return ret;
-	}
+	ret = ahci_platform_enable_regulators(hpriv);
+	if (ret)
+		return ret;
 
 	ret = clk_prepare_enable(imxpriv->sata_ref_clk);
 	if (ret < 0)
@@ -270,8 +268,7 @@ static int imx_sata_enable(struct ahci_host_priv *hpriv)
 disable_clk:
 	clk_disable_unprepare(imxpriv->sata_ref_clk);
 disable_regulator:
-	if (hpriv->target_pwr)
-		regulator_disable(hpriv->target_pwr);
+	ahci_platform_disable_regulators(hpriv);
 
 	return ret;
 }
@@ -291,8 +288,7 @@ static void imx_sata_disable(struct ahci_host_priv *hpriv)
 
 	clk_disable_unprepare(imxpriv->sata_ref_clk);
 
-	if (hpriv->target_pwr)
-		regulator_disable(hpriv->target_pwr);
+	ahci_platform_disable_regulators(hpriv);
 }
 
 static void ahci_imx_error_handler(struct ata_port *ap)
diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c
index a147aaadca85..73a086664ee7 100644
--- a/drivers/ata/libahci_platform.c
+++ b/drivers/ata/libahci_platform.c
@@ -24,6 +24,7 @@
 #include <linux/ahci_platform.h>
 #include <linux/phy/phy.h>
 #include <linux/pm_runtime.h>
+#include <linux/of_platform.h>
 #include "ahci.h"
 
 static void ahci_host_stop(struct ata_host *host);
@@ -137,6 +138,59 @@ void ahci_platform_disable_clks(struct ahci_host_priv *hpriv)
 }
 EXPORT_SYMBOL_GPL(ahci_platform_disable_clks);
 
+/**
+ * ahci_platform_enable_regulators - Enable regulators
+ * @hpriv: host private area to store config values
+ *
+ * This function enables all the regulators found in
+ * hpriv->target_pwrs, if any.  If a regulator fails to be enabled, it
+ * disables all the regulators already enabled in reverse order and
+ * returns an error.
+ *
+ * RETURNS:
+ * 0 on success otherwise a negative error code
+ */
+int ahci_platform_enable_regulators(struct ahci_host_priv *hpriv)
+{
+	int rc, i;
+
+	for (i = 0; i < hpriv->nports; i++) {
+		if (!hpriv->target_pwrs[i])
+			continue;
+
+		rc = regulator_enable(hpriv->target_pwrs[i]);
+		if (rc)
+			goto disable_target_pwrs;
+	}
+
+	return 0;
+
+disable_target_pwrs:
+	while (--i >= 0)
+		if (hpriv->target_pwrs[i])
+			regulator_disable(hpriv->target_pwrs[i]);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ahci_platform_enable_regulators);
+
+/**
+ * ahci_platform_disable_regulators - Disable regulators
+ * @hpriv: host private area to store config values
+ *
+ * This function disables all regulators found in hpriv->target_pwrs.
+ */
+void ahci_platform_disable_regulators(struct ahci_host_priv *hpriv)
+{
+	int i;
+
+	for (i = 0; i < hpriv->nports; i++) {
+		if (!hpriv->target_pwrs[i])
+			continue;
+		regulator_disable(hpriv->target_pwrs[i]);
+	}
+}
+EXPORT_SYMBOL_GPL(ahci_platform_disable_regulators);
 /**
  * ahci_platform_enable_resources - Enable platform resources
  * @hpriv: host private area to store config values
@@ -157,11 +211,9 @@ int ahci_platform_enable_resources(struct ahci_host_priv *hpriv)
 {
 	int rc;
 
-	if (hpriv->target_pwr) {
-		rc = regulator_enable(hpriv->target_pwr);
-		if (rc)
-			return rc;
-	}
+	rc = ahci_platform_enable_regulators(hpriv);
+	if (rc)
+		return rc;
 
 	rc = ahci_platform_enable_clks(hpriv);
 	if (rc)
@@ -177,8 +229,8 @@ disable_clks:
 	ahci_platform_disable_clks(hpriv);
 
 disable_regulator:
-	if (hpriv->target_pwr)
-		regulator_disable(hpriv->target_pwr);
+	ahci_platform_disable_regulators(hpriv);
+
 	return rc;
 }
 EXPORT_SYMBOL_GPL(ahci_platform_enable_resources);
@@ -199,8 +251,7 @@ void ahci_platform_disable_resources(struct ahci_host_priv *hpriv)
 
 	ahci_platform_disable_clks(hpriv);
 
-	if (hpriv->target_pwr)
-		regulator_disable(hpriv->target_pwr);
+	ahci_platform_disable_regulators(hpriv);
 }
 EXPORT_SYMBOL_GPL(ahci_platform_disable_resources);
 
@@ -216,6 +267,68 @@ static void ahci_platform_put_resources(struct device *dev, void *res)
 
 	for (c = 0; c < AHCI_MAX_CLKS && hpriv->clks[c]; c++)
 		clk_put(hpriv->clks[c]);
+	/*
+	 * The regulators are tied to child node device and not to the
+	 * SATA device itself. So we can't use devm for automatically
+	 * releasing them. We have to do it manually here.
+	 */
+	for (c = 0; c < hpriv->nports; c++)
+		if (hpriv->target_pwrs && hpriv->target_pwrs[c])
+			regulator_put(hpriv->target_pwrs[c]);
+
+}
+
+static int ahci_platform_get_phy(struct ahci_host_priv *hpriv, u32 port,
+				struct device *dev, struct device_node *node)
+{
+	int rc;
+
+	hpriv->phys[port] = devm_of_phy_get(dev, node, NULL);
+
+	if (!IS_ERR(hpriv->phys[port]))
+		return 0;
+
+	rc = PTR_ERR(hpriv->phys[port]);
+	switch (rc) {
+	case -ENOSYS:
+		/* No PHY support. Check if PHY is required. */
+		if (of_find_property(node, "phys", NULL)) {
+			dev_err(dev,
+				"couldn't get PHY in node %s: ENOSYS\n",
+				node->name);
+			break;
+		}
+	case -ENODEV:
+		/* continue normally */
+		hpriv->phys[port] = NULL;
+		rc = 0;
+		break;
+
+	default:
+		dev_err(dev,
+			"couldn't get PHY in node %s: %d\n",
+			node->name, rc);
+
+		break;
+	}
+
+	return rc;
+}
+
+static int ahci_platform_get_regulator(struct ahci_host_priv *hpriv, u32 port,
+				struct device *dev)
+{
+	struct regulator *target_pwr;
+	int rc = 0;
+
+	target_pwr = regulator_get_optional(dev, "target");
+
+	if (!IS_ERR(target_pwr))
+		hpriv->target_pwrs[port] = target_pwr;
+	else
+		rc = PTR_ERR(target_pwr);
+
+	return rc;
 }
 
 /**
@@ -240,7 +353,7 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev)
 	struct ahci_host_priv *hpriv;
 	struct clk *clk;
 	struct device_node *child;
-	int i, enabled_ports = 0, rc = -ENOMEM;
+	int i, sz, enabled_ports = 0, rc = -ENOMEM, child_nodes;
 	u32 mask_port_map = 0;
 
 	if (!devres_open_group(dev, NULL, GFP_KERNEL))
@@ -261,14 +374,6 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev)
 		goto err_out;
 	}
 
-	hpriv->target_pwr = devm_regulator_get_optional(dev, "target");
-	if (IS_ERR(hpriv->target_pwr)) {
-		rc = PTR_ERR(hpriv->target_pwr);
-		if (rc == -EPROBE_DEFER)
-			goto err_out;
-		hpriv->target_pwr = NULL;
-	}
-
 	for (i = 0; i < AHCI_MAX_CLKS; i++) {
 		/*
 		 * For now we must use clk_get(dev, NULL) for the first clock,
@@ -290,19 +395,33 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev)
 		hpriv->clks[i] = clk;
 	}
 
-	hpriv->nports = of_get_child_count(dev->of_node);
+	hpriv->nports = child_nodes = of_get_child_count(dev->of_node);
 
-	if (hpriv->nports) {
-		hpriv->phys = devm_kzalloc(dev,
-					   hpriv->nports * sizeof(*hpriv->phys),
-					   GFP_KERNEL);
-		if (!hpriv->phys) {
-			rc = -ENOMEM;
-			goto err_out;
-		}
+	/*
+	 * If no sub-node was found, we still need to set nports to
+	 * one in order to be able to use the
+	 * ahci_platform_[en|dis]able_[phys|regulators] functions.
+	 */
+	if (!child_nodes)
+		hpriv->nports = 1;
 
+	sz = hpriv->nports * sizeof(*hpriv->phys);
+	hpriv->phys = devm_kzalloc(dev, sz, GFP_KERNEL);
+	if (!hpriv->phys) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+	sz = hpriv->nports * sizeof(*hpriv->target_pwrs);
+	hpriv->target_pwrs = devm_kzalloc(dev, sz, GFP_KERNEL);
+	if (!hpriv->target_pwrs) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	if (child_nodes) {
 		for_each_child_of_node(dev->of_node, child) {
 			u32 port;
+			struct platform_device *port_dev;
 
 			if (!of_device_is_available(child))
 				continue;
@@ -316,18 +435,23 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev)
 				dev_warn(dev, "invalid port number %d\n", port);
 				continue;
 			}
-
 			mask_port_map |= BIT(port);
 
-			hpriv->phys[port] = devm_of_phy_get(dev, child, NULL);
-			if (IS_ERR(hpriv->phys[port])) {
-				rc = PTR_ERR(hpriv->phys[port]);
-				dev_err(dev,
-					"couldn't get PHY in node %s: %d\n",
-					child->name, rc);
-				goto err_out;
+			of_platform_device_create(child, NULL, NULL);
+
+			port_dev = of_find_device_by_node(child);
+
+			if (port_dev) {
+				rc = ahci_platform_get_regulator(hpriv, port,
+								&port_dev->dev);
+				if (rc == -EPROBE_DEFER)
+					goto err_out;
 			}
 
+			rc = ahci_platform_get_phy(hpriv, port, dev, child);
+			if (rc)
+				goto err_out;
+
 			enabled_ports++;
 		}
 		if (!enabled_ports) {
@@ -343,38 +467,14 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev)
 		 * If no sub-node was found, keep this for device tree
 		 * compatibility
 		 */
-		struct phy *phy = devm_phy_get(dev, "sata-phy");
-		if (!IS_ERR(phy)) {
-			hpriv->phys = devm_kzalloc(dev, sizeof(*hpriv->phys),
-						   GFP_KERNEL);
-			if (!hpriv->phys) {
-				rc = -ENOMEM;
-				goto err_out;
-			}
-
-			hpriv->phys[0] = phy;
-			hpriv->nports = 1;
-		} else {
-			rc = PTR_ERR(phy);
-			switch (rc) {
-				case -ENOSYS:
-					/* No PHY support. Check if PHY is required. */
-					if (of_find_property(dev->of_node, "phys", NULL)) {
-						dev_err(dev, "couldn't get sata-phy: ENOSYS\n");
-						goto err_out;
-					}
-				case -ENODEV:
-					/* continue normally */
-					hpriv->phys = NULL;
-					break;
-
-				default:
-					goto err_out;
+		rc = ahci_platform_get_phy(hpriv, 0, dev, dev->of_node);
+		if (rc)
+			goto err_out;
 
-			}
-		}
+		rc = ahci_platform_get_regulator(hpriv, 0, dev);
+		if (rc == -EPROBE_DEFER)
+			goto err_out;
 	}
-
 	pm_runtime_enable(dev);
 	pm_runtime_get_sync(dev);
 	hpriv->got_runtime_pm = true;
diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h
index 642d6ae4030c..f65b33809170 100644
--- a/include/linux/ahci_platform.h
+++ b/include/linux/ahci_platform.h
@@ -24,6 +24,8 @@ struct platform_device;
 
 int ahci_platform_enable_clks(struct ahci_host_priv *hpriv);
 void ahci_platform_disable_clks(struct ahci_host_priv *hpriv);
+int ahci_platform_enable_regulators(struct ahci_host_priv *hpriv);
+void ahci_platform_disable_regulators(struct ahci_host_priv *hpriv);
 int ahci_platform_enable_resources(struct ahci_host_priv *hpriv);
 void ahci_platform_disable_resources(struct ahci_host_priv *hpriv);
 struct ahci_host_priv *ahci_platform_get_resources(
-- 
cgit v1.2.3


From 54c523127bcca986c6f9b04c7b56a949ea011899 Mon Sep 17 00:00:00 2001
From: Matt Wagantall <mattw@codeaurora.org>
Date: Mon, 15 Dec 2014 23:47:23 +0000
Subject: iopoll: Introduce memory-mapped IO polling macros

It is sometimes necessary to poll a memory-mapped register until its value
satisfies some condition. Introduce a family of convenience macros that do
this. Tight-looping, sleeping, and timing out can all be accomplished using
these macros.

Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Robert Elliott <elliott@hp.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Matt Wagantall <mattw@codeaurora.org>
Signed-off-by: Mitchel Humpherys <mitchelh@codeaurora.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 include/linux/iopoll.h | 144 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 include/linux/iopoll.h

(limited to 'include/linux')

diff --git a/include/linux/iopoll.h b/include/linux/iopoll.h
new file mode 100644
index 000000000000..1c30014ed176
--- /dev/null
+++ b/include/linux/iopoll.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2012-2014 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_IOPOLL_H
+#define _LINUX_IOPOLL_H
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/hrtimer.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/io.h>
+
+/**
+ * readx_poll_timeout - Periodically poll an address until a condition is met or a timeout occurs
+ * @op: accessor function (takes @addr as its only argument)
+ * @addr: Address to poll
+ * @val: Variable to read the value into
+ * @cond: Break condition (usually involving @val)
+ * @sleep_us: Maximum time to sleep between reads in us (0
+ *            tight-loops).  Should be less than ~20ms since usleep_range
+ *            is used (see Documentation/timers/timers-howto.txt).
+ * @timeout_us: Timeout in us, 0 means never timeout
+ *
+ * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
+ * case, the last read value at @addr is stored in @val. Must not
+ * be called from atomic context if sleep_us or timeout_us are used.
+ *
+ * When available, you'll probably want to use one of the specialized
+ * macros defined below rather than this macro directly.
+ */
+#define readx_poll_timeout(op, addr, val, cond, sleep_us, timeout_us)	\
+({ \
+	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	might_sleep_if(sleep_us); \
+	for (;;) { \
+		(val) = op(addr); \
+		if (cond) \
+			break; \
+		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+			(val) = op(addr); \
+			break; \
+		} \
+		if (sleep_us) \
+			usleep_range((sleep_us >> 2) + 1, sleep_us); \
+	} \
+	(cond) ? 0 : -ETIMEDOUT; \
+})
+
+/**
+ * readx_poll_timeout_atomic - Periodically poll an address until a condition is met or a timeout occurs
+ * @op: accessor function (takes @addr as its only argument)
+ * @addr: Address to poll
+ * @val: Variable to read the value into
+ * @cond: Break condition (usually involving @val)
+ * @delay_us: Time to udelay between reads in us (0 tight-loops).  Should
+ *            be less than ~10us since udelay is used (see
+ *            Documentation/timers/timers-howto.txt).
+ * @timeout_us: Timeout in us, 0 means never timeout
+ *
+ * Returns 0 on success and -ETIMEDOUT upon a timeout. In either
+ * case, the last read value at @addr is stored in @val.
+ *
+ * When available, you'll probably want to use one of the specialized
+ * macros defined below rather than this macro directly.
+ */
+#define readx_poll_timeout_atomic(op, addr, val, cond, delay_us, timeout_us) \
+({ \
+	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	for (;;) { \
+		(val) = op(addr); \
+		if (cond) \
+			break; \
+		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+			(val) = op(addr); \
+			break; \
+		} \
+		if (delay_us) \
+			udelay(delay_us);	\
+	} \
+	(cond) ? 0 : -ETIMEDOUT; \
+})
+
+
+#define readb_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readb, addr, val, cond, delay_us, timeout_us)
+
+#define readb_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readb, addr, val, cond, delay_us, timeout_us)
+
+#define readw_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readw, addr, val, cond, delay_us, timeout_us)
+
+#define readw_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readw, addr, val, cond, delay_us, timeout_us)
+
+#define readl_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readl, addr, val, cond, delay_us, timeout_us)
+
+#define readl_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readl, addr, val, cond, delay_us, timeout_us)
+
+#define readq_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readq, addr, val, cond, delay_us, timeout_us)
+
+#define readq_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readq, addr, val, cond, delay_us, timeout_us)
+
+#define readb_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readb_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readb_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readb_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readw_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readw_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readw_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readw_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readl_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readl_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readl_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readl_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readq_relaxed_poll_timeout(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout(readq_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#define readq_relaxed_poll_timeout_atomic(addr, val, cond, delay_us, timeout_us) \
+	readx_poll_timeout_atomic(readq_relaxed, addr, val, cond, delay_us, timeout_us)
+
+#endif /* _LINUX_IOPOLL_H */
-- 
cgit v1.2.3


From d453cded05ee219b77815ea194dc36efa5398bca Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 20 Jan 2015 09:07:04 +1030
Subject: module_arch_freeing_init(): new hook for archs before
 module->module_init freed.

Archs have been abusing module_free() to clean up their arch-specific
allocations.  Since module_free() is also (ab)used by BPF and trace code,
let's keep it to simple allocations, and provide a hook called before
that.

This means that avr32, ia64, parisc and s390 no longer need to implement
their own module_free() at all.  avr32 doesn't need module_finalize()
either.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: linux-kernel@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
Cc: linux-parisc@vger.kernel.org
Cc: linux-s390@vger.kernel.org
---
 arch/avr32/kernel/module.c   | 13 +------------
 arch/ia64/kernel/module.c    |  6 ++----
 arch/parisc/kernel/module.c  |  6 +-----
 arch/s390/kernel/module.c    | 10 +++-------
 arch/tile/kernel/module.c    |  2 +-
 include/linux/moduleloader.h |  2 ++
 kernel/module.c              |  7 +++++++
 7 files changed, 17 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/avr32/kernel/module.c b/arch/avr32/kernel/module.c
index 2c9412908024..164efa009e5b 100644
--- a/arch/avr32/kernel/module.c
+++ b/arch/avr32/kernel/module.c
@@ -19,12 +19,10 @@
 #include <linux/moduleloader.h>
 #include <linux/vmalloc.h>
 
-void module_free(struct module *mod, void *module_region)
+void module_arch_freeing_init(struct module *mod)
 {
 	vfree(mod->arch.syminfo);
 	mod->arch.syminfo = NULL;
-
-	vfree(module_region);
 }
 
 static inline int check_rela(Elf32_Rela *rela, struct module *module,
@@ -291,12 +289,3 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
 
 	return ret;
 }
-
-int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs,
-		    struct module *module)
-{
-	vfree(module->arch.syminfo);
-	module->arch.syminfo = NULL;
-
-	return 0;
-}
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index 24603be24c14..29754aae5177 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -305,14 +305,12 @@ plt_target (struct plt_entry *plt)
 #endif /* !USE_BRL */
 
 void
-module_free (struct module *mod, void *module_region)
+module_arch_freeing_init (struct module *mod)
 {
-	if (mod && mod->arch.init_unw_table &&
-	    module_region == mod->module_init) {
+	if (mod->arch.init_unw_table) {
 		unw_remove_unwind_table(mod->arch.init_unw_table);
 		mod->arch.init_unw_table = NULL;
 	}
-	vfree(module_region);
 }
 
 /* Have we already seen one of these relocations? */
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index 50dfafc3f2c1..5822e8e200e6 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -298,14 +298,10 @@ static inline unsigned long count_stubs(const Elf_Rela *rela, unsigned long n)
 }
 #endif
 
-
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
+void module_arch_freeing_init(struct module *mod)
 {
 	kfree(mod->arch.section);
 	mod->arch.section = NULL;
-
-	vfree(module_region);
 }
 
 /* Additional bytes needed in front of individual sections */
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index b89b59158b95..409d152585be 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -55,14 +55,10 @@ void *module_alloc(unsigned long size)
 }
 #endif
 
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
+void module_arch_freeing_init(struct module *mod)
 {
-	if (mod) {
-		vfree(mod->arch.syminfo);
-		mod->arch.syminfo = NULL;
-	}
-	vfree(module_region);
+	vfree(mod->arch.syminfo);
+	mod->arch.syminfo = NULL;
 }
 
 static void check_rela(Elf_Rela *rela, struct module *me)
diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c
index 96447c9160a0..62a597e810d6 100644
--- a/arch/tile/kernel/module.c
+++ b/arch/tile/kernel/module.c
@@ -83,7 +83,7 @@ void module_free(struct module *mod, void *module_region)
 		     0, 0, 0, NULL, NULL, 0);
 
 	/*
-	 * FIXME: If module_region == mod->module_init, trim exception
+	 * FIXME: Add module_arch_freeing_init to trim exception
 	 * table entries.
 	 */
 }
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 7eeb9bbfb816..054eac853090 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -82,4 +82,6 @@ int module_finalize(const Elf_Ehdr *hdr,
 /* Any cleanup needed when module leaves. */
 void module_arch_cleanup(struct module *mod);
 
+/* Any cleanup before freeing mod->module_init */
+void module_arch_freeing_init(struct module *mod);
 #endif
diff --git a/kernel/module.c b/kernel/module.c
index 3965511ae133..68be0b1f9e7f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1804,6 +1804,10 @@ void __weak module_arch_cleanup(struct module *mod)
 {
 }
 
+void __weak module_arch_freeing_init(struct module *mod)
+{
+}
+
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
@@ -1841,6 +1845,7 @@ static void free_module(struct module *mod)
 
 	/* This may be NULL, but that's OK */
 	unset_module_init_ro_nx(mod);
+	module_arch_freeing_init(mod);
 	module_free(mod, mod->module_init);
 	kfree(mod->args);
 	percpu_modfree(mod);
@@ -2930,6 +2935,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
 static void module_deallocate(struct module *mod, struct load_info *info)
 {
 	percpu_modfree(mod);
+	module_arch_freeing_init(mod);
 	module_free(mod, mod->module_init);
 	module_free(mod, mod->module_core);
 }
@@ -3055,6 +3061,7 @@ static int do_init_module(struct module *mod)
 	mod->strtab = mod->core_strtab;
 #endif
 	unset_module_init_ro_nx(mod);
+	module_arch_freeing_init(mod);
 	module_free(mod, mod->module_init);
 	mod->module_init = NULL;
 	mod->init_size = 0;
-- 
cgit v1.2.3


From be1f221c0445a4157d177197c236f888d3581914 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 20 Jan 2015 09:07:05 +1030
Subject: module: remove mod arg from module_free, rename module_memfree().

Nothing needs the module pointer any more, and the next patch will
call it from RCU, where the module itself might no longer exist.
Removing the arg is the safest approach.

This just codifies the use of the module_alloc/module_free pattern
which ftrace and bpf use.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Ley Foon Tan <lftan@altera.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: x86@kernel.org
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: linux-cris-kernel@axis.com
Cc: linux-kernel@vger.kernel.org
Cc: linux-mips@linux-mips.org
Cc: nios2-dev@lists.rocketboards.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: sparclinux@vger.kernel.org
Cc: netdev@vger.kernel.org
---
 arch/cris/kernel/module.c       |  2 +-
 arch/mips/net/bpf_jit.c         |  2 +-
 arch/nios2/kernel/module.c      |  2 +-
 arch/powerpc/net/bpf_jit_comp.c |  2 +-
 arch/sparc/net/bpf_jit_comp.c   |  4 ++--
 arch/tile/kernel/module.c       |  2 +-
 arch/x86/kernel/ftrace.c        |  2 +-
 include/linux/moduleloader.h    |  2 +-
 kernel/bpf/core.c               |  2 +-
 kernel/kprobes.c                |  2 +-
 kernel/module.c                 | 14 +++++++-------
 11 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/cris/kernel/module.c b/arch/cris/kernel/module.c
index 51123f985eb5..af04cb6b6dc9 100644
--- a/arch/cris/kernel/module.c
+++ b/arch/cris/kernel/module.c
@@ -36,7 +36,7 @@ void *module_alloc(unsigned long size)
 }
 
 /* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
+void module_memfree(void *module_region)
 {
 	kfree(module_region);
 }
diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
index 9fd6834a2172..5d6139390bf8 100644
--- a/arch/mips/net/bpf_jit.c
+++ b/arch/mips/net/bpf_jit.c
@@ -1388,7 +1388,7 @@ out:
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	if (fp->jited)
-		module_free(NULL, fp->bpf_func);
+		module_memfree(fp->bpf_func);
 
 	bpf_prog_unlock_free(fp);
 }
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index cc924a38f22a..e2e3f13f98d5 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -36,7 +36,7 @@ void *module_alloc(unsigned long size)
 }
 
 /* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
+void module_memfree(void *module_region)
 {
 	kfree(module_region);
 }
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 1ca125b9c226..d1916b577f2c 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -699,7 +699,7 @@ out:
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	if (fp->jited)
-		module_free(NULL, fp->bpf_func);
+		module_memfree(fp->bpf_func);
 
 	bpf_prog_unlock_free(fp);
 }
diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c
index f33e7c7a3bf7..7931eeeb649a 100644
--- a/arch/sparc/net/bpf_jit_comp.c
+++ b/arch/sparc/net/bpf_jit_comp.c
@@ -776,7 +776,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 				if (unlikely(proglen + ilen > oldproglen)) {
 					pr_err("bpb_jit_compile fatal error\n");
 					kfree(addrs);
-					module_free(NULL, image);
+					module_memfree(image);
 					return;
 				}
 				memcpy(image + proglen, temp, ilen);
@@ -822,7 +822,7 @@ out:
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	if (fp->jited)
-		module_free(NULL, fp->bpf_func);
+		module_memfree(fp->bpf_func);
 
 	bpf_prog_unlock_free(fp);
 }
diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c
index 62a597e810d6..2305084c9b93 100644
--- a/arch/tile/kernel/module.c
+++ b/arch/tile/kernel/module.c
@@ -74,7 +74,7 @@ error:
 
 
 /* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
+void module_memfree(void *module_region)
 {
 	vfree(module_region);
 
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 2142376dc8c6..8b7b0a51e742 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -674,7 +674,7 @@ static inline void *alloc_tramp(unsigned long size)
 }
 static inline void tramp_free(void *tramp)
 {
-	module_free(NULL, tramp);
+	module_memfree(tramp);
 }
 #else
 /* Trampolines can only be created if modules are supported */
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 054eac853090..f7556261fe3c 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -26,7 +26,7 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
 void *module_alloc(unsigned long size);
 
 /* Free memory returned from module_alloc. */
-void module_free(struct module *mod, void *module_region);
+void module_memfree(void *module_region);
 
 /*
  * Apply the given relocation to the (simplified) ELF.  Return -error
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d6594e457a25..a64e7a207d2b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 
 void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
-	module_free(NULL, hdr);
+	module_memfree(hdr);
 }
 #endif /* CONFIG_BPF_JIT */
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 06f58309fed2..ee619929cf90 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -127,7 +127,7 @@ static void *alloc_insn_page(void)
 
 static void free_insn_page(void *page)
 {
-	module_free(NULL, page);
+	module_memfree(page);
 }
 
 struct kprobe_insn_cache kprobe_insn_slots = {
diff --git a/kernel/module.c b/kernel/module.c
index 68be0b1f9e7f..1f85fd5c89d3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1795,7 +1795,7 @@ static void unset_module_core_ro_nx(struct module *mod) { }
 static void unset_module_init_ro_nx(struct module *mod) { }
 #endif
 
-void __weak module_free(struct module *mod, void *module_region)
+void __weak module_memfree(void *module_region)
 {
 	vfree(module_region);
 }
@@ -1846,7 +1846,7 @@ static void free_module(struct module *mod)
 	/* This may be NULL, but that's OK */
 	unset_module_init_ro_nx(mod);
 	module_arch_freeing_init(mod);
-	module_free(mod, mod->module_init);
+	module_memfree(mod->module_init);
 	kfree(mod->args);
 	percpu_modfree(mod);
 
@@ -1855,7 +1855,7 @@ static void free_module(struct module *mod)
 
 	/* Finally, free the core (containing the module structure) */
 	unset_module_core_ro_nx(mod);
-	module_free(mod, mod->module_core);
+	module_memfree(mod->module_core);
 
 #ifdef CONFIG_MPU
 	update_protections(current->mm);
@@ -2790,7 +2790,7 @@ static int move_module(struct module *mod, struct load_info *info)
 		 */
 		kmemleak_ignore(ptr);
 		if (!ptr) {
-			module_free(mod, mod->module_core);
+			module_memfree(mod->module_core);
 			return -ENOMEM;
 		}
 		memset(ptr, 0, mod->init_size);
@@ -2936,8 +2936,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
 {
 	percpu_modfree(mod);
 	module_arch_freeing_init(mod);
-	module_free(mod, mod->module_init);
-	module_free(mod, mod->module_core);
+	module_memfree(mod->module_init);
+	module_memfree(mod->module_core);
 }
 
 int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3062,7 +3062,7 @@ static int do_init_module(struct module *mod)
 #endif
 	unset_module_init_ro_nx(mod);
 	module_arch_freeing_init(mod);
-	module_free(mod, mod->module_init);
+	module_memfree(mod->module_init);
 	mod->module_init = NULL;
 	mod->init_size = 0;
 	mod->init_ro_size = 0;
-- 
cgit v1.2.3


From 67d0d04a762db4bd610fd628ad683b5d7dc905e7 Mon Sep 17 00:00:00 2001
From: Vincent Yang <vincent.yang.fujitsu@gmail.com>
Date: Tue, 20 Jan 2015 16:05:16 +0800
Subject: mmc: sdhci: add a quirk for tuning work around

This patch defines a quirk for tuning work
around for some sdhci host controller. It sets
both SDHCI_CTRL_EXEC_TUNING and SDHCI_CTRL_TUNED_CLK
for tuning.
It is a preparation and will be used by Fujitsu
SDHCI controller f_sdh30 driver.

Signed-off-by: Vincent Yang <Vincent.Yang@tw.fujitsu.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci.c  | 2 ++
 include/linux/mmc/sdhci.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 3e4fe967d7e5..a3ecd20e5510 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -1961,6 +1961,8 @@ static int sdhci_execute_tuning(struct mmc_host *mmc, u32 opcode)
 
 	ctrl = sdhci_readw(host, SDHCI_HOST_CONTROL2);
 	ctrl |= SDHCI_CTRL_EXEC_TUNING;
+	if (host->quirks2 & SDHCI_QUIRK2_TUNING_WORK_AROUND)
+		ctrl |= SDHCI_CTRL_TUNED_CLK;
 	sdhci_writew(host, ctrl, SDHCI_HOST_CONTROL2);
 
 	/*
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index cb8b94ff6a26..933b897ca095 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -111,6 +111,8 @@ struct sdhci_host {
 #define SDHCI_QUIRK2_CLEAR_TRANSFERMODE_REG_BEFORE_CMD	(1<<10)
 /* Capability register bit-63 indicates HS400 support */
 #define SDHCI_QUIRK2_CAPS_BIT63_FOR_HS400		(1<<11)
+/* forced tuned clock */
+#define SDHCI_QUIRK2_TUNING_WORK_AROUND			(1<<12)
 
 	int irq;		/* Device IRQ */
 	void __iomem *ioaddr;	/* Mapped address */
-- 
cgit v1.2.3


From d3fc5d71ac4dfd28a66689cfd1eea84c4dba8bde Mon Sep 17 00:00:00 2001
From: Vincent Yang <vincent.yang.fujitsu@gmail.com>
Date: Tue, 20 Jan 2015 16:05:17 +0800
Subject: mmc: sdhci: add a quirk for single block transactions

This patch defines a quirk to disable the block count
for single block transactions.
It is a preparation and will be used by Fujitsu
SDHCI controller f_sdh30 driver.

Signed-off-by: Vincent Yang <Vincent.Yang@tw.fujitsu.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sdhci.c  | 8 +++++---
 include/linux/mmc/sdhci.h | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index a3ecd20e5510..c9881ca131d5 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -904,7 +904,7 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_command *cmd)
 static void sdhci_set_transfer_mode(struct sdhci_host *host,
 	struct mmc_command *cmd)
 {
-	u16 mode;
+	u16 mode = 0;
 	struct mmc_data *data = cmd->data;
 
 	if (data == NULL) {
@@ -922,9 +922,11 @@ static void sdhci_set_transfer_mode(struct sdhci_host *host,
 
 	WARN_ON(!host->data);
 
-	mode = SDHCI_TRNS_BLK_CNT_EN;
+	if (!(host->quirks2 & SDHCI_QUIRK2_SUPPORT_SINGLE))
+		mode = SDHCI_TRNS_BLK_CNT_EN;
+
 	if (mmc_op_multi(cmd->opcode) || data->blocks > 1) {
-		mode |= SDHCI_TRNS_MULTI;
+		mode = SDHCI_TRNS_BLK_CNT_EN | SDHCI_TRNS_MULTI;
 		/*
 		 * If we are sending CMD23, CMD12 never gets sent
 		 * on successful completion (so no Auto-CMD12).
diff --git a/include/linux/mmc/sdhci.h b/include/linux/mmc/sdhci.h
index 933b897ca095..c3e3db196738 100644
--- a/include/linux/mmc/sdhci.h
+++ b/include/linux/mmc/sdhci.h
@@ -113,6 +113,8 @@ struct sdhci_host {
 #define SDHCI_QUIRK2_CAPS_BIT63_FOR_HS400		(1<<11)
 /* forced tuned clock */
 #define SDHCI_QUIRK2_TUNING_WORK_AROUND			(1<<12)
+/* disable the block count for single block transactions */
+#define SDHCI_QUIRK2_SUPPORT_SINGLE			(1<<13)
 
 	int irq;		/* Device IRQ */
 	void __iomem *ioaddr;	/* Mapped address */
-- 
cgit v1.2.3


From 94b110aff8679b14f46fd6653ea87b42fe1555be Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:57:22 +0000
Subject: mmc: tmio: add tmio_mmc_host_alloc/free()

Current tmio_mmc driver is using tmio_mmc_data for driver/platform
specific data/callback, and it is needed for tmio_mmc_host_probe()
function. Because of this style, include/linux/mfd/tmio.h header has
tmio driver/framework specific data which is not needed from platform.

This patch adds new tmio_mmc_host_alloc/free() as cleanup preparation.
tmio driver specific data/callback will be implemented in tmio_mmc_host,
and platform specific data/callback will be implemented in tmio_mmc_data
in this cleanup.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 14 +++++++++---
 drivers/mmc/host/tmio_mmc.c       | 10 +++++++--
 drivers/mmc/host/tmio_mmc.h       |  5 +++--
 drivers/mmc/host/tmio_mmc_pio.c   | 45 +++++++++++++++++++++++++--------------
 include/linux/mfd/tmio.h          |  1 -
 5 files changed, 51 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 00c8ebdf8ec7..cf062c4d87a9 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -113,7 +113,7 @@ static int sh_mobile_sdhi_wait_idle(struct tmio_mmc_host *host)
 		udelay(1);
 
 	if (!timeout) {
-		dev_warn(host->pdata->dev, "timeout waiting for SD bus idle\n");
+		dev_warn(&host->pdev->dev, "timeout waiting for SD bus idle\n");
 		return -EBUSY;
 	}
 
@@ -207,6 +207,12 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 		goto eclkget;
 	}
 
+	host = tmio_mmc_host_alloc(pdev);
+	if (!host) {
+		ret = -ENOMEM;
+		goto eprobe;
+	}
+
 	mmc_data->clk_enable = sh_mobile_sdhi_clk_enable;
 	mmc_data->clk_disable = sh_mobile_sdhi_clk_disable;
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
@@ -274,9 +280,9 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	/* SD control register space size is 0x100, 0x200 for bus_shift=1 */
 	mmc_data->bus_shift = resource_size(res) >> 9;
 
-	ret = tmio_mmc_host_probe(&host, pdev, mmc_data);
+	ret = tmio_mmc_host_probe(host, mmc_data);
 	if (ret < 0)
-		goto eprobe;
+		goto efree;
 
 	/*
 	 * FIXME:
@@ -351,6 +357,8 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 
 eirq:
 	tmio_mmc_host_remove(host);
+efree:
+	tmio_mmc_host_free(host);
 eprobe:
 eclkget:
 	if (p && p->cleanup)
diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c
index 2ca0afaab792..a7c2e459187c 100644
--- a/drivers/mmc/host/tmio_mmc.c
+++ b/drivers/mmc/host/tmio_mmc.c
@@ -92,10 +92,14 @@ static int tmio_mmc_probe(struct platform_device *pdev)
 	pdata->bus_shift = resource_size(res) >> 10;
 	pdata->flags |= TMIO_MMC_HAVE_HIGH_REG;
 
-	ret = tmio_mmc_host_probe(&host, pdev, pdata);
-	if (ret)
+	host = tmio_mmc_host_alloc(pdev);
+	if (!host)
 		goto cell_disable;
 
+	ret = tmio_mmc_host_probe(host, pdata);
+	if (ret)
+		goto host_free;
+
 	ret = request_irq(irq, tmio_mmc_irq, IRQF_TRIGGER_FALLING,
 				dev_name(&pdev->dev), host);
 	if (ret)
@@ -108,6 +112,8 @@ static int tmio_mmc_probe(struct platform_device *pdev)
 
 host_remove:
 	tmio_mmc_host_remove(host);
+host_free:
+	tmio_mmc_host_free(host);
 cell_disable:
 	if (cell->disable)
 		cell->disable(pdev);
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index a34ecbe1c1ad..60d6747f0df5 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -85,8 +85,9 @@ struct tmio_mmc_host {
 	bool			sdio_irq_enabled;
 };
 
-int tmio_mmc_host_probe(struct tmio_mmc_host **host,
-			struct platform_device *pdev,
+struct tmio_mmc_host *tmio_mmc_host_alloc(struct platform_device *pdev);
+void tmio_mmc_host_free(struct tmio_mmc_host *host);
+int tmio_mmc_host_probe(struct tmio_mmc_host *host,
 			struct tmio_mmc_data *pdata);
 void tmio_mmc_host_remove(struct tmio_mmc_host *host);
 void tmio_mmc_do_data_irq(struct tmio_mmc_host *host);
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index 250bf8c9f998..396be0e95a98 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -1054,12 +1054,37 @@ static void tmio_mmc_of_parse(struct platform_device *pdev,
 		pdata->flags |= TMIO_MMC_WRPROTECT_DISABLE;
 }
 
-int tmio_mmc_host_probe(struct tmio_mmc_host **host,
-				  struct platform_device *pdev,
-				  struct tmio_mmc_data *pdata)
+struct tmio_mmc_host*
+tmio_mmc_host_alloc(struct platform_device *pdev)
 {
-	struct tmio_mmc_host *_host;
+	struct tmio_mmc_host *host;
 	struct mmc_host *mmc;
+
+	mmc = mmc_alloc_host(sizeof(struct tmio_mmc_host), &pdev->dev);
+	if (!mmc)
+		return NULL;
+
+	host = mmc_priv(mmc);
+	host->mmc = mmc;
+	host->pdev = pdev;
+
+	return host;
+}
+EXPORT_SYMBOL(tmio_mmc_host_alloc);
+
+void tmio_mmc_host_free(struct tmio_mmc_host *host)
+{
+	mmc_free_host(host->mmc);
+
+	host->mmc = NULL;
+}
+EXPORT_SYMBOL(tmio_mmc_host_free);
+
+int tmio_mmc_host_probe(struct tmio_mmc_host *_host,
+			struct tmio_mmc_data *pdata)
+{
+	struct platform_device *pdev = _host->pdev;
+	struct mmc_host *mmc = _host->mmc;
 	struct resource *res_ctl;
 	int ret;
 	u32 irq_mask = TMIO_MASK_CMD;
@@ -1073,19 +1098,11 @@ int tmio_mmc_host_probe(struct tmio_mmc_host **host,
 	if (!res_ctl)
 		return -EINVAL;
 
-	mmc = mmc_alloc_host(sizeof(struct tmio_mmc_host), &pdev->dev);
-	if (!mmc)
-		return -ENOMEM;
-
 	ret = mmc_of_parse(mmc);
 	if (ret < 0)
 		goto host_free;
 
-	pdata->dev = &pdev->dev;
-	_host = mmc_priv(mmc);
 	_host->pdata = pdata;
-	_host->mmc = mmc;
-	_host->pdev = pdev;
 	platform_set_drvdata(pdev, mmc);
 
 	_host->set_pwr = pdata->set_pwr;
@@ -1192,12 +1209,9 @@ int tmio_mmc_host_probe(struct tmio_mmc_host **host,
 		mmc_gpiod_request_cd_irq(mmc);
 	}
 
-	*host = _host;
-
 	return 0;
 
 host_free:
-	mmc_free_host(mmc);
 
 	return ret;
 }
@@ -1222,7 +1236,6 @@ void tmio_mmc_host_remove(struct tmio_mmc_host *host)
 	pm_runtime_disable(&pdev->dev);
 
 	iounmap(host->ctl);
-	mmc_free_host(mmc);
 }
 EXPORT_SYMBOL(tmio_mmc_host_remove);
 
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 57388171610d..c7d9af042d09 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -135,7 +135,6 @@ struct tmio_mmc_data {
 	unsigned long			bus_shift;
 	u32				ocr_mask;	/* available voltages */
 	struct tmio_mmc_dma		*dma;
-	struct device			*dev;
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
-- 
cgit v1.2.3


From 7ecc09bab1e856e6730a4dd8a3bc1c28bb6ab3be Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:57:33 +0000
Subject: mmc: tmio: tmio_mmc_host has .dma

Current .dma is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c |  4 ++--
 drivers/mmc/host/tmio_mmc.h       | 11 +++++++++++
 drivers/mmc/host/tmio_mmc_dma.c   | 24 +++++++++++-------------
 include/linux/mfd/tmio.h          | 11 -----------
 4 files changed, 24 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index cf062c4d87a9..288e78d2c7a1 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -213,6 +213,8 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 		goto eprobe;
 	}
 
+	host->dma		= dma_priv;
+
 	mmc_data->clk_enable = sh_mobile_sdhi_clk_enable;
 	mmc_data->clk_disable = sh_mobile_sdhi_clk_disable;
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
@@ -241,8 +243,6 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	dma_priv->alignment_shift = 1; /* 2-byte alignment */
 	dma_priv->filter = shdma_chan_filter;
 
-	mmc_data->dma = dma_priv;
-
 	/*
 	 * All SDHI blocks support 2-byte and larger block sizes in 4-bit
 	 * bus width mode.
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 60d6747f0df5..49a2559ff489 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -40,6 +40,16 @@
 
 struct tmio_mmc_data;
 
+struct tmio_mmc_dma {
+	void *chan_priv_tx;
+	void *chan_priv_rx;
+	int slave_id_tx;
+	int slave_id_rx;
+	int alignment_shift;
+	dma_addr_t dma_rx_offset;
+	bool (*filter)(struct dma_chan *chan, void *arg);
+};
+
 struct tmio_mmc_host {
 	void __iomem *ctl;
 	struct mmc_command      *cmd;
@@ -59,6 +69,7 @@ struct tmio_mmc_host {
 
 	struct platform_device *pdev;
 	struct tmio_mmc_data *pdata;
+	struct tmio_mmc_dma	*dma;
 
 	/* DMA support */
 	bool			force_pio;
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index 7d077388b9eb..6c214d60bbb6 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -49,11 +49,10 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 	struct scatterlist *sg = host->sg_ptr, *sg_tmp;
 	struct dma_async_tx_descriptor *desc = NULL;
 	struct dma_chan *chan = host->chan_rx;
-	struct tmio_mmc_data *pdata = host->pdata;
 	dma_cookie_t cookie;
 	int ret, i;
 	bool aligned = true, multiple = true;
-	unsigned int align = (1 << pdata->dma->alignment_shift) - 1;
+	unsigned int align = (1 << host->dma->alignment_shift) - 1;
 
 	for_each_sg(sg, sg_tmp, host->sg_len, i) {
 		if (sg_tmp->offset & align)
@@ -126,11 +125,10 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
 	struct scatterlist *sg = host->sg_ptr, *sg_tmp;
 	struct dma_async_tx_descriptor *desc = NULL;
 	struct dma_chan *chan = host->chan_tx;
-	struct tmio_mmc_data *pdata = host->pdata;
 	dma_cookie_t cookie;
 	int ret, i;
 	bool aligned = true, multiple = true;
-	unsigned int align = (1 << pdata->dma->alignment_shift) - 1;
+	unsigned int align = (1 << host->dma->alignment_shift) - 1;
 
 	for_each_sg(sg, sg_tmp, host->sg_len, i) {
 		if (sg_tmp->offset & align)
@@ -262,8 +260,8 @@ out:
 void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdata)
 {
 	/* We can only either use DMA for both Tx and Rx or not use it at all */
-	if (!pdata->dma || (!host->pdev->dev.of_node &&
-		(!pdata->dma->chan_priv_tx || !pdata->dma->chan_priv_rx)))
+	if (!host->dma || (!host->pdev->dev.of_node &&
+		(!host->dma->chan_priv_tx || !host->dma->chan_priv_rx)))
 		return;
 
 	if (!host->chan_tx && !host->chan_rx) {
@@ -280,7 +278,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 		dma_cap_set(DMA_SLAVE, mask);
 
 		host->chan_tx = dma_request_slave_channel_compat(mask,
-					pdata->dma->filter, pdata->dma->chan_priv_tx,
+					host->dma->filter, host->dma->chan_priv_tx,
 					&host->pdev->dev, "tx");
 		dev_dbg(&host->pdev->dev, "%s: TX: got channel %p\n", __func__,
 			host->chan_tx);
@@ -288,8 +286,8 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 		if (!host->chan_tx)
 			return;
 
-		if (pdata->dma->chan_priv_tx)
-			cfg.slave_id = pdata->dma->slave_id_tx;
+		if (host->dma->chan_priv_tx)
+			cfg.slave_id = host->dma->slave_id_tx;
 		cfg.direction = DMA_MEM_TO_DEV;
 		cfg.dst_addr = res->start + (CTL_SD_DATA_PORT << host->pdata->bus_shift);
 		cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
@@ -299,7 +297,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 			goto ecfgtx;
 
 		host->chan_rx = dma_request_slave_channel_compat(mask,
-					pdata->dma->filter, pdata->dma->chan_priv_rx,
+					host->dma->filter, host->dma->chan_priv_rx,
 					&host->pdev->dev, "rx");
 		dev_dbg(&host->pdev->dev, "%s: RX: got channel %p\n", __func__,
 			host->chan_rx);
@@ -307,10 +305,10 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 		if (!host->chan_rx)
 			goto ereqrx;
 
-		if (pdata->dma->chan_priv_rx)
-			cfg.slave_id = pdata->dma->slave_id_rx;
+		if (host->dma->chan_priv_rx)
+			cfg.slave_id = host->dma->slave_id_rx;
 		cfg.direction = DMA_DEV_TO_MEM;
-		cfg.src_addr = cfg.dst_addr + pdata->dma->dma_rx_offset;
+		cfg.src_addr = cfg.dst_addr + host->dma->dma_rx_offset;
 		cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
 		cfg.dst_addr = 0;
 		ret = dmaengine_slave_config(host->chan_rx, &cfg);
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index c7d9af042d09..8d708c7cf681 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -112,16 +112,6 @@ void tmio_core_mmc_clk_div(void __iomem *cnf, int shift, int state);
 
 struct dma_chan;
 
-struct tmio_mmc_dma {
-	void *chan_priv_tx;
-	void *chan_priv_rx;
-	int slave_id_tx;
-	int slave_id_rx;
-	int alignment_shift;
-	dma_addr_t dma_rx_offset;
-	bool (*filter)(struct dma_chan *chan, void *arg);
-};
-
 struct tmio_mmc_host;
 
 /*
@@ -134,7 +124,6 @@ struct tmio_mmc_data {
 	unsigned long			flags;
 	unsigned long			bus_shift;
 	u32				ocr_mask;	/* available voltages */
-	struct tmio_mmc_dma		*dma;
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
-- 
cgit v1.2.3


From dfe9a229e0a66b6a00439cea2885ad3b5d3e0840 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:57:42 +0000
Subject: mmc: tmio: tmio_mmc_host has .write16_hook

Current .write16_hook is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 2 +-
 drivers/mmc/host/tmio_mmc.h       | 4 +++-
 drivers/mmc/host/tmio_mmc_pio.c   | 2 +-
 include/linux/mfd/tmio.h          | 3 ---
 4 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 288e78d2c7a1..77ff0620912e 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -214,11 +214,11 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	}
 
 	host->dma		= dma_priv;
+	host->write16_hook	= sh_mobile_sdhi_write16_hook;
 
 	mmc_data->clk_enable = sh_mobile_sdhi_clk_enable;
 	mmc_data->clk_disable = sh_mobile_sdhi_clk_disable;
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
-	mmc_data->write16_hook = sh_mobile_sdhi_write16_hook;
 	mmc_data->multi_io_quirk = sh_mobile_sdhi_multi_io_quirk;
 	if (p) {
 		mmc_data->flags = p->tmio_flags;
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 49a2559ff489..3fa96b11f76c 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -94,6 +94,8 @@ struct tmio_mmc_host {
 	struct mutex		ios_lock;	/* protect set_ios() context */
 	bool			native_hotplug;
 	bool			sdio_irq_enabled;
+
+	int (*write16_hook)(struct tmio_mmc_host *host, int addr);
 };
 
 struct tmio_mmc_host *tmio_mmc_host_alloc(struct platform_device *pdev);
@@ -183,7 +185,7 @@ static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val
 	/* If there is a hook and it returns non-zero then there
 	 * is an error and the write should be skipped
 	 */
-	if (host->pdata->write16_hook && host->pdata->write16_hook(host, addr))
+	if (host->write16_hook && host->write16_hook(host, addr))
 		return;
 	writew(val, host->ctl + (addr << host->pdata->bus_shift));
 }
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index 396be0e95a98..58f4e47aac8c 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -1092,7 +1092,7 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host,
 	tmio_mmc_of_parse(pdev, pdata);
 
 	if (!(pdata->flags & TMIO_MMC_HAS_IDLE_WAIT))
-		pdata->write16_hook = NULL;
+		_host->write16_hook = NULL;
 
 	res_ctl = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res_ctl)
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 8d708c7cf681..0872ca1e5dc2 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -112,8 +112,6 @@ void tmio_core_mmc_clk_div(void __iomem *cnf, int shift, int state);
 
 struct dma_chan;
 
-struct tmio_mmc_host;
-
 /*
  * data for the MMC controller
  */
@@ -127,7 +125,6 @@ struct tmio_mmc_data {
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
-	int (*write16_hook)(struct tmio_mmc_host *host, int addr);
 	/* clock management callbacks */
 	int (*clk_enable)(struct platform_device *pdev, unsigned int *f);
 	void (*clk_disable)(struct platform_device *pdev);
-- 
cgit v1.2.3


From 4fe2ec57a15f98c232536cf04e7c139d830955d4 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:57:52 +0000
Subject: mmc: tmio: tmio_mmc_host has .clk_enable

Current .clk_enable is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 2 +-
 drivers/mmc/host/tmio_mmc.h       | 1 +
 drivers/mmc/host/tmio_mmc_pio.c   | 5 ++---
 include/linux/mfd/tmio.h          | 1 -
 4 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 77ff0620912e..87af77dd660b 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -215,8 +215,8 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 
 	host->dma		= dma_priv;
 	host->write16_hook	= sh_mobile_sdhi_write16_hook;
+	host->clk_enable	= sh_mobile_sdhi_clk_enable;
 
-	mmc_data->clk_enable = sh_mobile_sdhi_clk_enable;
 	mmc_data->clk_disable = sh_mobile_sdhi_clk_disable;
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
 	mmc_data->multi_io_quirk = sh_mobile_sdhi_multi_io_quirk;
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 3fa96b11f76c..5cd298665b94 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -96,6 +96,7 @@ struct tmio_mmc_host {
 	bool			sdio_irq_enabled;
 
 	int (*write16_hook)(struct tmio_mmc_host *host, int addr);
+	int (*clk_enable)(struct platform_device *pdev, unsigned int *f);
 };
 
 struct tmio_mmc_host *tmio_mmc_host_alloc(struct platform_device *pdev);
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index 58f4e47aac8c..38bcf0ca31c5 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -835,13 +835,12 @@ fail:
 static int tmio_mmc_clk_update(struct tmio_mmc_host *host)
 {
 	struct mmc_host *mmc = host->mmc;
-	struct tmio_mmc_data *pdata = host->pdata;
 	int ret;
 
-	if (!pdata->clk_enable)
+	if (!host->clk_enable)
 		return -ENOTSUPP;
 
-	ret = pdata->clk_enable(host->pdev, &mmc->f_max);
+	ret = host->clk_enable(host->pdev, &mmc->f_max);
 	if (!ret)
 		mmc->f_min = mmc->f_max / 512;
 
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 0872ca1e5dc2..472587a02a42 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -126,7 +126,6 @@ struct tmio_mmc_data {
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
 	/* clock management callbacks */
-	int (*clk_enable)(struct platform_device *pdev, unsigned int *f);
 	void (*clk_disable)(struct platform_device *pdev);
 	int (*multi_io_quirk)(struct mmc_card *card,
 			      unsigned int direction, int blk_size);
-- 
cgit v1.2.3


From 00452c11ea0e4e5822edf0ac46853933860d0f53 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:58:01 +0000
Subject: mmc: tmio: tmio_mmc_host has .clk_disable

Current .clk_disable is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 2 +-
 drivers/mmc/host/tmio_mmc.h       | 1 +
 drivers/mmc/host/tmio_mmc_pio.c   | 4 ++--
 include/linux/mfd/tmio.h          | 3 +--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 87af77dd660b..5e02a6a302e7 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -216,8 +216,8 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	host->dma		= dma_priv;
 	host->write16_hook	= sh_mobile_sdhi_write16_hook;
 	host->clk_enable	= sh_mobile_sdhi_clk_enable;
+	host->clk_disable	= sh_mobile_sdhi_clk_disable;
 
-	mmc_data->clk_disable = sh_mobile_sdhi_clk_disable;
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
 	mmc_data->multi_io_quirk = sh_mobile_sdhi_multi_io_quirk;
 	if (p) {
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 5cd298665b94..28234ec5caa3 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -97,6 +97,7 @@ struct tmio_mmc_host {
 
 	int (*write16_hook)(struct tmio_mmc_host *host, int addr);
 	int (*clk_enable)(struct platform_device *pdev, unsigned int *f);
+	void (*clk_disable)(struct platform_device *pdev);
 };
 
 struct tmio_mmc_host *tmio_mmc_host_alloc(struct platform_device *pdev);
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index 38bcf0ca31c5..e33eb8ee5eb5 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -1249,8 +1249,8 @@ int tmio_mmc_host_runtime_suspend(struct device *dev)
 	if (host->clk_cache)
 		tmio_mmc_clk_stop(host);
 
-	if (host->pdata->clk_disable)
-		host->pdata->clk_disable(host->pdev);
+	if (host->clk_disable)
+		host->clk_disable(host->pdev);
 
 	return 0;
 }
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 472587a02a42..a3f78da70af7 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -125,8 +125,7 @@ struct tmio_mmc_data {
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
-	/* clock management callbacks */
-	void (*clk_disable)(struct platform_device *pdev);
+
 	int (*multi_io_quirk)(struct mmc_card *card,
 			      unsigned int direction, int blk_size);
 };
-- 
cgit v1.2.3


From 85c02ddd591e5252eb1cbe8743a839638d7415fd Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:58:10 +0000
Subject: mmc: tmio: tmio_mmc_host has .multi_io_quirk

Current .multi_io_quirk is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 2 +-
 drivers/mmc/host/tmio_mmc.h       | 2 ++
 drivers/mmc/host/tmio_mmc_pio.c   | 5 ++---
 include/linux/mfd/tmio.h          | 3 ---
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 5e02a6a302e7..c92efe59284d 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -217,9 +217,9 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	host->write16_hook	= sh_mobile_sdhi_write16_hook;
 	host->clk_enable	= sh_mobile_sdhi_clk_enable;
 	host->clk_disable	= sh_mobile_sdhi_clk_disable;
+	host->multi_io_quirk	= sh_mobile_sdhi_multi_io_quirk;
 
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
-	mmc_data->multi_io_quirk = sh_mobile_sdhi_multi_io_quirk;
 	if (p) {
 		mmc_data->flags = p->tmio_flags;
 		mmc_data->ocr_mask = p->tmio_ocr_mask;
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 28234ec5caa3..263256c897ed 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -98,6 +98,8 @@ struct tmio_mmc_host {
 	int (*write16_hook)(struct tmio_mmc_host *host, int addr);
 	int (*clk_enable)(struct platform_device *pdev, unsigned int *f);
 	void (*clk_disable)(struct platform_device *pdev);
+	int (*multi_io_quirk)(struct mmc_card *card,
+			      unsigned int direction, int blk_size);
 };
 
 struct tmio_mmc_host *tmio_mmc_host_alloc(struct platform_device *pdev);
diff --git a/drivers/mmc/host/tmio_mmc_pio.c b/drivers/mmc/host/tmio_mmc_pio.c
index e33eb8ee5eb5..a31c3573d386 100644
--- a/drivers/mmc/host/tmio_mmc_pio.c
+++ b/drivers/mmc/host/tmio_mmc_pio.c
@@ -1004,10 +1004,9 @@ static int tmio_multi_io_quirk(struct mmc_card *card,
 			       unsigned int direction, int blk_size)
 {
 	struct tmio_mmc_host *host = mmc_priv(card->host);
-	struct tmio_mmc_data *pdata = host->pdata;
 
-	if (pdata->multi_io_quirk)
-		return pdata->multi_io_quirk(card, direction, blk_size);
+	if (host->multi_io_quirk)
+		return host->multi_io_quirk(card, direction, blk_size);
 
 	return blk_size;
 }
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index a3f78da70af7..3edaa17d1ccd 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -125,9 +125,6 @@ struct tmio_mmc_data {
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
-
-	int (*multi_io_quirk)(struct mmc_card *card,
-			      unsigned int direction, int blk_size);
 };
 
 /*
-- 
cgit v1.2.3


From 7445bf9e6f4e5d7755e22c7c9b06f4ae0d6160c6 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:58:20 +0000
Subject: mmc: tmio: tmio_mmc_host has .bus_shift

Current .bus_shift is implemented under tmio_mmc_data.
It goes to tmio_mmc_host by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c |  5 ++---
 drivers/mmc/host/tmio_mmc.c       |  5 +++--
 drivers/mmc/host/tmio_mmc.h       | 17 +++++++++--------
 drivers/mmc/host/tmio_mmc_dma.c   |  2 +-
 include/linux/mfd/tmio.h          |  1 -
 5 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index c92efe59284d..0db2a0a0376b 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -218,6 +218,8 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	host->clk_enable	= sh_mobile_sdhi_clk_enable;
 	host->clk_disable	= sh_mobile_sdhi_clk_disable;
 	host->multi_io_quirk	= sh_mobile_sdhi_multi_io_quirk;
+	/* SD control register space size is 0x100, 0x200 for bus_shift=1 */
+	host->bus_shift		= resource_size(res) >> 9;
 
 	mmc_data->capabilities = MMC_CAP_MMC_HIGHSPEED;
 	if (p) {
@@ -277,9 +279,6 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 		dma_priv->dma_rx_offset = of_data->dma_rx_offset;
 	}
 
-	/* SD control register space size is 0x100, 0x200 for bus_shift=1 */
-	mmc_data->bus_shift = resource_size(res) >> 9;
-
 	ret = tmio_mmc_host_probe(host, mmc_data);
 	if (ret < 0)
 		goto efree;
diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c
index a7c2e459187c..f746df493892 100644
--- a/drivers/mmc/host/tmio_mmc.c
+++ b/drivers/mmc/host/tmio_mmc.c
@@ -88,14 +88,15 @@ static int tmio_mmc_probe(struct platform_device *pdev)
 	if (!res)
 		return -EINVAL;
 
-	/* SD control register space size is 0x200, 0x400 for bus_shift=1 */
-	pdata->bus_shift = resource_size(res) >> 10;
 	pdata->flags |= TMIO_MMC_HAVE_HIGH_REG;
 
 	host = tmio_mmc_host_alloc(pdev);
 	if (!host)
 		goto cell_disable;
 
+	/* SD control register space size is 0x200, 0x400 for bus_shift=1 */
+	host->bus_shift = resource_size(res) >> 10;
+
 	ret = tmio_mmc_host_probe(host, pdata);
 	if (ret)
 		goto host_free;
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 263256c897ed..a5d30bfa7e64 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -66,6 +66,7 @@ struct tmio_mmc_host {
 	struct scatterlist      *sg_orig;
 	unsigned int            sg_len;
 	unsigned int            sg_off;
+	unsigned long		bus_shift;
 
 	struct platform_device *pdev;
 	struct tmio_mmc_data *pdata;
@@ -169,19 +170,19 @@ int tmio_mmc_host_runtime_resume(struct device *dev);
 
 static inline u16 sd_ctrl_read16(struct tmio_mmc_host *host, int addr)
 {
-	return readw(host->ctl + (addr << host->pdata->bus_shift));
+	return readw(host->ctl + (addr << host->bus_shift));
 }
 
 static inline void sd_ctrl_read16_rep(struct tmio_mmc_host *host, int addr,
 		u16 *buf, int count)
 {
-	readsw(host->ctl + (addr << host->pdata->bus_shift), buf, count);
+	readsw(host->ctl + (addr << host->bus_shift), buf, count);
 }
 
 static inline u32 sd_ctrl_read32(struct tmio_mmc_host *host, int addr)
 {
-	return readw(host->ctl + (addr << host->pdata->bus_shift)) |
-	       readw(host->ctl + ((addr + 2) << host->pdata->bus_shift)) << 16;
+	return readw(host->ctl + (addr << host->bus_shift)) |
+	       readw(host->ctl + ((addr + 2) << host->bus_shift)) << 16;
 }
 
 static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val)
@@ -191,19 +192,19 @@ static inline void sd_ctrl_write16(struct tmio_mmc_host *host, int addr, u16 val
 	 */
 	if (host->write16_hook && host->write16_hook(host, addr))
 		return;
-	writew(val, host->ctl + (addr << host->pdata->bus_shift));
+	writew(val, host->ctl + (addr << host->bus_shift));
 }
 
 static inline void sd_ctrl_write16_rep(struct tmio_mmc_host *host, int addr,
 		u16 *buf, int count)
 {
-	writesw(host->ctl + (addr << host->pdata->bus_shift), buf, count);
+	writesw(host->ctl + (addr << host->bus_shift), buf, count);
 }
 
 static inline void sd_ctrl_write32(struct tmio_mmc_host *host, int addr, u32 val)
 {
-	writew(val, host->ctl + (addr << host->pdata->bus_shift));
-	writew(val >> 16, host->ctl + ((addr + 2) << host->pdata->bus_shift));
+	writew(val, host->ctl + (addr << host->bus_shift));
+	writew(val >> 16, host->ctl + ((addr + 2) << host->bus_shift));
 }
 
 
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index 6c214d60bbb6..ee0131eb078d 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -289,7 +289,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 		if (host->dma->chan_priv_tx)
 			cfg.slave_id = host->dma->slave_id_tx;
 		cfg.direction = DMA_MEM_TO_DEV;
-		cfg.dst_addr = res->start + (CTL_SD_DATA_PORT << host->pdata->bus_shift);
+		cfg.dst_addr = res->start + (CTL_SD_DATA_PORT << host->bus_shift);
 		cfg.dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
 		cfg.src_addr = 0;
 		ret = dmaengine_slave_config(host->chan_tx, &cfg);
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 3edaa17d1ccd..7a5c27948315 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -120,7 +120,6 @@ struct tmio_mmc_data {
 	unsigned long			capabilities;
 	unsigned long			capabilities2;
 	unsigned long			flags;
-	unsigned long			bus_shift;
 	u32				ocr_mask;	/* available voltages */
 	unsigned int			cd_gpio;
 	void (*set_pwr)(struct platform_device *host, int state);
-- 
cgit v1.2.3


From e471df0bcaa137f1bbe7c5f75db6ce7566caa292 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:58:46 +0000
Subject: mmc: tmio: tmio_mmc_data has .alignment_shift

Current .alignment_shift is implemented under tmio_mmc_dma.
It goes to tmio_mmc_data by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 4 ++--
 drivers/mmc/host/tmio_mmc.h       | 1 -
 drivers/mmc/host/tmio_mmc_dma.c   | 4 ++--
 include/linux/mfd/tmio.h          | 1 +
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 0db2a0a0376b..782d8cf942a7 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -241,10 +241,10 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 			dma_priv->slave_id_rx = p->dma_slave_rx;
 		}
 	}
-
-	dma_priv->alignment_shift = 1; /* 2-byte alignment */
 	dma_priv->filter = shdma_chan_filter;
 
+	mmc_data->alignment_shift = 1; /* 2-byte alignment */
+
 	/*
 	 * All SDHI blocks support 2-byte and larger block sizes in 4-bit
 	 * bus width mode.
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index a5d30bfa7e64..9482abceeb9c 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -45,7 +45,6 @@ struct tmio_mmc_dma {
 	void *chan_priv_rx;
 	int slave_id_tx;
 	int slave_id_rx;
-	int alignment_shift;
 	dma_addr_t dma_rx_offset;
 	bool (*filter)(struct dma_chan *chan, void *arg);
 };
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index ee0131eb078d..d2b02de2f378 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -52,7 +52,7 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host)
 	dma_cookie_t cookie;
 	int ret, i;
 	bool aligned = true, multiple = true;
-	unsigned int align = (1 << host->dma->alignment_shift) - 1;
+	unsigned int align = (1 << host->pdata->alignment_shift) - 1;
 
 	for_each_sg(sg, sg_tmp, host->sg_len, i) {
 		if (sg_tmp->offset & align)
@@ -128,7 +128,7 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host)
 	dma_cookie_t cookie;
 	int ret, i;
 	bool aligned = true, multiple = true;
-	unsigned int align = (1 << host->dma->alignment_shift) - 1;
+	unsigned int align = (1 << host->pdata->alignment_shift) - 1;
 
 	for_each_sg(sg, sg_tmp, host->sg_len, i) {
 		if (sg_tmp->offset & align)
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 7a5c27948315..28a12d10e509 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -122,6 +122,7 @@ struct tmio_mmc_data {
 	unsigned long			flags;
 	u32				ocr_mask;	/* available voltages */
 	unsigned int			cd_gpio;
+	int				alignment_shift;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
 };
-- 
cgit v1.2.3


From 8b4c8f32da91681c0dcd321c9e3cd14f866c5517 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:58:56 +0000
Subject: mmc: tmio: tmio_mmc_data has .dma_rx_offset

Current .dma_rx_offset is implemented under tmio_mmc_dma.
It goes to tmio_mmc_data by this patch.

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 2 +-
 drivers/mmc/host/tmio_mmc.h       | 1 -
 drivers/mmc/host/tmio_mmc_dma.c   | 2 +-
 include/linux/mfd/tmio.h          | 1 +
 4 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 782d8cf942a7..c06e93b77995 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -276,7 +276,7 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 		mmc_data->flags |= of_data->tmio_flags;
 		mmc_data->capabilities |= of_data->capabilities;
 		mmc_data->capabilities2 |= of_data->capabilities2;
-		dma_priv->dma_rx_offset = of_data->dma_rx_offset;
+		mmc_data->dma_rx_offset = of_data->dma_rx_offset;
 	}
 
 	ret = tmio_mmc_host_probe(host, mmc_data);
diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h
index 9482abceeb9c..c9a84695cff2 100644
--- a/drivers/mmc/host/tmio_mmc.h
+++ b/drivers/mmc/host/tmio_mmc.h
@@ -45,7 +45,6 @@ struct tmio_mmc_dma {
 	void *chan_priv_rx;
 	int slave_id_tx;
 	int slave_id_rx;
-	dma_addr_t dma_rx_offset;
 	bool (*filter)(struct dma_chan *chan, void *arg);
 };
 
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index d2b02de2f378..634b2700cb77 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -308,7 +308,7 @@ void tmio_mmc_request_dma(struct tmio_mmc_host *host, struct tmio_mmc_data *pdat
 		if (host->dma->chan_priv_rx)
 			cfg.slave_id = host->dma->slave_id_rx;
 		cfg.direction = DMA_DEV_TO_MEM;
-		cfg.src_addr = cfg.dst_addr + host->dma->dma_rx_offset;
+		cfg.src_addr = cfg.dst_addr + host->pdata->dma_rx_offset;
 		cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
 		cfg.dst_addr = 0;
 		ret = dmaengine_slave_config(host->chan_rx, &cfg);
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 28a12d10e509..807ecfb162d7 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -123,6 +123,7 @@ struct tmio_mmc_data {
 	u32				ocr_mask;	/* available voltages */
 	unsigned int			cd_gpio;
 	int				alignment_shift;
+	dma_addr_t			dma_rx_offset;
 	void (*set_pwr)(struct platform_device *host, int state);
 	void (*set_clk_div)(struct platform_device *host, int state);
 };
-- 
cgit v1.2.3


From 010f4aa758f437647799b1fd677a5e2cf31714e9 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 04:59:24 +0000
Subject: mmc: sh_mobile_sdhi: remove .init/.cleanup

No one is using .init/.cleanup callback function.
Let's remove these.
sdhi_ops and .cd_wakeup are also removed

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c  | 26 +-------------------------
 include/linux/mmc/sh_mobile_sdhi.h | 15 ---------------
 2 files changed, 1 insertion(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index c06e93b77995..48a3be804cc0 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -156,15 +156,6 @@ static int sh_mobile_sdhi_multi_io_quirk(struct mmc_card *card,
 	return blk_size;
 }
 
-static void sh_mobile_sdhi_cd_wakeup(const struct platform_device *pdev)
-{
-	mmc_detect_change(platform_get_drvdata(pdev), msecs_to_jiffies(100));
-}
-
-static const struct sh_mobile_sdhi_ops sdhi_ops = {
-	.cd_wakeup = sh_mobile_sdhi_cd_wakeup,
-};
-
 static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 {
 	const struct of_device_id *of_id =
@@ -192,19 +183,11 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	mmc_data = &priv->mmc_data;
 	dma_priv = &priv->dma_priv;
 
-	if (p) {
-		if (p->init) {
-			ret = p->init(pdev, &sdhi_ops);
-			if (ret)
-				return ret;
-		}
-	}
-
 	priv->clk = devm_clk_get(&pdev->dev, NULL);
 	if (IS_ERR(priv->clk)) {
 		ret = PTR_ERR(priv->clk);
 		dev_err(&pdev->dev, "cannot get clock: %d\n", ret);
-		goto eclkget;
+		goto eprobe;
 	}
 
 	host = tmio_mmc_host_alloc(pdev);
@@ -359,9 +342,6 @@ eirq:
 efree:
 	tmio_mmc_host_free(host);
 eprobe:
-eclkget:
-	if (p && p->cleanup)
-		p->cleanup(pdev);
 	return ret;
 }
 
@@ -369,13 +349,9 @@ static int sh_mobile_sdhi_remove(struct platform_device *pdev)
 {
 	struct mmc_host *mmc = platform_get_drvdata(pdev);
 	struct tmio_mmc_host *host = mmc_priv(mmc);
-	struct sh_mobile_sdhi_info *p = pdev->dev.platform_data;
 
 	tmio_mmc_host_remove(host);
 
-	if (p && p->cleanup)
-		p->cleanup(pdev);
-
 	return 0;
 }
 
diff --git a/include/linux/mmc/sh_mobile_sdhi.h b/include/linux/mmc/sh_mobile_sdhi.h
index 68927ae50845..da77e5e2041d 100644
--- a/include/linux/mmc/sh_mobile_sdhi.h
+++ b/include/linux/mmc/sh_mobile_sdhi.h
@@ -3,20 +3,10 @@
 
 #include <linux/types.h>
 
-struct platform_device;
-
 #define SH_MOBILE_SDHI_IRQ_CARD_DETECT	"card_detect"
 #define SH_MOBILE_SDHI_IRQ_SDCARD	"sdcard"
 #define SH_MOBILE_SDHI_IRQ_SDIO		"sdio"
 
-/**
- * struct sh_mobile_sdhi_ops - SDHI driver callbacks
- * @cd_wakeup:		trigger a card-detection run
- */
-struct sh_mobile_sdhi_ops {
-	void (*cd_wakeup)(const struct platform_device *pdev);
-};
-
 struct sh_mobile_sdhi_info {
 	int dma_slave_tx;
 	int dma_slave_rx;
@@ -25,11 +15,6 @@ struct sh_mobile_sdhi_info {
 	unsigned long tmio_caps2;
 	u32 tmio_ocr_mask;	/* available MMC voltages */
 	unsigned int cd_gpio;
-
-	/* callbacks for board specific setup code */
-	int (*init)(struct platform_device *pdev,
-		    const struct sh_mobile_sdhi_ops *ops);
-	void (*cleanup)(struct platform_device *pdev);
 };
 
 #endif /* LINUX_MMC_SH_MOBILE_SDHI_H */
-- 
cgit v1.2.3


From de122cb1745313f331dc7c7923b484343d455e64 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Tue, 13 Jan 2015 05:00:39 +0000
Subject: mmc: tmio: remove TMIO_MMC_HAVE_CTL_DMA_REG flag

tmio_mmc_host has .enable_dma callback now.
We don't need TMIO_MMC_HAVE_CTL_DMA_REG anymore.
Let's remove it

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/host/sh_mobile_sdhi.c | 5 -----
 drivers/mmc/host/tmio_mmc_dma.c   | 3 ---
 include/linux/mfd/tmio.h          | 5 -----
 3 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/host/sh_mobile_sdhi.c b/drivers/mmc/host/sh_mobile_sdhi.c
index 7dbcb577f596..6906a905cd54 100644
--- a/drivers/mmc/host/sh_mobile_sdhi.c
+++ b/drivers/mmc/host/sh_mobile_sdhi.c
@@ -291,11 +291,6 @@ static int sh_mobile_sdhi_probe(struct platform_device *pdev)
 	 */
 	mmc_data->flags |= TMIO_MMC_SDIO_STATUS_QUIRK;
 
-	/*
-	 * All SDHI have DMA control register
-	 */
-	mmc_data->flags |= TMIO_MMC_HAVE_CTL_DMA_REG;
-
 	if (of_id && of_id->data) {
 		const struct sh_mobile_sdhi_of_data *of_data = of_id->data;
 		mmc_data->flags |= of_data->tmio_flags;
diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c
index aa5f4b6e790d..331bb618e398 100644
--- a/drivers/mmc/host/tmio_mmc_dma.c
+++ b/drivers/mmc/host/tmio_mmc_dma.c
@@ -28,9 +28,6 @@ void tmio_mmc_enable_dma(struct tmio_mmc_host *host, bool enable)
 	if (!host->chan_tx || !host->chan_rx)
 		return;
 
-	if (host->pdata->flags & TMIO_MMC_HAVE_CTL_DMA_REG)
-		sd_ctrl_write16(host, CTL_DMA_ENABLE, enable ? 2 : 0);
-
 	if (host->dma->enable)
 		host->dma->enable(host, enable);
 }
diff --git a/include/linux/mfd/tmio.h b/include/linux/mfd/tmio.h
index 807ecfb162d7..605812820e48 100644
--- a/include/linux/mfd/tmio.h
+++ b/include/linux/mfd/tmio.h
@@ -95,11 +95,6 @@
  */
 #define TMIO_MMC_SDIO_STATUS_QUIRK	(1 << 8)
 
-/*
- * Some controllers have DMA enable/disable register
- */
-#define TMIO_MMC_HAVE_CTL_DMA_REG	(1 << 9)
-
 /*
  * Some controllers allows to set SDx actual clock
  */
-- 
cgit v1.2.3


From 1dfb4a0d7615811ec4a61b0a7631c8ddc0baf335 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 13 Jan 2015 08:00:29 +0100
Subject: gpio: stmpe: enforce device tree only mode

Require that device tree be used with STMPE (all platforms use this)
and enforce OF_GPIO, then delete the platform data.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/Kconfig      |  1 +
 drivers/gpio/gpio-stmpe.c | 23 +++--------------------
 include/linux/mfd/stmpe.h | 16 ----------------
 3 files changed, 4 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 4187fcbc88fd..22b46567b3bb 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -644,6 +644,7 @@ config GPIO_SX150X
 config GPIO_STMPE
 	bool "STMPE GPIOs"
 	depends on MFD_STMPE
+	depends on OF_GPIO
 	select GPIOLIB_IRQCHIP
 	help
 	  This enables support for the GPIOs found on the STMPE I/O
diff --git a/drivers/gpio/gpio-stmpe.c b/drivers/gpio/gpio-stmpe.c
index 85c5b1974294..dabfb99dddef 100644
--- a/drivers/gpio/gpio-stmpe.c
+++ b/drivers/gpio/gpio-stmpe.c
@@ -30,7 +30,7 @@ struct stmpe_gpio {
 	struct stmpe *stmpe;
 	struct device *dev;
 	struct mutex irq_lock;
-	unsigned norequest_mask;
+	u32 norequest_mask;
 	/* Caches of interrupt control registers for bus_lock */
 	u8 regs[CACHE_NR_REGS][CACHE_NR_BANKS];
 	u8 oldregs[CACHE_NR_REGS][CACHE_NR_BANKS];
@@ -340,13 +340,10 @@ static int stmpe_gpio_probe(struct platform_device *pdev)
 {
 	struct stmpe *stmpe = dev_get_drvdata(pdev->dev.parent);
 	struct device_node *np = pdev->dev.of_node;
-	struct stmpe_gpio_platform_data *pdata;
 	struct stmpe_gpio *stmpe_gpio;
 	int ret;
 	int irq = 0;
 
-	pdata = stmpe->pdata->gpio;
-
 	irq = platform_get_irq(pdev, 0);
 
 	stmpe_gpio = kzalloc(sizeof(struct stmpe_gpio), GFP_KERNEL);
@@ -360,19 +357,14 @@ static int stmpe_gpio_probe(struct platform_device *pdev)
 	stmpe_gpio->chip = template_chip;
 	stmpe_gpio->chip.ngpio = stmpe->num_gpios;
 	stmpe_gpio->chip.dev = &pdev->dev;
-#ifdef CONFIG_OF
 	stmpe_gpio->chip.of_node = np;
-#endif
 	stmpe_gpio->chip.base = -1;
 
 	if (IS_ENABLED(CONFIG_DEBUG_FS))
                 stmpe_gpio->chip.dbg_show = stmpe_dbg_show;
 
-	if (pdata)
-		stmpe_gpio->norequest_mask = pdata->norequest_mask;
-	else if (np)
-		of_property_read_u32(np, "st,norequest-mask",
-				&stmpe_gpio->norequest_mask);
+	of_property_read_u32(np, "st,norequest-mask",
+			&stmpe_gpio->norequest_mask);
 
 	if (irq < 0)
 		dev_info(&pdev->dev,
@@ -414,9 +406,6 @@ static int stmpe_gpio_probe(struct platform_device *pdev)
 					     NULL);
 	}
 
-	if (pdata && pdata->setup)
-		pdata->setup(stmpe, stmpe_gpio->chip.base);
-
 	platform_set_drvdata(pdev, stmpe_gpio);
 
 	return 0;
@@ -433,15 +422,9 @@ static int stmpe_gpio_remove(struct platform_device *pdev)
 {
 	struct stmpe_gpio *stmpe_gpio = platform_get_drvdata(pdev);
 	struct stmpe *stmpe = stmpe_gpio->stmpe;
-	struct stmpe_gpio_platform_data *pdata = stmpe->pdata->gpio;
-
-	if (pdata && pdata->remove)
-		pdata->remove(stmpe, stmpe_gpio->chip.base);
 
 	gpiochip_remove(&stmpe_gpio->chip);
-
 	stmpe_disable(stmpe, STMPE_BLOCK_GPIO);
-
 	kfree(stmpe_gpio);
 
 	return 0;
diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h
index f742b6717d52..c9d869027300 100644
--- a/include/linux/mfd/stmpe.h
+++ b/include/linux/mfd/stmpe.h
@@ -117,20 +117,6 @@ extern int stmpe_disable(struct stmpe *stmpe, unsigned int blocks);
 
 #define STMPE_GPIO_NOREQ_811_TOUCH	(0xf0)
 
-/**
- * struct stmpe_gpio_platform_data - STMPE GPIO platform data
- * @norequest_mask: bitmask specifying which GPIOs should _not_ be
- *		    requestable due to different usage (e.g. touch, keypad)
- *		    STMPE_GPIO_NOREQ_* macros can be used here.
- * @setup: board specific setup callback.
- * @remove: board specific remove callback
- */
-struct stmpe_gpio_platform_data {
-	unsigned norequest_mask;
-	void (*setup)(struct stmpe *stmpe, unsigned gpio_base);
-	void (*remove)(struct stmpe *stmpe, unsigned gpio_base);
-};
-
 /**
  * struct stmpe_ts_platform_data - stmpe811 touch screen controller platform
  * data
@@ -182,7 +168,6 @@ struct stmpe_ts_platform_data {
  * @irq_over_gpio: true if gpio is used to get irq
  * @irq_gpio: gpio number over which irq will be requested (significant only if
  *	      irq_over_gpio is true)
- * @gpio: GPIO-specific platform data
  * @ts: touchscreen-specific platform data
  */
 struct stmpe_platform_data {
@@ -194,7 +179,6 @@ struct stmpe_platform_data {
 	int irq_gpio;
 	int autosleep_timeout;
 
-	struct stmpe_gpio_platform_data *gpio;
 	struct stmpe_ts_platform_data *ts;
 };
 
-- 
cgit v1.2.3


From 4b6eade76ad19183464b739e9af1efacdb1bbda8 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Tue, 20 Jan 2015 11:00:53 +0100
Subject: mfd: max77693: Add defines for MAX77693 charger driver

Prepare for adding support for Maxim 77693 charger by adding necessary
new defines.

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 include/linux/mfd/max77693-private.h | 108 +++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index 08dae01258b9..955dd990beaf 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -143,10 +143,118 @@ enum max77693_pmic_reg {
 #define FLASH_INT_FLED1_SHORT	BIT(3)
 #define FLASH_INT_OVER_CURRENT	BIT(4)
 
+/* Fast charge timer in in hours */
+#define DEFAULT_FAST_CHARGE_TIMER		4
+/* microamps */
+#define DEFAULT_TOP_OFF_THRESHOLD_CURRENT	150000
+/* minutes */
+#define DEFAULT_TOP_OFF_TIMER			30
+/* microvolts */
+#define DEFAULT_CONSTANT_VOLT			4200000
+/* microvolts */
+#define DEFAULT_MIN_SYSTEM_VOLT			3600000
+/* celsius */
+#define DEFAULT_THERMAL_REGULATION_TEMP		100
+/* microamps */
+#define DEFAULT_BATTERY_OVERCURRENT		3500000
+/* microvolts */
+#define DEFAULT_CHARGER_INPUT_THRESHOLD_VOLT	4300000
+
+/* MAX77693_CHG_REG_CHG_INT_OK register */
+#define CHG_INT_OK_BYP_SHIFT		0
+#define CHG_INT_OK_BAT_SHIFT		3
+#define CHG_INT_OK_CHG_SHIFT		4
+#define CHG_INT_OK_CHGIN_SHIFT		6
+#define CHG_INT_OK_DETBAT_SHIFT		7
+#define CHG_INT_OK_BYP_MASK		BIT(CHG_INT_OK_BYP_SHIFT)
+#define CHG_INT_OK_BAT_MASK		BIT(CHG_INT_OK_BAT_SHIFT)
+#define CHG_INT_OK_CHG_MASK		BIT(CHG_INT_OK_CHG_SHIFT)
+#define CHG_INT_OK_CHGIN_MASK		BIT(CHG_INT_OK_CHGIN_SHIFT)
+#define CHG_INT_OK_DETBAT_MASK		BIT(CHG_INT_OK_DETBAT_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_DETAILS_00 register */
+#define CHG_DETAILS_00_CHGIN_SHIFT	5
+#define CHG_DETAILS_00_CHGIN_MASK	(0x3 << CHG_DETAILS_00_CHGIN_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_DETAILS_01 register */
+#define CHG_DETAILS_01_CHG_SHIFT	0
+#define CHG_DETAILS_01_BAT_SHIFT	4
+#define CHG_DETAILS_01_TREG_SHIFT	7
+#define CHG_DETAILS_01_CHG_MASK		(0xf << CHG_DETAILS_01_CHG_SHIFT)
+#define CHG_DETAILS_01_BAT_MASK		(0x7 << CHG_DETAILS_01_BAT_SHIFT)
+#define CHG_DETAILS_01_TREG_MASK	BIT(7)
+
+/* MAX77693_CHG_REG_CHG_DETAILS_01/CHG field */
+enum max77693_charger_charging_state {
+	MAX77693_CHARGING_PREQUALIFICATION	= 0x0,
+	MAX77693_CHARGING_FAST_CONST_CURRENT,
+	MAX77693_CHARGING_FAST_CONST_VOLTAGE,
+	MAX77693_CHARGING_TOP_OFF,
+	MAX77693_CHARGING_DONE,
+	MAX77693_CHARGING_HIGH_TEMP,
+	MAX77693_CHARGING_TIMER_EXPIRED,
+	MAX77693_CHARGING_THERMISTOR_SUSPEND,
+	MAX77693_CHARGING_OFF,
+	MAX77693_CHARGING_RESERVED,
+	MAX77693_CHARGING_OVER_TEMP,
+	MAX77693_CHARGING_WATCHDOG_EXPIRED,
+};
+
+/* MAX77693_CHG_REG_CHG_DETAILS_01/BAT field */
+enum max77693_charger_battery_state {
+	MAX77693_BATTERY_NOBAT			= 0x0,
+	/* Dead-battery or low-battery prequalification */
+	MAX77693_BATTERY_PREQUALIFICATION,
+	MAX77693_BATTERY_TIMER_EXPIRED,
+	MAX77693_BATTERY_GOOD,
+	MAX77693_BATTERY_LOWVOLTAGE,
+	MAX77693_BATTERY_OVERVOLTAGE,
+	MAX77693_BATTERY_OVERCURRENT,
+	MAX77693_BATTERY_RESERVED,
+};
+
+/* MAX77693_CHG_REG_CHG_DETAILS_02 register */
+#define CHG_DETAILS_02_BYP_SHIFT	0
+#define CHG_DETAILS_02_BYP_MASK		(0xf << CHG_DETAILS_02_BYP_SHIFT)
+
 /* MAX77693 CHG_CNFG_00 register */
 #define CHG_CNFG_00_CHG_MASK		0x1
 #define CHG_CNFG_00_BUCK_MASK		0x4
 
+/* MAX77693_CHG_REG_CHG_CNFG_01 register */
+#define CHG_CNFG_01_FCHGTIME_SHIFT	0
+#define CHG_CNFG_01_CHGRSTRT_SHIFT	4
+#define CHG_CNFG_01_PQEN_SHIFT		7
+#define CHG_CNFG_01_FCHGTIME_MASK	(0x7 << CHG_CNFG_01_FCHGTIME_SHIFT)
+#define CHG_CNFG_01_CHGRSTRT_MASK	(0x3 << CHG_CNFG_01_CHGRSTRT_SHIFT)
+#define CHG_CNFG_01_PQEN_MAKS		BIT(CHG_CNFG_01_PQEN_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_CNFG_03 register */
+#define CHG_CNFG_03_TOITH_SHIFT		0
+#define CHG_CNFG_03_TOTIME_SHIFT	3
+#define CHG_CNFG_03_TOITH_MASK		(0x7 << CHG_CNFG_03_TOITH_SHIFT)
+#define CHG_CNFG_03_TOTIME_MASK		(0x7 << CHG_CNFG_03_TOTIME_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_CNFG_04 register */
+#define CHG_CNFG_04_CHGCVPRM_SHIFT	0
+#define CHG_CNFG_04_MINVSYS_SHIFT	5
+#define CHG_CNFG_04_CHGCVPRM_MASK	(0x1f << CHG_CNFG_04_CHGCVPRM_SHIFT)
+#define CHG_CNFG_04_MINVSYS_MASK	(0x7 << CHG_CNFG_04_MINVSYS_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_CNFG_06 register */
+#define CHG_CNFG_06_CHGPROT_SHIFT	2
+#define CHG_CNFG_06_CHGPROT_MASK	(0x3 << CHG_CNFG_06_CHGPROT_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_CNFG_07 register */
+#define CHG_CNFG_07_REGTEMP_SHIFT	5
+#define CHG_CNFG_07_REGTEMP_MASK	(0x3 << CHG_CNFG_07_REGTEMP_SHIFT)
+
+/* MAX77693_CHG_REG_CHG_CNFG_12 register */
+#define CHG_CNFG_12_B2SOVRC_SHIFT	0
+#define CHG_CNFG_12_VCHGINREG_SHIFT	3
+#define CHG_CNFG_12_B2SOVRC_MASK	(0x7 << CHG_CNFG_12_B2SOVRC_SHIFT)
+#define CHG_CNFG_12_VCHGINREG_MASK	(0x3 << CHG_CNFG_12_VCHGINREG_SHIFT)
+
 /* MAX77693 CHG_CNFG_09 Register */
 #define CHG_CNFG_09_CHGIN_ILIM_MASK	0x7F
 
-- 
cgit v1.2.3


From 2fded7f44b8fcf79e274c3f0cfbd0298f95308f3 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 23 Dec 2014 16:39:54 -0500
Subject: audit: remove vestiges of vers_ops

Should have been removed with commit 18900909 ("audit: remove the old
depricated kernel interface").

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 include/linux/audit.h | 1 -
 kernel/auditfilter.c  | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 93331929d643..b481779a8dee 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -46,7 +46,6 @@ struct audit_tree;
 struct sk_buff;
 
 struct audit_krule {
-	int			vers_ops;
 	u32			pflags;
 	u32			flags;
 	u32			listnr;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 103586e239a2..81c94d739e3f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -425,7 +425,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		goto exit_nofree;
 
 	bufp = data->buf;
-	entry->rule.vers_ops = 2;
 	for (i = 0; i < data->field_count; i++) {
 		struct audit_field *f = &entry->rule.fields[i];
 
@@ -758,7 +757,6 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
 		return ERR_PTR(-ENOMEM);
 
 	new = &entry->rule;
-	new->vers_ops = old->vers_ops;
 	new->flags = old->flags;
 	new->pflags = old->pflags;
 	new->listnr = old->listnr;
-- 
cgit v1.2.3


From 39e794bff718188cfb9ace2032cbe4fd86048dc6 Mon Sep 17 00:00:00 2001
From: Yaniv Gardi <ygardi@codeaurora.org>
Date: Thu, 15 Jan 2015 16:32:36 +0200
Subject: phy: qcom-ufs: add support for 20nm phy

This change adds a support for a 20nm qcom-ufs phy that is required in
platforms that use ufs-qcom controller.

Signed-off-by: Yaniv Gardi <ygardi@codeaurora.org>
Reviewed-by: Dov Levenglick <dovl@codeaurora.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/phy/Makefile                |   1 +
 drivers/phy/phy-qcom-ufs-i.h        |  43 +++++-
 drivers/phy/phy-qcom-ufs-qmp-20nm.c | 257 ++++++++++++++++++++++++++++++++++++
 drivers/phy/phy-qcom-ufs-qmp-20nm.h | 235 +++++++++++++++++++++++++++++++++
 include/linux/phy/phy-qcom-ufs.h    |  59 +++++++++
 5 files changed, 594 insertions(+), 1 deletion(-)
 create mode 100644 drivers/phy/phy-qcom-ufs-qmp-20nm.c
 create mode 100644 drivers/phy/phy-qcom-ufs-qmp-20nm.h
 create mode 100644 include/linux/phy/phy-qcom-ufs.h

(limited to 'include/linux')

diff --git a/drivers/phy/Makefile b/drivers/phy/Makefile
index 335965d3a9b7..781b2fa46d09 100644
--- a/drivers/phy/Makefile
+++ b/drivers/phy/Makefile
@@ -35,3 +35,4 @@ obj-$(CONFIG_PHY_XGENE)			+= phy-xgene.o
 obj-$(CONFIG_PHY_STIH407_USB)		+= phy-stih407-usb.o
 obj-$(CONFIG_PHY_STIH41X_USB)		+= phy-stih41x-usb.o
 obj-$(CONFIG_PHY_QCOM_UFS) 	+= phy-qcom-ufs.o
+obj-$(CONFIG_PHY_QCOM_UFS) 	+= phy-qcom-ufs-qmp-20nm.o
diff --git a/drivers/phy/phy-qcom-ufs-i.h b/drivers/phy/phy-qcom-ufs-i.h
index dac200f4d639..591a39175e8a 100644
--- a/drivers/phy/phy-qcom-ufs-i.h
+++ b/drivers/phy/phy-qcom-ufs-i.h
@@ -15,15 +15,56 @@
 #ifndef UFS_QCOM_PHY_I_H_
 #define UFS_QCOM_PHY_I_H_
 
+#include <linux/module.h>
 #include <linux/clk.h>
+#include <linux/regulator/consumer.h>
 #include <linux/slab.h>
-#include <linux/phy/phy.h>
+#include <linux/phy/phy-qcom-ufs.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/delay.h>
 
+#define readl_poll_timeout(addr, val, cond, sleep_us, timeout_us) \
+({ \
+	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); \
+	might_sleep_if(timeout_us); \
+	for (;;) { \
+		(val) = readl(addr); \
+		if (cond) \
+			break; \
+		if (timeout_us && ktime_compare(ktime_get(), timeout) > 0) { \
+			(val) = readl(addr); \
+			break; \
+		} \
+		if (sleep_us) \
+			usleep_range(DIV_ROUND_UP(sleep_us, 4), sleep_us); \
+	} \
+	(cond) ? 0 : -ETIMEDOUT; \
+})
+
+#define UFS_QCOM_PHY_CAL_ENTRY(reg, val)	\
+	{				\
+		.reg_offset = reg,	\
+		.cfg_value = val,	\
+	}
+
 #define UFS_QCOM_PHY_NAME_LEN	30
 
+enum {
+	MASK_SERDES_START       = 0x1,
+	MASK_PCS_READY          = 0x1,
+};
+
+enum {
+	OFFSET_SERDES_START     = 0x0,
+};
+
+struct ufs_qcom_phy_stored_attributes {
+	u32 att;
+	u32 value;
+};
+
+
 struct ufs_qcom_phy_calibration {
 	u32 reg_offset;
 	u32 cfg_value;
diff --git a/drivers/phy/phy-qcom-ufs-qmp-20nm.c b/drivers/phy/phy-qcom-ufs-qmp-20nm.c
new file mode 100644
index 000000000000..8332f96b2c4a
--- /dev/null
+++ b/drivers/phy/phy-qcom-ufs-qmp-20nm.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2013-2015, Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "phy-qcom-ufs-qmp-20nm.h"
+
+#define UFS_PHY_NAME "ufs_phy_qmp_20nm"
+
+static
+int ufs_qcom_phy_qmp_20nm_phy_calibrate(struct ufs_qcom_phy *ufs_qcom_phy,
+					bool is_rate_B)
+{
+	struct ufs_qcom_phy_calibration *tbl_A, *tbl_B;
+	int tbl_size_A, tbl_size_B;
+	u8 major = ufs_qcom_phy->host_ctrl_rev_major;
+	u16 minor = ufs_qcom_phy->host_ctrl_rev_minor;
+	u16 step = ufs_qcom_phy->host_ctrl_rev_step;
+	int err;
+
+	if ((major == 0x1) && (minor == 0x002) && (step == 0x0000)) {
+		tbl_size_A = ARRAY_SIZE(phy_cal_table_rate_A_1_2_0);
+		tbl_A = phy_cal_table_rate_A_1_2_0;
+	} else if ((major == 0x1) && (minor == 0x003) && (step == 0x0000)) {
+		tbl_size_A = ARRAY_SIZE(phy_cal_table_rate_A_1_3_0);
+		tbl_A = phy_cal_table_rate_A_1_3_0;
+	} else {
+		dev_err(ufs_qcom_phy->dev, "%s: Unknown UFS-PHY version, no calibration values\n",
+			__func__);
+		err = -ENODEV;
+		goto out;
+	}
+
+	tbl_size_B = ARRAY_SIZE(phy_cal_table_rate_B);
+	tbl_B = phy_cal_table_rate_B;
+
+	err = ufs_qcom_phy_calibrate(ufs_qcom_phy, tbl_A, tbl_size_A,
+						tbl_B, tbl_size_B, is_rate_B);
+
+	if (err)
+		dev_err(ufs_qcom_phy->dev, "%s: ufs_qcom_phy_calibrate() failed %d\n",
+			__func__, err);
+
+out:
+	return err;
+}
+
+static
+void ufs_qcom_phy_qmp_20nm_advertise_quirks(struct ufs_qcom_phy *phy_common)
+{
+	phy_common->quirks =
+		UFS_QCOM_PHY_QUIRK_HIBERN8_EXIT_AFTER_PHY_PWR_COLLAPSE;
+}
+
+static int ufs_qcom_phy_qmp_20nm_init(struct phy *generic_phy)
+{
+	struct ufs_qcom_phy_qmp_20nm *phy = phy_get_drvdata(generic_phy);
+	struct ufs_qcom_phy *phy_common = &phy->common_cfg;
+	int err = 0;
+
+	err = ufs_qcom_phy_init_clks(generic_phy, phy_common);
+	if (err) {
+		dev_err(phy_common->dev, "%s: ufs_qcom_phy_init_clks() failed %d\n",
+			__func__, err);
+		goto out;
+	}
+
+	err = ufs_qcom_phy_init_vregulators(generic_phy, phy_common);
+	if (err) {
+		dev_err(phy_common->dev, "%s: ufs_qcom_phy_init_vregulators() failed %d\n",
+			__func__, err);
+		goto out;
+	}
+
+	ufs_qcom_phy_qmp_20nm_advertise_quirks(phy_common);
+
+out:
+	return err;
+}
+
+static
+void ufs_qcom_phy_qmp_20nm_power_control(struct ufs_qcom_phy *phy, bool val)
+{
+	bool hibern8_exit_after_pwr_collapse = phy->quirks &
+		UFS_QCOM_PHY_QUIRK_HIBERN8_EXIT_AFTER_PHY_PWR_COLLAPSE;
+
+	if (val) {
+		writel_relaxed(0x1, phy->mmio + UFS_PHY_POWER_DOWN_CONTROL);
+		/*
+		 * Before any transactions involving PHY, ensure PHY knows
+		 * that it's analog rail is powered ON.
+		 */
+		mb();
+
+		if (hibern8_exit_after_pwr_collapse) {
+			/*
+			 * Give atleast 1us delay after restoring PHY analog
+			 * power.
+			 */
+			usleep_range(1, 2);
+			writel_relaxed(0x0A, phy->mmio +
+				       QSERDES_COM_SYSCLK_EN_SEL_TXBAND);
+			writel_relaxed(0x08, phy->mmio +
+				       QSERDES_COM_SYSCLK_EN_SEL_TXBAND);
+			/*
+			 * Make sure workaround is deactivated before proceeding
+			 * with normal PHY operations.
+			 */
+			mb();
+		}
+	} else {
+		if (hibern8_exit_after_pwr_collapse) {
+			writel_relaxed(0x0A, phy->mmio +
+				       QSERDES_COM_SYSCLK_EN_SEL_TXBAND);
+			writel_relaxed(0x02, phy->mmio +
+				       QSERDES_COM_SYSCLK_EN_SEL_TXBAND);
+			/*
+			 * Make sure that above workaround is activated before
+			 * PHY analog power collapse.
+			 */
+			mb();
+		}
+
+		writel_relaxed(0x0, phy->mmio + UFS_PHY_POWER_DOWN_CONTROL);
+		/*
+		 * ensure that PHY knows its PHY analog rail is going
+		 * to be powered down
+		 */
+		mb();
+	}
+}
+
+static
+void ufs_qcom_phy_qmp_20nm_set_tx_lane_enable(struct ufs_qcom_phy *phy, u32 val)
+{
+	writel_relaxed(val & UFS_PHY_TX_LANE_ENABLE_MASK,
+			phy->mmio + UFS_PHY_TX_LANE_ENABLE);
+	mb();
+}
+
+static inline void ufs_qcom_phy_qmp_20nm_start_serdes(struct ufs_qcom_phy *phy)
+{
+	u32 tmp;
+
+	tmp = readl_relaxed(phy->mmio + UFS_PHY_PHY_START);
+	tmp &= ~MASK_SERDES_START;
+	tmp |= (1 << OFFSET_SERDES_START);
+	writel_relaxed(tmp, phy->mmio + UFS_PHY_PHY_START);
+	mb();
+}
+
+static int ufs_qcom_phy_qmp_20nm_is_pcs_ready(struct ufs_qcom_phy *phy_common)
+{
+	int err = 0;
+	u32 val;
+
+	err = readl_poll_timeout(phy_common->mmio + UFS_PHY_PCS_READY_STATUS,
+			val, (val & MASK_PCS_READY), 10, 1000000);
+	if (err)
+		dev_err(phy_common->dev, "%s: poll for pcs failed err = %d\n",
+			__func__, err);
+	return err;
+}
+
+static struct phy_ops ufs_qcom_phy_qmp_20nm_phy_ops = {
+	.init		= ufs_qcom_phy_qmp_20nm_init,
+	.exit		= ufs_qcom_phy_exit,
+	.power_on	= ufs_qcom_phy_power_on,
+	.power_off	= ufs_qcom_phy_power_off,
+	.owner		= THIS_MODULE,
+};
+
+static struct ufs_qcom_phy_specific_ops phy_20nm_ops = {
+	.calibrate_phy		= ufs_qcom_phy_qmp_20nm_phy_calibrate,
+	.start_serdes		= ufs_qcom_phy_qmp_20nm_start_serdes,
+	.is_physical_coding_sublayer_ready = ufs_qcom_phy_qmp_20nm_is_pcs_ready,
+	.set_tx_lane_enable	= ufs_qcom_phy_qmp_20nm_set_tx_lane_enable,
+	.power_control		= ufs_qcom_phy_qmp_20nm_power_control,
+};
+
+static int ufs_qcom_phy_qmp_20nm_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct phy *generic_phy;
+	struct ufs_qcom_phy_qmp_20nm *phy;
+	int err = 0;
+
+	phy = devm_kzalloc(dev, sizeof(*phy), GFP_KERNEL);
+	if (!phy) {
+		dev_err(dev, "%s: failed to allocate phy\n", __func__);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	generic_phy = ufs_qcom_phy_generic_probe(pdev, &phy->common_cfg,
+				&ufs_qcom_phy_qmp_20nm_phy_ops, &phy_20nm_ops);
+
+	if (!generic_phy) {
+		dev_err(dev, "%s: ufs_qcom_phy_generic_probe() failed\n",
+			__func__);
+		err = -EIO;
+		goto out;
+	}
+
+	phy_set_drvdata(generic_phy, phy);
+
+	strlcpy(phy->common_cfg.name, UFS_PHY_NAME,
+			sizeof(phy->common_cfg.name));
+
+out:
+	return err;
+}
+
+static int ufs_qcom_phy_qmp_20nm_remove(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct phy *generic_phy = to_phy(dev);
+	struct ufs_qcom_phy *ufs_qcom_phy = get_ufs_qcom_phy(generic_phy);
+	int err = 0;
+
+	err = ufs_qcom_phy_remove(generic_phy, ufs_qcom_phy);
+	if (err)
+		dev_err(dev, "%s: ufs_qcom_phy_remove failed = %d\n",
+			__func__, err);
+
+	return err;
+}
+
+static const struct of_device_id ufs_qcom_phy_qmp_20nm_of_match[] = {
+	{.compatible = "qcom,ufs-phy-qmp-20nm"},
+	{},
+};
+MODULE_DEVICE_TABLE(of, ufs_qcom_phy_qmp_20nm_of_match);
+
+static struct platform_driver ufs_qcom_phy_qmp_20nm_driver = {
+	.probe = ufs_qcom_phy_qmp_20nm_probe,
+	.remove = ufs_qcom_phy_qmp_20nm_remove,
+	.driver = {
+		.of_match_table = ufs_qcom_phy_qmp_20nm_of_match,
+		.name = "ufs_qcom_phy_qmp_20nm",
+		.owner = THIS_MODULE,
+	},
+};
+
+module_platform_driver(ufs_qcom_phy_qmp_20nm_driver);
+
+MODULE_DESCRIPTION("Universal Flash Storage (UFS) QCOM PHY QMP 20nm");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/phy/phy-qcom-ufs-qmp-20nm.h b/drivers/phy/phy-qcom-ufs-qmp-20nm.h
new file mode 100644
index 000000000000..4f3076bb3d71
--- /dev/null
+++ b/drivers/phy/phy-qcom-ufs-qmp-20nm.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2013-2015, Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef UFS_QCOM_PHY_QMP_20NM_H_
+#define UFS_QCOM_PHY_QMP_20NM_H_
+
+#include "phy-qcom-ufs-i.h"
+
+/* QCOM UFS PHY control registers */
+
+#define COM_OFF(x)     (0x000 + x)
+#define PHY_OFF(x)     (0xC00 + x)
+#define TX_OFF(n, x)   (0x400 + (0x400 * n) + x)
+#define RX_OFF(n, x)   (0x600 + (0x400 * n) + x)
+
+/* UFS PHY PLL block registers */
+#define QSERDES_COM_SYS_CLK_CTRL		COM_OFF(0x0)
+#define QSERDES_COM_PLL_VCOTAIL_EN		COM_OFF(0x04)
+#define QSERDES_COM_PLL_CNTRL			COM_OFF(0x14)
+#define QSERDES_COM_PLL_IP_SETI			COM_OFF(0x24)
+#define QSERDES_COM_CORE_CLK_IN_SYNC_SEL	COM_OFF(0x28)
+#define QSERDES_COM_BIAS_EN_CLKBUFLR_EN		COM_OFF(0x30)
+#define QSERDES_COM_PLL_CP_SETI			COM_OFF(0x34)
+#define QSERDES_COM_PLL_IP_SETP			COM_OFF(0x38)
+#define QSERDES_COM_PLL_CP_SETP			COM_OFF(0x3C)
+#define QSERDES_COM_SYSCLK_EN_SEL_TXBAND	COM_OFF(0x48)
+#define QSERDES_COM_RESETSM_CNTRL		COM_OFF(0x4C)
+#define QSERDES_COM_RESETSM_CNTRL2		COM_OFF(0x50)
+#define QSERDES_COM_PLLLOCK_CMP1		COM_OFF(0x90)
+#define QSERDES_COM_PLLLOCK_CMP2		COM_OFF(0x94)
+#define QSERDES_COM_PLLLOCK_CMP3		COM_OFF(0x98)
+#define QSERDES_COM_PLLLOCK_CMP_EN		COM_OFF(0x9C)
+#define QSERDES_COM_BGTC			COM_OFF(0xA0)
+#define QSERDES_COM_DEC_START1			COM_OFF(0xAC)
+#define QSERDES_COM_PLL_AMP_OS			COM_OFF(0xB0)
+#define QSERDES_COM_RES_CODE_UP_OFFSET		COM_OFF(0xD8)
+#define QSERDES_COM_RES_CODE_DN_OFFSET		COM_OFF(0xDC)
+#define QSERDES_COM_DIV_FRAC_START1		COM_OFF(0x100)
+#define QSERDES_COM_DIV_FRAC_START2		COM_OFF(0x104)
+#define QSERDES_COM_DIV_FRAC_START3		COM_OFF(0x108)
+#define QSERDES_COM_DEC_START2			COM_OFF(0x10C)
+#define QSERDES_COM_PLL_RXTXEPCLK_EN		COM_OFF(0x110)
+#define QSERDES_COM_PLL_CRCTRL			COM_OFF(0x114)
+#define QSERDES_COM_PLL_CLKEPDIV		COM_OFF(0x118)
+
+/* TX LANE n (0, 1) registers */
+#define QSERDES_TX_EMP_POST1_LVL(n)		TX_OFF(n, 0x08)
+#define QSERDES_TX_DRV_LVL(n)			TX_OFF(n, 0x0C)
+#define QSERDES_TX_LANE_MODE(n)			TX_OFF(n, 0x54)
+
+/* RX LANE n (0, 1) registers */
+#define QSERDES_RX_CDR_CONTROL1(n)		RX_OFF(n, 0x0)
+#define QSERDES_RX_CDR_CONTROL_HALF(n)		RX_OFF(n, 0x8)
+#define QSERDES_RX_RX_EQ_GAIN1_LSB(n)		RX_OFF(n, 0xA8)
+#define QSERDES_RX_RX_EQ_GAIN1_MSB(n)		RX_OFF(n, 0xAC)
+#define QSERDES_RX_RX_EQ_GAIN2_LSB(n)		RX_OFF(n, 0xB0)
+#define QSERDES_RX_RX_EQ_GAIN2_MSB(n)		RX_OFF(n, 0xB4)
+#define QSERDES_RX_RX_EQU_ADAPTOR_CNTRL2(n)	RX_OFF(n, 0xBC)
+#define QSERDES_RX_CDR_CONTROL_QUARTER(n)	RX_OFF(n, 0xC)
+#define QSERDES_RX_SIGDET_CNTRL(n)		RX_OFF(n, 0x100)
+
+/* UFS PHY registers */
+#define UFS_PHY_PHY_START			PHY_OFF(0x00)
+#define UFS_PHY_POWER_DOWN_CONTROL		PHY_OFF(0x4)
+#define UFS_PHY_TX_LANE_ENABLE			PHY_OFF(0x44)
+#define UFS_PHY_PWM_G1_CLK_DIVIDER		PHY_OFF(0x08)
+#define UFS_PHY_PWM_G2_CLK_DIVIDER		PHY_OFF(0x0C)
+#define UFS_PHY_PWM_G3_CLK_DIVIDER		PHY_OFF(0x10)
+#define UFS_PHY_PWM_G4_CLK_DIVIDER		PHY_OFF(0x14)
+#define UFS_PHY_CORECLK_PWM_G1_CLK_DIVIDER	PHY_OFF(0x34)
+#define UFS_PHY_CORECLK_PWM_G2_CLK_DIVIDER	PHY_OFF(0x38)
+#define UFS_PHY_CORECLK_PWM_G3_CLK_DIVIDER	PHY_OFF(0x3C)
+#define UFS_PHY_CORECLK_PWM_G4_CLK_DIVIDER	PHY_OFF(0x40)
+#define UFS_PHY_OMC_STATUS_RDVAL		PHY_OFF(0x68)
+#define UFS_PHY_LINE_RESET_TIME			PHY_OFF(0x28)
+#define UFS_PHY_LINE_RESET_GRANULARITY		PHY_OFF(0x2C)
+#define UFS_PHY_TSYNC_RSYNC_CNTL		PHY_OFF(0x48)
+#define UFS_PHY_PLL_CNTL			PHY_OFF(0x50)
+#define UFS_PHY_TX_LARGE_AMP_DRV_LVL		PHY_OFF(0x54)
+#define UFS_PHY_TX_SMALL_AMP_DRV_LVL		PHY_OFF(0x5C)
+#define UFS_PHY_TX_LARGE_AMP_POST_EMP_LVL	PHY_OFF(0x58)
+#define UFS_PHY_TX_SMALL_AMP_POST_EMP_LVL	PHY_OFF(0x60)
+#define UFS_PHY_CFG_CHANGE_CNT_VAL		PHY_OFF(0x64)
+#define UFS_PHY_RX_SYNC_WAIT_TIME		PHY_OFF(0x6C)
+#define UFS_PHY_TX_MIN_SLEEP_NOCONFIG_TIME_CAPABILITY	PHY_OFF(0xB4)
+#define UFS_PHY_RX_MIN_SLEEP_NOCONFIG_TIME_CAPABILITY	PHY_OFF(0xE0)
+#define UFS_PHY_TX_MIN_STALL_NOCONFIG_TIME_CAPABILITY	PHY_OFF(0xB8)
+#define UFS_PHY_RX_MIN_STALL_NOCONFIG_TIME_CAPABILITY	PHY_OFF(0xE4)
+#define UFS_PHY_TX_MIN_SAVE_CONFIG_TIME_CAPABILITY	PHY_OFF(0xBC)
+#define UFS_PHY_RX_MIN_SAVE_CONFIG_TIME_CAPABILITY	PHY_OFF(0xE8)
+#define UFS_PHY_RX_PWM_BURST_CLOSURE_LENGTH_CAPABILITY	PHY_OFF(0xFC)
+#define UFS_PHY_RX_MIN_ACTIVATETIME_CAPABILITY		PHY_OFF(0x100)
+#define UFS_PHY_RX_SIGDET_CTRL3				PHY_OFF(0x14c)
+#define UFS_PHY_RMMI_ATTR_CTRL			PHY_OFF(0x160)
+#define UFS_PHY_RMMI_RX_CFGUPDT_L1	(1 << 7)
+#define UFS_PHY_RMMI_TX_CFGUPDT_L1	(1 << 6)
+#define UFS_PHY_RMMI_CFGWR_L1		(1 << 5)
+#define UFS_PHY_RMMI_CFGRD_L1		(1 << 4)
+#define UFS_PHY_RMMI_RX_CFGUPDT_L0	(1 << 3)
+#define UFS_PHY_RMMI_TX_CFGUPDT_L0	(1 << 2)
+#define UFS_PHY_RMMI_CFGWR_L0		(1 << 1)
+#define UFS_PHY_RMMI_CFGRD_L0		(1 << 0)
+#define UFS_PHY_RMMI_ATTRID			PHY_OFF(0x164)
+#define UFS_PHY_RMMI_ATTRWRVAL			PHY_OFF(0x168)
+#define UFS_PHY_RMMI_ATTRRDVAL_L0_STATUS	PHY_OFF(0x16C)
+#define UFS_PHY_RMMI_ATTRRDVAL_L1_STATUS	PHY_OFF(0x170)
+#define UFS_PHY_PCS_READY_STATUS		PHY_OFF(0x174)
+
+#define UFS_PHY_TX_LANE_ENABLE_MASK		0x3
+
+/*
+ * This structure represents the 20nm specific phy.
+ * common_cfg MUST remain the first field in this structure
+ * in case extra fields are added. This way, when calling
+ * get_ufs_qcom_phy() of generic phy, we can extract the
+ * common phy structure (struct ufs_qcom_phy) out of it
+ * regardless of the relevant specific phy.
+ */
+struct ufs_qcom_phy_qmp_20nm {
+	struct ufs_qcom_phy common_cfg;
+};
+
+static struct ufs_qcom_phy_calibration phy_cal_table_rate_A_1_2_0[] = {
+	UFS_QCOM_PHY_CAL_ENTRY(UFS_PHY_POWER_DOWN_CONTROL, 0x01),
+	UFS_QCOM_PHY_CAL_ENTRY(UFS_PHY_RX_SIGDET_CTRL3, 0x0D),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_VCOTAIL_EN, 0xe1),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CRCTRL, 0xcc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_SYSCLK_EN_SEL_TXBAND, 0x08),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CLKEPDIV, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_RXTXEPCLK_EN, 0x10),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DEC_START1, 0x82),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DEC_START2, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START1, 0x80),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START2, 0x80),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START3, 0x40),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP1, 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP2, 0x19),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP3, 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP_EN, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RESETSM_CNTRL, 0x90),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RESETSM_CNTRL2, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL1(0), 0xf2),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_HALF(0), 0x0c),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_QUARTER(0), 0x12),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL1(1), 0xf2),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_HALF(1), 0x0c),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_QUARTER(1), 0x12),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_LSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_MSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_LSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_MSB(0), 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_LSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_MSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_LSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_MSB(1), 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CP_SETI, 0x3f),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_IP_SETP, 0x1b),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CP_SETP, 0x0f),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_IP_SETI, 0x01),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_EMP_POST1_LVL(0), 0x2F),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_DRV_LVL(0), 0x20),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_EMP_POST1_LVL(1), 0x2F),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_DRV_LVL(1), 0x20),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_LANE_MODE(0), 0x68),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_LANE_MODE(1), 0x68),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQU_ADAPTOR_CNTRL2(1), 0xdc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQU_ADAPTOR_CNTRL2(0), 0xdc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_BIAS_EN_CLKBUFLR_EN, 0x3),
+};
+
+static struct ufs_qcom_phy_calibration phy_cal_table_rate_A_1_3_0[] = {
+	UFS_QCOM_PHY_CAL_ENTRY(UFS_PHY_POWER_DOWN_CONTROL, 0x01),
+	UFS_QCOM_PHY_CAL_ENTRY(UFS_PHY_RX_SIGDET_CTRL3, 0x0D),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_VCOTAIL_EN, 0xe1),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CRCTRL, 0xcc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_SYSCLK_EN_SEL_TXBAND, 0x08),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CLKEPDIV, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_RXTXEPCLK_EN, 0x10),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DEC_START1, 0x82),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DEC_START2, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START1, 0x80),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START2, 0x80),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DIV_FRAC_START3, 0x40),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP1, 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP2, 0x19),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP3, 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP_EN, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RESETSM_CNTRL, 0x90),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RESETSM_CNTRL2, 0x03),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL1(0), 0xf2),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_HALF(0), 0x0c),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_QUARTER(0), 0x12),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL1(1), 0xf2),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_HALF(1), 0x0c),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_CDR_CONTROL_QUARTER(1), 0x12),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_LSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_MSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_LSB(0), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_MSB(0), 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_LSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN1_MSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_LSB(1), 0xff),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQ_GAIN2_MSB(1), 0x00),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CP_SETI, 0x2b),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_IP_SETP, 0x38),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CP_SETP, 0x3c),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RES_CODE_UP_OFFSET, 0x02),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_RES_CODE_DN_OFFSET, 0x02),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_IP_SETI, 0x01),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLL_CNTRL, 0x40),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_LANE_MODE(0), 0x68),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_TX_LANE_MODE(1), 0x68),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQU_ADAPTOR_CNTRL2(1), 0xdc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_RX_RX_EQU_ADAPTOR_CNTRL2(0), 0xdc),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_BIAS_EN_CLKBUFLR_EN, 0x3),
+};
+
+static struct ufs_qcom_phy_calibration phy_cal_table_rate_B[] = {
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_DEC_START1, 0x98),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP1, 0x65),
+	UFS_QCOM_PHY_CAL_ENTRY(QSERDES_COM_PLLLOCK_CMP2, 0x1e),
+};
+
+#endif
diff --git a/include/linux/phy/phy-qcom-ufs.h b/include/linux/phy/phy-qcom-ufs.h
new file mode 100644
index 000000000000..9d18e9f948e9
--- /dev/null
+++ b/include/linux/phy/phy-qcom-ufs.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2013-2015, Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef PHY_QCOM_UFS_H_
+#define PHY_QCOM_UFS_H_
+
+#include "phy.h"
+
+/**
+ * ufs_qcom_phy_enable_ref_clk() - Enable the phy
+ * ref clock.
+ * @phy: reference to a generic phy
+ *
+ * returns 0 for success, and non-zero for error.
+ */
+int ufs_qcom_phy_enable_ref_clk(struct phy *phy);
+
+/**
+ * ufs_qcom_phy_disable_ref_clk() - Disable the phy
+ * ref clock.
+ * @phy: reference to a generic phy.
+ */
+void ufs_qcom_phy_disable_ref_clk(struct phy *phy);
+
+/**
+ * ufs_qcom_phy_enable_dev_ref_clk() - Enable the device
+ * ref clock.
+ * @phy: reference to a generic phy.
+ */
+void ufs_qcom_phy_enable_dev_ref_clk(struct phy *phy);
+
+/**
+ * ufs_qcom_phy_disable_dev_ref_clk() - Disable the device
+ * ref clock.
+ * @phy: reference to a generic phy.
+ */
+void ufs_qcom_phy_disable_dev_ref_clk(struct phy *phy);
+
+int ufs_qcom_phy_enable_iface_clk(struct phy *phy);
+void ufs_qcom_phy_disable_iface_clk(struct phy *phy);
+int ufs_qcom_phy_start_serdes(struct phy *phy);
+int ufs_qcom_phy_set_tx_lane_enable(struct phy *phy, u32 tx_lanes);
+int ufs_qcom_phy_calibrate_phy(struct phy *phy, bool is_rate_B);
+int ufs_qcom_phy_is_pcs_ready(struct phy *phy);
+void ufs_qcom_phy_save_controller_version(struct phy *phy,
+			u8 major, u16 minor, u16 step);
+
+#endif /* PHY_QCOM_UFS_H_ */
-- 
cgit v1.2.3


From 3c33f5b99d688deafd21d4a770303691c7c3a320 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 20 Jan 2015 09:26:19 -0600
Subject: livepatch: support for repatching a function

Add support for patching a function multiple times.  If multiple patches
affect a function, the function in the most recently enabled patch
"wins".  This enables a cumulative patch upgrade path, where each patch
is a superset of previous patches.

This requires restructuring the data a little bit.  With the current
design, where each klp_func struct has its own ftrace_ops, we'd have to
unregister the old ops and then register the new ops, because
FTRACE_OPS_FL_IPMODIFY prevents us from having two ops registered for
the same function at the same time.  That would leave a regression
window where the function isn't patched at all (not good for a patch
upgrade path).

This patch replaces the per-klp_func ftrace_ops with a global klp_ops
list, with one ftrace_ops per original function.  A single ftrace_ops is
shared between all klp_funcs which have the same old_addr.  This allows
the switch between function versions to happen instantaneously by
updating the klp_ops struct's func_stack list.  The winner is the
klp_func at the top of the func_stack (front of the list).

[ jkosina@suse.cz: turn WARN_ON() into WARN_ON_ONCE() in ftrace handler to
  avoid storm in pathological cases ]

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/livepatch.h |   4 +-
 kernel/livepatch/core.c   | 170 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 121 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 950bc615842f..f14c6fb262b4 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -40,8 +40,8 @@ enum klp_state {
  * @old_addr:	a hint conveying at what address the old function
  *		can be found (optional, vmlinux patches only)
  * @kobj:	kobject for sysfs resources
- * @fops:	ftrace operations structure
  * @state:	tracks function-level patch application state
+ * @stack_node:	list node for klp_ops func_stack list
  */
 struct klp_func {
 	/* external */
@@ -59,8 +59,8 @@ struct klp_func {
 
 	/* internal */
 	struct kobject kobj;
-	struct ftrace_ops *fops;
 	enum klp_state state;
+	struct list_head stack_node;
 };
 
 /**
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 2401e7f955d3..bc05d390ce85 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -29,17 +29,53 @@
 #include <linux/kallsyms.h>
 #include <linux/livepatch.h>
 
-/*
- * The klp_mutex protects the klp_patches list and state transitions of any
- * structure reachable from the patches list.  References to any structure must
- * be obtained under mutex protection.
+/**
+ * struct klp_ops - structure for tracking registered ftrace ops structs
+ *
+ * A single ftrace_ops is shared between all enabled replacement functions
+ * (klp_func structs) which have the same old_addr.  This allows the switch
+ * between function versions to happen instantaneously by updating the klp_ops
+ * struct's func_stack list.  The winner is the klp_func at the top of the
+ * func_stack (front of the list).
+ *
+ * @node:	node for the global klp_ops list
+ * @func_stack:	list head for the stack of klp_func's (active func is on top)
+ * @fops:	registered ftrace ops struct
  */
+struct klp_ops {
+	struct list_head node;
+	struct list_head func_stack;
+	struct ftrace_ops fops;
+};
 
+/*
+ * The klp_mutex protects the global lists and state transitions of any
+ * structure reachable from them.  References to any structure must be obtained
+ * under mutex protection (except in klp_ftrace_handler(), which uses RCU to
+ * ensure it gets consistent data).
+ */
 static DEFINE_MUTEX(klp_mutex);
+
 static LIST_HEAD(klp_patches);
+static LIST_HEAD(klp_ops);
 
 static struct kobject *klp_root_kobj;
 
+static struct klp_ops *klp_find_ops(unsigned long old_addr)
+{
+	struct klp_ops *ops;
+	struct klp_func *func;
+
+	list_for_each_entry(ops, &klp_ops, node) {
+		func = list_first_entry(&ops->func_stack, struct klp_func,
+					stack_node);
+		if (func->old_addr == old_addr)
+			return ops;
+	}
+
+	return NULL;
+}
+
 static bool klp_is_module(struct klp_object *obj)
 {
 	return obj->name;
@@ -267,16 +303,28 @@ static int klp_write_object_relocations(struct module *pmod,
 
 static void notrace klp_ftrace_handler(unsigned long ip,
 				       unsigned long parent_ip,
-				       struct ftrace_ops *ops,
+				       struct ftrace_ops *fops,
 				       struct pt_regs *regs)
 {
-	struct klp_func *func = ops->private;
+	struct klp_ops *ops;
+	struct klp_func *func;
+
+	ops = container_of(fops, struct klp_ops, fops);
+
+	rcu_read_lock();
+	func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
+				      stack_node);
+	rcu_read_unlock();
+
+	if (WARN_ON_ONCE(!func))
+		return;
 
 	klp_arch_set_pc(regs, (unsigned long)func->new_func);
 }
 
 static int klp_disable_func(struct klp_func *func)
 {
+	struct klp_ops *ops;
 	int ret;
 
 	if (WARN_ON(func->state != KLP_ENABLED))
@@ -285,16 +333,28 @@ static int klp_disable_func(struct klp_func *func)
 	if (WARN_ON(!func->old_addr))
 		return -EINVAL;
 
-	ret = unregister_ftrace_function(func->fops);
-	if (ret) {
-		pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
-		       func->old_name, ret);
-		return ret;
-	}
+	ops = klp_find_ops(func->old_addr);
+	if (WARN_ON(!ops))
+		return -EINVAL;
 
-	ret = ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0);
-	if (ret)
-		pr_warn("function unregister succeeded but failed to clear the filter\n");
+	if (list_is_singular(&ops->func_stack)) {
+		ret = unregister_ftrace_function(&ops->fops);
+		if (ret) {
+			pr_err("failed to unregister ftrace handler for function '%s' (%d)\n",
+			       func->old_name, ret);
+			return ret;
+		}
+
+		ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+		if (ret)
+			pr_warn("function unregister succeeded but failed to clear the filter\n");
+
+		list_del_rcu(&func->stack_node);
+		list_del(&ops->node);
+		kfree(ops);
+	} else {
+		list_del_rcu(&func->stack_node);
+	}
 
 	func->state = KLP_DISABLED;
 
@@ -303,6 +363,7 @@ static int klp_disable_func(struct klp_func *func)
 
 static int klp_enable_func(struct klp_func *func)
 {
+	struct klp_ops *ops;
 	int ret;
 
 	if (WARN_ON(!func->old_addr))
@@ -311,22 +372,50 @@ static int klp_enable_func(struct klp_func *func)
 	if (WARN_ON(func->state != KLP_DISABLED))
 		return -EINVAL;
 
-	ret = ftrace_set_filter_ip(func->fops, func->old_addr, 0, 0);
-	if (ret) {
-		pr_err("failed to set ftrace filter for function '%s' (%d)\n",
-		       func->old_name, ret);
-		return ret;
-	}
+	ops = klp_find_ops(func->old_addr);
+	if (!ops) {
+		ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+		if (!ops)
+			return -ENOMEM;
+
+		ops->fops.func = klp_ftrace_handler;
+		ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS |
+				  FTRACE_OPS_FL_DYNAMIC |
+				  FTRACE_OPS_FL_IPMODIFY;
+
+		list_add(&ops->node, &klp_ops);
+
+		INIT_LIST_HEAD(&ops->func_stack);
+		list_add_rcu(&func->stack_node, &ops->func_stack);
+
+		ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 0, 0);
+		if (ret) {
+			pr_err("failed to set ftrace filter for function '%s' (%d)\n",
+			       func->old_name, ret);
+			goto err;
+		}
+
+		ret = register_ftrace_function(&ops->fops);
+		if (ret) {
+			pr_err("failed to register ftrace handler for function '%s' (%d)\n",
+			       func->old_name, ret);
+			ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0);
+			goto err;
+		}
+
 
-	ret = register_ftrace_function(func->fops);
-	if (ret) {
-		pr_err("failed to register ftrace handler for function '%s' (%d)\n",
-		       func->old_name, ret);
-		ftrace_set_filter_ip(func->fops, func->old_addr, 1, 0);
 	} else {
-		func->state = KLP_ENABLED;
+		list_add_rcu(&func->stack_node, &ops->func_stack);
 	}
 
+	func->state = KLP_ENABLED;
+
+	return ret;
+
+err:
+	list_del_rcu(&func->stack_node);
+	list_del(&ops->node);
+	kfree(ops);
 	return ret;
 }
 
@@ -582,10 +671,6 @@ static struct kobj_type klp_ktype_patch = {
 
 static void klp_kobj_release_func(struct kobject *kobj)
 {
-	struct klp_func *func;
-
-	func = container_of(kobj, struct klp_func, kobj);
-	kfree(func->fops);
 }
 
 static struct kobj_type klp_ktype_func = {
@@ -642,28 +727,11 @@ static void klp_free_patch(struct klp_patch *patch)
 
 static int klp_init_func(struct klp_object *obj, struct klp_func *func)
 {
-	struct ftrace_ops *ops;
-	int ret;
-
-	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
-	if (!ops)
-		return -ENOMEM;
-
-	ops->private = func;
-	ops->func = klp_ftrace_handler;
-	ops->flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_DYNAMIC |
-		     FTRACE_OPS_FL_IPMODIFY;
-	func->fops = ops;
+	INIT_LIST_HEAD(&func->stack_node);
 	func->state = KLP_DISABLED;
 
-	ret = kobject_init_and_add(&func->kobj, &klp_ktype_func,
-				   obj->kobj, func->old_name);
-	if (ret) {
-		kfree(func->fops);
-		return ret;
-	}
-
-	return 0;
+	return kobject_init_and_add(&func->kobj, &klp_ktype_func,
+				    obj->kobj, func->old_name);
 }
 
 /* parts of the initialization that is done only when the object is loaded */
-- 
cgit v1.2.3


From 97b713ba3ebaa6c8d84c2c720f5468a7c6a6eb4e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:31 +0100
Subject: fs: kill BDI_CAP_SWAP_BACKED

This bdi flag isn't too useful - we can determine that a vma is backed by
either swap or shmem trivially in the caller.

This also allows removing the backing_dev_info instaces for swap and shmem
in favor of noop_backing_dev_info.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/backing-dev.h | 13 -------------
 mm/madvise.c                | 17 ++++++++++-------
 mm/shmem.c                  | 25 +++++++------------------
 mm/swap.c                   |  2 --
 mm/swap_state.c             |  7 +------
 5 files changed, 18 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012b7a14..e936cea856dc 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -238,8 +238,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_WRITE_MAP:      Can be mapped for writing
  * BDI_CAP_EXEC_MAP:       Can be mapped for execution
  *
- * BDI_CAP_SWAP_BACKED:    Count shmem/tmpfs objects as swap-backed.
- *
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
@@ -250,7 +248,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_WRITE_MAP	0x00000020
 #define BDI_CAP_EXEC_MAP	0x00000040
 #define BDI_CAP_NO_ACCT_WB	0x00000080
-#define BDI_CAP_SWAP_BACKED	0x00000100
 #define BDI_CAP_STABLE_WRITES	0x00000200
 #define BDI_CAP_STRICTLIMIT	0x00000400
 
@@ -329,11 +326,6 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
 				      BDI_CAP_NO_WRITEBACK));
 }
 
-static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
-{
-	return bdi->capabilities & BDI_CAP_SWAP_BACKED;
-}
-
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
@@ -344,11 +336,6 @@ static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 	return bdi_cap_account_dirty(mapping->backing_dev_info);
 }
 
-static inline bool mapping_cap_swap_backed(struct address_space *mapping)
-{
-	return bdi_cap_swap_backed(mapping->backing_dev_info);
-}
-
 static inline int bdi_sched_wait(void *word)
 {
 	schedule();
diff --git a/mm/madvise.c b/mm/madvise.c
index a271adc93289..1383a8916bc3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -222,19 +222,22 @@ static long madvise_willneed(struct vm_area_struct *vma,
 	struct file *file = vma->vm_file;
 
 #ifdef CONFIG_SWAP
-	if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+	if (!file) {
 		*prev = vma;
-		if (!file)
-			force_swapin_readahead(vma, start, end);
-		else
-			force_shm_swapin_readahead(vma, start, end,
-						file->f_mapping);
+		force_swapin_readahead(vma, start, end);
 		return 0;
 	}
-#endif
 
+	if (shmem_mapping(file->f_mapping)) {
+		*prev = vma;
+		force_shm_swapin_readahead(vma, start, end,
+					file->f_mapping);
+		return 0;
+	}
+#else
 	if (!file)
 		return -EBADF;
+#endif
 
 	if (file->f_mapping->a_ops->get_xip_mem) {
 		/* no bad return value, but ignore advice */
diff --git a/mm/shmem.c b/mm/shmem.c
index 73ba1df7c8ba..1b77eaf589fd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations;
 static const struct inode_operations shmem_special_inode_operations;
 static const struct vm_operations_struct shmem_vm_ops;
 
-static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
 
@@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		goto redirty;
 
 	/*
-	 * shmem_backing_dev_info's capabilities prevent regular writeback or
-	 * sync from ever calling shmem_writepage; but a stacking filesystem
-	 * might use ->writepage of its underlying filesystem, in which case
-	 * tmpfs should write out to swap only in response to memory pressure,
-	 * and not for the writeback threads or sync.
+	 * Our capabilities prevent regular writeback or sync from ever calling
+	 * shmem_writepage; but a stacking filesystem might use ->writepage of
+	 * its underlying filesystem, in which case tmpfs should write out to
+	 * swap only in response to memory pressure, and not for the writeback
+	 * threads or sync.
 	 */
 	if (!wbc->for_reclaim) {
 		WARN_ON_ONCE(1);	/* Still happens? Tell us about it! */
@@ -1415,7 +1410,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
-		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
+		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_generation = get_seconds();
 		info = SHMEM_I(inode);
@@ -1461,7 +1456,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 
 bool shmem_mapping(struct address_space *mapping)
 {
-	return mapping->backing_dev_info == &shmem_backing_dev_info;
+	return mapping->host->i_sb->s_op == &shmem_ops;
 }
 
 #ifdef CONFIG_TMPFS
@@ -3226,10 +3221,6 @@ int __init shmem_init(void)
 	if (shmem_inode_cachep)
 		return 0;
 
-	error = bdi_init(&shmem_backing_dev_info);
-	if (error)
-		goto out4;
-
 	error = shmem_init_inodecache();
 	if (error)
 		goto out3;
@@ -3253,8 +3244,6 @@ out1:
 out2:
 	shmem_destroy_inodecache();
 out3:
-	bdi_destroy(&shmem_backing_dev_info);
-out4:
 	shm_mnt = ERR_PTR(error);
 	return error;
 }
diff --git a/mm/swap.c b/mm/swap.c
index 8a12b33936b4..4e0109a2f37b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1138,8 +1138,6 @@ void __init swap_setup(void)
 #ifdef CONFIG_SWAP
 	int i;
 
-	if (bdi_init(swapper_spaces[0].backing_dev_info))
-		panic("Failed to init swap bdi");
 	for (i = 0; i < MAX_SWAPFILES; i++) {
 		spin_lock_init(&swapper_spaces[i].tree_lock);
 		INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9711342987a0..1c137b69ecde 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -32,17 +32,12 @@ static const struct address_space_operations swap_aops = {
 #endif
 };
 
-static struct backing_dev_info swap_backing_dev_info = {
-	.name		= "swap",
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
 struct address_space swapper_spaces[MAX_SWAPFILES] = {
 	[0 ... MAX_SWAPFILES - 1] = {
 		.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
 		.i_mmap_writable = ATOMIC_INIT(0),
 		.a_ops		= &swap_aops,
-		.backing_dev_info = &swap_backing_dev_info,
+		.backing_dev_info = &noop_backing_dev_info,
 	}
 };
 
-- 
cgit v1.2.3


From b4caecd48005fbed3949dde6c1cb233142fd69e9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:32 +0100
Subject: fs: introduce f_op->mmap_capabilities for nommu mmap support

Since "BDI: Provide backing device capability information [try #3]" the
backing_dev_info structure also provides flags for the kind of mmap
operation available in a nommu environment, which is entirely unrelated
to it's original purpose.

Introduce a new nommu-only file operation to provide this information to
the nommu mmap code instead.  Splitting this from the backing_dev_info
structure allows to remove lots of backing_dev_info instance that aren't
otherwise needed, and entirely gets rid of the concept of providing a
backing_dev_info for a character device.  It also removes the need for
the mtd_inodefs filesystem.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 Documentation/nommu-mmap.txt                    |  8 +--
 block/blk-core.c                                |  2 +-
 drivers/char/mem.c                              | 64 ++++++++++----------
 drivers/mtd/mtdchar.c                           | 72 ++++------------------
 drivers/mtd/mtdconcat.c                         | 10 ----
 drivers/mtd/mtdcore.c                           | 80 +++++++------------------
 drivers/mtd/mtdpart.c                           |  1 -
 drivers/staging/lustre/lustre/llite/llite_lib.c |  2 +-
 fs/9p/v9fs.c                                    |  2 +-
 fs/afs/volume.c                                 |  2 +-
 fs/aio.c                                        | 14 +----
 fs/btrfs/disk-io.c                              |  3 +-
 fs/char_dev.c                                   | 24 --------
 fs/cifs/connect.c                               |  2 +-
 fs/coda/inode.c                                 |  2 +-
 fs/configfs/configfs_internal.h                 |  2 -
 fs/configfs/inode.c                             | 18 +-----
 fs/configfs/mount.c                             | 11 +---
 fs/ecryptfs/main.c                              |  2 +-
 fs/exofs/super.c                                |  2 +-
 fs/ncpfs/inode.c                                |  2 +-
 fs/ramfs/file-nommu.c                           |  7 +++
 fs/ramfs/inode.c                                | 22 +------
 fs/romfs/mmap-nommu.c                           | 10 ++++
 fs/ubifs/super.c                                |  2 +-
 include/linux/backing-dev.h                     | 33 ++--------
 include/linux/cdev.h                            |  2 -
 include/linux/fs.h                              | 23 +++++++
 include/linux/mtd/mtd.h                         |  2 +
 mm/backing-dev.c                                |  7 +--
 mm/nommu.c                                      | 69 ++++++++++-----------
 security/security.c                             | 13 ++--
 32 files changed, 169 insertions(+), 346 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/nommu-mmap.txt b/Documentation/nommu-mmap.txt
index 8e1ddec2c78a..ae57b9ea0d41 100644
--- a/Documentation/nommu-mmap.txt
+++ b/Documentation/nommu-mmap.txt
@@ -43,12 +43,12 @@ and it's also much more restricted in the latter case:
            even if this was created by another process.
 
          - If possible, the file mapping will be directly on the backing device
-           if the backing device has the BDI_CAP_MAP_DIRECT capability and
+           if the backing device has the NOMMU_MAP_DIRECT capability and
            appropriate mapping protection capabilities. Ramfs, romfs, cramfs
            and mtd might all permit this.
 
 	 - If the backing device device can't or won't permit direct sharing,
-           but does have the BDI_CAP_MAP_COPY capability, then a copy of the
+           but does have the NOMMU_MAP_COPY capability, then a copy of the
            appropriate bit of the file will be read into a contiguous bit of
            memory and any extraneous space beyond the EOF will be cleared
 
@@ -220,7 +220,7 @@ directly (can't be copied).
 
 The file->f_op->mmap() operation will be called to actually inaugurate the
 mapping. It can be rejected at that point. Returning the ENOSYS error will
-cause the mapping to be copied instead if BDI_CAP_MAP_COPY is specified.
+cause the mapping to be copied instead if NOMMU_MAP_COPY is specified.
 
 The vm_ops->close() routine will be invoked when the last mapping on a chardev
 is removed. An existing mapping will be shared, partially or not, if possible
@@ -232,7 +232,7 @@ want to handle it, despite the fact it's got an operation. For instance, it
 might try directing the call to a secondary driver which turns out not to
 implement it. Such is the case for the framebuffer driver which attempts to
 direct the call to the device-specific driver. Under such circumstances, the
-mapping request will be rejected if BDI_CAP_MAP_COPY is not specified, and a
+mapping request will be rejected if NOMMU_MAP_COPY is not specified, and a
 copy mapped otherwise.
 
 IMPORTANT NOTE:
diff --git a/block/blk-core.c b/block/blk-core.c
index 3ad405571dcc..928aac29bccd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -607,7 +607,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
-	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
+	q->backing_dev_info.capabilities = 0;
 	q->backing_dev_info.name = "block";
 	q->node = node_id;
 
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 4c58333b4257..9a6b63783a94 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -287,13 +287,24 @@ static unsigned long get_unmapped_area_mem(struct file *file,
 	return pgoff << PAGE_SHIFT;
 }
 
+/* permit direct mmap, for read, write or exec */
+static unsigned memory_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_DIRECT |
+		NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
+
+static unsigned zero_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_COPY;
+}
+
 /* can't do an in-place private mapping if there's no MMU */
 static inline int private_mapping_ok(struct vm_area_struct *vma)
 {
 	return vma->vm_flags & VM_MAYSHARE;
 }
 #else
-#define get_unmapped_area_mem	NULL
 
 static inline int private_mapping_ok(struct vm_area_struct *vma)
 {
@@ -721,7 +732,10 @@ static const struct file_operations mem_fops = {
 	.write		= write_mem,
 	.mmap		= mmap_mem,
 	.open		= open_mem,
+#ifndef CONFIG_MMU
 	.get_unmapped_area = get_unmapped_area_mem,
+	.mmap_capabilities = memory_mmap_capabilities,
+#endif
 };
 
 #ifdef CONFIG_DEVKMEM
@@ -731,7 +745,10 @@ static const struct file_operations kmem_fops = {
 	.write		= write_kmem,
 	.mmap		= mmap_kmem,
 	.open		= open_kmem,
+#ifndef CONFIG_MMU
 	.get_unmapped_area = get_unmapped_area_mem,
+	.mmap_capabilities = memory_mmap_capabilities,
+#endif
 };
 #endif
 
@@ -760,16 +777,9 @@ static const struct file_operations zero_fops = {
 	.read_iter	= read_iter_zero,
 	.aio_write	= aio_write_zero,
 	.mmap		= mmap_zero,
-};
-
-/*
- * capabilities for /dev/zero
- * - permits private mappings, "copies" are taken of the source of zeros
- * - no writeback happens
- */
-static struct backing_dev_info zero_bdi = {
-	.name		= "char/mem",
-	.capabilities	= BDI_CAP_MAP_COPY | BDI_CAP_NO_ACCT_AND_WRITEBACK,
+#ifndef CONFIG_MMU
+	.mmap_capabilities = zero_mmap_capabilities,
+#endif
 };
 
 static const struct file_operations full_fops = {
@@ -783,22 +793,22 @@ static const struct memdev {
 	const char *name;
 	umode_t mode;
 	const struct file_operations *fops;
-	struct backing_dev_info *dev_info;
+	fmode_t fmode;
 } devlist[] = {
-	 [1] = { "mem", 0, &mem_fops, &directly_mappable_cdev_bdi },
+	 [1] = { "mem", 0, &mem_fops, FMODE_UNSIGNED_OFFSET },
 #ifdef CONFIG_DEVKMEM
-	 [2] = { "kmem", 0, &kmem_fops, &directly_mappable_cdev_bdi },
+	 [2] = { "kmem", 0, &kmem_fops, FMODE_UNSIGNED_OFFSET },
 #endif
-	 [3] = { "null", 0666, &null_fops, NULL },
+	 [3] = { "null", 0666, &null_fops, 0 },
 #ifdef CONFIG_DEVPORT
-	 [4] = { "port", 0, &port_fops, NULL },
+	 [4] = { "port", 0, &port_fops, 0 },
 #endif
-	 [5] = { "zero", 0666, &zero_fops, &zero_bdi },
-	 [7] = { "full", 0666, &full_fops, NULL },
-	 [8] = { "random", 0666, &random_fops, NULL },
-	 [9] = { "urandom", 0666, &urandom_fops, NULL },
+	 [5] = { "zero", 0666, &zero_fops, 0 },
+	 [7] = { "full", 0666, &full_fops, 0 },
+	 [8] = { "random", 0666, &random_fops, 0 },
+	 [9] = { "urandom", 0666, &urandom_fops, 0 },
 #ifdef CONFIG_PRINTK
-	[11] = { "kmsg", 0644, &kmsg_fops, NULL },
+	[11] = { "kmsg", 0644, &kmsg_fops, 0 },
 #endif
 };
 
@@ -816,12 +826,7 @@ static int memory_open(struct inode *inode, struct file *filp)
 		return -ENXIO;
 
 	filp->f_op = dev->fops;
-	if (dev->dev_info)
-		filp->f_mapping->backing_dev_info = dev->dev_info;
-
-	/* Is /dev/mem or /dev/kmem ? */
-	if (dev->dev_info == &directly_mappable_cdev_bdi)
-		filp->f_mode |= FMODE_UNSIGNED_OFFSET;
+	filp->f_mode |= dev->fmode;
 
 	if (dev->fops->open)
 		return dev->fops->open(inode, filp);
@@ -846,11 +851,6 @@ static struct class *mem_class;
 static int __init chr_dev_init(void)
 {
 	int minor;
-	int err;
-
-	err = bdi_init(&zero_bdi);
-	if (err)
-		return err;
 
 	if (register_chrdev(MEM_MAJOR, "mem", &memory_fops))
 		printk("unable to get major %d for memory devs\n", MEM_MAJOR);
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index 53563955931b..55fa27ecf4e1 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -49,7 +49,6 @@ static DEFINE_MUTEX(mtd_mutex);
  */
 struct mtd_file_info {
 	struct mtd_info *mtd;
-	struct inode *ino;
 	enum mtd_file_modes mode;
 };
 
@@ -59,10 +58,6 @@ static loff_t mtdchar_lseek(struct file *file, loff_t offset, int orig)
 	return fixed_size_llseek(file, offset, orig, mfi->mtd->size);
 }
 
-static int count;
-static struct vfsmount *mnt;
-static struct file_system_type mtd_inodefs_type;
-
 static int mtdchar_open(struct inode *inode, struct file *file)
 {
 	int minor = iminor(inode);
@@ -70,7 +65,6 @@ static int mtdchar_open(struct inode *inode, struct file *file)
 	int ret = 0;
 	struct mtd_info *mtd;
 	struct mtd_file_info *mfi;
-	struct inode *mtd_ino;
 
 	pr_debug("MTD_open\n");
 
@@ -78,10 +72,6 @@ static int mtdchar_open(struct inode *inode, struct file *file)
 	if ((file->f_mode & FMODE_WRITE) && (minor & 1))
 		return -EACCES;
 
-	ret = simple_pin_fs(&mtd_inodefs_type, &mnt, &count);
-	if (ret)
-		return ret;
-
 	mutex_lock(&mtd_mutex);
 	mtd = get_mtd_device(NULL, devnum);
 
@@ -95,43 +85,26 @@ static int mtdchar_open(struct inode *inode, struct file *file)
 		goto out1;
 	}
 
-	mtd_ino = iget_locked(mnt->mnt_sb, devnum);
-	if (!mtd_ino) {
-		ret = -ENOMEM;
-		goto out1;
-	}
-	if (mtd_ino->i_state & I_NEW) {
-		mtd_ino->i_private = mtd;
-		mtd_ino->i_mode = S_IFCHR;
-		mtd_ino->i_data.backing_dev_info = mtd->backing_dev_info;
-		unlock_new_inode(mtd_ino);
-	}
-	file->f_mapping = mtd_ino->i_mapping;
-
 	/* You can't open it RW if it's not a writeable device */
 	if ((file->f_mode & FMODE_WRITE) && !(mtd->flags & MTD_WRITEABLE)) {
 		ret = -EACCES;
-		goto out2;
+		goto out1;
 	}
 
 	mfi = kzalloc(sizeof(*mfi), GFP_KERNEL);
 	if (!mfi) {
 		ret = -ENOMEM;
-		goto out2;
+		goto out1;
 	}
-	mfi->ino = mtd_ino;
 	mfi->mtd = mtd;
 	file->private_data = mfi;
 	mutex_unlock(&mtd_mutex);
 	return 0;
 
-out2:
-	iput(mtd_ino);
 out1:
 	put_mtd_device(mtd);
 out:
 	mutex_unlock(&mtd_mutex);
-	simple_release_fs(&mnt, &count);
 	return ret;
 } /* mtdchar_open */
 
@@ -148,12 +121,9 @@ static int mtdchar_close(struct inode *inode, struct file *file)
 	if ((file->f_mode & FMODE_WRITE))
 		mtd_sync(mtd);
 
-	iput(mfi->ino);
-
 	put_mtd_device(mtd);
 	file->private_data = NULL;
 	kfree(mfi);
-	simple_release_fs(&mnt, &count);
 
 	return 0;
 } /* mtdchar_close */
@@ -1117,6 +1087,13 @@ static unsigned long mtdchar_get_unmapped_area(struct file *file,
 	ret = mtd_get_unmapped_area(mtd, len, offset, flags);
 	return ret == -EOPNOTSUPP ? -ENODEV : ret;
 }
+
+static unsigned mtdchar_mmap_capabilities(struct file *file)
+{
+	struct mtd_file_info *mfi = file->private_data;
+
+	return mtd_mmap_capabilities(mfi->mtd);
+}
 #endif
 
 /*
@@ -1160,27 +1137,10 @@ static const struct file_operations mtd_fops = {
 	.mmap		= mtdchar_mmap,
 #ifndef CONFIG_MMU
 	.get_unmapped_area = mtdchar_get_unmapped_area,
+	.mmap_capabilities = mtdchar_mmap_capabilities,
 #endif
 };
 
-static const struct super_operations mtd_ops = {
-	.drop_inode = generic_delete_inode,
-	.statfs = simple_statfs,
-};
-
-static struct dentry *mtd_inodefs_mount(struct file_system_type *fs_type,
-				int flags, const char *dev_name, void *data)
-{
-	return mount_pseudo(fs_type, "mtd_inode:", &mtd_ops, NULL, MTD_INODE_FS_MAGIC);
-}
-
-static struct file_system_type mtd_inodefs_type = {
-       .name = "mtd_inodefs",
-       .mount = mtd_inodefs_mount,
-       .kill_sb = kill_anon_super,
-};
-MODULE_ALIAS_FS("mtd_inodefs");
-
 int __init init_mtdchar(void)
 {
 	int ret;
@@ -1193,23 +1153,11 @@ int __init init_mtdchar(void)
 		return ret;
 	}
 
-	ret = register_filesystem(&mtd_inodefs_type);
-	if (ret) {
-		pr_err("Can't register mtd_inodefs filesystem, error %d\n",
-		       ret);
-		goto err_unregister_chdev;
-	}
-
-	return ret;
-
-err_unregister_chdev:
-	__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
 	return ret;
 }
 
 void __exit cleanup_mtdchar(void)
 {
-	unregister_filesystem(&mtd_inodefs_type);
 	__unregister_chrdev(MTD_CHAR_MAJOR, 0, 1 << MINORBITS, "mtd");
 }
 
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index b9000563b9f4..eacc3aac7327 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -732,8 +732,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 
 	concat->mtd.ecc_stats.badblocks = subdev[0]->ecc_stats.badblocks;
 
-	concat->mtd.backing_dev_info = subdev[0]->backing_dev_info;
-
 	concat->subdev[0] = subdev[0];
 
 	for (i = 1; i < num_devs; i++) {
@@ -761,14 +759,6 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[],	/* subdevices to c
 				    subdev[i]->flags & MTD_WRITEABLE;
 		}
 
-		/* only permit direct mapping if the BDIs are all the same
-		 * - copy-mapping is still permitted
-		 */
-		if (concat->mtd.backing_dev_info !=
-		    subdev[i]->backing_dev_info)
-			concat->mtd.backing_dev_info =
-				&default_backing_dev_info;
-
 		concat->mtd.size += subdev[i]->size;
 		concat->mtd.ecc_stats.badblocks +=
 			subdev[i]->ecc_stats.badblocks;
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 4c611871d7e6..ff38a1df22f7 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -43,33 +43,7 @@
 
 #include "mtdcore.h"
 
-/*
- * backing device capabilities for non-mappable devices (such as NAND flash)
- * - permits private mappings, copies are taken of the data
- */
-static struct backing_dev_info mtd_bdi_unmappable = {
-	.capabilities	= BDI_CAP_MAP_COPY,
-};
-
-/*
- * backing device capabilities for R/O mappable devices (such as ROM)
- * - permits private mappings, copies are taken of the data
- * - permits non-writable shared mappings
- */
-static struct backing_dev_info mtd_bdi_ro_mappable = {
-	.capabilities	= (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
-			   BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP),
-};
-
-/*
- * backing device capabilities for writable mappable devices (such as RAM)
- * - permits private mappings, copies are taken of the data
- * - permits non-writable shared mappings
- */
-static struct backing_dev_info mtd_bdi_rw_mappable = {
-	.capabilities	= (BDI_CAP_MAP_COPY | BDI_CAP_MAP_DIRECT |
-			   BDI_CAP_EXEC_MAP | BDI_CAP_READ_MAP |
-			   BDI_CAP_WRITE_MAP),
+static struct backing_dev_info mtd_bdi = {
 };
 
 static int mtd_cls_suspend(struct device *dev, pm_message_t state);
@@ -365,6 +339,22 @@ static struct device_type mtd_devtype = {
 	.release	= mtd_release,
 };
 
+#ifndef CONFIG_MMU
+unsigned mtd_mmap_capabilities(struct mtd_info *mtd)
+{
+	switch (mtd->type) {
+	case MTD_RAM:
+		return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC |
+			NOMMU_MAP_READ | NOMMU_MAP_WRITE;
+	case MTD_ROM:
+		return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_EXEC |
+			NOMMU_MAP_READ;
+	default:
+		return NOMMU_MAP_COPY;
+	}
+}
+#endif
+
 /**
  *	add_mtd_device - register an MTD device
  *	@mtd: pointer to new MTD device info structure
@@ -380,19 +370,7 @@ int add_mtd_device(struct mtd_info *mtd)
 	struct mtd_notifier *not;
 	int i, error;
 
-	if (!mtd->backing_dev_info) {
-		switch (mtd->type) {
-		case MTD_RAM:
-			mtd->backing_dev_info = &mtd_bdi_rw_mappable;
-			break;
-		case MTD_ROM:
-			mtd->backing_dev_info = &mtd_bdi_ro_mappable;
-			break;
-		default:
-			mtd->backing_dev_info = &mtd_bdi_unmappable;
-			break;
-		}
-	}
+	mtd->backing_dev_info = &mtd_bdi;
 
 	BUG_ON(mtd->writesize == 0);
 	mutex_lock(&mtd_table_mutex);
@@ -1237,17 +1215,9 @@ static int __init init_mtd(void)
 	if (ret)
 		goto err_reg;
 
-	ret = mtd_bdi_init(&mtd_bdi_unmappable, "mtd-unmap");
-	if (ret)
-		goto err_bdi1;
-
-	ret = mtd_bdi_init(&mtd_bdi_ro_mappable, "mtd-romap");
-	if (ret)
-		goto err_bdi2;
-
-	ret = mtd_bdi_init(&mtd_bdi_rw_mappable, "mtd-rwmap");
+	ret = mtd_bdi_init(&mtd_bdi, "mtd");
 	if (ret)
-		goto err_bdi3;
+		goto err_bdi;
 
 	proc_mtd = proc_create("mtd", 0, NULL, &mtd_proc_ops);
 
@@ -1260,11 +1230,7 @@ static int __init init_mtd(void)
 out_procfs:
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
-err_bdi3:
-	bdi_destroy(&mtd_bdi_ro_mappable);
-err_bdi2:
-	bdi_destroy(&mtd_bdi_unmappable);
-err_bdi1:
+err_bdi:
 	class_unregister(&mtd_class);
 err_reg:
 	pr_err("Error registering mtd class or bdi: %d\n", ret);
@@ -1277,9 +1243,7 @@ static void __exit cleanup_mtd(void)
 	if (proc_mtd)
 		remove_proc_entry("mtd", NULL);
 	class_unregister(&mtd_class);
-	bdi_destroy(&mtd_bdi_unmappable);
-	bdi_destroy(&mtd_bdi_ro_mappable);
-	bdi_destroy(&mtd_bdi_rw_mappable);
+	bdi_destroy(&mtd_bdi);
 }
 
 module_init(init_mtd);
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index a3e3a7d074d5..e779de315ade 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -378,7 +378,6 @@ static struct mtd_part *allocate_partition(struct mtd_info *master,
 
 	slave->mtd.name = name;
 	slave->mtd.owner = master->owner;
-	slave->mtd.backing_dev_info = master->backing_dev_info;
 
 	/* NOTE:  we don't arrange MTDs as a tree; it'd be error-prone
 	 * to have the same data be in two different partitions.
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index a3367bfb1456..d5b149c5542d 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -987,7 +987,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 	if (err)
 		goto out_free;
 	lsi->lsi_flags |= LSI_BDI_INITIALIZED;
-	lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY;
+	lsi->lsi_bdi.capabilities = 0;
 	err = ll_bdi_register(&lsi->lsi_bdi);
 	if (err)
 		goto out_free;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6894b085f0ee..620d93489539 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	}
 	init_rwsem(&v9ses->rename_sem);
 
-	rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+	rc = bdi_setup_and_register(&v9ses->bdi, "9p");
 	if (rc) {
 		kfree(v9ses->aname);
 		kfree(v9ses->uname);
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 2b607257820c..d142a2449e65 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 	volume->cell		= params->cell;
 	volume->vid		= vlocation->vldb.vid[params->type];
 
-	ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+	ret = bdi_setup_and_register(&volume->bdi, "afs");
 	if (ret)
 		goto error_bdi;
 
diff --git a/fs/aio.c b/fs/aio.c
index 1b7893ecc296..6f13d3fab07f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,15 +165,6 @@ static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
 
-/* Backing dev info for aio fs.
- * -no dirty page accounting or writeback happens
- */
-static struct backing_dev_info aio_fs_backing_dev_info = {
-	.name           = "aiofs",
-	.state          = 0,
-	.capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
-};
-
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
 	struct qstr this = QSTR_INIT("[aio]", 5);
@@ -185,7 +176,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 
 	inode->i_mapping->a_ops = &aio_ctx_aops;
 	inode->i_mapping->private_data = ctx;
-	inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
+	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_size = PAGE_SIZE * nr_pages;
 
 	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -230,9 +221,6 @@ static int __init aio_setup(void)
 	if (IS_ERR(aio_mnt))
 		panic("Failed to create aio fs mount.");
 
-	if (bdi_init(&aio_fs_backing_dev_info))
-		panic("Failed to init aio fs backing dev info.");
-
 	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c63419a7f70..afc4092989cd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1715,8 +1715,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
 	int err;
 
-	bdi->capabilities = BDI_CAP_MAP_COPY;
-	err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
+	err = bdi_setup_and_register(bdi, "btrfs");
 	if (err)
 		return err;
 
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 67b2007f10fe..ea06a3d0364c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -24,27 +24,6 @@
 
 #include "internal.h"
 
-/*
- * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
- * devices
- * - permits shared-mmap for read, write and/or exec
- * - does not permit private mmap in NOMMU mode (can't do COW)
- * - no readahead or I/O queue unplugging required
- */
-struct backing_dev_info directly_mappable_cdev_bdi = {
-	.name = "char",
-	.capabilities	= (
-#ifdef CONFIG_MMU
-		/* permit private copies of the data to be taken */
-		BDI_CAP_MAP_COPY |
-#endif
-		/* permit direct mmap, for read, write or exec */
-		BDI_CAP_MAP_DIRECT |
-		BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
-		/* no writeback happens */
-		BDI_CAP_NO_ACCT_AND_WRITEBACK),
-};
-
 static struct kobj_map *cdev_map;
 
 static DEFINE_MUTEX(chrdevs_lock);
@@ -575,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
 void __init chrdev_init(void)
 {
 	cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
-	if (bdi_init(&directly_mappable_cdev_bdi))
-		panic("Failed to init directly mappable cdev bdi");
 }
 
 
@@ -590,4 +567,3 @@ EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
-EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2a772da16b83..d3aa999ab785 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3446,7 +3446,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
 	int referral_walks_count = 0;
 #endif
 
-	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+	rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
 	if (rc)
 		return rc;
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b945410bfcd5..82ec68b59208 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 		goto unlock_out;
 	}
 
-	error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+	error = bdi_setup_and_register(&vc->bdi, "coda");
 	if (error)
 		goto unlock_out;
 
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index bd4a3c167091..a315677e44d3 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -70,8 +70,6 @@ extern int configfs_is_root(struct config_item *item);
 
 extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
 extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
-extern int configfs_inode_init(void);
-extern void configfs_inode_exit(void);
 
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5946ad98053f..0ad6b4d6de00 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = {
 	.write_end	= simple_write_end,
 };
 
-static struct backing_dev_info configfs_backing_dev_info = {
-	.name		= "configfs",
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
 static const struct inode_operations configfs_inode_operations ={
 	.setattr	= configfs_setattr,
 };
@@ -137,7 +131,7 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode->i_mapping->a_ops = &configfs_aops;
-		inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
+		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_op = &configfs_inode_operations;
 
 		if (sd->s_iattr) {
@@ -283,13 +277,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 	}
 	mutex_unlock(&dir->d_inode->i_mutex);
 }
-
-int __init configfs_inode_init(void)
-{
-	return bdi_init(&configfs_backing_dev_info);
-}
-
-void configfs_inode_exit(void)
-{
-	bdi_destroy(&configfs_backing_dev_info);
-}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f6c285833390..da94e41bdbf6 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -145,19 +145,13 @@ static int __init configfs_init(void)
 	if (!config_kobj)
 		goto out2;
 
-	err = configfs_inode_init();
-	if (err)
-		goto out3;
-
 	err = register_filesystem(&configfs_fs_type);
 	if (err)
-		goto out4;
+		goto out3;
 
 	return 0;
-out4:
-	pr_err("Unable to register filesystem!\n");
-	configfs_inode_exit();
 out3:
+	pr_err("Unable to register filesystem!\n");
 	kobject_put(config_kobj);
 out2:
 	kmem_cache_destroy(configfs_dir_cachep);
@@ -172,7 +166,6 @@ static void __exit configfs_exit(void)
 	kobject_put(config_kobj);
 	kmem_cache_destroy(configfs_dir_cachep);
 	configfs_dir_cachep = NULL;
-	configfs_inode_exit();
 }
 
 MODULE_AUTHOR("Oracle");
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d9eb84bda559..1895d60f4122 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
 		goto out;
 	}
 
-	rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
+	rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
 	if (rc)
 		goto out1;
 
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 95965503afcb..fcc2e565f540 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 		goto free_sbi;
 	}
 
-	ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+	ret = bdi_setup_and_register(&sbi->bdi, "exofs");
 	if (ret) {
 		EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
 		dput(sb->s_root);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index e31e589369a4..a699a3fc62c0 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -560,7 +560,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
 	server = NCP_SBP(sb);
 	memset(server, 0, sizeof(*server));
 
-	error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+	error = bdi_setup_and_register(&server->bdi, "ncpfs");
 	if (error)
 		goto out_fput;
 
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index bbafbde3471a..f6ab41b39612 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 						   unsigned long flags);
 static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
 
+static unsigned ramfs_mmap_capabilities(struct file *file)
+{
+	return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ |
+		NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
+
 const struct file_operations ramfs_file_operations = {
+	.mmap_capabilities	= ramfs_mmap_capabilities,
 	.mmap			= ramfs_nommu_mmap,
 	.get_unmapped_area	= ramfs_nommu_get_unmapped_area,
 	.read			= new_sync_read,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d365b1c4eb3c..ad4d712002f4 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = {
 	.set_page_dirty	= __set_page_dirty_no_writeback,
 };
 
-static struct backing_dev_info ramfs_backing_dev_info = {
-	.name		= "ramfs",
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK |
-			  BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
-			  BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
-};
-
 struct inode *ramfs_get_inode(struct super_block *sb,
 				const struct inode *dir, umode_t mode, dev_t dev)
 {
@@ -67,7 +59,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_mapping->a_ops = &ramfs_aops;
-		inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
+		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		mapping_set_unevictable(inode->i_mapping);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -267,19 +259,9 @@ static struct file_system_type ramfs_fs_type = {
 int __init init_ramfs_fs(void)
 {
 	static unsigned long once;
-	int err;
 
 	if (test_and_set_bit(0, &once))
 		return 0;
-
-	err = bdi_init(&ramfs_backing_dev_info);
-	if (err)
-		return err;
-
-	err = register_filesystem(&ramfs_fs_type);
-	if (err)
-		bdi_destroy(&ramfs_backing_dev_info);
-
-	return err;
+	return register_filesystem(&ramfs_fs_type);
 }
 fs_initcall(init_ramfs_fs);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index ea06c7554860..7da9e2153953 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
 	return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
 }
 
+static unsigned romfs_mmap_capabilities(struct file *file)
+{
+	struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
+
+	if (!mtd)
+		return NOMMU_MAP_COPY;
+	return mtd_mmap_capabilities(mtd);
+}
+
 const struct file_operations romfs_ro_fops = {
 	.llseek			= generic_file_llseek,
 	.read			= new_sync_read,
@@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = {
 	.splice_read		= generic_file_splice_read,
 	.mmap			= romfs_mmap,
 	.get_unmapped_area	= romfs_get_unmapped_area,
+	.mmap_capabilities	= romfs_mmap_capabilities,
 };
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 106bf20629ce..ed93dc6ae245 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2017,7 +2017,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 	 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
 	 */
 	c->bdi.name = "ubifs",
-	c->bdi.capabilities = BDI_CAP_MAP_COPY;
+	c->bdi.capabilities = 0;
 	err  = bdi_init(&c->bdi);
 	if (err)
 		goto out_close;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e936cea856dc..478f95d92d73 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -114,7 +114,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 		const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 void bdi_unregister(struct backing_dev_info *bdi);
-int __must_check bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
+int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
@@ -228,42 +228,17 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_NO_ACCT_DIRTY:  Dirty pages shouldn't contribute to accounting
  * BDI_CAP_NO_WRITEBACK:   Don't write pages back
  * BDI_CAP_NO_ACCT_WB:     Don't automatically account writeback pages
- *
- * These flags let !MMU mmap() govern direct device mapping vs immediate
- * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
- *
- * BDI_CAP_MAP_COPY:       Copy can be mapped (MAP_PRIVATE)
- * BDI_CAP_MAP_DIRECT:     Can be mapped directly (MAP_SHARED)
- * BDI_CAP_READ_MAP:       Can be mapped for reading
- * BDI_CAP_WRITE_MAP:      Can be mapped for writing
- * BDI_CAP_EXEC_MAP:       Can be mapped for execution
- *
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
-#define BDI_CAP_MAP_COPY	0x00000004
-#define BDI_CAP_MAP_DIRECT	0x00000008
-#define BDI_CAP_READ_MAP	0x00000010
-#define BDI_CAP_WRITE_MAP	0x00000020
-#define BDI_CAP_EXEC_MAP	0x00000040
-#define BDI_CAP_NO_ACCT_WB	0x00000080
-#define BDI_CAP_STABLE_WRITES	0x00000200
-#define BDI_CAP_STRICTLIMIT	0x00000400
-
-#define BDI_CAP_VMFLAGS \
-	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
+#define BDI_CAP_NO_ACCT_WB	0x00000004
+#define BDI_CAP_STABLE_WRITES	0x00000008
+#define BDI_CAP_STRICTLIMIT	0x00000010
 
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
 
-#if defined(VM_MAYREAD) && \
-	(BDI_CAP_READ_MAP != VM_MAYREAD || \
-	 BDI_CAP_WRITE_MAP != VM_MAYWRITE || \
-	 BDI_CAP_EXEC_MAP != VM_MAYEXEC)
-#error please change backing_dev_info::capabilities flags
-#endif
-
 extern struct backing_dev_info default_backing_dev_info;
 extern struct backing_dev_info noop_backing_dev_info;
 
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index fb4591977b03..f8763615a5f2 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -30,6 +30,4 @@ void cdev_del(struct cdev *);
 
 void cd_forget(struct inode *);
 
-extern struct backing_dev_info directly_mappable_cdev_bdi;
-
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..1dada399aa23 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1502,6 +1502,26 @@ struct block_device_operations;
 #define HAVE_COMPAT_IOCTL 1
 #define HAVE_UNLOCKED_IOCTL 1
 
+/*
+ * These flags let !MMU mmap() govern direct device mapping vs immediate
+ * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
+ *
+ * NOMMU_MAP_COPY:	Copy can be mapped (MAP_PRIVATE)
+ * NOMMU_MAP_DIRECT:	Can be mapped directly (MAP_SHARED)
+ * NOMMU_MAP_READ:	Can be mapped for reading
+ * NOMMU_MAP_WRITE:	Can be mapped for writing
+ * NOMMU_MAP_EXEC:	Can be mapped for execution
+ */
+#define NOMMU_MAP_COPY		0x00000001
+#define NOMMU_MAP_DIRECT	0x00000008
+#define NOMMU_MAP_READ		VM_MAYREAD
+#define NOMMU_MAP_WRITE		VM_MAYWRITE
+#define NOMMU_MAP_EXEC		VM_MAYEXEC
+
+#define NOMMU_VMFLAGS \
+	(NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)
+
+
 struct iov_iter;
 
 struct file_operations {
@@ -1536,6 +1556,9 @@ struct file_operations {
 	long (*fallocate)(struct file *file, int mode, loff_t offset,
 			  loff_t len);
 	void (*show_fdinfo)(struct seq_file *m, struct file *f);
+#ifndef CONFIG_MMU
+	unsigned (*mmap_capabilities)(struct file *);
+#endif
 };
 
 struct inode_operations {
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 031ff3a9a0bd..3301c4c289d6 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -408,4 +408,6 @@ static inline int mtd_is_bitflip_or_eccerr(int err) {
 	return mtd_is_bitflip(err) || mtd_is_eccerr(err);
 }
 
+unsigned mtd_mmap_capabilities(struct mtd_info *mtd);
+
 #endif /* __MTD_MTD_H__ */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df55000b..16c68958aeda 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -17,8 +17,6 @@ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 struct backing_dev_info default_backing_dev_info = {
 	.name		= "default",
 	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
-	.state		= 0,
-	.capabilities	= BDI_CAP_MAP_COPY,
 };
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
@@ -513,13 +511,12 @@ EXPORT_SYMBOL(bdi_destroy);
  * For use from filesystems to quickly init and register a bdi associated
  * with dirty writeback
  */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
-			   unsigned int cap)
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
 {
 	int err;
 
 	bdi->name = name;
-	bdi->capabilities = cap;
+	bdi->capabilities = 0;
 	err = bdi_init(bdi);
 	if (err)
 		return err;
diff --git a/mm/nommu.c b/mm/nommu.c
index b51eadf6d952..13af96f35a4b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -946,9 +946,6 @@ static int validate_mmap_request(struct file *file,
 		return -EOVERFLOW;
 
 	if (file) {
-		/* validate file mapping requests */
-		struct address_space *mapping;
-
 		/* files must support mmap */
 		if (!file->f_op->mmap)
 			return -ENODEV;
@@ -957,28 +954,22 @@ static int validate_mmap_request(struct file *file,
 		 * - we support chardevs that provide their own "memory"
 		 * - we support files/blockdevs that are memory backed
 		 */
-		mapping = file->f_mapping;
-		if (!mapping)
-			mapping = file_inode(file)->i_mapping;
-
-		capabilities = 0;
-		if (mapping && mapping->backing_dev_info)
-			capabilities = mapping->backing_dev_info->capabilities;
-
-		if (!capabilities) {
+		if (file->f_op->mmap_capabilities) {
+			capabilities = file->f_op->mmap_capabilities(file);
+		} else {
 			/* no explicit capabilities set, so assume some
 			 * defaults */
 			switch (file_inode(file)->i_mode & S_IFMT) {
 			case S_IFREG:
 			case S_IFBLK:
-				capabilities = BDI_CAP_MAP_COPY;
+				capabilities = NOMMU_MAP_COPY;
 				break;
 
 			case S_IFCHR:
 				capabilities =
-					BDI_CAP_MAP_DIRECT |
-					BDI_CAP_READ_MAP |
-					BDI_CAP_WRITE_MAP;
+					NOMMU_MAP_DIRECT |
+					NOMMU_MAP_READ |
+					NOMMU_MAP_WRITE;
 				break;
 
 			default:
@@ -989,9 +980,9 @@ static int validate_mmap_request(struct file *file,
 		/* eliminate any capabilities that we can't support on this
 		 * device */
 		if (!file->f_op->get_unmapped_area)
-			capabilities &= ~BDI_CAP_MAP_DIRECT;
+			capabilities &= ~NOMMU_MAP_DIRECT;
 		if (!file->f_op->read)
-			capabilities &= ~BDI_CAP_MAP_COPY;
+			capabilities &= ~NOMMU_MAP_COPY;
 
 		/* The file shall have been opened with read permission. */
 		if (!(file->f_mode & FMODE_READ))
@@ -1010,29 +1001,29 @@ static int validate_mmap_request(struct file *file,
 			if (locks_verify_locked(file))
 				return -EAGAIN;
 
-			if (!(capabilities & BDI_CAP_MAP_DIRECT))
+			if (!(capabilities & NOMMU_MAP_DIRECT))
 				return -ENODEV;
 
 			/* we mustn't privatise shared mappings */
-			capabilities &= ~BDI_CAP_MAP_COPY;
+			capabilities &= ~NOMMU_MAP_COPY;
 		} else {
 			/* we're going to read the file into private memory we
 			 * allocate */
-			if (!(capabilities & BDI_CAP_MAP_COPY))
+			if (!(capabilities & NOMMU_MAP_COPY))
 				return -ENODEV;
 
 			/* we don't permit a private writable mapping to be
 			 * shared with the backing device */
 			if (prot & PROT_WRITE)
-				capabilities &= ~BDI_CAP_MAP_DIRECT;
+				capabilities &= ~NOMMU_MAP_DIRECT;
 		}
 
-		if (capabilities & BDI_CAP_MAP_DIRECT) {
-			if (((prot & PROT_READ)  && !(capabilities & BDI_CAP_READ_MAP))  ||
-			    ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
-			    ((prot & PROT_EXEC)  && !(capabilities & BDI_CAP_EXEC_MAP))
+		if (capabilities & NOMMU_MAP_DIRECT) {
+			if (((prot & PROT_READ)  && !(capabilities & NOMMU_MAP_READ))  ||
+			    ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
+			    ((prot & PROT_EXEC)  && !(capabilities & NOMMU_MAP_EXEC))
 			    ) {
-				capabilities &= ~BDI_CAP_MAP_DIRECT;
+				capabilities &= ~NOMMU_MAP_DIRECT;
 				if (flags & MAP_SHARED) {
 					printk(KERN_WARNING
 					       "MAP_SHARED not completely supported on !MMU\n");
@@ -1049,21 +1040,21 @@ static int validate_mmap_request(struct file *file,
 		} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
 			/* handle implication of PROT_EXEC by PROT_READ */
 			if (current->personality & READ_IMPLIES_EXEC) {
-				if (capabilities & BDI_CAP_EXEC_MAP)
+				if (capabilities & NOMMU_MAP_EXEC)
 					prot |= PROT_EXEC;
 			}
 		} else if ((prot & PROT_READ) &&
 			 (prot & PROT_EXEC) &&
-			 !(capabilities & BDI_CAP_EXEC_MAP)
+			 !(capabilities & NOMMU_MAP_EXEC)
 			 ) {
 			/* backing file is not executable, try to copy */
-			capabilities &= ~BDI_CAP_MAP_DIRECT;
+			capabilities &= ~NOMMU_MAP_DIRECT;
 		}
 	} else {
 		/* anonymous mappings are always memory backed and can be
 		 * privately mapped
 		 */
-		capabilities = BDI_CAP_MAP_COPY;
+		capabilities = NOMMU_MAP_COPY;
 
 		/* handle PROT_EXEC implication by PROT_READ */
 		if ((prot & PROT_READ) &&
@@ -1095,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
 	vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
 	/* vm_flags |= mm->def_flags; */
 
-	if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+	if (!(capabilities & NOMMU_MAP_DIRECT)) {
 		/* attempt to share read-only copies of mapped file chunks */
 		vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 		if (file && !(prot & PROT_WRITE))
@@ -1104,7 +1095,7 @@ static unsigned long determine_vm_flags(struct file *file,
 		/* overlay a shareable mapping on the backing device or inode
 		 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
 		 * romfs/cramfs */
-		vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
+		vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
 		if (flags & MAP_SHARED)
 			vm_flags |= VM_SHARED;
 	}
@@ -1157,7 +1148,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 	 * shared mappings on devices or memory
 	 * - VM_MAYSHARE will be set if it may attempt to share
 	 */
-	if (capabilities & BDI_CAP_MAP_DIRECT) {
+	if (capabilities & NOMMU_MAP_DIRECT) {
 		ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
 		if (ret == 0) {
 			/* shouldn't return success if we're not sharing */
@@ -1346,7 +1337,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 			if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
 			    !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
 				/* new mapping is not a subset of the region */
-				if (!(capabilities & BDI_CAP_MAP_DIRECT))
+				if (!(capabilities & NOMMU_MAP_DIRECT))
 					goto sharing_violation;
 				continue;
 			}
@@ -1385,7 +1376,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 		 * - this is the hook for quasi-memory character devices to
 		 *   tell us the location of a shared mapping
 		 */
-		if (capabilities & BDI_CAP_MAP_DIRECT) {
+		if (capabilities & NOMMU_MAP_DIRECT) {
 			addr = file->f_op->get_unmapped_area(file, addr, len,
 							     pgoff, flags);
 			if (IS_ERR_VALUE(addr)) {
@@ -1397,10 +1388,10 @@ unsigned long do_mmap_pgoff(struct file *file,
 				 * the mapping so we'll have to attempt to copy
 				 * it */
 				ret = -ENODEV;
-				if (!(capabilities & BDI_CAP_MAP_COPY))
+				if (!(capabilities & NOMMU_MAP_COPY))
 					goto error_just_free;
 
-				capabilities &= ~BDI_CAP_MAP_DIRECT;
+				capabilities &= ~NOMMU_MAP_DIRECT;
 			} else {
 				vma->vm_start = region->vm_start = addr;
 				vma->vm_end = region->vm_end = addr + len;
@@ -1411,7 +1402,7 @@ unsigned long do_mmap_pgoff(struct file *file,
 	vma->vm_region = region;
 
 	/* set up the mapping
-	 * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
+	 * - the region is filled in if NOMMU_MAP_DIRECT is still set
 	 */
 	if (file && vma->vm_flags & VM_SHARED)
 		ret = do_mmap_shared_file(vma);
diff --git a/security/security.c b/security/security.c
index 18b35c63fc0c..a0442b20f001 100644
--- a/security/security.c
+++ b/security/security.c
@@ -726,16 +726,15 @@ static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
 		return prot | PROT_EXEC;
 	/*
 	 * ditto if it's not on noexec mount, except that on !MMU we need
-	 * BDI_CAP_EXEC_MMAP (== VM_MAYEXEC) in this case
+	 * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
 	 */
 	if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) {
 #ifndef CONFIG_MMU
-		unsigned long caps = 0;
-		struct address_space *mapping = file->f_mapping;
-		if (mapping && mapping->backing_dev_info)
-			caps = mapping->backing_dev_info->capabilities;
-		if (!(caps & BDI_CAP_EXEC_MAP))
-			return prot;
+		if (file->f_op->mmap_capabilities) {
+			unsigned caps = file->f_op->mmap_capabilities(file);
+			if (!(caps & NOMMU_MAP_EXEC))
+				return prot;
+		}
 #endif
 		return prot | PROT_EXEC;
 	}
-- 
cgit v1.2.3


From de1414a654e66b81b5348dbc5259ecf2fb61655e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:36 +0100
Subject: fs: export inode_to_bdi and use it in favor of
 mapping->backing_dev_info

Now that we got rid of the bdi abuse on character devices we can always use
sb->s_bdi to get at the backing_dev_info for a file, except for the block
device special case.  Export inode_to_bdi and replace uses of
mapping->backing_dev_info with it to prepare for the removal of
mapping->backing_dev_info.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/btrfs/file.c                  |  2 +-
 fs/ceph/file.c                   |  2 +-
 fs/ext2/ialloc.c                 |  2 +-
 fs/ext4/super.c                  |  2 +-
 fs/fs-writeback.c                |  3 ++-
 fs/fuse/file.c                   | 10 +++++-----
 fs/gfs2/aops.c                   |  2 +-
 fs/gfs2/super.c                  |  2 +-
 fs/nfs/filelayout/filelayout.c   |  2 +-
 fs/nfs/write.c                   |  6 +++---
 fs/ntfs/file.c                   |  3 ++-
 fs/ocfs2/file.c                  |  2 +-
 fs/xfs/xfs_file.c                |  2 +-
 include/linux/backing-dev.h      |  6 ++++--
 include/trace/events/writeback.h |  6 +++---
 mm/fadvise.c                     |  4 ++--
 mm/filemap.c                     |  4 ++--
 mm/filemap_xip.c                 |  3 ++-
 mm/page-writeback.c              | 29 +++++++++++++----------------
 mm/readahead.c                   |  4 ++--
 mm/truncate.c                    |  2 +-
 mm/vmscan.c                      |  4 ++--
 22 files changed, 52 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..835c04a874fd 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 
 	mutex_lock(&inode->i_mutex);
 
-	current->backing_dev_info = inode->i_mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err) {
 		mutex_unlock(&inode->i_mutex);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce74b394b49d..905986dd4c3c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -945,7 +945,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = file->f_mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7d66fb0e4cca..6c14bb8322fa 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode)
 	struct ext2_group_desc * gdp;
 	struct backing_dev_info *bdi;
 
-	bdi = inode->i_mapping->backing_dev_info;
+	bdi = inode_to_bdi(inode);
 	if (bdi_read_congested(bdi))
 		return;
 	if (bdi_write_congested(bdi))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 74c5f53595fb..ad88e601a6cd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func,
 static int block_device_ejected(struct super_block *sb)
 {
 	struct inode *bd_inode = sb->s_bdev->bd_inode;
-	struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
 
 	return bdi->dev == NULL;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e8116a44cc29..a20b1145f4d5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -66,7 +66,7 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(writeback_in_progress);
 
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 #ifdef CONFIG_BLOCK
@@ -75,6 +75,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 #endif
 	return sb->s_bdi;
 }
+EXPORT_SYMBOL_GPL(inode_to_bdi);
 
 static inline struct inode *wb_inode(struct list_head *head)
 {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c552197..19d80b82d344 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
@@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 {
 	struct inode *inode = req->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	int i;
 
 	list_del(&req->writepages_entry);
@@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page)
 	req->end = fuse_writepage_end;
 	req->inode = inode;
 
-	inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	spin_lock(&fc->lock);
@@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
 
 	if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
 					old_req->state == FUSE_REQ_PENDING)) {
-		struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
 
 		copy_highpage(old_req->pages[0], page);
 		spin_unlock(&fc->lock);
@@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page,
 	req->page_descs[req->num_pages].offset = 0;
 	req->page_descs[req->num_pages].length = PAGE_SIZE;
 
-	inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+	inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
 	err = 0;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37fed638..4ad4f94edebe 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -289,7 +289,7 @@ continue_unlock:
 		if (!clear_page_dirty_for_io(page))
 			goto continue_unlock;
 
-		trace_wbc_writepage(wbc, mapping->backing_dev_info);
+		trace_wbc_writepage(wbc, inode_to_bdi(inode));
 
 		ret = __gfs2_jdata_writepage(page, wbc);
 		if (unlikely(ret)) {
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5b327f837de7..1666382b198d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
-	struct backing_dev_info *bdi = metamapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
 	int ret = 0;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7afb52f6a25a..51aa889611cf 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1081,7 +1081,7 @@ mds_commit:
 	spin_unlock(cinfo->lock);
 	if (!cinfo->dreq) {
 		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+		inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
 			     BDI_RECLAIMABLE);
 		__mark_inode_dirty(req->wb_context->dentry->d_inode,
 				   I_DIRTY_DATASYNC);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af3af685a9e3..298abcc5281b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -786,7 +786,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
 	spin_unlock(cinfo->lock);
 	if (!cinfo->dreq) {
 		inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-		inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+		inc_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
 			     BDI_RECLAIMABLE);
 		__mark_inode_dirty(req->wb_context->dentry->d_inode,
 				   I_DIRTY_DATASYNC);
@@ -853,7 +853,7 @@ static void
 nfs_clear_page_commit(struct page *page)
 {
 	dec_zone_page_state(page, NR_UNSTABLE_NFS);
-	dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
+	dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
 }
 
 /* Called holding inode (/cinfo) lock */
@@ -1564,7 +1564,7 @@ void nfs_retry_commit(struct list_head *page_list,
 		nfs_mark_request_commit(req, lseg, cinfo);
 		if (!cinfo->dreq) {
 			dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-			dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
+			dec_bdi_stat(inode_to_bdi(page_file_mapping(req->wb_page)->host),
 				     BDI_RECLAIMABLE);
 		}
 		nfs_unlock_and_release_request(req);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 643faa44f22b..1da9b2d184dc 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,6 +19,7 @@
  * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
 #include <linux/pagemap.h>
@@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
 	count = iov_length(iov, nr_segs);
 	pos = *ppos;
 	/* We can write back this queue in page reclaim. */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 	written = 0;
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3950693dd0f6..abe7d98d6178 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2363,7 +2363,7 @@ relock:
 			goto out_dio;
 		}
 	} else {
-		current->backing_dev_info = file->f_mapping->backing_dev_info;
+		current->backing_dev_info = inode_to_bdi(inode);
 		written = generic_perform_write(file, from, *ppos);
 		if (likely(written >= 0))
 			iocb->ki_pos = *ppos + written;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 13e974e6a889..5684ac3e7d18 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -699,7 +699,7 @@ xfs_file_buffered_aio_write(
 
 	iov_iter_truncate(from, count);
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 write_retry:
 	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 478f95d92d73..ed59dee03a71 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -106,6 +106,8 @@ struct backing_dev_info {
 #endif
 };
 
+struct backing_dev_info *inode_to_bdi(struct inode *inode);
+
 int __must_check bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 
@@ -303,12 +305,12 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi)
 
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
-	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
+	return bdi_cap_writeback_dirty(inode_to_bdi(mapping->host));
 }
 
 static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 {
-	return bdi_cap_account_dirty(mapping->backing_dev_info);
+	return bdi_cap_account_dirty(inode_to_bdi(mapping->host));
 }
 
 static inline int bdi_sched_wait(void *word)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index cee02d65ab3f..74f5207bd090 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -47,7 +47,7 @@ TRACE_EVENT(writeback_dirty_page,
 
 	TP_fast_assign(
 		strncpy(__entry->name,
-			mapping ? dev_name(mapping->backing_dev_info->dev) : "(unknown)", 32);
+			mapping ? dev_name(inode_to_bdi(mapping->host)->dev) : "(unknown)", 32);
 		__entry->ino = mapping ? mapping->host->i_ino : 0;
 		__entry->index = page->index;
 	),
@@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 	),
 
 	TP_fast_assign(
-		struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(inode);
 
 		/* may be called for files on pseudo FSes w/ unregistered bdi */
 		strncpy(__entry->name,
@@ -116,7 +116,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
 
 	TP_fast_assign(
 		strncpy(__entry->name,
-			dev_name(inode->i_mapping->backing_dev_info->dev), 32);
+			dev_name(inode_to_bdi(inode)->dev), 32);
 		__entry->ino		= inode->i_ino;
 		__entry->sync_mode	= wbc->sync_mode;
 	),
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 2ad7adf4f0a4..fac23ecf8d72 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -73,7 +73,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	else
 		endbyte--;		/* inclusive */
 
-	bdi = mapping->backing_dev_info;
+	bdi = inode_to_bdi(mapping->host);
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
@@ -113,7 +113,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	case POSIX_FADV_NOREUSE:
 		break;
 	case POSIX_FADV_DONTNEED:
-		if (!bdi_write_congested(mapping->backing_dev_info))
+		if (!bdi_write_congested(bdi))
 			__filemap_fdatawrite_range(mapping, offset, endbyte,
 						   WB_SYNC_NONE);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 673e4581a2e5..5d7c23c26f81 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 	 */
 	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
 		dec_zone_page_state(page, NR_FILE_DIRTY);
-		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
 	}
 }
 
@@ -2565,7 +2565,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	size_t		count = iov_iter_count(from);
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
 	if (err)
 		goto out;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0d105aeff82f..26897fbfbe19 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/backing-dev.h>
 #include <linux/pagemap.h>
 #include <linux/export.h>
 #include <linux/uio.h>
@@ -410,7 +411,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
 	count = len;
 
 	/* We can write back this queue in page reclaim */
-	current->backing_dev_info = mapping->backing_dev_info;
+	current->backing_dev_info = inode_to_bdi(inode);
 
 	ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
 	if (ret)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6f4335238e33..d4cbb4bd7d1c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	unsigned long task_ratelimit;
 	unsigned long dirty_ratelimit;
 	unsigned long pos_ratio;
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
 	unsigned long start_time = jiffies;
 
@@ -1574,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
  */
 void balance_dirty_pages_ratelimited(struct address_space *mapping)
 {
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 	int ratelimit;
 	int *p;
 
@@ -1929,7 +1929,7 @@ continue_unlock:
 			if (!clear_page_dirty_for_io(page))
 				goto continue_unlock;
 
-			trace_wbc_writepage(wbc, mapping->backing_dev_info);
+			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
 			ret = (*writepage)(page, wbc, data);
 			if (unlikely(ret)) {
 				if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -2094,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 	trace_writeback_dirty_page(page, mapping);
 
 	if (mapping_cap_account_dirty(mapping)) {
+		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+
 		__inc_zone_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_DIRTIED);
-		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
-		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+		__inc_bdi_stat(bdi, BDI_RECLAIMABLE);
+		__inc_bdi_stat(bdi, BDI_DIRTIED);
 		task_io_account_write(PAGE_CACHE_SIZE);
 		current->nr_dirtied++;
 		this_cpu_inc(bdp_ratelimits);
@@ -2156,7 +2158,7 @@ void account_page_redirty(struct page *page)
 	if (mapping && mapping_cap_account_dirty(mapping)) {
 		current->nr_dirtied--;
 		dec_zone_page_state(page, NR_DIRTIED);
-		dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+		dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
 	}
 }
 EXPORT_SYMBOL(account_page_redirty);
@@ -2295,7 +2297,7 @@ int clear_page_dirty_for_io(struct page *page)
 		 */
 		if (TestClearPageDirty(page)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			dec_bdi_stat(mapping->backing_dev_info,
+			dec_bdi_stat(inode_to_bdi(mapping->host),
 					BDI_RECLAIMABLE);
 			return 1;
 		}
@@ -2315,7 +2317,7 @@ int test_clear_page_writeback(struct page *page)
 
 	memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
 	if (mapping) {
-		struct backing_dev_info *bdi = mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 		unsigned long flags;
 
 		spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2352,7 +2354,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 
 	memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
 	if (mapping) {
-		struct backing_dev_info *bdi = mapping->backing_dev_info;
+		struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
 		unsigned long flags;
 
 		spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2406,12 +2408,7 @@ EXPORT_SYMBOL(mapping_tagged);
  */
 void wait_for_stable_page(struct page *page)
 {
-	struct address_space *mapping = page_mapping(page);
-	struct backing_dev_info *bdi = mapping->backing_dev_info;
-
-	if (!bdi_cap_stable_pages_required(bdi))
-		return;
-
-	wait_on_page_writeback(page);
+	if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
+		wait_on_page_writeback(page);
 }
 EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/readahead.c b/mm/readahead.c
index 17b9172ec37f..935675844b2e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -27,7 +27,7 @@
 void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 {
-	ra->ra_pages = mapping->backing_dev_info->ra_pages;
+	ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
 	ra->prev_pos = -1;
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
 	/*
 	 * Defer asynchronous read-ahead on IO congestion.
 	 */
-	if (bdi_read_congested(mapping->backing_dev_info))
+	if (bdi_read_congested(inode_to_bdi(mapping->host)))
 		return;
 
 	/* do read-ahead */
diff --git a/mm/truncate.c b/mm/truncate.c
index f1e4d6052369..ddec5a5966d7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
 		struct address_space *mapping = page->mapping;
 		if (mapping && mapping_cap_account_dirty(mapping)) {
 			dec_zone_page_state(page, NR_FILE_DIRTY);
-			dec_bdi_stat(mapping->backing_dev_info,
+			dec_bdi_stat(inode_to_bdi(mapping->host),
 					BDI_RECLAIMABLE);
 			if (account_size)
 				task_io_account_cancelled_write(account_size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ab2505c3ef54..e00a16393f21 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -497,7 +497,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 	}
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
-	if (!may_write_to_queue(mapping->backing_dev_info, sc))
+	if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
 		return PAGE_KEEP;
 
 	if (clear_page_dirty_for_io(page)) {
@@ -876,7 +876,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 */
 		mapping = page_mapping(page);
 		if (((dirty || writeback) && mapping &&
-		     bdi_write_congested(mapping->backing_dev_info)) ||
+		     bdi_write_congested(inode_to_bdi(mapping->host))) ||
 		    (writeback && PageReclaim(page)))
 			nr_congested++;
 
-- 
cgit v1.2.3


From b83ae6d421435c6204150300f1c25bfbd39cd62b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:37 +0100
Subject: fs: remove mapping->backing_dev_info

Now that we never use the backing_dev_info pointer in struct address_space
we can simply remove it and save 4 to 8 bytes in every inode.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/char/raw.c     |  4 +---
 fs/aio.c               |  1 -
 fs/block_dev.c         | 26 +-------------------------
 fs/btrfs/disk-io.c     |  1 -
 fs/btrfs/inode.c       |  6 ------
 fs/ceph/inode.c        |  2 --
 fs/cifs/inode.c        |  2 --
 fs/configfs/inode.c    |  1 -
 fs/ecryptfs/inode.c    |  1 -
 fs/exofs/inode.c       |  2 --
 fs/fuse/inode.c        |  1 -
 fs/gfs2/glock.c        |  1 -
 fs/gfs2/ops_fstype.c   |  1 -
 fs/hugetlbfs/inode.c   |  1 -
 fs/inode.c             | 13 -------------
 fs/kernfs/inode.c      |  1 -
 fs/ncpfs/inode.c       |  1 -
 fs/nfs/inode.c         |  1 -
 fs/nilfs2/gcinode.c    |  1 -
 fs/nilfs2/mdt.c        |  6 ++----
 fs/nilfs2/page.c       |  4 +---
 fs/nilfs2/page.h       |  3 +--
 fs/nilfs2/super.c      |  2 +-
 fs/ocfs2/dlmfs/dlmfs.c |  2 --
 fs/ramfs/inode.c       |  1 -
 fs/romfs/super.c       |  3 ---
 fs/ubifs/dir.c         |  2 --
 fs/ubifs/super.c       |  3 ---
 include/linux/fs.h     |  3 +--
 mm/backing-dev.c       |  1 -
 mm/shmem.c             |  1 -
 mm/swap_state.c        |  1 -
 32 files changed, 8 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index a24891b97547..6e29bf2db536 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -104,11 +104,9 @@ static int raw_release(struct inode *inode, struct file *filp)
 
 	mutex_lock(&raw_mutex);
 	bdev = raw_devices[minor].binding;
-	if (--raw_devices[minor].inuse == 0) {
+	if (--raw_devices[minor].inuse == 0)
 		/* Here  inode->i_mapping == bdev->bd_inode->i_mapping  */
 		inode->i_mapping = &inode->i_data;
-		inode->i_mapping->backing_dev_info = &default_backing_dev_info;
-	}
 	mutex_unlock(&raw_mutex);
 
 	blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
diff --git a/fs/aio.c b/fs/aio.c
index 6f13d3fab07f..3bf8b1d250c3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -176,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 
 	inode->i_mapping->a_ops = &aio_ctx_aops;
 	inode->i_mapping->private_data = ctx;
-	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_size = PAGE_SIZE * nr_pages;
 
 	path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 026ca7b8431c..a9f92794d7a0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -60,19 +60,6 @@ static void bdev_write_inode(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 }
 
-/*
- * Move the inode from its current bdi to a new bdi.  Make sure the inode
- * is clean before moving so that it doesn't linger on the old bdi.
- */
-static void bdev_inode_switch_bdi(struct inode *inode,
-			struct backing_dev_info *dst)
-{
-	spin_lock(&inode->i_lock);
-	WARN_ON_ONCE(inode->i_state & I_DIRTY);
-	inode->i_data.backing_dev_info = dst;
-	spin_unlock(&inode->i_lock);
-}
-
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 void kill_bdev(struct block_device *bdev)
 {
@@ -589,7 +576,6 @@ struct block_device *bdget(dev_t dev)
 		inode->i_bdev = bdev;
 		inode->i_data.a_ops = &def_blk_aops;
 		mapping_set_gfp_mask(&inode->i_data, GFP_USER);
-		inode->i_data.backing_dev_info = &default_backing_dev_info;
 		spin_lock(&bdev_lock);
 		list_add(&bdev->bd_list, &all_bdevs);
 		spin_unlock(&bdev_lock);
@@ -1150,8 +1136,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
 		if (!partno) {
-			struct backing_dev_info *bdi;
-
 			ret = -ENXIO;
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!bdev->bd_part)
@@ -1177,11 +1161,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				}
 			}
 
-			if (!ret) {
+			if (!ret)
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-				bdi = blk_get_backing_dev_info(bdev);
-				bdev_inode_switch_bdi(bdev->bd_inode, bdi);
-			}
 
 			/*
 			 * If the device is invalidated, rescan partition
@@ -1208,8 +1189,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (ret)
 				goto out_clear;
 			bdev->bd_contains = whole;
-			bdev_inode_switch_bdi(bdev->bd_inode,
-				whole->bd_inode->i_data.backing_dev_info);
 			bdev->bd_part = disk_get_part(disk, partno);
 			if (!(disk->flags & GENHD_FL_UP) ||
 			    !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1249,7 +1228,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev->bd_queue = NULL;
-	bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
@@ -1474,8 +1452,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 		 * dirty data before.
 		 */
 		bdev_write_inode(bdev->bd_inode);
-		bdev_inode_switch_bdi(bdev->bd_inode,
-					&default_backing_dev_info);
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index afc4092989cd..1ec872e3a926 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2318,7 +2318,6 @@ int open_ctree(struct super_block *sb,
 	 */
 	fs_info->btree_inode->i_size = OFFSET_MAX;
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
 
 	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
 	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8bf326affb94..54bcf639d1cf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3608,7 +3608,6 @@ cache_acl:
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &btrfs_aops;
-		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 		inode->i_fop = &btrfs_file_operations;
 		inode->i_op = &btrfs_file_inode_operations;
@@ -3623,7 +3622,6 @@ cache_acl:
 	case S_IFLNK:
 		inode->i_op = &btrfs_symlink_inode_operations;
 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
-		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		break;
 	default:
 		inode->i_op = &btrfs_special_inode_operations;
@@ -6088,7 +6086,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	inode->i_fop = &btrfs_file_operations;
 	inode->i_op = &btrfs_file_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
 	if (err)
@@ -9203,7 +9200,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode->i_fop = &btrfs_file_operations;
 	inode->i_op = &btrfs_file_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
@@ -9247,7 +9243,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 
 	inode->i_op = &btrfs_symlink_inode_operations;
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	inode_set_bytes(inode, name_len);
 	btrfs_i_size_write(inode, name_len);
 	err = btrfs_update_inode(trans, root, inode);
@@ -9459,7 +9454,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	inode->i_op = &btrfs_file_inode_operations;
 
 	inode->i_mapping->a_ops = &btrfs_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 
 	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f61a74115beb..6b5173605154 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 	}
 
 	inode->i_mapping->a_ops = &ceph_aops;
-	inode->i_mapping->backing_dev_info =
-		&ceph_sb_to_client(inode->i_sb)->backing_dev_info;
 
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFIFO:
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0c3ce464cae4..2d4f37235ed0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -937,8 +937,6 @@ retry_iget5_locked:
 			inode->i_flags |= S_NOATIME | S_NOCMTIME;
 		if (inode->i_state & I_NEW) {
 			inode->i_ino = hash;
-			if (S_ISREG(inode->i_mode))
-				inode->i_data.backing_dev_info = sb->s_bdi;
 #ifdef CONFIG_CIFS_FSCACHE
 			/* initialize per-inode cache cookie pointer */
 			CIFS_I(inode)->fscache = NULL;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 0ad6b4d6de00..65af86147154 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -131,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode->i_mapping->a_ops = &configfs_aops;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_op = &configfs_inode_operations;
 
 		if (sd->s_iattr) {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1686dc2da9fd..34b36a504059 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
 	inode->i_ino = lower_inode->i_ino;
 	inode->i_version++;
 	inode->i_mapping->a_ops = &ecryptfs_aops;
-	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
 	if (S_ISLNK(inode->i_mode))
 		inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f1d3d4eb8c4f..6fc91df99ff8 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1214,7 +1214,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
 	}
 
-	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &exofs_file_inode_operations;
 		inode->i_fop = &exofs_file_operations;
@@ -1314,7 +1313,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
 
 	set_obj_2bcreated(oi);
 
-	inode->i_mapping->backing_dev_info = sb->s_bdi;
 	inode_init_owner(inode, dir, mode);
 	inode->i_ino = sbi->s_nextid++;
 	inode->i_blkbits = EXOFS_BLKSHIFT;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f38256e4476e..e8799c11424b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 		if (!fc->writeback_cache || !S_ISREG(attr->mode))
 			inode->i_flags |= S_NOCMTIME;
 		inode->i_generation = generation;
-		inode->i_data.backing_dev_info = &fc->bdi;
 		fuse_init_inode(inode, attr);
 		unlock_new_inode(inode);
 	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a23524aa3eac..08ea717981f7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -775,7 +775,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		mapping->flags = 0;
 		mapping_set_gfp_mask(mapping, GFP_NOFS);
 		mapping->private_data = NULL;
-		mapping->backing_dev_info = s->s_bdi;
 		mapping->writeback_index = 0;
 	}
 
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8633ad328ee2..efc8e254787c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
 	mapping->private_data = NULL;
-	mapping->backing_dev_info = sb->s_bdi;
 	mapping->writeback_index = 0;
 
 	spin_lock_init(&sdp->sd_log_lock);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index de7c95c7d840..c274aca8e8dc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -492,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
 				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_mapping->private_data = resv_map;
 		info = HUGETLBFS_I(inode);
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..e4e8caa7464c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -170,20 +170,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	atomic_set(&mapping->i_mmap_writable, 0);
 	mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
 	mapping->private_data = NULL;
-	mapping->backing_dev_info = &default_backing_dev_info;
 	mapping->writeback_index = 0;
-
-	/*
-	 * If the block_device provides a backing_dev_info for client
-	 * inodes then use that.  Otherwise the inode share the bdev's
-	 * backing_dev_info.
-	 */
-	if (sb->s_bdev) {
-		struct backing_dev_info *bdi;
-
-		bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-		mapping->backing_dev_info = bdi;
-	}
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
 	INIT_HLIST_HEAD(&inode->i_dentry);	/* buggered by rcu freeing */
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 06f06887b2d2..9000874a945b 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -286,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
 	kernfs_get(kn);
 	inode->i_private = kn;
 	inode->i_mapping->a_ops = &kernfs_aops;
-	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_op = &kernfs_iops;
 
 	set_default_inode_attr(inode, kn->mode);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index a699a3fc62c0..01a9e16e9782 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 	if (inode) {
 		atomic_set(&NCP_FINFO(inode)->opened, info->opened);
 
-		inode->i_mapping->backing_dev_info = sb->s_bdi;
 		inode->i_ino = info->ino;
 		ncp_set_attr(inode, info);
 		if (S_ISREG(inode->i_mode)) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4bffe637ea32..24aac72420f4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -387,7 +387,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 		if (S_ISREG(inode->i_mode)) {
 			inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
 			inode->i_data.a_ops = &nfs_file_aops;
-			inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
 		} else if (S_ISDIR(inode->i_mode)) {
 			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
 			inode->i_fop = &nfs_dir_operations;
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 57ceaf33d177..748ca238915a 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode)
 	inode->i_mode = S_IFREG;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
 	inode->i_mapping->a_ops = &empty_aops;
-	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
 	ii->i_flags = 0;
 	nilfs_bmap_init_gc(ii->i_bmap);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c4dcd1db57ee..892cf5ffdb8e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
 
 	inode->i_mode = S_IFREG;
 	mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
-	inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
 
 	inode->i_op = &def_mdt_iops;
 	inode->i_fop = &def_mdt_fops;
@@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 			       struct nilfs_shadow_map *shadow)
 {
 	struct nilfs_mdt_info *mi = NILFS_MDT(inode);
-	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
 	address_space_init_once(&shadow->frozen_data);
-	nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
+	nilfs_mapping_init(&shadow->frozen_data, inode);
 	address_space_init_once(&shadow->frozen_btnodes);
-	nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
+	nilfs_mapping_init(&shadow->frozen_btnodes, inode);
 	mi->mi_shadow = shadow;
 	return 0;
 }
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index da276640f776..700ecbcca55d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
 	return nc;
 }
 
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
-			struct backing_dev_info *bdi)
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
 {
 	mapping->host = inode;
 	mapping->flags = 0;
 	mapping_set_gfp_mask(mapping, GFP_NOFS);
 	mapping->private_data = NULL;
-	mapping->backing_dev_info = bdi;
 	mapping->a_ops = &empty_aops;
 }
 
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index ef30c5c2426f..a43b8287d012 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_page(struct page *, bool);
 void nilfs_clear_dirty_pages(struct address_space *, bool);
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
-			struct backing_dev_info *bdi);
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
 					    sector_t start_blk,
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3d4bbac36bea..5bc2a1cf73c3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
 	ii->i_state = 0;
 	ii->i_cno = 0;
 	ii->vfs_inode.i_version = 1;
-	nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
+	nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
 	return &ii->vfs_inode;
 }
 
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 6000d3029b26..061ba6a91bf2 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -398,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 	if (inode) {
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, NULL, mode);
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inc_nlink(inode);
 
@@ -422,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
 
 	inode->i_ino = get_next_ino();
 	inode_init_owner(inode, parent, mode);
-	inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
 	ip = DLMFS_I(inode);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index ad4d712002f4..889d558b4e05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -59,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_mapping->a_ops = &ramfs_aops;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
 		mapping_set_unevictable(inode->i_mapping);
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e98dd88197d5..268733cda397 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
 	case ROMFH_REG:
 		i->i_fop = &romfs_ro_fops;
 		i->i_data.a_ops = &romfs_aops;
-		if (i->i_sb->s_mtd)
-			i->i_data.backing_dev_info =
-				i->i_sb->s_mtd->backing_dev_info;
 		if (nextfh & ROMFH_EXEC)
 			mode |= S_IXUGO;
 		break;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ea41649e4ca5..c49b1981ac95 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
 	inode->i_mtime = inode->i_atime = inode->i_ctime =
 			 ubifs_current_time(inode);
 	inode->i_mapping->nrpages = 0;
-	/* Disable readahead */
-	inode->i_mapping->backing_dev_info = &c->bdi;
 
 	switch (mode & S_IFMT) {
 	case S_IFREG:
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index ed93dc6ae245..6197154f36ca 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 	if (err)
 		goto out_invalid;
 
-	/* Disable read-ahead */
-	inode->i_mapping->backing_dev_info = &c->bdi;
-
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFREG:
 		inode->i_mapping->a_ops = &ubifs_file_address_operations;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1dada399aa23..65d02de342e1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -34,6 +34,7 @@
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
 
+struct backing_dev_info;
 struct export_operations;
 struct hd_geometry;
 struct iovec;
@@ -394,7 +395,6 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
 				loff_t pos, unsigned len, unsigned copied,
 				struct page *page, void *fsdata);
 
-struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
@@ -409,7 +409,6 @@ struct address_space {
 	pgoff_t			writeback_index;/* writeback starts here */
 	const struct address_space_operations *a_ops;	/* methods */
 	unsigned long		flags;		/* error bits/gfp mask */
-	struct backing_dev_info *backing_dev_info; /* device readahead, etc */
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	void			*private_data;	/* ditto */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 16c68958aeda..52e0c7652448 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -24,7 +24,6 @@ struct backing_dev_info noop_backing_dev_info = {
 	.name		= "noop",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
-EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 
 static struct class *bdi_class;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 1b77eaf589fd..4c61d3d5bfb4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1410,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, dir, mode);
 		inode->i_blocks = 0;
-		inode->i_mapping->backing_dev_info = &noop_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_generation = get_seconds();
 		info = SHMEM_I(inode);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 1c137b69ecde..405923f77334 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,7 +37,6 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = {
 		.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
 		.i_mmap_writable = ATOMIC_INIT(0),
 		.a_ops		= &swap_aops,
-		.backing_dev_info = &noop_backing_dev_info,
 	}
 };
 
-- 
cgit v1.2.3


From df0ce26cb4ee8bc233d50213b97213532aff0a3c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Jan 2015 10:42:41 +0100
Subject: fs: remove default_backing_dev_info

Now that default_backing_dev_info is not used for writeback purposes we can
git rid of it easily:

 - instead of using it's name for tracing unregistered bdi we just use
   "unknown"
 - btrfs and ceph can just assign the default read ahead window themselves
   like several other filesystems already do.
 - we can assign noop_backing_dev_info as the default one in alloc_super.
   All filesystems already either assigned their own or
   noop_backing_dev_info.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/btrfs/disk-io.c               | 2 +-
 fs/ceph/super.c                  | 2 +-
 fs/super.c                       | 8 ++------
 include/linux/backing-dev.h      | 1 -
 include/trace/events/writeback.h | 6 ++----
 mm/backing-dev.c                 | 9 ---------
 6 files changed, 6 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1ec872e3a926..1afb18226da8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1719,7 +1719,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 	if (err)
 		return err;
 
-	bdi->ra_pages	= default_backing_dev_info.ra_pages;
+	bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
 	bdi->congested_fn	= btrfs_congested_fn;
 	bdi->congested_data	= info;
 	return 0;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index e350cc1611e4..5ae62587a71d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -899,7 +899,7 @@ static int ceph_register_bdi(struct super_block *sb,
 			>> PAGE_SHIFT;
 	else
 		fsc->backing_dev_info.ra_pages =
-			default_backing_dev_info.ra_pages;
+			VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
 
 	err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
 			   atomic_long_inc_return(&bdi_seq));
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..3b4dadafdd60 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -185,8 +185,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	}
 	init_waitqueue_head(&s->s_writers.wait);
 	init_waitqueue_head(&s->s_writers.wait_unfrozen);
+	s->s_bdi = &noop_backing_dev_info;
 	s->s_flags = flags;
-	s->s_bdi = &default_backing_dev_info;
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
 	INIT_LIST_HEAD(&s->s_inodes);
@@ -863,10 +863,7 @@ EXPORT_SYMBOL(free_anon_bdev);
 
 int set_anon_super(struct super_block *s, void *data)
 {
-	int error = get_anon_bdev(&s->s_dev);
-	if (!error)
-		s->s_bdi = &noop_backing_dev_info;
-	return error;
+	return get_anon_bdev(&s->s_dev);
 }
 
 EXPORT_SYMBOL(set_anon_super);
@@ -1111,7 +1108,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
 	sb = root->d_sb;
 	BUG_ON(!sb);
 	WARN_ON(!sb->s_bdi);
-	WARN_ON(sb->s_bdi == &default_backing_dev_info);
 	sb->s_flags |= MS_BORN;
 
 	error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index ed59dee03a71..d94077fea1f8 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -241,7 +241,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
 	(BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
 
-extern struct backing_dev_info default_backing_dev_info;
 extern struct backing_dev_info noop_backing_dev_info;
 
 int writeback_in_progress(struct backing_dev_info *bdi);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 74f5207bd090..0e9310905413 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -156,10 +156,8 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		__field(int, reason)
 	),
 	TP_fast_assign(
-		struct device *dev = bdi->dev;
-		if (!dev)
-			dev = default_backing_dev_info.dev;
-		strncpy(__entry->name, dev_name(dev), 32);
+		strncpy(__entry->name,
+			bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
 		__entry->nr_pages = work->nr_pages;
 		__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
 		__entry->sync_mode = work->sync_mode;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1725adb242e0..7690ec77c722 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,12 +14,6 @@
 
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
-struct backing_dev_info default_backing_dev_info = {
-	.name		= "default",
-	.ra_pages	= VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
-};
-EXPORT_SYMBOL_GPL(default_backing_dev_info);
-
 struct backing_dev_info noop_backing_dev_info = {
 	.name		= "noop",
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -250,9 +244,6 @@ static int __init default_bdi_init(void)
 	if (!bdi_wq)
 		return -ENOMEM;
 
-	err = bdi_init(&default_backing_dev_info);
-	if (!err)
-		bdi_register(&default_backing_dev_info, NULL, "default");
 	err = bdi_init(&noop_backing_dev_info);
 
 	return err;
-- 
cgit v1.2.3


From 85cdf36e11557dc367c1361e4b7bb2c4619cae91 Mon Sep 17 00:00:00 2001
From: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Date: Thu, 1 Jan 2015 18:04:52 +0100
Subject: power: ab8500_fg.c: Remove unused function

Remove the function ab8500_fg_reinit() that is not used anywhere.

This was partially found by using a static code analysis program called cppcheck.

Signed-off-by: Rickard Strandqvist <rickard_strandqvist@spectrumdigital.se>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/ab8500_fg.c            | 14 --------------
 include/linux/mfd/abx500/ab8500-bm.h |  1 -
 2 files changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/ab8500_fg.c b/drivers/power/ab8500_fg.c
index 69b80bcaa9e7..c908658aa31a 100644
--- a/drivers/power/ab8500_fg.c
+++ b/drivers/power/ab8500_fg.c
@@ -2435,20 +2435,6 @@ static void ab8500_fg_reinit_work(struct work_struct *work)
 	}
 }
 
-/**
- * ab8500_fg_reinit() - forces FG algorithm to reinitialize with current values
- *
- * This function can be used to force the FG algorithm to recalculate a new
- * voltage based battery capacity.
- */
-void ab8500_fg_reinit(void)
-{
-	struct ab8500_fg *di = ab8500_fg_get();
-	/* User won't be notified if a null pointer returned. */
-	if (di != NULL)
-		queue_delayed_work(di->fg_wq, &di->fg_reinit_work, 0);
-}
-
 /* Exposure to the sysfs interface */
 
 struct ab8500_fg_sysfs_entry {
diff --git a/include/linux/mfd/abx500/ab8500-bm.h b/include/linux/mfd/abx500/ab8500-bm.h
index cc892a8d8d6e..12a5b396921e 100644
--- a/include/linux/mfd/abx500/ab8500-bm.h
+++ b/include/linux/mfd/abx500/ab8500-bm.h
@@ -461,7 +461,6 @@ struct ab8500_fg;
 #ifdef CONFIG_AB8500_BM
 extern struct abx500_bm_data ab8500_bm_data;
 
-void ab8500_fg_reinit(void);
 void ab8500_charger_usb_state_changed(u8 bm_usb_state, u16 mA);
 struct ab8500_btemp *ab8500_btemp_get(void);
 int ab8500_btemp_get_batctrl_temp(struct ab8500_btemp *btemp);
-- 
cgit v1.2.3


From 9c45101e88b2bf2ce36b8833fcfa784a9149aa74 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 19 Nov 2014 09:21:58 +0100
Subject: quota: Cleanup flags definitions

Currently all quota flags were defined just in kernel-private headers.
Export flags readable / writeable from userspace to userspace via
include/uapi/linux/quota.h.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c           |  2 +-
 include/linux/dqblk_v1.h   |  3 ---
 include/linux/quota.h      | 14 ++++++++------
 include/uapi/linux/quota.h | 14 +++++++++++++-
 4 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8f0acef3d184..f8be368b9086 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1248,7 +1248,7 @@ static int ignore_hardlimit(struct dquot *dquot)
 
 	return capable(CAP_SYS_RESOURCE) &&
 	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
-		!(info->dqi_flags & V1_DQF_RSQUASH));
+		!(info->dqi_flags & DQF_ROOT_SQUASH));
 }
 
 /* needs dq_data_lock */
diff --git a/include/linux/dqblk_v1.h b/include/linux/dqblk_v1.h
index 3713a7232dd8..c0d4d1e2a45c 100644
--- a/include/linux/dqblk_v1.h
+++ b/include/linux/dqblk_v1.h
@@ -5,9 +5,6 @@
 #ifndef _LINUX_DQBLK_V1_H
 #define _LINUX_DQBLK_V1_H
 
-/* Root squash turned on */
-#define V1_DQF_RSQUASH 1
-
 /* Numbers of blocks needed for updates */
 #define V1_INIT_ALLOC 1
 #define V1_INIT_REWRITE 1
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 50978b781a19..0c42113607ce 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -223,12 +223,14 @@ struct mem_dqinfo {
 
 struct super_block;
 
-#define DQF_MASK 0xffff		/* Mask for format specific flags */
-#define DQF_GETINFO_MASK 0x1ffff	/* Mask for flags passed to userspace */
-#define DQF_SETINFO_MASK 0xffff		/* Mask for flags modifiable from userspace */
-#define DQF_SYS_FILE_B		16
-#define DQF_SYS_FILE (1 << DQF_SYS_FILE_B)	/* Quota file stored as system file */
-#define DQF_INFO_DIRTY_B	31
+/* Mask for flags passed to userspace */
+#define DQF_GETINFO_MASK (DQF_ROOT_SQUASH | DQF_SYS_FILE)
+/* Mask for flags modifiable from userspace */
+#define DQF_SETINFO_MASK DQF_ROOT_SQUASH
+
+enum {
+	DQF_INFO_DIRTY_B = DQF_PRIVATE,
+};
 #define DQF_INFO_DIRTY (1 << DQF_INFO_DIRTY_B)	/* Is info dirty? */
 
 extern void mark_info_dirty(struct super_block *sb, int type);
diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
index 3b6cfbeb086d..1f49b8341c99 100644
--- a/include/uapi/linux/quota.h
+++ b/include/uapi/linux/quota.h
@@ -126,10 +126,22 @@ struct if_dqblk {
 #define IIF_FLAGS	4
 #define IIF_ALL		(IIF_BGRACE | IIF_IGRACE | IIF_FLAGS)
 
+enum {
+	DQF_ROOT_SQUASH_B = 0,
+	DQF_SYS_FILE_B = 16,
+	/* Kernel internal flags invisible to userspace */
+	DQF_PRIVATE
+};
+
+/* Root squash enabled (for v1 quota format) */
+#define DQF_ROOT_SQUASH	(1 << DQF_ROOT_SQUASH_B)
+/* Quota stored in a system file */
+#define DQF_SYS_FILE	(1 << DQF_SYS_FILE_B)
+
 struct if_dqinfo {
 	__u64 dqi_bgrace;
 	__u64 dqi_igrace;
-	__u32 dqi_flags;
+	__u32 dqi_flags;	/* DFQ_* */
 	__u32 dqi_valid;
 };
 
-- 
cgit v1.2.3


From c1155c64e603378dccfc21ee0612cf60dd11725b Mon Sep 17 00:00:00 2001
From: Jonghwa Lee <jonghwa3.lee@samsung.com>
Date: Fri, 19 Dec 2014 17:55:13 +0900
Subject: power: charger-manager: Use alarmtimer for battery monitoring in
 suspend.

To guerantee proper charing and managing batteries even in suspend,
charger-manager has used rtc device with rtc framework interface.
However, it is better to use alarmtimer for cleaner and more appropriate
operation.
This patch makes driver to use alarmtimer for polling work in suspend and
removes all deprecated codes related with using rtc interface.

Signed-off-by: Jonghwa Lee <jonghwa3.lee@samsung.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 drivers/power/Kconfig                 |   2 +-
 drivers/power/charger-manager.c       | 289 ++++++++++------------------------
 include/linux/power/charger-manager.h |  32 +---
 3 files changed, 84 insertions(+), 239 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index e2569a538501..a79f16afb588 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -315,7 +315,7 @@ config CHARGER_GPIO
 
 config CHARGER_MANAGER
 	bool "Battery charger manager for multiple chargers"
-	depends on REGULATOR && RTC_CLASS
+	depends on REGULATOR
 	select EXTCON
 	help
           Say Y to enable charger-manager support, which allows multiple
diff --git a/drivers/power/charger-manager.c b/drivers/power/charger-manager.c
index 649052e1f2d9..14b0d85318eb 100644
--- a/drivers/power/charger-manager.c
+++ b/drivers/power/charger-manager.c
@@ -69,16 +69,10 @@ static LIST_HEAD(cm_list);
 static DEFINE_MUTEX(cm_list_mtx);
 
 /* About in-suspend (suspend-again) monitoring */
-static struct rtc_device *rtc_dev;
-/*
- * Backup RTC alarm
- * Save the wakeup alarm before entering suspend-to-RAM
- */
-static struct rtc_wkalrm rtc_wkalarm_save;
-/* Backup RTC alarm time in terms of seconds since 01-01-1970 00:00:00 */
-static unsigned long rtc_wkalarm_save_time;
+static struct alarm *cm_timer;
+
 static bool cm_suspended;
-static bool cm_rtc_set;
+static bool cm_timer_set;
 static unsigned long cm_suspend_duration_ms;
 
 /* About normal (not suspended) monitoring */
@@ -87,9 +81,6 @@ static unsigned long next_polling; /* Next appointed polling time */
 static struct workqueue_struct *cm_wq; /* init at driver add */
 static struct delayed_work cm_monitor_work; /* init at driver add */
 
-/* Global charger-manager description */
-static struct charger_global_desc *g_desc; /* init with setup_charger_manager */
-
 /**
  * is_batt_present - See if the battery presents in place.
  * @cm: the Charger Manager representing the battery.
@@ -1047,10 +1038,13 @@ static bool cm_setup_timer(void)
 {
 	struct charger_manager *cm;
 	unsigned int wakeup_ms = UINT_MAX;
-	bool ret = false;
+	int timer_req = 0;
 
-	mutex_lock(&cm_list_mtx);
+	if (time_after(next_polling, jiffies))
+		CM_MIN_VALID(wakeup_ms,
+			jiffies_to_msecs(next_polling - jiffies));
 
+	mutex_lock(&cm_list_mtx);
 	list_for_each_entry(cm, &cm_list, entry) {
 		unsigned int fbchk_ms = 0;
 
@@ -1070,162 +1064,38 @@ static bool cm_setup_timer(void)
 		/* Skip if polling is not required for this CM */
 		if (!is_polling_required(cm) && !cm->emergency_stop)
 			continue;
+		timer_req++;
 		if (cm->desc->polling_interval_ms == 0)
 			continue;
 		CM_MIN_VALID(wakeup_ms, cm->desc->polling_interval_ms);
 	}
-
 	mutex_unlock(&cm_list_mtx);
 
-	if (wakeup_ms < UINT_MAX && wakeup_ms > 0) {
-		pr_info("Charger Manager wakeup timer: %u ms\n", wakeup_ms);
-		if (rtc_dev) {
-			struct rtc_wkalrm tmp;
-			unsigned long time, now;
-			unsigned long add = DIV_ROUND_UP(wakeup_ms, 1000);
-
-			/*
-			 * Set alarm with the polling interval (wakeup_ms)
-			 * except when rtc_wkalarm_save comes first.
-			 * However, the alarm time should be NOW +
-			 * CM_RTC_SMALL or later.
-			 */
-			tmp.enabled = 1;
-			rtc_read_time(rtc_dev, &tmp.time);
-			rtc_tm_to_time(&tmp.time, &now);
-			if (add < CM_RTC_SMALL)
-				add = CM_RTC_SMALL;
-			time = now + add;
+	if (timer_req && cm_timer) {
+		ktime_t now, add;
 
-			ret = true;
+		/*
+		 * Set alarm with the polling interval (wakeup_ms)
+		 * The alarm time should be NOW + CM_RTC_SMALL or later.
+		 */
+		if (wakeup_ms == UINT_MAX ||
+			wakeup_ms < CM_RTC_SMALL * MSEC_PER_SEC)
+			wakeup_ms = 2 * CM_RTC_SMALL * MSEC_PER_SEC;
 
-			if (rtc_wkalarm_save.enabled &&
-			    rtc_wkalarm_save_time &&
-			    rtc_wkalarm_save_time < time) {
-				if (rtc_wkalarm_save_time < now + CM_RTC_SMALL)
-					time = now + CM_RTC_SMALL;
-				else
-					time = rtc_wkalarm_save_time;
+		pr_info("Charger Manager wakeup timer: %u ms\n", wakeup_ms);
 
-				/* The timer is not appointed by CM */
-				ret = false;
-			}
+		now = ktime_get_boottime();
+		add = ktime_set(wakeup_ms / MSEC_PER_SEC,
+				(wakeup_ms % MSEC_PER_SEC) * NSEC_PER_MSEC);
+		alarm_start(cm_timer, ktime_add(now, add));
 
-			pr_info("Waking up after %lu secs\n", time - now);
+		cm_suspend_duration_ms = wakeup_ms;
 
-			rtc_time_to_tm(time, &tmp.time);
-			rtc_set_alarm(rtc_dev, &tmp);
-			cm_suspend_duration_ms += wakeup_ms;
-			return ret;
-		}
+		return true;
 	}
-
-	if (rtc_dev)
-		rtc_set_alarm(rtc_dev, &rtc_wkalarm_save);
 	return false;
 }
 
-static void _cm_fbchk_in_suspend(struct charger_manager *cm)
-{
-	unsigned long jiffy_now = jiffies;
-
-	if (!cm->fullbatt_vchk_jiffies_at)
-		return;
-
-	if (g_desc && g_desc->assume_timer_stops_in_suspend)
-		jiffy_now += msecs_to_jiffies(cm_suspend_duration_ms);
-
-	/* Execute now if it's going to be executed not too long after */
-	jiffy_now += CM_JIFFIES_SMALL;
-
-	if (time_after_eq(jiffy_now, cm->fullbatt_vchk_jiffies_at))
-		fullbatt_vchk(&cm->fullbatt_vchk_work.work);
-}
-
-/**
- * cm_suspend_again - Determine whether suspend again or not
- *
- * Returns true if the system should be suspended again
- * Returns false if the system should be woken up
- */
-bool cm_suspend_again(void)
-{
-	struct charger_manager *cm;
-	bool ret = false;
-
-	if (!g_desc || !g_desc->rtc_only_wakeup || !g_desc->rtc_only_wakeup() ||
-	    !cm_rtc_set)
-		return false;
-
-	if (cm_monitor())
-		goto out;
-
-	ret = true;
-	mutex_lock(&cm_list_mtx);
-	list_for_each_entry(cm, &cm_list, entry) {
-		_cm_fbchk_in_suspend(cm);
-
-		if (cm->status_save_ext_pwr_inserted != is_ext_pwr_online(cm) ||
-		    cm->status_save_batt != is_batt_present(cm)) {
-			ret = false;
-			break;
-		}
-	}
-	mutex_unlock(&cm_list_mtx);
-
-	cm_rtc_set = cm_setup_timer();
-out:
-	/* It's about the time when the non-CM appointed timer goes off */
-	if (rtc_wkalarm_save.enabled) {
-		unsigned long now;
-		struct rtc_time tmp;
-
-		rtc_read_time(rtc_dev, &tmp);
-		rtc_tm_to_time(&tmp, &now);
-
-		if (rtc_wkalarm_save_time &&
-		    now + CM_RTC_SMALL >= rtc_wkalarm_save_time)
-			return false;
-	}
-	return ret;
-}
-EXPORT_SYMBOL_GPL(cm_suspend_again);
-
-/**
- * setup_charger_manager - initialize charger_global_desc data
- * @gd: pointer to instance of charger_global_desc
- */
-int setup_charger_manager(struct charger_global_desc *gd)
-{
-	if (!gd)
-		return -EINVAL;
-
-	if (rtc_dev)
-		rtc_class_close(rtc_dev);
-	rtc_dev = NULL;
-	g_desc = NULL;
-
-	if (!gd->rtc_only_wakeup) {
-		pr_err("The callback rtc_only_wakeup is not given\n");
-		return -EINVAL;
-	}
-
-	if (gd->rtc_name) {
-		rtc_dev = rtc_class_open(gd->rtc_name);
-		if (IS_ERR_OR_NULL(rtc_dev)) {
-			rtc_dev = NULL;
-			/* Retry at probe. RTC may be not registered yet */
-		}
-	} else {
-		pr_warn("No wakeup timer is given for charger manager.  "
-			"In-suspend monitoring won't work.\n");
-	}
-
-	g_desc = gd;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(setup_charger_manager);
-
 /**
  * charger_extcon_work - enable/diable charger according to the state
  *			of charger cable
@@ -1719,6 +1589,12 @@ static inline struct charger_desc *cm_get_drv_data(struct platform_device *pdev)
 	return dev_get_platdata(&pdev->dev);
 }
 
+static enum alarmtimer_restart cm_timer_func(struct alarm *alarm, ktime_t now)
+{
+	cm_timer_set = false;
+	return ALARMTIMER_NORESTART;
+}
+
 static int charger_manager_probe(struct platform_device *pdev)
 {
 	struct charger_desc *desc = cm_get_drv_data(pdev);
@@ -1728,16 +1604,6 @@ static int charger_manager_probe(struct platform_device *pdev)
 	union power_supply_propval val;
 	struct power_supply *fuel_gauge;
 
-	if (g_desc && !rtc_dev && g_desc->rtc_name) {
-		rtc_dev = rtc_class_open(g_desc->rtc_name);
-		if (IS_ERR_OR_NULL(rtc_dev)) {
-			rtc_dev = NULL;
-			dev_err(&pdev->dev, "Cannot get RTC %s\n",
-				g_desc->rtc_name);
-			return -ENODEV;
-		}
-	}
-
 	if (IS_ERR(desc)) {
 		dev_err(&pdev->dev, "No platform data (desc) found\n");
 		return -ENODEV;
@@ -1752,6 +1618,12 @@ static int charger_manager_probe(struct platform_device *pdev)
 	cm->dev = &pdev->dev;
 	cm->desc = desc;
 
+	/* Initialize alarm timer */
+	if (alarmtimer_get_rtcdev()) {
+		cm_timer = devm_kzalloc(cm->dev, sizeof(*cm_timer), GFP_KERNEL);
+		alarm_init(cm_timer, ALARM_BOOTTIME, cm_timer_func);
+	}
+
 	/*
 	 * The following two do not need to be errors.
 	 * Users may intentionally ignore those two features.
@@ -1993,38 +1865,41 @@ static int cm_suspend_noirq(struct device *dev)
 	return ret;
 }
 
+static bool cm_need_to_awake(void)
+{
+	struct charger_manager *cm;
+
+	if (cm_timer)
+		return false;
+
+	mutex_lock(&cm_list_mtx);
+	list_for_each_entry(cm, &cm_list, entry) {
+		if (is_charging(cm)) {
+			mutex_unlock(&cm_list_mtx);
+			return true;
+		}
+	}
+	mutex_unlock(&cm_list_mtx);
+
+	return false;
+}
+
 static int cm_suspend_prepare(struct device *dev)
 {
 	struct charger_manager *cm = dev_get_drvdata(dev);
 
-	if (!cm_suspended) {
-		if (rtc_dev) {
-			struct rtc_time tmp;
-			unsigned long now;
-
-			rtc_read_alarm(rtc_dev, &rtc_wkalarm_save);
-			rtc_read_time(rtc_dev, &tmp);
+	if (cm_need_to_awake())
+		return -EBUSY;
 
-			if (rtc_wkalarm_save.enabled) {
-				rtc_tm_to_time(&rtc_wkalarm_save.time,
-					       &rtc_wkalarm_save_time);
-				rtc_tm_to_time(&tmp, &now);
-				if (now > rtc_wkalarm_save_time)
-					rtc_wkalarm_save_time = 0;
-			} else {
-				rtc_wkalarm_save_time = 0;
-			}
-		}
+	if (!cm_suspended)
 		cm_suspended = true;
-	}
 
-	cancel_delayed_work(&cm->fullbatt_vchk_work);
-	cm->status_save_ext_pwr_inserted = is_ext_pwr_online(cm);
-	cm->status_save_batt = is_batt_present(cm);
+	cm_timer_set = cm_setup_timer();
 
-	if (!cm_rtc_set) {
-		cm_suspend_duration_ms = 0;
-		cm_rtc_set = cm_setup_timer();
+	if (cm_timer_set) {
+		cancel_work_sync(&setup_polling);
+		cancel_delayed_work_sync(&cm_monitor_work);
+		cancel_delayed_work(&cm->fullbatt_vchk_work);
 	}
 
 	return 0;
@@ -2034,18 +1909,21 @@ static void cm_suspend_complete(struct device *dev)
 {
 	struct charger_manager *cm = dev_get_drvdata(dev);
 
-	if (cm_suspended) {
-		if (rtc_dev) {
-			struct rtc_wkalrm tmp;
-
-			rtc_read_alarm(rtc_dev, &tmp);
-			rtc_wkalarm_save.pending = tmp.pending;
-			rtc_set_alarm(rtc_dev, &rtc_wkalarm_save);
-		}
+	if (cm_suspended)
 		cm_suspended = false;
-		cm_rtc_set = false;
+
+	if (cm_timer_set) {
+		ktime_t remain;
+
+		alarm_cancel(cm_timer);
+		cm_timer_set = false;
+		remain = alarm_expires_remaining(cm_timer);
+		cm_suspend_duration_ms -= ktime_to_ms(remain);
+		schedule_work(&setup_polling);
 	}
 
+	_cm_monitor(cm);
+
 	/* Re-enqueue delayed work (fullbatt_vchk_work) */
 	if (cm->fullbatt_vchk_jiffies_at) {
 		unsigned long delay = 0;
@@ -2060,21 +1938,18 @@ static void cm_suspend_complete(struct device *dev)
 		}
 
 		/*
-		 * Account for cm_suspend_duration_ms if
-		 * assume_timer_stops_in_suspend is active
+		 * Account for cm_suspend_duration_ms with assuming that
+		 * timer stops in suspend.
 		 */
-		if (g_desc && g_desc->assume_timer_stops_in_suspend) {
-			if (delay > cm_suspend_duration_ms)
-				delay -= cm_suspend_duration_ms;
-			else
-				delay = 0;
-		}
+		if (delay > cm_suspend_duration_ms)
+			delay -= cm_suspend_duration_ms;
+		else
+			delay = 0;
 
 		queue_delayed_work(cm_wq, &cm->fullbatt_vchk_work,
 				   msecs_to_jiffies(delay));
 	}
 	device_set_wakeup_capable(cm->dev, false);
-	uevent_notify(cm, NULL);
 }
 
 static const struct dev_pm_ops charger_manager_pm = {
diff --git a/include/linux/power/charger-manager.h b/include/linux/power/charger-manager.h
index e97fc656a058..416ebeb6ee1e 100644
--- a/include/linux/power/charger-manager.h
+++ b/include/linux/power/charger-manager.h
@@ -17,6 +17,7 @@
 
 #include <linux/power_supply.h>
 #include <linux/extcon.h>
+#include <linux/alarmtimer.h>
 
 enum data_source {
 	CM_BATTERY_PRESENT,
@@ -44,29 +45,6 @@ enum cm_event_types {
 	CM_EVENT_OTHERS,
 };
 
-/**
- * struct charger_global_desc
- * @rtc_name: the name of RTC used to wake up the system from suspend.
- * @rtc_only_wakeup:
- *	If the system is woken up by waekup-sources other than the RTC or
- *	callbacks, Charger Manager should recognize with
- *	rtc_only_wakeup() returning false.
- *	If the RTC given to CM is the only wakeup reason,
- *	rtc_only_wakeup should return true.
- * @assume_timer_stops_in_suspend:
- *	Assume that the jiffy timer stops in suspend-to-RAM.
- *	When enabled, CM does not rely on jiffies value in
- *	suspend_again and assumes that jiffies value does not
- *	change during suspend.
- */
-struct charger_global_desc {
-	char *rtc_name;
-
-	bool (*rtc_only_wakeup)(void);
-
-	bool assume_timer_stops_in_suspend;
-};
-
 /**
  * struct charger_cable
  * @extcon_name: the name of extcon device.
@@ -266,22 +244,14 @@ struct charger_manager {
 	char psy_name_buf[PSY_NAME_MAX + 1];
 	struct power_supply charger_psy;
 
-	bool status_save_ext_pwr_inserted;
-	bool status_save_batt;
-
 	u64 charging_start_time;
 	u64 charging_end_time;
 };
 
 #ifdef CONFIG_CHARGER_MANAGER
-extern int setup_charger_manager(struct charger_global_desc *gd);
-extern bool cm_suspend_again(void);
 extern void cm_notify_event(struct power_supply *psy,
 				enum cm_event_types type, char *msg);
 #else
-static inline int setup_charger_manager(struct charger_global_desc *gd)
-{ return 0; }
-static inline bool cm_suspend_again(void) { return false; }
 static inline void cm_notify_event(struct power_supply *psy,
 				enum cm_event_types type, char *msg) { }
 #endif
-- 
cgit v1.2.3


From d5db139ab3764640e0882a1746e7b9fdee33fd87 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 22 Jan 2015 11:13:14 +1030
Subject: module: make module_refcount() a signed integer.

James Bottomley points out that it will be -1 during unload.  It's
only used for diagnostics, so let's not hide that as it could be a
clue as to what's gone wrong.

Cc: Jason Wessel <jason.wessel@windriver.com>
Acked-and-documention-added-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Masami Hiramatsu <maasami.hiramatsu.pt@hitachi.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h      |  2 +-
 kernel/debug/kdb/kdb_main.c |  2 +-
 kernel/module.c             | 17 +++++++++++++----
 3 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index ebfb0e153c6a..b653d7c0a05a 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -444,7 +444,7 @@ extern void __module_put_and_exit(struct module *mod, long code)
 #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code)
 
 #ifdef CONFIG_MODULE_UNLOAD
-unsigned long module_refcount(struct module *mod);
+int module_refcount(struct module *mod);
 void __symbol_put(const char *symbol);
 #define symbol_put(x) __symbol_put(VMLINUX_SYMBOL_STR(x))
 void symbol_put_addr(void *addr);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 379650b984f8..2934889f2cce 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1979,7 +1979,7 @@ static int kdb_lsmod(int argc, const char **argv)
 		kdb_printf("%-20s%8u  0x%p ", mod->name,
 			   mod->core_size, (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
-		kdb_printf("%4ld ", module_refcount(mod));
+		kdb_printf("%4d ", module_refcount(mod));
 #endif
 		if (mod->state == MODULE_STATE_GOING)
 			kdb_printf(" (Unloading)");
diff --git a/kernel/module.c b/kernel/module.c
index ed4ec9c30bd2..d856e96a3cce 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -772,9 +772,18 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
 	return 0;
 }
 
-unsigned long module_refcount(struct module *mod)
+/**
+ * module_refcount - return the refcount or -1 if unloading
+ *
+ * @mod:	the module we're checking
+ *
+ * Returns:
+ *	-1 if the module is in the process of unloading
+ *	otherwise the number of references in the kernel to the module
+ */
+int module_refcount(struct module *mod)
 {
-	return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE;
+	return atomic_read(&mod->refcnt) - MODULE_REF_BASE;
 }
 EXPORT_SYMBOL(module_refcount);
 
@@ -856,7 +865,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
 	struct module_use *use;
 	int printed_something = 0;
 
-	seq_printf(m, " %lu ", module_refcount(mod));
+	seq_printf(m, " %i ", module_refcount(mod));
 
 	/*
 	 * Always include a trailing , so userspace can differentiate
@@ -908,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
 static ssize_t show_refcnt(struct module_attribute *mattr,
 			   struct module_kobject *mk, char *buffer)
 {
-	return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
+	return sprintf(buffer, "%i\n", module_refcount(mk->mod));
 }
 
 static struct module_attribute modinfo_refcnt =
-- 
cgit v1.2.3


From 8116bf4cb62d337c953cfa5369ef4cf83e73140c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Wed, 21 Jan 2015 20:44:01 -0500
Subject: locks: update comments that refer to inode->i_flock

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
---
 fs/locks.c         |  2 +-
 include/linux/fs.h | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 2fc36b3772a0..4d0d41163a50 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2212,7 +2212,7 @@ again:
 	 */
 	/*
 	 * we need that spin_lock here - it prevents reordering between
-	 * update of inode->i_flock and check for it done in close().
+	 * update of i_flctx->flc_posix and check for it done in close().
 	 * rcu_read_lock() wouldn't do.
 	 */
 	spin_lock(&current->files->file_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f87cb2f03103..ddd2fa7cefd3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -925,12 +925,11 @@ int locks_in_grace(struct net *);
  * FIXME: should we create a separate "struct lock_request" to help distinguish
  * these two uses?
  *
- * The i_flock list is ordered by:
+ * The varous i_flctx lists are ordered by:
  *
- * 1) lock type -- FL_LEASEs first, then FL_FLOCK, and finally FL_POSIX
- * 2) lock owner
- * 3) lock range start
- * 4) lock range end
+ * 1) lock owner
+ * 2) lock range start
+ * 3) lock range end
  *
  * Obviously, the last two criteria only matter for POSIX locks.
  */
@@ -1992,8 +1991,9 @@ static inline int break_lease(struct inode *inode, unsigned int mode)
 {
 	/*
 	 * Since this check is lockless, we must ensure that any refcounts
-	 * taken are done before checking inode->i_flock. Otherwise, we could
-	 * end up racing with tasks trying to set a new lease on this file.
+	 * taken are done before checking i_flctx->flc_lease. Otherwise, we
+	 * could end up racing with tasks trying to set a new lease on this
+	 * file.
 	 */
 	smp_mb();
 	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
@@ -2005,8 +2005,9 @@ static inline int break_deleg(struct inode *inode, unsigned int mode)
 {
 	/*
 	 * Since this check is lockless, we must ensure that any refcounts
-	 * taken are done before checking inode->i_flock. Otherwise, we could
-	 * end up racing with tasks trying to set a new lease on this file.
+	 * taken are done before checking i_flctx->flc_lease. Otherwise, we
+	 * could end up racing with tasks trying to set a new lease on this
+	 * file.
 	 */
 	smp_mb();
 	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
-- 
cgit v1.2.3


From 30b8b0066cafef274fc92462578ee346211ce7cb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 15 Jan 2015 21:22:39 +0000
Subject: init: Get rid of x86isms

The UP local API support can be set up from an early initcall. No need
for horrible hackery in the init code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Link: http://lkml.kernel.org/r/20150115211703.827943883@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig            |  4 ++++
 arch/x86/kernel/apic/apic.c |  7 +++++++
 include/linux/smp.h         |  7 +++++++
 init/main.c                 | 13 -------------
 4 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ba397bde7948..ffcc3ca5862a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -855,6 +855,10 @@ config SCHED_MC
 
 source "kernel/Kconfig.preempt"
 
+config UP_LATE_INIT
+       def_bool y
+       depends on X86_UP_APIC
+
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors"
 	depends on X86_32 && !SMP && !X86_32_NON_STANDARD && !PCI_MSI
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c681e9ba9e47..19f1bc714ee6 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2267,6 +2267,13 @@ int __init APIC_init_uniprocessor(void)
 	return 0;
 }
 
+#ifdef CONFIG_UP_LATE_INIT
+void __init up_late_init(void)
+{
+	APIC_init_uniprocessor();
+}
+#endif
+
 /*
  * Power management
  */
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 93dff5fff524..be91db2a7017 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -151,6 +151,13 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func,
 static inline void kick_all_cpus_sync(void) {  }
 static inline void wake_up_all_idle_cpus(void) {  }
 
+#ifdef CONFIG_UP_LATE_INIT
+extern void __init up_late_init(void);
+static inline void smp_init(void) { up_late_init(); }
+#else
+static inline void smp_init(void) { }
+#endif
+
 #endif /* !SMP */
 
 /*
diff --git a/init/main.c b/init/main.c
index 61b993767db5..179ada15d08a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -87,10 +87,6 @@
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
 
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/smp.h>
-#endif
-
 static int kernel_init(void *);
 
 extern void init_IRQ(void);
@@ -351,15 +347,6 @@ __setup("rdinit=", rdinit_setup);
 
 #ifndef CONFIG_SMP
 static const unsigned int setup_max_cpus = NR_CPUS;
-#ifdef CONFIG_X86_LOCAL_APIC
-static void __init smp_init(void)
-{
-	APIC_init_uniprocessor();
-}
-#else
-#define smp_init()	do { } while (0)
-#endif
-
 static inline void setup_nr_cpu_ids(void) { }
 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
 #endif
-- 
cgit v1.2.3


From 1f94a94f67e1083e19fb7b436dd7ca7a4ba03f2b Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 9 Jan 2015 20:34:39 -0600
Subject: PCI: Add generic config accessors

Many PCI controllers' configuration space accesses are memory-mapped and
vary only in address calculation and access checks.  There are 2 main
access methods: a decoded address space such as ECAM or a single address
and data register similar to x86.  This implementation can support both
cases as well as be used in cases that need additional pre- or post-access
handling.

Add a new pci_ops member, map_bus, which can do access checks and any
necessary setup.  It returns the address to use for the configuration space
access.  The access types supported are 32-bit only accesses or correct
byte, word, or dword sized accesses.

Tested-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Thierry Reding <treding@nvidia.com>
---
 drivers/pci/access.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h  | 11 +++++++
 2 files changed, 98 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 49dd766852ba..d9b64a175990 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -67,6 +67,93 @@ EXPORT_SYMBOL(pci_bus_write_config_byte);
 EXPORT_SYMBOL(pci_bus_write_config_word);
 EXPORT_SYMBOL(pci_bus_write_config_dword);
 
+int pci_generic_config_read(struct pci_bus *bus, unsigned int devfn,
+			    int where, int size, u32 *val)
+{
+	void __iomem *addr;
+
+	addr = bus->ops->map_bus(bus, devfn, where);
+	if (!addr) {
+		*val = ~0;
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	}
+
+	if (size == 1)
+		*val = readb(addr);
+	else if (size == 2)
+		*val = readw(addr);
+	else
+		*val = readl(addr);
+
+	return PCIBIOS_SUCCESSFUL;
+}
+EXPORT_SYMBOL_GPL(pci_generic_config_read);
+
+int pci_generic_config_write(struct pci_bus *bus, unsigned int devfn,
+			     int where, int size, u32 val)
+{
+	void __iomem *addr;
+
+	addr = bus->ops->map_bus(bus, devfn, where);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (size == 1)
+		writeb(val, addr);
+	else if (size == 2)
+		writew(val, addr);
+	else
+		writel(val, addr);
+
+	return PCIBIOS_SUCCESSFUL;
+}
+EXPORT_SYMBOL_GPL(pci_generic_config_write);
+
+int pci_generic_config_read32(struct pci_bus *bus, unsigned int devfn,
+			      int where, int size, u32 *val)
+{
+	void __iomem *addr;
+
+	addr = bus->ops->map_bus(bus, devfn, where & ~0x3);
+	if (!addr) {
+		*val = ~0;
+		return PCIBIOS_DEVICE_NOT_FOUND;
+	}
+
+	*val = readl(addr);
+
+	if (size <= 2)
+		*val = (*val >> (8 * (where & 3))) & ((1 << (size * 8)) - 1);
+
+	return PCIBIOS_SUCCESSFUL;
+}
+EXPORT_SYMBOL_GPL(pci_generic_config_read32);
+
+int pci_generic_config_write32(struct pci_bus *bus, unsigned int devfn,
+			       int where, int size, u32 val)
+{
+	void __iomem *addr;
+	u32 mask, tmp;
+
+	addr = bus->ops->map_bus(bus, devfn, where & ~0x3);
+	if (!addr)
+		return PCIBIOS_DEVICE_NOT_FOUND;
+
+	if (size == 4) {
+		writel(val, addr);
+		return PCIBIOS_SUCCESSFUL;
+	} else {
+		mask = ~(((1 << (size * 8)) - 1) << ((where & 0x3) * 8));
+	}
+
+	tmp = readl(addr) & mask;
+	tmp |= val << ((where & 0x3) * 8);
+	writel(tmp, addr);
+
+	return PCIBIOS_SUCCESSFUL;
+}
+EXPORT_SYMBOL_GPL(pci_generic_config_write32);
+
 /**
  * pci_bus_set_ops - Set raw operations of pci bus
  * @bus:	pci bus struct
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 360a966a97a5..e7fd51900182 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -560,6 +560,7 @@ static inline int pcibios_err_to_errno(int err)
 /* Low-level architecture-dependent routines */
 
 struct pci_ops {
+	void __iomem *(*map_bus)(struct pci_bus *bus, unsigned int devfn, int where);
 	int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val);
 	int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val);
 };
@@ -857,6 +858,16 @@ int pci_bus_write_config_word(struct pci_bus *bus, unsigned int devfn,
 			      int where, u16 val);
 int pci_bus_write_config_dword(struct pci_bus *bus, unsigned int devfn,
 			       int where, u32 val);
+
+int pci_generic_config_read(struct pci_bus *bus, unsigned int devfn,
+			    int where, int size, u32 *val);
+int pci_generic_config_write(struct pci_bus *bus, unsigned int devfn,
+			    int where, int size, u32 val);
+int pci_generic_config_read32(struct pci_bus *bus, unsigned int devfn,
+			      int where, int size, u32 *val);
+int pci_generic_config_write32(struct pci_bus *bus, unsigned int devfn,
+			       int where, int size, u32 val);
+
 struct pci_ops *pci_bus_set_ops(struct pci_bus *bus, struct pci_ops *ops);
 
 static inline int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val)
-- 
cgit v1.2.3


From 41fbf3b39d5eca01527338b4d0ee15ee1ae1023c Mon Sep 17 00:00:00 2001
From: Chunyan Zhang <zhang.chunyan@linaro.org>
Date: Wed, 17 Dec 2014 13:11:35 +0800
Subject: ktime.h: Introduce ktime_ms_delta

This patch adds a reusable time difference function which returns the
difference in millisecond, as often used in some driver code, e.g.
mtd/test, media/rc, etc.

Signed-off-by: Chunyan Zhang <zhang.chunyan@linaro.org>
Acked-by: Arnd Bergmann <arnd@linaro.org>
Cc: zhang.lyra@gmail.com
Cc: davem@davemloft.net
Cc: john.stultz@linaro.org
Cc: dborkman@redhat.com
Link: http://lkml.kernel.org/r/1418793095-18780-1-git-send-email-zhang.chunyan@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/ktime.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index c9d645ad98ff..891ea92a68b0 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -186,6 +186,11 @@ static inline s64 ktime_us_delta(const ktime_t later, const ktime_t earlier)
        return ktime_to_us(ktime_sub(later, earlier));
 }
 
+static inline s64 ktime_ms_delta(const ktime_t later, const ktime_t earlier)
+{
+	return ktime_to_ms(ktime_sub(later, earlier));
+}
+
 static inline ktime_t ktime_add_us(const ktime_t kt, const u64 usec)
 {
 	return ktime_add_ns(kt, usec * NSEC_PER_USEC);
-- 
cgit v1.2.3


From 9bc7491906b4113b4c5ae442157c7dfc4e10cd14 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jan 2015 21:24:10 +0100
Subject: hrtimer: Prevent stale expiry time in hrtimer_interrupt()

hrtimer_interrupt() has the following subtle issue:

hrtimer_interrupt()
  lock(cpu_base);
  expires_next = KTIME_MAX;

  expire_timers(CLOCK_MONOTONIC);
  expires = get_next_timer(CLOCK_MONOTONIC);
  if (expires < expires_next)
    expires_next = expires;

  expire_timers(CLOCK_REALTIME);
    unlock(cpu_base);
    wakeup()
    hrtimer_start(CLOCK_MONOTONIC, newtimer);
    lock(cpu_base();
  expires = get_next_timer(CLOCK_REALTIME);
  if (expires < expires_next)
    expires_next = expires;

So because we already evaluated the next expiring timer of
CLOCK_MONOTONIC we ignore that the expiry time of newtimer might be
earlier than the overall next expiry time in hrtimer_interrupt().

To solve this, remove the caching of the next expiry value from
hrtimer_interrupt() and reevaluate all active clock bases for the next
expiry value. To avoid another code duplication, create a shared
evaluation function and use it for hrtimer_get_next_event(),
hrtimer_force_reprogram() and hrtimer_interrupt().

There is another subtlety in this mechanism:

While hrtimer_interrupt() is running, we want to avoid to touch the
hardware device because we will reprogram it anyway at the end of
hrtimer_interrupt(). This works nicely for hrtimers which get rearmed
via the HRTIMER_RESTART mechanism, because we drop out when the
callback on that CPU is running. But that fails, if a new timer gets
enqueued like in the example above.

This has another implication: While hrtimer_interrupt() is running we
refuse remote enqueueing of timers - see hrtimer_interrupt() and
hrtimer_check_target().

hrtimer_interrupt() tries to prevent this by setting cpu_base->expires
to KTIME_MAX, but that fails if a new timer gets queued.

Prevent both the hardware access and the remote enqueue
explicitely. We can loosen the restriction on the remote enqueue now
due to reevaluation of the next expiry value, but that needs a
seperate patch.

Folded in a fix from Vignesh Radhakrishnan.

Reported-and-tested-by: Stanislav Fomichev <stfomichev@yandex-team.ru>
Based-on-patch-by: Stanislav Fomichev <stfomichev@yandex-team.ru>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: vigneshr@codeaurora.org
Cc: john.stultz@linaro.org
Cc: viresh.kumar@linaro.org
Cc: fweisbec@gmail.com
Cc: cl@linux.com
Cc: stuart.w.hayes@gmail.com
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1501202049190.5526@nanos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |   2 +
 kernel/time/hrtimer.c   | 108 ++++++++++++++++++++++--------------------------
 2 files changed, 52 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index a036d058a249..05f6df1fdf5b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -170,6 +170,7 @@ enum  hrtimer_base_type {
  * @clock_was_set:	Indicates that clock was set from irq context.
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
+ * @in_hrtirq:		hrtimer_interrupt() is currently executing
  * @hres_active:	State of high resolution mode
  * @hang_detected:	The last hrtimer interrupt detected a hang
  * @nr_events:		Total number of hrtimer interrupt events
@@ -185,6 +186,7 @@ struct hrtimer_cpu_base {
 	unsigned int			clock_was_set;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t				expires_next;
+	int				in_hrtirq;
 	int				hres_active;
 	int				hang_detected;
 	unsigned long			nr_events;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 37e50aadd471..b663653a5d5b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -440,6 +440,37 @@ static inline void debug_deactivate(struct hrtimer *timer)
 	trace_hrtimer_cancel(timer);
 }
 
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+	struct hrtimer_clock_base *base = cpu_base->clock_base;
+	ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
+	int i;
+
+	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+		struct timerqueue_node *next;
+		struct hrtimer *timer;
+
+		next = timerqueue_getnext(&base->active);
+		if (!next)
+			continue;
+
+		timer = container_of(next, struct hrtimer, node);
+		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+		if (expires.tv64 < expires_next.tv64)
+			expires_next = expires;
+	}
+	/*
+	 * clock_was_set() might have changed base->offset of any of
+	 * the clock bases so the result might be negative. Fix it up
+	 * to prevent a false positive in clockevents_program_event().
+	 */
+	if (expires_next.tv64 < 0)
+		expires_next.tv64 = 0;
+	return expires_next;
+}
+#endif
+
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -488,32 +519,7 @@ static inline int hrtimer_hres_active(void)
 static void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 {
-	int i;
-	struct hrtimer_clock_base *base = cpu_base->clock_base;
-	ktime_t expires, expires_next;
-
-	expires_next.tv64 = KTIME_MAX;
-
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
-		struct hrtimer *timer;
-		struct timerqueue_node *next;
-
-		next = timerqueue_getnext(&base->active);
-		if (!next)
-			continue;
-		timer = container_of(next, struct hrtimer, node);
-
-		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-		/*
-		 * clock_was_set() has changed base->offset so the
-		 * result might be negative. Fix it up to prevent a
-		 * false positive in clockevents_program_event()
-		 */
-		if (expires.tv64 < 0)
-			expires.tv64 = 0;
-		if (expires.tv64 < expires_next.tv64)
-			expires_next = expires;
-	}
+	ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
 
 	if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
 		return;
@@ -586,6 +592,15 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	if (expires.tv64 >= cpu_base->expires_next.tv64)
 		return 0;
 
+	/*
+	 * When the target cpu of the timer is currently executing
+	 * hrtimer_interrupt(), then we do not touch the clock event
+	 * device. hrtimer_interrupt() will reevaluate all clock bases
+	 * before reprogramming the device.
+	 */
+	if (cpu_base->in_hrtirq)
+		return 0;
+
 	/*
 	 * If a hang was detected in the last timer interrupt then we
 	 * do not schedule a timer which is earlier than the expiry
@@ -1104,29 +1119,14 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 ktime_t hrtimer_get_next_event(void)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	struct hrtimer_clock_base *base = cpu_base->clock_base;
-	ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
+	ktime_t mindelta = { .tv64 = KTIME_MAX };
 	unsigned long flags;
-	int i;
 
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
-	if (!hrtimer_hres_active()) {
-		for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
-			struct hrtimer *timer;
-			struct timerqueue_node *next;
-
-			next = timerqueue_getnext(&base->active);
-			if (!next)
-				continue;
-
-			timer = container_of(next, struct hrtimer, node);
-			delta.tv64 = hrtimer_get_expires_tv64(timer);
-			delta = ktime_sub(delta, base->get_time());
-			if (delta.tv64 < mindelta.tv64)
-				mindelta.tv64 = delta.tv64;
-		}
-	}
+	if (!hrtimer_hres_active())
+		mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
+				     ktime_get());
 
 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 
@@ -1253,7 +1253,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	raw_spin_lock(&cpu_base->lock);
 	entry_time = now = hrtimer_update_base(cpu_base);
 retry:
-	expires_next.tv64 = KTIME_MAX;
+	cpu_base->in_hrtirq = 1;
 	/*
 	 * We set expires_next to KTIME_MAX here with cpu_base->lock
 	 * held to prevent that a timer is enqueued in our queue via
@@ -1291,28 +1291,20 @@ retry:
 			 * are right-of a not yet expired timer, because that
 			 * timer will have to trigger a wakeup anyway.
 			 */
-
-			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
-				ktime_t expires;
-
-				expires = ktime_sub(hrtimer_get_expires(timer),
-						    base->offset);
-				if (expires.tv64 < 0)
-					expires.tv64 = KTIME_MAX;
-				if (expires.tv64 < expires_next.tv64)
-					expires_next = expires;
+			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
 				break;
-			}
 
 			__run_hrtimer(timer, &basenow);
 		}
 	}
-
+	/* Reevaluate the clock bases for the next expiry */
+	expires_next = __hrtimer_get_next_event(cpu_base);
 	/*
 	 * Store the new expiry value so the migration code can verify
 	 * against it.
 	 */
 	cpu_base->expires_next = expires_next;
+	cpu_base->in_hrtirq = 0;
 	raw_spin_unlock(&cpu_base->lock);
 
 	/* Reprogramming necessary ? */
-- 
cgit v1.2.3


From 3c5199143bc4b35f472c5c2534026d74821e2044 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Thu, 22 Jan 2015 08:19:32 -0500
Subject: sunrpc/lockd: fix references to the BKL

The BKL is completely out of the picture in the lockd and sunrpc code
these days. Update the antiquated comments that refer to it.

Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svclock.c         | 4 ++--
 include/linux/sunrpc/svc.h | 2 +-
 net/sunrpc/svc.c           | 4 ++--
 net/sunrpc/svc_xprt.c      | 3 +--
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
 static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
 {
 	/*
-	 * We can get away with a static buffer because we're only
-	 * called with BKL held.
+	 * We can get away with a static buffer because this is only called
+	 * from lockd, which is single-threaded.
 	 */
 	static char buf[2*NLM_MAXCOOKIELEN+1];
 	unsigned int i, len = sizeof(buf);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 6f22cfeef5e3..fae6fb947fc8 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -110,7 +110,7 @@ struct svc_serv {
  * We use sv_nrthreads as a reference count.  svc_destroy() drops
  * this refcount, so we need to bump it up around operations that
  * change the number of threads.  Horrible, but there it is.
- * Should be called with the BKL held.
+ * Should be called with the "service mutex" held.
  */
 static inline void svc_get(struct svc_serv *serv)
 {
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 91eaef1844c8..78974e4d9ad2 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -768,8 +768,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 EXPORT_SYMBOL_GPL(svc_set_num_threads);
 
 /*
- * Called from a server thread as it's exiting. Caller must hold the BKL or
- * the "service mutex", whichever is appropriate for the service.
+ * Called from a server thread as it's exiting. Caller must hold the "service
+ * mutex" for the service.
  */
 void
 svc_exit_thread(struct svc_rqst *rqstp)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index c69358b3cf7f..163ac45c3639 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -42,7 +42,7 @@ static LIST_HEAD(svc_xprt_class_list);
  *	svc_pool->sp_lock protects most of the fields of that pool.
  *	svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt.
  *	when both need to be taken (rare), svc_serv->sv_lock is first.
- *	BKL protects svc_serv->sv_nrthread.
+ *	The "service mutex" protects svc_serv->sv_nrthread.
  *	svc_sock->sk_lock protects the svc_sock->sk_deferred list
  *             and the ->sk_info_authunix cache.
  *
@@ -67,7 +67,6 @@ static LIST_HEAD(svc_xprt_class_list);
  *		  that no other thread will be using the transport or will
  *		  try to set XPT_DEAD.
  */
-
 int svc_reg_xprt_class(struct svc_xprt_class *xcl)
 {
 	struct svc_xprt_class *cl;
-- 
cgit v1.2.3


From c5ed1df781cb544d4e4d189bb5b6ec7336d8888c Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Mon, 19 Jan 2015 08:30:30 +0100
Subject: bcma: use standard bus scanning during early register
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Starting with kernel 3.19-rc1 early registration of bcma on MIPS is done
a bit later, with memory allocator available. This allows us to simplify
code by using standard bus scanning method.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/bcma_private.h   |  7 +----
 drivers/bcma/host_soc.c       |  2 +-
 drivers/bcma/main.c           | 33 +++++++----------------
 drivers/bcma/scan.c           | 62 +++----------------------------------------
 include/linux/bcma/bcma_soc.h |  2 --
 5 files changed, 15 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/bcma_private.h b/drivers/bcma/bcma_private.h
index 59422b5fa46c..3f314c98d089 100644
--- a/drivers/bcma/bcma_private.h
+++ b/drivers/bcma/bcma_private.h
@@ -28,9 +28,7 @@ void bcma_prepare_core(struct bcma_bus *bus, struct bcma_device *core);
 void bcma_init_bus(struct bcma_bus *bus);
 int bcma_bus_register(struct bcma_bus *bus);
 void bcma_bus_unregister(struct bcma_bus *bus);
-int __init bcma_bus_early_register(struct bcma_bus *bus,
-				   struct bcma_device *core_cc,
-				   struct bcma_device *core_mips);
+int __init bcma_bus_early_register(struct bcma_bus *bus);
 #ifdef CONFIG_PM
 int bcma_bus_suspend(struct bcma_bus *bus);
 int bcma_bus_resume(struct bcma_bus *bus);
@@ -39,9 +37,6 @@ int bcma_bus_resume(struct bcma_bus *bus);
 /* scan.c */
 void bcma_detect_chip(struct bcma_bus *bus);
 int bcma_bus_scan(struct bcma_bus *bus);
-int __init bcma_bus_scan_early(struct bcma_bus *bus,
-			       struct bcma_device_id *match,
-			       struct bcma_device *core);
 
 /* sprom.c */
 int bcma_sprom_get(struct bcma_bus *bus);
diff --git a/drivers/bcma/host_soc.c b/drivers/bcma/host_soc.c
index 335cbcfd945b..2dce34789329 100644
--- a/drivers/bcma/host_soc.c
+++ b/drivers/bcma/host_soc.c
@@ -193,7 +193,7 @@ int __init bcma_host_soc_init(struct bcma_soc *soc)
 	int err;
 
 	/* Scan bus and initialize it */
-	err = bcma_bus_early_register(bus, &soc->core_cc, &soc->core_mips);
+	err = bcma_bus_early_register(bus);
 	if (err)
 		iounmap(bus->mmio);
 
diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index c166d444fef6..c3c5e0a2d5be 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -489,35 +489,20 @@ void bcma_bus_unregister(struct bcma_bus *bus)
 	kfree(cores[0]);
 }
 
-int __init bcma_bus_early_register(struct bcma_bus *bus,
-				   struct bcma_device *core_cc,
-				   struct bcma_device *core_mips)
+/*
+ * This is a special version of bus registration function designed for SoCs.
+ * It scans bus and performs basic initialization of main cores only.
+ * Please note it requires memory allocation, however it won't try to sleep.
+ */
+int __init bcma_bus_early_register(struct bcma_bus *bus)
 {
 	int err;
 	struct bcma_device *core;
-	struct bcma_device_id match;
-
-	match.manuf = BCMA_MANUF_BCM;
-	match.id = bcma_cc_core_id(bus);
-	match.class = BCMA_CL_SIM;
-	match.rev = BCMA_ANY_REV;
-
-	/* Scan for chip common core */
-	err = bcma_bus_scan_early(bus, &match, core_cc);
-	if (err) {
-		bcma_err(bus, "Failed to scan for common core: %d\n", err);
-		return -1;
-	}
-
-	match.manuf = BCMA_MANUF_MIPS;
-	match.id = BCMA_CORE_MIPS_74K;
-	match.class = BCMA_CL_SIM;
-	match.rev = BCMA_ANY_REV;
 
-	/* Scan for mips core */
-	err = bcma_bus_scan_early(bus, &match, core_mips);
+	/* Scan for devices (cores) */
+	err = bcma_bus_scan(bus);
 	if (err) {
-		bcma_err(bus, "Failed to scan for mips core: %d\n", err);
+		bcma_err(bus, "Failed to scan bus: %d\n", err);
 		return -1;
 	}
 
diff --git a/drivers/bcma/scan.c b/drivers/bcma/scan.c
index 5328ee5b4df0..df806b9c5490 100644
--- a/drivers/bcma/scan.c
+++ b/drivers/bcma/scan.c
@@ -461,6 +461,10 @@ int bcma_bus_scan(struct bcma_bus *bus)
 
 	int err, core_num = 0;
 
+	/* Skip if bus was already scanned (e.g. during early register) */
+	if (bus->nr_cores)
+		return 0;
+
 	erombase = bcma_scan_read32(bus, 0, BCMA_CC_EROM);
 	if (bus->hosttype == BCMA_HOSTTYPE_SOC) {
 		eromptr = ioremap_nocache(erombase, BCMA_CORE_SIZE);
@@ -519,61 +523,3 @@ out:
 
 	return err;
 }
-
-int __init bcma_bus_scan_early(struct bcma_bus *bus,
-			       struct bcma_device_id *match,
-			       struct bcma_device *core)
-{
-	u32 erombase;
-	u32 __iomem *eromptr, *eromend;
-
-	int err = -ENODEV;
-	int core_num = 0;
-
-	erombase = bcma_scan_read32(bus, 0, BCMA_CC_EROM);
-	if (bus->hosttype == BCMA_HOSTTYPE_SOC) {
-		eromptr = ioremap_nocache(erombase, BCMA_CORE_SIZE);
-		if (!eromptr)
-			return -ENOMEM;
-	} else {
-		eromptr = bus->mmio;
-	}
-
-	eromend = eromptr + BCMA_CORE_SIZE / sizeof(u32);
-
-	bcma_scan_switch_core(bus, erombase);
-
-	while (eromptr < eromend) {
-		memset(core, 0, sizeof(*core));
-		INIT_LIST_HEAD(&core->list);
-		core->bus = bus;
-
-		err = bcma_get_next_core(bus, &eromptr, match, core_num, core);
-		if (err == -ENODEV) {
-			core_num++;
-			continue;
-		} else if (err == -ENXIO)
-			continue;
-		else if (err == -ESPIPE)
-			break;
-		else if (err < 0)
-			goto out;
-
-		core->core_index = core_num++;
-		bus->nr_cores++;
-		bcma_info(bus, "Core %d found: %s (manuf 0x%03X, id 0x%03X, rev 0x%02X, class 0x%X)\n",
-			  core->core_index, bcma_device_name(&core->id),
-			  core->id.manuf, core->id.id, core->id.rev,
-			  core->id.class);
-
-		list_add_tail(&core->list, &bus->cores);
-		err = 0;
-		break;
-	}
-
-out:
-	if (bus->hosttype == BCMA_HOSTTYPE_SOC)
-		iounmap(eromptr);
-
-	return err;
-}
diff --git a/include/linux/bcma/bcma_soc.h b/include/linux/bcma/bcma_soc.h
index f24d245f8394..1b5fc0c3b1b5 100644
--- a/include/linux/bcma/bcma_soc.h
+++ b/include/linux/bcma/bcma_soc.h
@@ -5,8 +5,6 @@
 
 struct bcma_soc {
 	struct bcma_bus bus;
-	struct bcma_device core_cc;
-	struct bcma_device core_mips;
 };
 
 int __init bcma_host_soc_register(struct bcma_soc *soc);
-- 
cgit v1.2.3


From a9aaf2915ee265735c28b764551d084e61a694e0 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 13 Jan 2015 11:34:00 +0530
Subject: cpufreq: stats: get rid of per-cpu cpufreq_stats_table

All CPUs sharing a cpufreq policy share stats too. For this reason,
add a stats pointer to struct cpufreq_policy and drop per-CPU variable
cpufreq_stats_table used for accessing cpufreq stats so as to reduce
code complexity.

Reviewed-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_stats.c | 62 +++++++++++++++++++----------------------
 include/linux/cpufreq.h         |  3 ++
 2 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 6c234f548601..3792b2e2f4a8 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -31,8 +31,6 @@ struct cpufreq_stats {
 #endif
 };
 
-static DEFINE_PER_CPU(struct cpufreq_stats *, cpufreq_stats_table);
-
 static int cpufreq_stats_update(struct cpufreq_stats *stat)
 {
 	unsigned long long cur_time = get_jiffies_64();
@@ -48,20 +46,15 @@ static int cpufreq_stats_update(struct cpufreq_stats *stat)
 
 static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
 {
-	struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table, policy->cpu);
-	if (!stat)
-		return 0;
-	return sprintf(buf, "%d\n",
-			per_cpu(cpufreq_stats_table, stat->cpu)->total_trans);
+	return sprintf(buf, "%d\n", policy->stats->total_trans);
 }
 
 static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 {
+	struct cpufreq_stats *stat = policy->stats;
 	ssize_t len = 0;
 	int i;
-	struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table, policy->cpu);
-	if (!stat)
-		return 0;
+
 	cpufreq_stats_update(stat);
 	for (i = 0; i < stat->state_num; i++) {
 		len += sprintf(buf + len, "%u %llu\n", stat->freq_table[i],
@@ -74,12 +67,10 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 #ifdef CONFIG_CPU_FREQ_STAT_DETAILS
 static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 {
+	struct cpufreq_stats *stat = policy->stats;
 	ssize_t len = 0;
 	int i, j;
 
-	struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table, policy->cpu);
-	if (!stat)
-		return 0;
 	cpufreq_stats_update(stat);
 	len += snprintf(buf + len, PAGE_SIZE - len, "   From  :    To\n");
 	len += snprintf(buf + len, PAGE_SIZE - len, "         : ");
@@ -145,8 +136,9 @@ static int freq_table_get_index(struct cpufreq_stats *stat, unsigned int freq)
 
 static void __cpufreq_stats_free_table(struct cpufreq_policy *policy)
 {
-	struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table, policy->cpu);
+	struct cpufreq_stats *stat = policy->stats;
 
+	/* Already freed */
 	if (!stat)
 		return;
 
@@ -155,7 +147,7 @@ static void __cpufreq_stats_free_table(struct cpufreq_policy *policy)
 	sysfs_remove_group(&policy->kobj, &stats_attr_group);
 	kfree(stat->time_in_state);
 	kfree(stat);
-	per_cpu(cpufreq_stats_table, policy->cpu) = NULL;
+	policy->stats = NULL;
 }
 
 static void cpufreq_stats_free_table(unsigned int cpu)
@@ -184,7 +176,7 @@ static int __cpufreq_stats_create_table(struct cpufreq_policy *policy)
 		return 0;
 
 	/* stats already initialized */
-	if (per_cpu(cpufreq_stats_table, cpu))
+	if (policy->stats)
 		return -EEXIST;
 
 	stat = kzalloc(sizeof(*stat), GFP_KERNEL);
@@ -196,7 +188,7 @@ static int __cpufreq_stats_create_table(struct cpufreq_policy *policy)
 		goto error_out;
 
 	stat->cpu = cpu;
-	per_cpu(cpufreq_stats_table, cpu) = stat;
+	policy->stats = stat;
 
 	cpufreq_for_each_valid_entry(pos, table)
 		count++;
@@ -231,7 +223,7 @@ error_alloc:
 	sysfs_remove_group(&policy->kobj, &stats_attr_group);
 error_out:
 	kfree(stat);
-	per_cpu(cpufreq_stats_table, cpu) = NULL;
+	policy->stats = NULL;
 	return ret;
 }
 
@@ -254,15 +246,7 @@ static void cpufreq_stats_create_table(unsigned int cpu)
 
 static void cpufreq_stats_update_policy_cpu(struct cpufreq_policy *policy)
 {
-	struct cpufreq_stats *stat = per_cpu(cpufreq_stats_table,
-			policy->last_cpu);
-
-	pr_debug("Updating stats_table for new_cpu %u from last_cpu %u\n",
-			policy->cpu, policy->last_cpu);
-	per_cpu(cpufreq_stats_table, policy->cpu) = per_cpu(cpufreq_stats_table,
-			policy->last_cpu);
-	per_cpu(cpufreq_stats_table, policy->last_cpu) = NULL;
-	stat->cpu = policy->cpu;
+	policy->stats->cpu = policy->cpu;
 }
 
 static int cpufreq_stat_notifier_policy(struct notifier_block *nb,
@@ -288,27 +272,36 @@ static int cpufreq_stat_notifier_trans(struct notifier_block *nb,
 		unsigned long val, void *data)
 {
 	struct cpufreq_freqs *freq = data;
+	struct cpufreq_policy *policy = cpufreq_cpu_get(freq->cpu);
 	struct cpufreq_stats *stat;
 	int old_index, new_index;
 
-	if (val != CPUFREQ_POSTCHANGE)
+	if (!policy) {
+		pr_err("%s: No policy found\n", __func__);
 		return 0;
+	}
 
-	stat = per_cpu(cpufreq_stats_table, freq->cpu);
-	if (!stat)
-		return 0;
+	if (val != CPUFREQ_POSTCHANGE)
+		goto put_policy;
+
+	if (!policy->stats) {
+		pr_debug("%s: No stats found\n", __func__);
+		goto put_policy;
+	}
+
+	stat = policy->stats;
 
 	old_index = stat->last_index;
 	new_index = freq_table_get_index(stat, freq->new);
 
 	/* We can't do stat->time_in_state[-1]= .. */
 	if (old_index == -1 || new_index == -1)
-		return 0;
+		goto put_policy;
 
 	cpufreq_stats_update(stat);
 
 	if (old_index == new_index)
-		return 0;
+		goto put_policy;
 
 	spin_lock(&cpufreq_stats_lock);
 	stat->last_index = new_index;
@@ -317,6 +310,9 @@ static int cpufreq_stat_notifier_trans(struct notifier_block *nb,
 #endif
 	stat->total_trans++;
 	spin_unlock(&cpufreq_stats_lock);
+
+put_policy:
+	cpufreq_cpu_put(policy);
 	return 0;
 }
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 4d078cebafd2..60b7b496565d 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -113,6 +113,9 @@ struct cpufreq_policy {
 	wait_queue_head_t	transition_wait;
 	struct task_struct	*transition_task; /* Task which is doing the transition */
 
+	/* cpufreq-stats */
+	struct cpufreq_stats	*stats;
+
 	/* For cpufreq driver's internal use */
 	void			*driver_data;
 };
-- 
cgit v1.2.3


From 7c418ff099110d987846c8c670479a3b90ed1dcb Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 6 Jan 2015 21:09:08 +0530
Subject: cpufreq: Remove (now) unused 'last_cpu' from struct cpufreq_policy

'last_cpu' was used only from cpufreq-stats and isn't used anymore. Get rid of
it.

Reviewed-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 3 ---
 include/linux/cpufreq.h   | 2 --
 2 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 70b568a253e9..60ef37d569c5 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1086,10 +1086,7 @@ static int update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu,
 	}
 
 	down_write(&policy->rwsem);
-
-	policy->last_cpu = policy->cpu;
 	policy->cpu = cpu;
-
 	up_write(&policy->rwsem);
 
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 60b7b496565d..7e1a389b4e92 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -66,8 +66,6 @@ struct cpufreq_policy {
 	unsigned int		shared_type; /* ACPI: ANY or ALL affected CPUs
 						should set cpufreq */
 	unsigned int		cpu;    /* cpu nr of CPU managing this policy */
-	unsigned int		last_cpu; /* cpu nr of previous CPU that managed
-					   * this policy */
 	struct clk		*clk;
 	struct cpufreq_cpuinfo	cpuinfo;/* see above */
 
-- 
cgit v1.2.3


From d9f354460db8b58a8395936d323b4ca6e8428b9d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 6 Jan 2015 21:09:10 +0530
Subject: cpufreq: remove CPUFREQ_UPDATE_POLICY_CPU notifications

CPUFREQ_UPDATE_POLICY_CPU notifications were used only from cpufreq-stats which
doesn't use it anymore. Remove them.

This also decrements values of other notification macros defined after
CPUFREQ_UPDATE_POLICY_CPU by 1 to remove gaps. Hopefully all users are using
macro's instead of direct numbers and so they wouldn't break as macro values are
changed now.

Reviewed-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 3 ---
 include/linux/cpufreq.h   | 5 ++---
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 60ef37d569c5..ca69f42b8e1e 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1089,9 +1089,6 @@ static int update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu,
 	policy->cpu = cpu;
 	up_write(&policy->rwsem);
 
-	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
-			CPUFREQ_UPDATE_POLICY_CPU, policy);
-
 	return 0;
 }
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 7e1a389b4e92..2ee4888c1f47 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -368,9 +368,8 @@ static inline void cpufreq_resume(void) {}
 #define CPUFREQ_INCOMPATIBLE		(1)
 #define CPUFREQ_NOTIFY			(2)
 #define CPUFREQ_START			(3)
-#define CPUFREQ_UPDATE_POLICY_CPU	(4)
-#define CPUFREQ_CREATE_POLICY		(5)
-#define CPUFREQ_REMOVE_POLICY		(6)
+#define CPUFREQ_CREATE_POLICY		(4)
+#define CPUFREQ_REMOVE_POLICY		(5)
 
 #ifdef CONFIG_CPU_FREQ
 int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list);
-- 
cgit v1.2.3


From 382548a62ade2c003c77a1055b6eb2a47ce30084 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 20 Jan 2015 11:33:09 +0100
Subject: PM / Domains: Remove pm_genpd_dev_need_restore() API

There are currently no users of this API, let's remove it.

Additionally, if such feature would be needed future wise, a better
option is likely use pm_runtime_set_active|suspended() in some form.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 20 --------------------
 include/linux/pm_domain.h   |  2 --
 2 files changed, 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 0d8780c04a5e..c5280f2b798b 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1558,26 +1558,6 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	return ret;
 }
 
-/**
- * pm_genpd_dev_need_restore - Set/unset the device's "need restore" flag.
- * @dev: Device to set/unset the flag for.
- * @val: The new value of the device's "need restore" flag.
- */
-void pm_genpd_dev_need_restore(struct device *dev, bool val)
-{
-	struct pm_subsys_data *psd;
-	unsigned long flags;
-
-	spin_lock_irqsave(&dev->power.lock, flags);
-
-	psd = dev_to_psd(dev);
-	if (psd && psd->domain_data)
-		to_gpd_data(psd->domain_data)->need_restore = val ? 1 : 0;
-
-	spin_unlock_irqrestore(&dev->power.lock, flags);
-}
-EXPORT_SYMBOL_GPL(pm_genpd_dev_need_restore);
-
 /**
  * pm_genpd_add_subdomain - Add a subdomain to an I/O PM domain.
  * @genpd: Master PM domain to add the subdomain to.
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index a9edab2c787a..ed607760fc20 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -140,7 +140,6 @@ extern int __pm_genpd_name_add_device(const char *domain_name,
 
 extern int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 				  struct device *dev);
-extern void pm_genpd_dev_need_restore(struct device *dev, bool val);
 extern int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
 				  struct generic_pm_domain *new_subdomain);
 extern int pm_genpd_add_subdomain_names(const char *master_name,
@@ -187,7 +186,6 @@ static inline int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 {
 	return -ENOSYS;
 }
-static inline void pm_genpd_dev_need_restore(struct device *dev, bool val) {}
 static inline int pm_genpd_add_subdomain(struct generic_pm_domain *genpd,
 					 struct generic_pm_domain *new_sd)
 {
-- 
cgit v1.2.3


From 8b618628b2bf83512fc8df5e8672619d65adfdfb Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Wed, 3 Dec 2014 14:43:06 -0500
Subject: ktime: Optimize ktime_divns for constant divisors

At least on ARM, do_div() is optimized to turn constant divisors into
an inline multiplication by the reciprocal value at compile time.
However this optimization is missed entirely whenever ktime_divns() is
used and the slow out-of-line division code is used all the time.

Let ktime_divns() use do_div() inline whenever the divisor is constant
and small enough.  This will make things like ktime_to_us() and
ktime_to_ms() much faster.

Cc: Arnd Bergmann <arnd.bergmann@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Nicolas Pitre <nico@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/ktime.h | 12 +++++++++++-
 kernel/time/hrtimer.c |  4 ++--
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index c9d645ad98ff..411dd8bfe539 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -166,7 +166,17 @@ static inline bool ktime_before(const ktime_t cmp1, const ktime_t cmp2)
 }
 
 #if BITS_PER_LONG < 64
-extern u64 ktime_divns(const ktime_t kt, s64 div);
+extern u64 __ktime_divns(const ktime_t kt, s64 div);
+static inline u64 ktime_divns(const ktime_t kt, s64 div)
+{
+	if (__builtin_constant_p(div) && !(div >> 32)) {
+		u64 ns = kt.tv64;
+		do_div(ns, div);
+		return ns;
+	} else {
+		return __ktime_divns(kt, div);
+	}
+}
 #else /* BITS_PER_LONG < 64 */
 # define ktime_divns(kt, div)		(u64)((kt).tv64 / (div))
 #endif
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 37e50aadd471..890535c41c2d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -266,7 +266,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 /*
  * Divide a ktime value by a nanosecond value
  */
-u64 ktime_divns(const ktime_t kt, s64 div)
+u64 __ktime_divns(const ktime_t kt, s64 div)
 {
 	u64 dclc;
 	int sft = 0;
@@ -282,7 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)
 
 	return dclc;
 }
-EXPORT_SYMBOL_GPL(ktime_divns);
+EXPORT_SYMBOL_GPL(__ktime_divns);
 #endif /* BITS_PER_LONG >= 64 */
 
 /*
-- 
cgit v1.2.3


From d08c0cdd26d48751c15aa2b4479a410594fee9ac Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 8 Dec 2014 12:00:09 -0800
Subject: time: Expose getboottime64 for in-kernel uses

Adds a timespec64 based getboottime64() implementation
that can be used as we convert internal users of
getboottime away from using timespecs.

Cc: pang.xunlei <pang.xunlei@linaro.org>
Cc: Arnd Bergmann <arnd.bergmann@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h | 16 ++++++++++++++--
 kernel/time/timekeeping.c   | 12 ++++++------
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 9b63d13ba82b..91480137aa39 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -33,6 +33,7 @@ extern time64_t ktime_get_real_seconds(void);
 
 extern int __getnstimeofday64(struct timespec64 *tv);
 extern void getnstimeofday64(struct timespec64 *tv);
+extern void getboottime64(struct timespec64 *ts);
 
 #if BITS_PER_LONG == 64
 /**
@@ -72,6 +73,11 @@ static inline struct timespec get_monotonic_coarse(void)
 {
 	return get_monotonic_coarse64();
 }
+
+static inline void getboottime(struct timespec *ts)
+{
+	return getboottime64(ts);
+}
 #else
 /**
  * Deprecated. Use do_settimeofday64().
@@ -129,9 +135,15 @@ static inline struct timespec get_monotonic_coarse(void)
 {
 	return timespec64_to_timespec(get_monotonic_coarse64());
 }
-#endif
 
-extern void getboottime(struct timespec *ts);
+static inline void getboottime(struct timespec *ts)
+{
+	struct timespec64 ts64;
+
+	getboottime64(&ts64);
+	*ts = timespec64_to_timespec(ts64);
+}
+#endif
 
 #define do_posix_clock_monotonic_gettime(ts) ktime_get_ts(ts)
 #define ktime_get_real_ts64(ts)	getnstimeofday64(ts)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a931852082f..b124af259800 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1659,24 +1659,24 @@ out:
 }
 
 /**
- * getboottime - Return the real time of system boot.
- * @ts:		pointer to the timespec to be set
+ * getboottime64 - Return the real time of system boot.
+ * @ts:		pointer to the timespec64 to be set
  *
- * Returns the wall-time of boot in a timespec.
+ * Returns the wall-time of boot in a timespec64.
  *
  * This is based on the wall_to_monotonic offset and the total suspend
  * time. Calls to settimeofday will affect the value returned (which
  * basically means that however wrong your real time clock is at boot time,
  * you get the right time here).
  */
-void getboottime(struct timespec *ts)
+void getboottime64(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
 
-	*ts = ktime_to_timespec(t);
+	*ts = ktime_to_timespec64(t);
 }
-EXPORT_SYMBOL_GPL(getboottime);
+EXPORT_SYMBOL_GPL(getboottime64);
 
 unsigned long get_seconds(void)
 {
-- 
cgit v1.2.3


From 2e0c78ee5ba4d777ecf22c8f40cc968b4308ca88 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 18 Dec 2014 18:04:34 -0800
Subject: time: Expose get_monotonic_boottime64 for in-kernel use

As part of the 2038 conversion process, add a
get_monotonic_boottime64 accessor so we can depracate
get_monotonic_boottime.

Cc: pang.xunlei <pang.xunlei@linaro.org>
Cc: Arnd Bergmann <arnd.bergmann@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 91480137aa39..3eaae4754275 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -229,6 +229,11 @@ static inline void get_monotonic_boottime(struct timespec *ts)
 	*ts = ktime_to_timespec(ktime_get_boottime());
 }
 
+static inline void get_monotonic_boottime64(struct timespec64 *ts)
+{
+	*ts = ktime_to_timespec64(ktime_get_boottime());
+}
+
 static inline void timekeeping_clocktai(struct timespec *ts)
 {
 	*ts = ktime_to_timespec(ktime_get_clocktai());
-- 
cgit v1.2.3


From 9a4a445e30f0b601ca2d9433274047cbf48ebf9e Mon Sep 17 00:00:00 2001
From: Xunlei Pang <pang.xunlei@linaro.org>
Date: Thu, 22 Jan 2015 02:31:55 +0000
Subject: rtc: Convert rtc_set_ntp_time() to use timespec64

rtc_set_ntp_time() uses timespec which is y2038-unsafe,
so modify to use timespec64 which is y2038-safe, then
replace rtc_time_to_tm() with rtc_time64_to_tm().

Also adjust all its call sites(only NTP uses it) accordingly.

Cc: pang.xunlei <pang.xunlei@linaro.org>
Cc: Arnd Bergmann <arnd.bergmann@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/rtc/systohc.c | 6 +++---
 include/linux/rtc.h   | 2 +-
 kernel/time/ntp.c     | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/systohc.c b/drivers/rtc/systohc.c
index bf3e242ccc5c..eb71872d0361 100644
--- a/drivers/rtc/systohc.c
+++ b/drivers/rtc/systohc.c
@@ -20,16 +20,16 @@
  *
  * If temporary failure is indicated the caller should try again 'soon'
  */
-int rtc_set_ntp_time(struct timespec now)
+int rtc_set_ntp_time(struct timespec64 now)
 {
 	struct rtc_device *rtc;
 	struct rtc_time tm;
 	int err = -ENODEV;
 
 	if (now.tv_nsec < (NSEC_PER_SEC >> 1))
-		rtc_time_to_tm(now.tv_sec, &tm);
+		rtc_time64_to_tm(now.tv_sec, &tm);
 	else
-		rtc_time_to_tm(now.tv_sec + 1, &tm);
+		rtc_time64_to_tm(now.tv_sec + 1, &tm);
 
 	rtc = rtc_class_open(CONFIG_RTC_HCTOSYS_DEVICE);
 	if (rtc) {
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 6d6be09a2fe5..dcad7ee0d746 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -161,7 +161,7 @@ extern void devm_rtc_device_unregister(struct device *dev,
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_mmss(struct rtc_device *rtc, unsigned long secs);
-extern int rtc_set_ntp_time(struct timespec now);
+extern int rtc_set_ntp_time(struct timespec64 now);
 int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm);
 extern int rtc_read_alarm(struct rtc_device *rtc,
 			struct rtc_wkalrm *alrm);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 87a346fd6d61..183dfe2191c6 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -488,13 +488,13 @@ static void sync_cmos_clock(struct work_struct *work)
 
 	getnstimeofday64(&now);
 	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
-		struct timespec adjust = timespec64_to_timespec(now);
+		struct timespec64 adjust = now;
 
 		fail = -ENODEV;
 		if (persistent_clock_is_local)
 			adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-		fail = update_persistent_clock(adjust);
+		fail = update_persistent_clock(timespec64_to_timespec(adjust));
 #endif
 #ifdef CONFIG_RTC_SYSTOHC
 		if (fail == -ENODEV)
-- 
cgit v1.2.3


From 872bf2fb69d90e3619befee842fc26db39d8e475 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:35 +0200
Subject: net/mlx4_core: Maintain a persistent memory for mlx4 device

Maintain a persistent memory that should survive reset flow/PCI error.
This comes as a preparation for coming series to support above flows.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/alias_GUID.c            |   2 +-
 drivers/infiniband/hw/mlx4/mad.c                   |   3 +-
 drivers/infiniband/hw/mlx4/main.c                  |  17 ++--
 drivers/infiniband/hw/mlx4/mr.c                    |   6 +-
 drivers/infiniband/hw/mlx4/sysfs.c                 |   6 +-
 drivers/net/ethernet/mellanox/mlx4/alloc.c         |  15 +--
 drivers/net/ethernet/mellanox/mlx4/catas.c         |  13 +--
 drivers/net/ethernet/mellanox/mlx4/cmd.c           |  46 +++++----
 drivers/net/ethernet/mellanox/mlx4/en_cq.c         |   4 +-
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c    |   2 +-
 drivers/net/ethernet/mellanox/mlx4/en_main.c       |   4 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |   2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   4 +-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         |   4 +-
 drivers/net/ethernet/mellanox/mlx4/eq.c            |  42 ++++----
 drivers/net/ethernet/mellanox/mlx4/icm.c           |  11 ++-
 drivers/net/ethernet/mellanox/mlx4/main.c          | 106 +++++++++++++--------
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |   9 +-
 drivers/net/ethernet/mellanox/mlx4/mr.c            |   8 +-
 drivers/net/ethernet/mellanox/mlx4/pd.c            |   6 +-
 drivers/net/ethernet/mellanox/mlx4/port.c          |  17 ++--
 drivers/net/ethernet/mellanox/mlx4/reset.c         |  23 +++--
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  36 ++++---
 include/linux/mlx4/device.h                        |  11 ++-
 24 files changed, 234 insertions(+), 163 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
index 0eb141c41416..a31e031afd87 100644
--- a/drivers/infiniband/hw/mlx4/alias_GUID.c
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -154,7 +154,7 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
 			continue;
 
 		slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
-		if (slave_id >= dev->dev->num_vfs + 1)
+		if (slave_id >= dev->dev->persist->num_vfs + 1)
 			return;
 		tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE];
 		form_cache_ag = get_cached_alias_guid(dev, port_num,
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 82a7dd87089b..c7619716c31d 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1951,7 +1951,8 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
 	ctx->ib_dev = &dev->ib_dev;
 
 	for (i = 0;
-	     i < min(dev->dev->caps.sqp_demux, (u16)(dev->dev->num_vfs + 1));
+	     i < min(dev->dev->caps.sqp_demux,
+	     (u16)(dev->dev->persist->num_vfs + 1));
 	     i++) {
 		struct mlx4_active_ports actv_ports =
 			mlx4_get_active_ports(dev->dev, i);
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 57ecc5b204f3..b4fa6f658800 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -198,7 +198,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
 
 	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
 		0xffffff;
-	props->vendor_part_id	   = dev->dev->pdev->device;
+	props->vendor_part_id	   = dev->dev->persist->pdev->device;
 	props->hw_ver		   = be32_to_cpup((__be32 *) (out_mad->data + 32));
 	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
 
@@ -1375,7 +1375,7 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr,
 {
 	struct mlx4_ib_dev *dev =
 		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
-	return sprintf(buf, "MT%d\n", dev->dev->pdev->device);
+	return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
 }
 
 static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
@@ -1937,7 +1937,8 @@ static void init_pkeys(struct mlx4_ib_dev *ibdev)
 	int i;
 
 	if (mlx4_is_master(ibdev->dev)) {
-		for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) {
+		for (slave = 0; slave <= ibdev->dev->persist->num_vfs;
+		     ++slave) {
 			for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
 				for (i = 0;
 				     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
@@ -1994,7 +1995,7 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
 	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {
 		for (j = 0; j < eq_per_port; j++) {
 			snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s",
-				 i, j, dev->pdev->bus->name);
+				 i, j, dev->persist->pdev->bus->name);
 			/* Set IRQ for specific name (per ring) */
 			if (mlx4_assign_eq(dev, name, NULL,
 					   &ibdev->eq_table[eq])) {
@@ -2058,7 +2059,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 
 	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
 	if (!ibdev) {
-		dev_err(&dev->pdev->dev, "Device struct alloc failed\n");
+		dev_err(&dev->persist->pdev->dev,
+			"Device struct alloc failed\n");
 		return NULL;
 	}
 
@@ -2085,7 +2087,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->num_ports		= num_ports;
 	ibdev->ib_dev.phys_port_cnt     = ibdev->num_ports;
 	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
-	ibdev->ib_dev.dma_device	= &dev->pdev->dev;
+	ibdev->ib_dev.dma_device	= &dev->persist->pdev->dev;
 
 	if (dev->caps.userspace_caps)
 		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
@@ -2236,7 +2238,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 				sizeof(long),
 				GFP_KERNEL);
 		if (!ibdev->ib_uc_qpns_bitmap) {
-			dev_err(&dev->pdev->dev, "bit map alloc failed\n");
+			dev_err(&dev->persist->pdev->dev,
+				"bit map alloc failed\n");
 			goto err_steer_qp_release;
 		}
 
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index c36ccbd9a644..e0d271782d0a 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -401,7 +401,8 @@ struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device
 	if (!mfrpl->ibfrpl.page_list)
 		goto err_free;
 
-	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
+	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->persist->
+						     pdev->dev,
 						     size, &mfrpl->map,
 						     GFP_KERNEL);
 	if (!mfrpl->mapped_page_list)
@@ -423,7 +424,8 @@ void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
 	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
 	int size = page_list->max_page_list_len * sizeof (u64);
 
-	dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
+	dma_free_coherent(&dev->dev->persist->pdev->dev, size,
+			  mfrpl->mapped_page_list,
 			  mfrpl->map);
 	kfree(mfrpl->ibfrpl.page_list);
 	kfree(mfrpl);
diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
index cb4c66e723b5..d10c2b8a5dad 100644
--- a/drivers/infiniband/hw/mlx4/sysfs.c
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -375,7 +375,7 @@ static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max)
 	char base_name[9];
 
 	/* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */
-	strlcpy(name, pci_name(dev->dev->pdev), max);
+	strlcpy(name, pci_name(dev->dev->persist->pdev), max);
 	strncpy(base_name, name, 8); /*till xxxx:yy:*/
 	base_name[8] = '\0';
 	/* with no ARI only 3 last bits are used so when the fn is higher than 8
@@ -792,7 +792,7 @@ static int register_pkey_tree(struct mlx4_ib_dev *device)
 	if (!mlx4_is_master(device->dev))
 		return 0;
 
-	for (i = 0; i <= device->dev->num_vfs; ++i)
+	for (i = 0; i <= device->dev->persist->num_vfs; ++i)
 		register_one_pkey_tree(device, i);
 
 	return 0;
@@ -807,7 +807,7 @@ static void unregister_pkey_tree(struct mlx4_ib_dev *device)
 	if (!mlx4_is_master(device->dev))
 		return;
 
-	for (slave = device->dev->num_vfs; slave >= 0; --slave) {
+	for (slave = device->dev->persist->num_vfs; slave >= 0; --slave) {
 		list_for_each_entry_safe(p, t,
 					 &device->pkeys.pkey_port_list[slave],
 					 entry) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c
index 963dd7e6d547..a716c26e0d99 100644
--- a/drivers/net/ethernet/mellanox/mlx4/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c
@@ -592,7 +592,7 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 		buf->nbufs        = 1;
 		buf->npages       = 1;
 		buf->page_shift   = get_order(size) + PAGE_SHIFT;
-		buf->direct.buf   = dma_alloc_coherent(&dev->pdev->dev,
+		buf->direct.buf   = dma_alloc_coherent(&dev->persist->pdev->dev,
 						       size, &t, gfp);
 		if (!buf->direct.buf)
 			return -ENOMEM;
@@ -619,7 +619,8 @@ int mlx4_buf_alloc(struct mlx4_dev *dev, int size, int max_direct,
 
 		for (i = 0; i < buf->nbufs; ++i) {
 			buf->page_list[i].buf =
-				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+				dma_alloc_coherent(&dev->persist->pdev->dev,
+						   PAGE_SIZE,
 						   &t, gfp);
 			if (!buf->page_list[i].buf)
 				goto err_free;
@@ -657,7 +658,8 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 	int i;
 
 	if (buf->nbufs == 1)
-		dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+		dma_free_coherent(&dev->persist->pdev->dev, size,
+				  buf->direct.buf,
 				  buf->direct.map);
 	else {
 		if (BITS_PER_LONG == 64 && buf->direct.buf)
@@ -665,7 +667,8 @@ void mlx4_buf_free(struct mlx4_dev *dev, int size, struct mlx4_buf *buf)
 
 		for (i = 0; i < buf->nbufs; ++i)
 			if (buf->page_list[i].buf)
-				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+				dma_free_coherent(&dev->persist->pdev->dev,
+						  PAGE_SIZE,
 						  buf->page_list[i].buf,
 						  buf->page_list[i].map);
 		kfree(buf->page_list);
@@ -738,7 +741,7 @@ int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order, gfp_t gfp
 		if (!mlx4_alloc_db_from_pgdir(pgdir, db, order))
 			goto out;
 
-	pgdir = mlx4_alloc_db_pgdir(&(dev->pdev->dev), gfp);
+	pgdir = mlx4_alloc_db_pgdir(&dev->persist->pdev->dev, gfp);
 	if (!pgdir) {
 		ret = -ENOMEM;
 		goto out;
@@ -775,7 +778,7 @@ void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db)
 	set_bit(i, db->u.pgdir->bits[o]);
 
 	if (bitmap_full(db->u.pgdir->order1, MLX4_DB_PER_PAGE / 2)) {
-		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
 				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
 		list_del(&db->u.pgdir->list);
 		kfree(db->u.pgdir);
diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 9c656fe4983d..1a102c9bac99 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -70,7 +70,7 @@ static void poll_catas(unsigned long dev_ptr)
 
 	if (readl(priv->catas_err.map)) {
 		/* If the device is off-line, we cannot try to recover it */
-		if (pci_channel_offline(dev->pdev))
+		if (pci_channel_offline(dev->persist->pdev))
 			mod_timer(&priv->catas_err.timer,
 				  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
 		else {
@@ -94,6 +94,7 @@ static void catas_reset(struct work_struct *work)
 {
 	struct mlx4_priv *priv, *tmppriv;
 	struct mlx4_dev *dev;
+	struct mlx4_dev_persistent *persist;
 
 	LIST_HEAD(tlist);
 	int ret;
@@ -103,20 +104,20 @@ static void catas_reset(struct work_struct *work)
 	spin_unlock_irq(&catas_lock);
 
 	list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
-		struct pci_dev *pdev = priv->dev.pdev;
+		struct pci_dev *pdev = priv->dev.persist->pdev;
 
 		/* If the device is off-line, we cannot reset it */
 		if (pci_channel_offline(pdev))
 			continue;
 
-		ret = mlx4_restart_one(priv->dev.pdev);
+		ret = mlx4_restart_one(priv->dev.persist->pdev);
 		/* 'priv' now is not valid */
 		if (ret)
 			pr_err("mlx4 %s: Reset failed (%d)\n",
 			       pci_name(pdev), ret);
 		else {
-			dev  = pci_get_drvdata(pdev);
-			mlx4_dbg(dev, "Reset succeeded\n");
+			persist  = pci_get_drvdata(pdev);
+			mlx4_dbg(persist->dev, "Reset succeeded\n");
 		}
 	}
 }
@@ -134,7 +135,7 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev)
 	init_timer(&priv->catas_err.timer);
 	priv->catas_err.map = NULL;
 
-	addr = pci_resource_start(dev->pdev, priv->fw.catas_bar) +
+	addr = pci_resource_start(dev->persist->pdev, priv->fw.catas_bar) +
 		priv->fw.catas_offset;
 
 	priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 5c93d1451c44..7cd90e6a4272 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -307,7 +307,7 @@ static int cmd_pending(struct mlx4_dev *dev)
 {
 	u32 status;
 
-	if (pci_channel_offline(dev->pdev))
+	if (pci_channel_offline(dev->persist->pdev))
 		return -EIO;
 
 	status = readl(mlx4_priv(dev)->cmd.hcr + HCR_STATUS_OFFSET);
@@ -328,7 +328,7 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 
 	mutex_lock(&cmd->hcr_mutex);
 
-	if (pci_channel_offline(dev->pdev)) {
+	if (pci_channel_offline(dev->persist->pdev)) {
 		/*
 		 * Device is going through error recovery
 		 * and cannot accept commands.
@@ -342,7 +342,7 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 		end += msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS);
 
 	while (cmd_pending(dev)) {
-		if (pci_channel_offline(dev->pdev)) {
+		if (pci_channel_offline(dev->persist->pdev)) {
 			/*
 			 * Device is going through error recovery
 			 * and cannot accept commands.
@@ -464,7 +464,7 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 
 	down(&priv->cmd.poll_sem);
 
-	if (pci_channel_offline(dev->pdev)) {
+	if (pci_channel_offline(dev->persist->pdev)) {
 		/*
 		 * Device is going through error recovery
 		 * and cannot accept commands.
@@ -487,7 +487,7 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 
 	end = msecs_to_jiffies(timeout) + jiffies;
 	while (cmd_pending(dev) && time_before(jiffies, end)) {
-		if (pci_channel_offline(dev->pdev)) {
+		if (pci_channel_offline(dev->persist->pdev)) {
 			/*
 			 * Device is going through error recovery
 			 * and cannot accept commands.
@@ -612,7 +612,7 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	       int out_is_imm, u32 in_modifier, u8 op_modifier,
 	       u16 op, unsigned long timeout, int native)
 {
-	if (pci_channel_offline(dev->pdev))
+	if (pci_channel_offline(dev->persist->pdev))
 		return -EIO;
 
 	if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
@@ -1997,11 +1997,12 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 
 	if (mlx4_is_master(dev))
 		priv->mfunc.comm =
-		ioremap(pci_resource_start(dev->pdev, priv->fw.comm_bar) +
+		ioremap(pci_resource_start(dev->persist->pdev,
+					   priv->fw.comm_bar) +
 			priv->fw.comm_base, MLX4_COMM_PAGESIZE);
 	else
 		priv->mfunc.comm =
-		ioremap(pci_resource_start(dev->pdev, 2) +
+		ioremap(pci_resource_start(dev->persist->pdev, 2) +
 			MLX4_SLAVE_COMM_BASE, MLX4_COMM_PAGESIZE);
 	if (!priv->mfunc.comm) {
 		mlx4_err(dev, "Couldn't map communication vector\n");
@@ -2107,9 +2108,9 @@ err_comm_admin:
 err_comm:
 	iounmap(priv->mfunc.comm);
 err_vhcr:
-	dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
-					     priv->mfunc.vhcr,
-					     priv->mfunc.vhcr_dma);
+	dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+			  priv->mfunc.vhcr,
+			  priv->mfunc.vhcr_dma);
 	priv->mfunc.vhcr = NULL;
 	return -ENOMEM;
 }
@@ -2130,8 +2131,8 @@ int mlx4_cmd_init(struct mlx4_dev *dev)
 	}
 
 	if (!mlx4_is_slave(dev) && !priv->cmd.hcr) {
-		priv->cmd.hcr = ioremap(pci_resource_start(dev->pdev, 0) +
-					MLX4_HCR_BASE, MLX4_HCR_SIZE);
+		priv->cmd.hcr = ioremap(pci_resource_start(dev->persist->pdev,
+					0) + MLX4_HCR_BASE, MLX4_HCR_SIZE);
 		if (!priv->cmd.hcr) {
 			mlx4_err(dev, "Couldn't map command register\n");
 			goto err;
@@ -2140,7 +2141,8 @@ int mlx4_cmd_init(struct mlx4_dev *dev)
 	}
 
 	if (mlx4_is_mfunc(dev) && !priv->mfunc.vhcr) {
-		priv->mfunc.vhcr = dma_alloc_coherent(&(dev->pdev->dev), PAGE_SIZE,
+		priv->mfunc.vhcr = dma_alloc_coherent(&dev->persist->pdev->dev,
+						      PAGE_SIZE,
 						      &priv->mfunc.vhcr_dma,
 						      GFP_KERNEL);
 		if (!priv->mfunc.vhcr)
@@ -2150,7 +2152,8 @@ int mlx4_cmd_init(struct mlx4_dev *dev)
 	}
 
 	if (!priv->cmd.pool) {
-		priv->cmd.pool = pci_pool_create("mlx4_cmd", dev->pdev,
+		priv->cmd.pool = pci_pool_create("mlx4_cmd",
+						 dev->persist->pdev,
 						 MLX4_MAILBOX_SIZE,
 						 MLX4_MAILBOX_SIZE, 0);
 		if (!priv->cmd.pool)
@@ -2202,7 +2205,7 @@ void mlx4_cmd_cleanup(struct mlx4_dev *dev, int cleanup_mask)
 	}
 	if (mlx4_is_mfunc(dev) && priv->mfunc.vhcr &&
 	    (cleanup_mask & MLX4_CMD_CLEANUP_VHCR)) {
-		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
 				  priv->mfunc.vhcr, priv->mfunc.vhcr_dma);
 		priv->mfunc.vhcr = NULL;
 	}
@@ -2306,8 +2309,9 @@ u32 mlx4_comm_get_version(void)
 
 static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
 {
-	if ((vf < 0) || (vf >= dev->num_vfs)) {
-		mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n", vf, dev->num_vfs);
+	if ((vf < 0) || (vf >= dev->persist->num_vfs)) {
+		mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n",
+			 vf, dev->persist->num_vfs);
 		return -EINVAL;
 	}
 
@@ -2316,7 +2320,7 @@ static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
 
 int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave)
 {
-	if (slave < 1 || slave > dev->num_vfs) {
+	if (slave < 1 || slave > dev->persist->num_vfs) {
 		mlx4_err(dev,
 			 "Bad slave number:%d (number of activated slaves: %lu)\n",
 			 slave, dev->num_slaves);
@@ -2388,7 +2392,7 @@ struct mlx4_slaves_pport mlx4_phys_to_slaves_pport(struct mlx4_dev *dev,
 	if (port <= 0 || port > dev->caps.num_ports)
 		return slaves_pport;
 
-	for (i = 0; i < dev->num_vfs + 1; i++) {
+	for (i = 0; i < dev->persist->num_vfs + 1; i++) {
 		struct mlx4_active_ports actv_ports =
 			mlx4_get_active_ports(dev, i);
 		if (test_bit(port - 1, actv_ports.ports))
@@ -2408,7 +2412,7 @@ struct mlx4_slaves_pport mlx4_phys_to_slaves_pport_actv(
 
 	bitmap_zero(slaves_pport.slaves, MLX4_MFUNC_MAX);
 
-	for (i = 0; i < dev->num_vfs + 1; i++) {
+	for (i = 0; i < dev->persist->num_vfs + 1; i++) {
 		struct mlx4_active_ports actv_ports =
 			mlx4_get_active_ports(dev, i);
 		if (bitmap_equal(crit_ports->ports, actv_ports.ports,
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index 82322b1c8411..22da4d0d0f05 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -70,10 +70,10 @@ int mlx4_en_create_cq(struct mlx4_en_priv *priv,
 	/* Allocate HW buffers on provided NUMA node.
 	 * dev->numa_node is used in mtt range allocation flow.
 	 */
-	set_dev_node(&mdev->dev->pdev->dev, node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
 	err = mlx4_alloc_hwq_res(mdev->dev, &cq->wqres,
 				cq->buf_size, 2 * PAGE_SIZE);
-	set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err)
 		goto err_cq;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 90e0f045a6bc..569eda9e83d6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -92,7 +92,7 @@ mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo)
 		(u16) (mdev->dev->caps.fw_ver >> 32),
 		(u16) ((mdev->dev->caps.fw_ver >> 16) & 0xffff),
 		(u16) (mdev->dev->caps.fw_ver & 0xffff));
-	strlcpy(drvinfo->bus_info, pci_name(mdev->dev->pdev),
+	strlcpy(drvinfo->bus_info, pci_name(mdev->dev->persist->pdev),
 		sizeof(drvinfo->bus_info));
 	drvinfo->n_stats = 0;
 	drvinfo->regdump_len = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c
index 9f16f754137b..c643d2bbb7b9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c
@@ -241,8 +241,8 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
 	spin_lock_init(&mdev->uar_lock);
 
 	mdev->dev = dev;
-	mdev->dma_device = &(dev->pdev->dev);
-	mdev->pdev = dev->pdev;
+	mdev->dma_device = &dev->persist->pdev->dev;
+	mdev->pdev = dev->persist->pdev;
 	mdev->device_up = false;
 
 	mdev->LSO_support = !!(dev->caps.flags & (1 << 15));
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index d0d6dc1b8e46..43a3f9822f74 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2457,7 +2457,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	netif_set_real_num_tx_queues(dev, prof->tx_ring_num);
 	netif_set_real_num_rx_queues(dev, prof->rx_ring_num);
 
-	SET_NETDEV_DEV(dev, &mdev->dev->pdev->dev);
+	SET_NETDEV_DEV(dev, &mdev->dev->persist->pdev->dev);
 	dev->dev_port = port - 1;
 
 	/*
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index a0474eb94aa3..2ba5d368edce 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -387,10 +387,10 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 		 ring->rx_info, tmp);
 
 	/* Allocate HW buffers on provided NUMA node */
-	set_dev_node(&mdev->dev->pdev->dev, node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
 	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres,
 				 ring->buf_size, 2 * PAGE_SIZE);
-	set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err)
 		goto err_info;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 359bb1286eb5..55f9f5c5344e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -91,10 +91,10 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE);
 
 	/* Allocate HW buffers on provided NUMA node */
-	set_dev_node(&mdev->dev->pdev->dev, node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, node);
 	err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size,
 				 2 * PAGE_SIZE);
-	set_dev_node(&mdev->dev->pdev->dev, mdev->dev->numa_node);
+	set_dev_node(&mdev->dev->persist->pdev->dev, mdev->dev->numa_node);
 	if (err) {
 		en_err(priv, "Failed allocating hwq resources\n");
 		goto err_bounce;
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index 3d275fbaf0eb..7538c9ce98a9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -237,7 +237,7 @@ int mlx4_gen_guid_change_eqe(struct mlx4_dev *dev, int slave, u8 port)
 	struct mlx4_eqe eqe;
 
 	/*don't send if we don't have the that slave */
-	if (dev->num_vfs < slave)
+	if (dev->persist->num_vfs < slave)
 		return 0;
 	memset(&eqe, 0, sizeof eqe);
 
@@ -255,7 +255,7 @@ int mlx4_gen_port_state_change_eqe(struct mlx4_dev *dev, int slave, u8 port,
 	struct mlx4_eqe eqe;
 
 	/*don't send if we don't have the that slave */
-	if (dev->num_vfs < slave)
+	if (dev->persist->num_vfs < slave)
 		return 0;
 	memset(&eqe, 0, sizeof eqe);
 
@@ -310,7 +310,7 @@ static void set_all_slave_state(struct mlx4_dev *dev, u8 port, int event)
 	struct mlx4_slaves_pport slaves_pport = mlx4_phys_to_slaves_pport(dev,
 									  port);
 
-	for (i = 0; i < dev->num_vfs + 1; i++)
+	for (i = 0; i < dev->persist->num_vfs + 1; i++)
 		if (test_bit(i, slaves_pport.slaves))
 			set_and_calc_slave_port_state(dev, i, port,
 						      event, &gen_event);
@@ -560,7 +560,8 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 				mlx4_priv(dev)->sense.do_sense_port[port] = 1;
 				if (!mlx4_is_master(dev))
 					break;
-				for (i = 0; i < dev->num_vfs + 1; i++) {
+				for (i = 0; i < dev->persist->num_vfs + 1;
+				     i++) {
 					if (!test_bit(i, slaves_port.slaves))
 						continue;
 					if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) {
@@ -596,7 +597,9 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 				if (!mlx4_is_master(dev))
 					break;
 				if (dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
-					for (i = 0; i < dev->num_vfs + 1; i++) {
+					for (i = 0;
+					     i < dev->persist->num_vfs + 1;
+					     i++) {
 						if (!test_bit(i, slaves_port.slaves))
 							continue;
 						if (i == mlx4_master_func_num(dev))
@@ -865,7 +868,7 @@ static void __iomem *mlx4_get_eq_uar(struct mlx4_dev *dev, struct mlx4_eq *eq)
 
 	if (!priv->eq_table.uar_map[index]) {
 		priv->eq_table.uar_map[index] =
-			ioremap(pci_resource_start(dev->pdev, 2) +
+			ioremap(pci_resource_start(dev->persist->pdev, 2) +
 				((eq->eqn / 4) << PAGE_SHIFT),
 				PAGE_SIZE);
 		if (!priv->eq_table.uar_map[index]) {
@@ -928,8 +931,10 @@ static int mlx4_create_eq(struct mlx4_dev *dev, int nent,
 	eq_context = mailbox->buf;
 
 	for (i = 0; i < npages; ++i) {
-		eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
-							  PAGE_SIZE, &t, GFP_KERNEL);
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->persist->
+							  pdev->dev,
+							  PAGE_SIZE, &t,
+							  GFP_KERNEL);
 		if (!eq->page_list[i].buf)
 			goto err_out_free_pages;
 
@@ -995,7 +1000,7 @@ err_out_free_eq:
 err_out_free_pages:
 	for (i = 0; i < npages; ++i)
 		if (eq->page_list[i].buf)
-			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+			dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
 					  eq->page_list[i].buf,
 					  eq->page_list[i].map);
 
@@ -1044,9 +1049,9 @@ static void mlx4_free_eq(struct mlx4_dev *dev,
 
 	mlx4_mtt_cleanup(dev, &eq->mtt);
 	for (i = 0; i < npages; ++i)
-		dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
-				    eq->page_list[i].buf,
-				    eq->page_list[i].map);
+		dma_free_coherent(&dev->persist->pdev->dev, PAGE_SIZE,
+				  eq->page_list[i].buf,
+				  eq->page_list[i].map);
 
 	kfree(eq->page_list);
 	mlx4_bitmap_free(&priv->eq_table.bitmap, eq->eqn, MLX4_USE_RR);
@@ -1060,7 +1065,7 @@ static void mlx4_free_irqs(struct mlx4_dev *dev)
 	int	i, vec;
 
 	if (eq_table->have_irq)
-		free_irq(dev->pdev->irq, dev);
+		free_irq(dev->persist->pdev->irq, dev);
 
 	for (i = 0; i < dev->caps.num_comp_vectors + 1; ++i)
 		if (eq_table->eq[i].have_irq) {
@@ -1089,7 +1094,8 @@ static int mlx4_map_clr_int(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
-	priv->clr_base = ioremap(pci_resource_start(dev->pdev, priv->fw.clr_int_bar) +
+	priv->clr_base = ioremap(pci_resource_start(dev->persist->pdev,
+				 priv->fw.clr_int_bar) +
 				 priv->fw.clr_int_base, MLX4_CLR_INT_SIZE);
 	if (!priv->clr_base) {
 		mlx4_err(dev, "Couldn't map interrupt clear register, aborting\n");
@@ -1212,13 +1218,13 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 					 i * MLX4_IRQNAME_SIZE,
 					 MLX4_IRQNAME_SIZE,
 					 "mlx4-comp-%d@pci:%s", i,
-					 pci_name(dev->pdev));
+					 pci_name(dev->persist->pdev));
 			} else {
 				snprintf(priv->eq_table.irq_names +
 					 i * MLX4_IRQNAME_SIZE,
 					 MLX4_IRQNAME_SIZE,
 					 "mlx4-async@pci:%s",
-					 pci_name(dev->pdev));
+					 pci_name(dev->persist->pdev));
 			}
 
 			eq_name = priv->eq_table.irq_names +
@@ -1235,8 +1241,8 @@ int mlx4_init_eq_table(struct mlx4_dev *dev)
 		snprintf(priv->eq_table.irq_names,
 			 MLX4_IRQNAME_SIZE,
 			 DRV_NAME "@pci:%s",
-			 pci_name(dev->pdev));
-		err = request_irq(dev->pdev->irq, mlx4_interrupt,
+			 pci_name(dev->persist->pdev));
+		err = request_irq(dev->persist->pdev->irq, mlx4_interrupt,
 				  IRQF_SHARED, priv->eq_table.irq_names, dev);
 		if (err)
 			goto err_out_async;
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index 97c9b1db1d27..2a9dd460a95f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -56,7 +56,7 @@ static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chu
 	int i;
 
 	if (chunk->nsg > 0)
-		pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+		pci_unmap_sg(dev->persist->pdev, chunk->mem, chunk->npages,
 			     PCI_DMA_BIDIRECTIONAL);
 
 	for (i = 0; i < chunk->npages; ++i)
@@ -69,7 +69,8 @@ static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *
 	int i;
 
 	for (i = 0; i < chunk->npages; ++i)
-		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+		dma_free_coherent(&dev->persist->pdev->dev,
+				  chunk->mem[i].length,
 				  lowmem_page_address(sg_page(&chunk->mem[i])),
 				  sg_dma_address(&chunk->mem[i]));
 }
@@ -173,7 +174,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 			--cur_order;
 
 		if (coherent)
-			ret = mlx4_alloc_icm_coherent(&dev->pdev->dev,
+			ret = mlx4_alloc_icm_coherent(&dev->persist->pdev->dev,
 						      &chunk->mem[chunk->npages],
 						      cur_order, gfp_mask);
 		else
@@ -193,7 +194,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 		if (coherent)
 			++chunk->nsg;
 		else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
-			chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+			chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem,
 						chunk->npages,
 						PCI_DMA_BIDIRECTIONAL);
 
@@ -208,7 +209,7 @@ struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
 	}
 
 	if (!coherent && chunk) {
-		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+		chunk->nsg = pci_map_sg(dev->persist->pdev, chunk->mem,
 					chunk->npages,
 					PCI_DMA_BIDIRECTIONAL);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 03e9eb0dc761..abcee61f8a47 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -318,10 +318,11 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		return -ENODEV;
 	}
 
-	if (dev_cap->uar_size > pci_resource_len(dev->pdev, 2)) {
+	if (dev_cap->uar_size > pci_resource_len(dev->persist->pdev, 2)) {
 		mlx4_err(dev, "HCA reported UAR size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
 			 dev_cap->uar_size,
-			 (unsigned long long) pci_resource_len(dev->pdev, 2));
+			 (unsigned long long)
+			 pci_resource_len(dev->persist->pdev, 2));
 		return -ENODEV;
 	}
 
@@ -541,8 +542,10 @@ static int mlx4_get_pcie_dev_link_caps(struct mlx4_dev *dev,
 	*speed = PCI_SPEED_UNKNOWN;
 	*width = PCIE_LNK_WIDTH_UNKNOWN;
 
-	err1 = pcie_capability_read_dword(dev->pdev, PCI_EXP_LNKCAP, &lnkcap1);
-	err2 = pcie_capability_read_dword(dev->pdev, PCI_EXP_LNKCAP2, &lnkcap2);
+	err1 = pcie_capability_read_dword(dev->persist->pdev, PCI_EXP_LNKCAP,
+					  &lnkcap1);
+	err2 = pcie_capability_read_dword(dev->persist->pdev, PCI_EXP_LNKCAP2,
+					  &lnkcap2);
 	if (!err2 && lnkcap2) { /* PCIe r3.0-compliant */
 		if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
 			*speed = PCIE_SPEED_8_0GT;
@@ -587,7 +590,7 @@ static void mlx4_check_pcie_caps(struct mlx4_dev *dev)
 		return;
 	}
 
-	err = pcie_get_minimum_link(dev->pdev, &speed, &width);
+	err = pcie_get_minimum_link(dev->persist->pdev, &speed, &width);
 	if (err || speed == PCI_SPEED_UNKNOWN ||
 	    width == PCIE_LNK_WIDTH_UNKNOWN) {
 		mlx4_warn(dev,
@@ -837,10 +840,12 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
 	if (dev->caps.uar_page_size * (dev->caps.num_uars -
 				       dev->caps.reserved_uars) >
-				       pci_resource_len(dev->pdev, 2)) {
+				       pci_resource_len(dev->persist->pdev,
+							2)) {
 		mlx4_err(dev, "HCA reported UAR region size of 0x%x bigger than PCI resource 2 size of 0x%llx, aborting\n",
 			 dev->caps.uar_page_size * dev->caps.num_uars,
-			 (unsigned long long) pci_resource_len(dev->pdev, 2));
+			 (unsigned long long)
+			 pci_resource_len(dev->persist->pdev, 2));
 		goto err_mem;
 	}
 
@@ -1492,9 +1497,9 @@ static int map_bf_area(struct mlx4_dev *dev)
 	if (!dev->caps.bf_reg_size)
 		return -ENXIO;
 
-	bf_start = pci_resource_start(dev->pdev, 2) +
+	bf_start = pci_resource_start(dev->persist->pdev, 2) +
 			(dev->caps.num_uars << PAGE_SHIFT);
-	bf_len = pci_resource_len(dev->pdev, 2) -
+	bf_len = pci_resource_len(dev->persist->pdev, 2) -
 			(dev->caps.num_uars << PAGE_SHIFT);
 	priv->bf_mapping = io_mapping_create_wc(bf_start, bf_len);
 	if (!priv->bf_mapping)
@@ -1536,7 +1541,8 @@ static int map_internal_clock(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	priv->clock_mapping =
-		ioremap(pci_resource_start(dev->pdev, priv->fw.clock_bar) +
+		ioremap(pci_resource_start(dev->persist->pdev,
+					   priv->fw.clock_bar) +
 			priv->fw.clock_offset, MLX4_CLOCK_SIZE);
 
 	if (!priv->clock_mapping)
@@ -1705,7 +1711,8 @@ static void choose_steering_mode(struct mlx4_dev *dev,
 	if (mlx4_log_num_mgm_entry_size <= 0 &&
 	    dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_FS_EN &&
 	    (!mlx4_is_mfunc(dev) ||
-	     (dev_cap->fs_max_num_qp_per_entry >= (dev->num_vfs + 1))) &&
+	     (dev_cap->fs_max_num_qp_per_entry >=
+	     (dev->persist->num_vfs + 1))) &&
 	    choose_log_fs_mgm_entry_size(dev_cap->fs_max_num_qp_per_entry) >=
 		MLX4_MIN_MGM_LOG_ENTRY_SIZE) {
 		dev->oper_log_mgm_entry_size =
@@ -2288,7 +2295,8 @@ static void mlx4_enable_msi_x(struct mlx4_dev *dev)
 		for (i = 0; i < nreq; ++i)
 			entries[i].entry = i;
 
-		nreq = pci_enable_msix_range(dev->pdev, entries, 2, nreq);
+		nreq = pci_enable_msix_range(dev->persist->pdev, entries, 2,
+					     nreq);
 
 		if (nreq < 0) {
 			kfree(entries);
@@ -2316,7 +2324,7 @@ no_msi:
 	dev->caps.comp_pool	   = 0;
 
 	for (i = 0; i < 2; ++i)
-		priv->eq_table.eq[i].irq = dev->pdev->irq;
+		priv->eq_table.eq[i].irq = dev->persist->pdev->irq;
 }
 
 static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
@@ -2344,7 +2352,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 	info->port_attr.show      = show_port_type;
 	sysfs_attr_init(&info->port_attr.attr);
 
-	err = device_create_file(&dev->pdev->dev, &info->port_attr);
+	err = device_create_file(&dev->persist->pdev->dev, &info->port_attr);
 	if (err) {
 		mlx4_err(dev, "Failed to create file for port %d\n", port);
 		info->port = -1;
@@ -2361,10 +2369,12 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 	info->port_mtu_attr.show      = show_port_ib_mtu;
 	sysfs_attr_init(&info->port_mtu_attr.attr);
 
-	err = device_create_file(&dev->pdev->dev, &info->port_mtu_attr);
+	err = device_create_file(&dev->persist->pdev->dev,
+				 &info->port_mtu_attr);
 	if (err) {
 		mlx4_err(dev, "Failed to create mtu file for port %d\n", port);
-		device_remove_file(&info->dev->pdev->dev, &info->port_attr);
+		device_remove_file(&info->dev->persist->pdev->dev,
+				   &info->port_attr);
 		info->port = -1;
 	}
 
@@ -2376,8 +2386,9 @@ static void mlx4_cleanup_port_info(struct mlx4_port_info *info)
 	if (info->port < 0)
 		return;
 
-	device_remove_file(&info->dev->pdev->dev, &info->port_attr);
-	device_remove_file(&info->dev->pdev->dev, &info->port_mtu_attr);
+	device_remove_file(&info->dev->persist->pdev->dev, &info->port_attr);
+	device_remove_file(&info->dev->persist->pdev->dev,
+			   &info->port_mtu_attr);
 }
 
 static int mlx4_init_steering(struct mlx4_dev *dev)
@@ -2444,10 +2455,11 @@ static int mlx4_get_ownership(struct mlx4_dev *dev)
 	void __iomem *owner;
 	u32 ret;
 
-	if (pci_channel_offline(dev->pdev))
+	if (pci_channel_offline(dev->persist->pdev))
 		return -EIO;
 
-	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+	owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_OWNER_BASE,
 			MLX4_OWNER_SIZE);
 	if (!owner) {
 		mlx4_err(dev, "Failed to obtain ownership bit\n");
@@ -2463,10 +2475,11 @@ static void mlx4_free_ownership(struct mlx4_dev *dev)
 {
 	void __iomem *owner;
 
-	if (pci_channel_offline(dev->pdev))
+	if (pci_channel_offline(dev->persist->pdev))
 		return;
 
-	owner = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_OWNER_BASE,
+	owner = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_OWNER_BASE,
 			MLX4_OWNER_SIZE);
 	if (!owner) {
 		mlx4_err(dev, "Failed to obtain ownership bit\n");
@@ -2514,13 +2527,13 @@ static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
 		dev_flags |= MLX4_FLAG_SRIOV |
 			MLX4_FLAG_MASTER;
 		dev_flags &= ~MLX4_FLAG_SLAVE;
-		dev->num_vfs = total_vfs;
+		dev->persist->num_vfs = total_vfs;
 	}
 	return dev_flags;
 
 disable_sriov:
 	atomic_dec(&pf_loading);
-	dev->num_vfs = 0;
+	dev->persist->num_vfs = 0;
 	kfree(dev->dev_vfs);
 	return dev_flags & ~MLX4_FLAG_MASTER;
 }
@@ -2607,7 +2620,7 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
 			existing_vfs = pci_num_vf(pdev);
 			if (existing_vfs)
 				dev->flags |= MLX4_FLAG_SRIOV;
-			dev->num_vfs = total_vfs;
+			dev->persist->num_vfs = total_vfs;
 		}
 	}
 
@@ -2771,12 +2784,14 @@ slave_start:
 				 dev->caps.num_ports);
 			goto err_close;
 		}
-		memcpy(dev->nvfs, nvfs, sizeof(dev->nvfs));
+		memcpy(dev->persist->nvfs, nvfs, sizeof(dev->persist->nvfs));
 
-		for (i = 0; i < sizeof(dev->nvfs)/sizeof(dev->nvfs[0]); i++) {
+		for (i = 0;
+		     i < sizeof(dev->persist->nvfs)/
+		     sizeof(dev->persist->nvfs[0]); i++) {
 			unsigned j;
 
-			for (j = 0; j < dev->nvfs[i]; ++sum, ++j) {
+			for (j = 0; j < dev->persist->nvfs[i]; ++sum, ++j) {
 				dev->dev_vfs[sum].min_port = i < 2 ? i + 1 : 1;
 				dev->dev_vfs[sum].n_ports = i < 2 ? 1 :
 					dev->caps.num_ports;
@@ -2846,7 +2861,7 @@ slave_start:
 
 	priv->removed = 0;
 
-	if (mlx4_is_master(dev) && dev->num_vfs)
+	if (mlx4_is_master(dev) && dev->persist->num_vfs)
 		atomic_dec(&pf_loading);
 
 	kfree(dev_cap);
@@ -2908,7 +2923,7 @@ err_sriov:
 	if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs)
 		pci_disable_sriov(pdev);
 
-	if (mlx4_is_master(dev) && dev->num_vfs)
+	if (mlx4_is_master(dev) && dev->persist->num_vfs)
 		atomic_dec(&pf_loading);
 
 	kfree(priv->dev.dev_vfs);
@@ -3076,20 +3091,28 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		return -ENOMEM;
 
 	dev       = &priv->dev;
-	dev->pdev = pdev;
-	pci_set_drvdata(pdev, dev);
+	dev->persist = kzalloc(sizeof(*dev->persist), GFP_KERNEL);
+	if (!dev->persist) {
+		kfree(priv);
+		return -ENOMEM;
+	}
+	dev->persist->pdev = pdev;
+	dev->persist->dev = dev;
+	pci_set_drvdata(pdev, dev->persist);
 	priv->pci_dev_data = id->driver_data;
 
 	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
-	if (ret)
+	if (ret) {
+		kfree(dev->persist);
 		kfree(priv);
-
+	}
 	return ret;
 }
 
 static void mlx4_unload_one(struct pci_dev *pdev)
 {
-	struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int               pci_dev_data;
 	int p;
@@ -3155,7 +3178,7 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 		mlx4_warn(dev, "Disabling SR-IOV\n");
 		pci_disable_sriov(pdev);
 		dev->flags &= ~MLX4_FLAG_SRIOV;
-		dev->num_vfs = 0;
+		dev->persist->num_vfs = 0;
 	}
 
 	if (!mlx4_is_slave(dev))
@@ -3175,26 +3198,29 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 
 static void mlx4_remove_one(struct pci_dev *pdev)
 {
-	struct mlx4_dev  *dev  = pci_get_drvdata(pdev);
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	mlx4_unload_one(pdev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
+	kfree(dev->persist);
 	kfree(priv);
 	pci_set_drvdata(pdev, NULL);
 }
 
 int mlx4_restart_one(struct pci_dev *pdev)
 {
-	struct mlx4_dev	 *dev  = pci_get_drvdata(pdev);
+	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
+	struct mlx4_dev	 *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int nvfs[MLX4_MAX_PORTS + 1] = {0, 0, 0};
 	int pci_dev_data, err, total_vfs;
 
 	pci_dev_data = priv->pci_dev_data;
-	total_vfs = dev->num_vfs;
-	memcpy(nvfs, dev->nvfs, sizeof(dev->nvfs));
+	total_vfs = dev->persist->num_vfs;
+	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
 
 	mlx4_unload_one(pdev);
 	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index bdd4eea2247c..faa37ab75a9d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -221,16 +221,17 @@ extern int mlx4_debug_level;
 #define mlx4_dbg(mdev, format, ...)					\
 do {									\
 	if (mlx4_debug_level)						\
-		dev_printk(KERN_DEBUG, &(mdev)->pdev->dev, format,	\
+		dev_printk(KERN_DEBUG,					\
+			   &(mdev)->persist->pdev->dev, format,		\
 			   ##__VA_ARGS__);				\
 } while (0)
 
 #define mlx4_err(mdev, format, ...)					\
-	dev_err(&(mdev)->pdev->dev, format, ##__VA_ARGS__)
+	dev_err(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
 #define mlx4_info(mdev, format, ...)					\
-	dev_info(&(mdev)->pdev->dev, format, ##__VA_ARGS__)
+	dev_info(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
 #define mlx4_warn(mdev, format, ...)					\
-	dev_warn(&(mdev)->pdev->dev, format, ##__VA_ARGS__)
+	dev_warn(&(mdev)->persist->pdev->dev, format, ##__VA_ARGS__)
 
 extern int mlx4_log_num_mgm_entry_size;
 extern int log_mtts_per_seg;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
index 7094a9c70fd5..8dbdf1d29357 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -708,13 +708,13 @@ static int mlx4_write_mtt_chunk(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 	if (!mtts)
 		return -ENOMEM;
 
-	dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle,
+	dma_sync_single_for_cpu(&dev->persist->pdev->dev, dma_handle,
 				npages * sizeof (u64), DMA_TO_DEVICE);
 
 	for (i = 0; i < npages; ++i)
 		mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
 
-	dma_sync_single_for_device(&dev->pdev->dev, dma_handle,
+	dma_sync_single_for_device(&dev->persist->pdev->dev, dma_handle,
 				   npages * sizeof (u64), DMA_TO_DEVICE);
 
 	return 0;
@@ -1020,13 +1020,13 @@ int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list
 	/* Make sure MPT status is visible before writing MTT entries */
 	wmb();
 
-	dma_sync_single_for_cpu(&dev->pdev->dev, fmr->dma_handle,
+	dma_sync_single_for_cpu(&dev->persist->pdev->dev, fmr->dma_handle,
 				npages * sizeof(u64), DMA_TO_DEVICE);
 
 	for (i = 0; i < npages; ++i)
 		fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT);
 
-	dma_sync_single_for_device(&dev->pdev->dev, fmr->dma_handle,
+	dma_sync_single_for_device(&dev->persist->pdev->dev, fmr->dma_handle,
 				   npages * sizeof(u64), DMA_TO_DEVICE);
 
 	fmr->mpt->key    = cpu_to_be32(key);
diff --git a/drivers/net/ethernet/mellanox/mlx4/pd.c b/drivers/net/ethernet/mellanox/mlx4/pd.c
index 74216071201f..a42b4c0a9ed9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/pd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/pd.c
@@ -151,11 +151,13 @@ int mlx4_uar_alloc(struct mlx4_dev *dev, struct mlx4_uar *uar)
 		return -ENOMEM;
 
 	if (mlx4_is_slave(dev))
-		offset = uar->index % ((int) pci_resource_len(dev->pdev, 2) /
+		offset = uar->index % ((int)pci_resource_len(dev->persist->pdev,
+							     2) /
 				       dev->caps.uar_page_size);
 	else
 		offset = uar->index;
-	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + offset;
+	uar->pfn = (pci_resource_start(dev->persist->pdev, 2) >> PAGE_SHIFT)
+		    + offset;
 	uar->map = NULL;
 	return 0;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index 30eb1ead0fe6..9f268f05290a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -553,9 +553,9 @@ int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port)
 		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
 				    dev, &exclusive_ports);
 		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
-					   dev->num_vfs + 1);
+					   dev->persist->num_vfs + 1);
 	}
-	vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1;
+	vfs = bitmap_weight(slaves_pport.slaves, dev->persist->num_vfs + 1) - 1;
 	if (slave_gid <= ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) % vfs))
 		return ((MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs) + 1;
 	return (MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS) / vfs;
@@ -590,10 +590,10 @@ int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port)
 		slaves_pport_actv = mlx4_phys_to_slaves_pport_actv(
 				    dev, &exclusive_ports);
 		slave_gid -= bitmap_weight(slaves_pport_actv.slaves,
-					   dev->num_vfs + 1);
+					   dev->persist->num_vfs + 1);
 	}
 	gids = MLX4_ROCE_MAX_GIDS - MLX4_ROCE_PF_GIDS;
-	vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1;
+	vfs = bitmap_weight(slaves_pport.slaves, dev->persist->num_vfs + 1) - 1;
 	if (slave_gid <= gids % vfs)
 		return MLX4_ROCE_PF_GIDS + ((gids / vfs) + 1) * (slave_gid - 1);
 
@@ -644,7 +644,7 @@ void mlx4_reset_roce_gids(struct mlx4_dev *dev, int slave)
 	int num_eth_ports, err;
 	int i;
 
-	if (slave < 0 || slave > dev->num_vfs)
+	if (slave < 0 || slave > dev->persist->num_vfs)
 		return;
 
 	actv_ports = mlx4_get_active_ports(dev, slave);
@@ -1214,7 +1214,8 @@ int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
 		return -EINVAL;
 
 	slaves_pport = mlx4_phys_to_slaves_pport(dev, port);
-	num_vfs = bitmap_weight(slaves_pport.slaves, dev->num_vfs + 1) - 1;
+	num_vfs = bitmap_weight(slaves_pport.slaves,
+				dev->persist->num_vfs + 1) - 1;
 
 	for (i = 0; i < MLX4_ROCE_MAX_GIDS; i++) {
 		if (!memcmp(priv->port[port].gid_table.roce_gids[i].raw, gid,
@@ -1258,7 +1259,7 @@ int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
 							dev, &exclusive_ports);
 				num_vfs_before += bitmap_weight(
 						slaves_pport_actv.slaves,
-						dev->num_vfs + 1);
+						dev->persist->num_vfs + 1);
 			}
 
 			/* candidate_slave_gid isn't necessarily the correct slave, but
@@ -1288,7 +1289,7 @@ int mlx4_get_slave_from_roce_gid(struct mlx4_dev *dev, int port, u8 *gid,
 						dev, &exclusive_ports);
 				slave_gid += bitmap_weight(
 						slaves_pport_actv.slaves,
-						dev->num_vfs + 1);
+						dev->persist->num_vfs + 1);
 			}
 		}
 		*slave_id = slave_gid;
diff --git a/drivers/net/ethernet/mellanox/mlx4/reset.c b/drivers/net/ethernet/mellanox/mlx4/reset.c
index ea1c6d092145..0076d88587ca 100644
--- a/drivers/net/ethernet/mellanox/mlx4/reset.c
+++ b/drivers/net/ethernet/mellanox/mlx4/reset.c
@@ -76,19 +76,21 @@ int mlx4_reset(struct mlx4_dev *dev)
 		goto out;
 	}
 
-	pcie_cap = pci_pcie_cap(dev->pdev);
+	pcie_cap = pci_pcie_cap(dev->persist->pdev);
 
 	for (i = 0; i < 64; ++i) {
 		if (i == 22 || i == 23)
 			continue;
-		if (pci_read_config_dword(dev->pdev, i * 4, hca_header + i)) {
+		if (pci_read_config_dword(dev->persist->pdev, i * 4,
+					  hca_header + i)) {
 			err = -ENODEV;
 			mlx4_err(dev, "Couldn't save HCA PCI header, aborting\n");
 			goto out;
 		}
 	}
 
-	reset = ioremap(pci_resource_start(dev->pdev, 0) + MLX4_RESET_BASE,
+	reset = ioremap(pci_resource_start(dev->persist->pdev, 0) +
+			MLX4_RESET_BASE,
 			MLX4_RESET_SIZE);
 	if (!reset) {
 		err = -ENOMEM;
@@ -122,8 +124,8 @@ int mlx4_reset(struct mlx4_dev *dev)
 
 	end = jiffies + MLX4_RESET_TIMEOUT_JIFFIES;
 	do {
-		if (!pci_read_config_word(dev->pdev, PCI_VENDOR_ID, &vendor) &&
-		    vendor != 0xffff)
+		if (!pci_read_config_word(dev->persist->pdev, PCI_VENDOR_ID,
+					  &vendor) && vendor != 0xffff)
 			break;
 
 		msleep(1);
@@ -138,14 +140,16 @@ int mlx4_reset(struct mlx4_dev *dev)
 	/* Now restore the PCI headers */
 	if (pcie_cap) {
 		devctl = hca_header[(pcie_cap + PCI_EXP_DEVCTL) / 4];
-		if (pcie_capability_write_word(dev->pdev, PCI_EXP_DEVCTL,
+		if (pcie_capability_write_word(dev->persist->pdev,
+					       PCI_EXP_DEVCTL,
 					       devctl)) {
 			err = -ENODEV;
 			mlx4_err(dev, "Couldn't restore HCA PCI Express Device Control register, aborting\n");
 			goto out;
 		}
 		linkctl = hca_header[(pcie_cap + PCI_EXP_LNKCTL) / 4];
-		if (pcie_capability_write_word(dev->pdev, PCI_EXP_LNKCTL,
+		if (pcie_capability_write_word(dev->persist->pdev,
+					       PCI_EXP_LNKCTL,
 					       linkctl)) {
 			err = -ENODEV;
 			mlx4_err(dev, "Couldn't restore HCA PCI Express Link control register, aborting\n");
@@ -157,7 +161,8 @@ int mlx4_reset(struct mlx4_dev *dev)
 		if (i * 4 == PCI_COMMAND)
 			continue;
 
-		if (pci_write_config_dword(dev->pdev, i * 4, hca_header[i])) {
+		if (pci_write_config_dword(dev->persist->pdev, i * 4,
+					   hca_header[i])) {
 			err = -ENODEV;
 			mlx4_err(dev, "Couldn't restore HCA reg %x, aborting\n",
 				 i);
@@ -165,7 +170,7 @@ int mlx4_reset(struct mlx4_dev *dev)
 		}
 	}
 
-	if (pci_write_config_dword(dev->pdev, PCI_COMMAND,
+	if (pci_write_config_dword(dev->persist->pdev, PCI_COMMAND,
 				   hca_header[PCI_COMMAND / 4])) {
 		err = -ENODEV;
 		mlx4_err(dev, "Couldn't restore HCA COMMAND, aborting\n");
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 4efbd1eca611..3e93879bccce 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -309,12 +309,13 @@ static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave,
 	int allocated, free, reserved, guaranteed, from_free;
 	int from_rsvd;
 
-	if (slave > dev->num_vfs)
+	if (slave > dev->persist->num_vfs)
 		return -EINVAL;
 
 	spin_lock(&res_alloc->alloc_lock);
 	allocated = (port > 0) ?
-		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] :
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] :
 		res_alloc->allocated[slave];
 	free = (port > 0) ? res_alloc->res_port_free[port - 1] :
 		res_alloc->res_free;
@@ -352,7 +353,8 @@ static inline int mlx4_grant_resource(struct mlx4_dev *dev, int slave,
 	if (!err) {
 		/* grant the request */
 		if (port > 0) {
-			res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] += count;
+			res_alloc->allocated[(port - 1) *
+			(dev->persist->num_vfs + 1) + slave] += count;
 			res_alloc->res_port_free[port - 1] -= count;
 			res_alloc->res_port_rsvd[port - 1] -= from_rsvd;
 		} else {
@@ -376,13 +378,14 @@ static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave,
 		&priv->mfunc.master.res_tracker.res_alloc[res_type];
 	int allocated, guaranteed, from_rsvd;
 
-	if (slave > dev->num_vfs)
+	if (slave > dev->persist->num_vfs)
 		return;
 
 	spin_lock(&res_alloc->alloc_lock);
 
 	allocated = (port > 0) ?
-		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] :
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] :
 		res_alloc->allocated[slave];
 	guaranteed = res_alloc->guaranteed[slave];
 
@@ -397,7 +400,8 @@ static inline void mlx4_release_resource(struct mlx4_dev *dev, int slave,
 	}
 
 	if (port > 0) {
-		res_alloc->allocated[(port - 1) * (dev->num_vfs + 1) + slave] -= count;
+		res_alloc->allocated[(port - 1) *
+		(dev->persist->num_vfs + 1) + slave] -= count;
 		res_alloc->res_port_free[port - 1] += count;
 		res_alloc->res_port_rsvd[port - 1] += from_rsvd;
 	} else {
@@ -415,7 +419,8 @@ static inline void initialize_res_quotas(struct mlx4_dev *dev,
 					 enum mlx4_resource res_type,
 					 int vf, int num_instances)
 {
-	res_alloc->guaranteed[vf] = num_instances / (2 * (dev->num_vfs + 1));
+	res_alloc->guaranteed[vf] = num_instances /
+				    (2 * (dev->persist->num_vfs + 1));
 	res_alloc->quota[vf] = (num_instances / 2) + res_alloc->guaranteed[vf];
 	if (vf == mlx4_master_func_num(dev)) {
 		res_alloc->res_free = num_instances;
@@ -486,21 +491,26 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 	for (i = 0; i < MLX4_NUM_OF_RESOURCE_TYPE; i++) {
 		struct resource_allocator *res_alloc =
 			&priv->mfunc.master.res_tracker.res_alloc[i];
-		res_alloc->quota = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
-		res_alloc->guaranteed = kmalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+		res_alloc->quota = kmalloc((dev->persist->num_vfs + 1) *
+					   sizeof(int), GFP_KERNEL);
+		res_alloc->guaranteed = kmalloc((dev->persist->num_vfs + 1) *
+						sizeof(int), GFP_KERNEL);
 		if (i == RES_MAC || i == RES_VLAN)
 			res_alloc->allocated = kzalloc(MLX4_MAX_PORTS *
-						       (dev->num_vfs + 1) * sizeof(int),
-							GFP_KERNEL);
+						       (dev->persist->num_vfs
+						       + 1) *
+						       sizeof(int), GFP_KERNEL);
 		else
-			res_alloc->allocated = kzalloc((dev->num_vfs + 1) * sizeof(int), GFP_KERNEL);
+			res_alloc->allocated = kzalloc((dev->persist->
+							num_vfs + 1) *
+						       sizeof(int), GFP_KERNEL);
 
 		if (!res_alloc->quota || !res_alloc->guaranteed ||
 		    !res_alloc->allocated)
 			goto no_mem_err;
 
 		spin_lock_init(&res_alloc->alloc_lock);
-		for (t = 0; t < dev->num_vfs + 1; t++) {
+		for (t = 0; t < dev->persist->num_vfs + 1; t++) {
 			struct mlx4_active_ports actv_ports =
 				mlx4_get_active_ports(dev, t);
 			switch (i) {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index f1e41b33462f..1069ce65e8b4 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -744,8 +744,15 @@ struct mlx4_vf_dev {
 	u8			n_ports;
 };
 
-struct mlx4_dev {
+struct mlx4_dev_persistent {
 	struct pci_dev	       *pdev;
+	struct mlx4_dev	       *dev;
+	int                     nvfs[MLX4_MAX_PORTS + 1];
+	int			num_vfs;
+};
+
+struct mlx4_dev {
+	struct mlx4_dev_persistent *persist;
 	unsigned long		flags;
 	unsigned long		num_slaves;
 	struct mlx4_caps	caps;
@@ -754,13 +761,11 @@ struct mlx4_dev {
 	struct radix_tree_root	qp_table_tree;
 	u8			rev_id;
 	char			board_id[MLX4_BOARD_ID_LEN];
-	int			num_vfs;
 	int			numa_node;
 	int			oper_log_mgm_entry_size;
 	u64			regid_promisc_array[MLX4_MAX_PORTS + 1];
 	u64			regid_allmulti_array[MLX4_MAX_PORTS + 1];
 	struct mlx4_vf_dev     *dev_vfs;
-	int                     nvfs[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_eqe {
-- 
cgit v1.2.3


From dd0eefe3abbf47442db296bf68f27eb2860c1cdf Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:36 +0200
Subject: net/mlx4_core: Set device configuration data to be persistent across
 reset

When an HCA enters an internal error state, this is detected by the driver.
The driver then should reset the HCA and restart the software stack.

Keep ports information and some SRIOV configuration in a persistent area
to have it valid across reset.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/main.c | 45 +++++++++++++++++++++++++++++--
 include/linux/mlx4/device.h               |  2 ++
 2 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index abcee61f8a47..2c5a555dff89 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3109,18 +3109,34 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	return ret;
 }
 
+static void mlx4_clean_dev(struct mlx4_dev *dev)
+{
+	struct mlx4_dev_persistent *persist = dev->persist;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	memset(priv, 0, sizeof(*priv));
+	priv->dev.persist = persist;
+}
+
 static void mlx4_unload_one(struct pci_dev *pdev)
 {
 	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
 	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int               pci_dev_data;
-	int p;
+	int p, i;
 	int active_vfs = 0;
 
 	if (priv->removed)
 		return;
 
+	/* saving current ports type for further use */
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		dev->persist->curr_port_type[i] = dev->caps.port_type[i + 1];
+		dev->persist->curr_port_poss_type[i] = dev->caps.
+						       possible_type[i + 1];
+	}
+
 	pci_dev_data = priv->pci_dev_data;
 
 	/* Disabling SR-IOV is not allowed while there are active vf's */
@@ -3191,7 +3207,7 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 	kfree(dev->caps.qp1_proxy);
 	kfree(dev->dev_vfs);
 
-	memset(priv, 0, sizeof(*priv));
+	mlx4_clean_dev(dev);
 	priv->pci_dev_data = pci_dev_data;
 	priv->removed = 1;
 }
@@ -3210,6 +3226,25 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	pci_set_drvdata(pdev, NULL);
 }
 
+static int restore_current_port_types(struct mlx4_dev *dev,
+				      enum mlx4_port_type *types,
+				      enum mlx4_port_type *poss_types)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err, i;
+
+	mlx4_stop_sense(dev);
+
+	mutex_lock(&priv->port_mutex);
+	for (i = 0; i < dev->caps.num_ports; i++)
+		dev->caps.possible_type[i + 1] = poss_types[i];
+	err = mlx4_change_port_types(dev, types);
+	mlx4_start_sense(dev);
+	mutex_unlock(&priv->port_mutex);
+
+	return err;
+}
+
 int mlx4_restart_one(struct pci_dev *pdev)
 {
 	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
@@ -3230,6 +3265,12 @@ int mlx4_restart_one(struct pci_dev *pdev)
 		return err;
 	}
 
+	err = restore_current_port_types(dev, dev->persist->curr_port_type,
+					 dev->persist->curr_port_poss_type);
+	if (err)
+		mlx4_err(dev, "could not restore original port types (%d)\n",
+			 err);
+
 	return err;
 }
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 1069ce65e8b4..8c3837ac1a2d 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -749,6 +749,8 @@ struct mlx4_dev_persistent {
 	struct mlx4_dev	       *dev;
 	int                     nvfs[MLX4_MAX_PORTS + 1];
 	int			num_vfs;
+	enum mlx4_port_type curr_port_type[MLX4_MAX_PORTS + 1];
+	enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
 };
 
 struct mlx4_dev {
-- 
cgit v1.2.3


From ad9a0bf08ffbf32b8f292c3bb78ca0f24bb8f6b2 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:37 +0200
Subject: net/mlx4_core: Refactor the catas flow to work per device

Using a WQ per device instead of a single global WQ, this allows
independent reset handling per device even when SRIOV is used.

This comes as a pre-patch for supporting chip reset
for both native and SRIOV.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c | 75 ++++++++++++++----------------
 drivers/net/ethernet/mellanox/mlx4/main.c  | 12 ++++-
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |  3 +-
 include/linux/mlx4/device.h                |  2 +
 4 files changed, 48 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 1a102c9bac99..5bb9aa6e281d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -40,10 +40,7 @@ enum {
 	MLX4_CATAS_POLL_INTERVAL	= 5 * HZ,
 };
 
-static DEFINE_SPINLOCK(catas_lock);
 
-static LIST_HEAD(catas_list);
-static struct work_struct catas_work;
 
 static int internal_err_reset = 1;
 module_param(internal_err_reset, int, 0644);
@@ -77,13 +74,9 @@ static void poll_catas(unsigned long dev_ptr)
 			dump_err_buf(dev);
 			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
 
-			if (internal_err_reset) {
-				spin_lock(&catas_lock);
-				list_add(&priv->catas_err.list, &catas_list);
-				spin_unlock(&catas_lock);
-
-				queue_work(mlx4_wq, &catas_work);
-			}
+			if (internal_err_reset)
+				queue_work(dev->persist->catas_wq,
+					   &dev->persist->catas_work);
 		}
 	} else
 		mod_timer(&priv->catas_err.timer,
@@ -92,34 +85,23 @@ static void poll_catas(unsigned long dev_ptr)
 
 static void catas_reset(struct work_struct *work)
 {
-	struct mlx4_priv *priv, *tmppriv;
-	struct mlx4_dev *dev;
-	struct mlx4_dev_persistent *persist;
-
-	LIST_HEAD(tlist);
+	struct mlx4_dev_persistent *persist =
+		container_of(work, struct mlx4_dev_persistent,
+			     catas_work);
+	struct pci_dev *pdev = persist->pdev;
 	int ret;
 
-	spin_lock_irq(&catas_lock);
-	list_splice_init(&catas_list, &tlist);
-	spin_unlock_irq(&catas_lock);
-
-	list_for_each_entry_safe(priv, tmppriv, &tlist, catas_err.list) {
-		struct pci_dev *pdev = priv->dev.persist->pdev;
-
-		/* If the device is off-line, we cannot reset it */
-		if (pci_channel_offline(pdev))
-			continue;
+	/* If the device is off-line, we cannot reset it */
+	if (pci_channel_offline(pdev))
+		return;
 
-		ret = mlx4_restart_one(priv->dev.persist->pdev);
-		/* 'priv' now is not valid */
-		if (ret)
-			pr_err("mlx4 %s: Reset failed (%d)\n",
-			       pci_name(pdev), ret);
-		else {
-			persist  = pci_get_drvdata(pdev);
-			mlx4_dbg(persist->dev, "Reset succeeded\n");
-		}
-	}
+	ret = mlx4_restart_one(pdev);
+	/* 'priv' now is not valid */
+	if (ret)
+		pr_err("mlx4 %s: Reset failed (%d)\n",
+		       pci_name(pdev), ret);
+	else
+		mlx4_dbg(persist->dev, "Reset succeeded\n");
 }
 
 void mlx4_start_catas_poll(struct mlx4_dev *dev)
@@ -158,15 +140,26 @@ void mlx4_stop_catas_poll(struct mlx4_dev *dev)
 
 	del_timer_sync(&priv->catas_err.timer);
 
-	if (priv->catas_err.map)
+	if (priv->catas_err.map) {
 		iounmap(priv->catas_err.map);
+		priv->catas_err.map = NULL;
+	}
+}
 
-	spin_lock_irq(&catas_lock);
-	list_del(&priv->catas_err.list);
-	spin_unlock_irq(&catas_lock);
+int  mlx4_catas_init(struct mlx4_dev *dev)
+{
+	INIT_WORK(&dev->persist->catas_work, catas_reset);
+	dev->persist->catas_wq = create_singlethread_workqueue("mlx4_health");
+	if (!dev->persist->catas_wq)
+		return -ENOMEM;
+
+	return 0;
 }
 
-void  __init mlx4_catas_init(void)
+void mlx4_catas_end(struct mlx4_dev *dev)
 {
-	INIT_WORK(&catas_work, catas_reset);
+	if (dev->persist->catas_wq) {
+		destroy_workqueue(dev->persist->catas_wq);
+		dev->persist->catas_wq = NULL;
+	}
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 2c5a555dff89..a61694cc1476 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3064,11 +3064,19 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data,
 		}
 	}
 
-	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+	err = mlx4_catas_init(&priv->dev);
 	if (err)
 		goto err_release_regions;
+
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+	if (err)
+		goto err_catas;
+
 	return 0;
 
+err_catas:
+	mlx4_catas_end(&priv->dev);
+
 err_release_regions:
 	pci_release_regions(pdev);
 
@@ -3219,6 +3227,7 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	mlx4_unload_one(pdev);
+	mlx4_catas_end(dev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 	kfree(dev->persist);
@@ -3403,7 +3412,6 @@ static int __init mlx4_init(void)
 	if (mlx4_verify_params())
 		return -EINVAL;
 
-	mlx4_catas_init();
 
 	mlx4_wq = create_singlethread_workqueue("mlx4");
 	if (!mlx4_wq)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index faa37ab75a9d..d41af84f965b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -995,7 +995,8 @@ void __mlx4_xrcd_free(struct mlx4_dev *dev, u32 xrcdn);
 
 void mlx4_start_catas_poll(struct mlx4_dev *dev);
 void mlx4_stop_catas_poll(struct mlx4_dev *dev);
-void mlx4_catas_init(void);
+int mlx4_catas_init(struct mlx4_dev *dev);
+void mlx4_catas_end(struct mlx4_dev *dev);
 int mlx4_restart_one(struct pci_dev *pdev);
 int mlx4_register_device(struct mlx4_dev *dev);
 void mlx4_unregister_device(struct mlx4_dev *dev);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 8c3837ac1a2d..da425d2f3708 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -751,6 +751,8 @@ struct mlx4_dev_persistent {
 	int			num_vfs;
 	enum mlx4_port_type curr_port_type[MLX4_MAX_PORTS + 1];
 	enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
+	struct work_struct      catas_work;
+	struct workqueue_struct *catas_wq;
 };
 
 struct mlx4_dev {
-- 
cgit v1.2.3


From f6bc11e42646e661e699a5593cbd1e9dba7191d0 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:38 +0200
Subject: net/mlx4_core: Enhance the catas flow to support device reset

This includes:

- resetting the chip when a fatal error is detected (the current code
  does not do this).

- exposing the ability to enter error state from outside the catas code
  by calling its functionality. (E.g. FW Command timeout, AER error).

- managing a persistent device state. This is needed to sync between
  reset flow cases.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c | 122 ++++++++++++++++++++++-------
 drivers/net/ethernet/mellanox/mlx4/main.c  |   6 ++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |   2 +-
 include/linux/mlx4/device.h                |   7 ++
 4 files changed, 108 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 5bb9aa6e281d..588d6b5e1211 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -48,6 +48,83 @@ MODULE_PARM_DESC(internal_err_reset,
 		 "Reset device on internal errors if non-zero"
 		 " (default 1, in SRIOV mode default is 0)");
 
+static int read_vendor_id(struct mlx4_dev *dev)
+{
+	u16 vendor_id = 0;
+	int ret;
+
+	ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
+	if (ret) {
+		mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
+		return ret;
+	}
+
+	if (vendor_id == 0xffff) {
+		mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx4_reset_master(struct mlx4_dev *dev)
+{
+	int err = 0;
+
+	if (!pci_channel_offline(dev->persist->pdev)) {
+		err = read_vendor_id(dev);
+		/* If PCI can't be accessed to read vendor ID we assume that its
+		 * link was disabled and chip was already reset.
+		 */
+		if (err)
+			return 0;
+
+		err = mlx4_reset(dev);
+		if (err)
+			mlx4_err(dev, "Fail to reset HCA\n");
+	}
+
+	return err;
+}
+
+void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
+{
+	int err;
+	struct mlx4_dev *dev;
+
+	if (!internal_err_reset)
+		return;
+
+	mutex_lock(&persist->device_state_mutex);
+	if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		goto out;
+
+	dev = persist->dev;
+	mlx4_err(dev, "device is going to be reset\n");
+	err = mlx4_reset_master(dev);
+	BUG_ON(err != 0);
+
+	dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
+	mlx4_err(dev, "device was reset successfully\n");
+	mutex_unlock(&persist->device_state_mutex);
+
+	/* At that step HW was already reset, now notify clients */
+	mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+	return;
+
+out:
+	mutex_unlock(&persist->device_state_mutex);
+}
+
+static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
+{
+	int err = 0;
+
+	mlx4_enter_error_state(persist);
+	err = mlx4_restart_one(persist->pdev);
+	mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", err);
+}
+
 static void dump_err_buf(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -66,21 +143,22 @@ static void poll_catas(unsigned long dev_ptr)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
 	if (readl(priv->catas_err.map)) {
-		/* If the device is off-line, we cannot try to recover it */
-		if (pci_channel_offline(dev->persist->pdev))
-			mod_timer(&priv->catas_err.timer,
-				  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
-		else {
-			dump_err_buf(dev);
-			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
-
-			if (internal_err_reset)
-				queue_work(dev->persist->catas_wq,
-					   &dev->persist->catas_work);
-		}
-	} else
-		mod_timer(&priv->catas_err.timer,
-			  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+		dump_err_buf(dev);
+		goto internal_err;
+	}
+
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx4_warn(dev, "Internal error mark was detected on device\n");
+		goto internal_err;
+	}
+
+	mod_timer(&priv->catas_err.timer,
+		  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
+	return;
+
+internal_err:
+	if (internal_err_reset)
+		queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
 }
 
 static void catas_reset(struct work_struct *work)
@@ -88,20 +166,8 @@ static void catas_reset(struct work_struct *work)
 	struct mlx4_dev_persistent *persist =
 		container_of(work, struct mlx4_dev_persistent,
 			     catas_work);
-	struct pci_dev *pdev = persist->pdev;
-	int ret;
-
-	/* If the device is off-line, we cannot reset it */
-	if (pci_channel_offline(pdev))
-		return;
 
-	ret = mlx4_restart_one(pdev);
-	/* 'priv' now is not valid */
-	if (ret)
-		pr_err("mlx4 %s: Reset failed (%d)\n",
-		       pci_name(pdev), ret);
-	else
-		mlx4_dbg(persist->dev, "Reset succeeded\n");
+	mlx4_handle_error_state(persist);
 }
 
 void mlx4_start_catas_poll(struct mlx4_dev *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index a61694cc1476..dc2d910fcc88 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -2624,6 +2624,11 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
 		}
 	}
 
+	/* on load remove any previous indication of internal error,
+	 * device is up.
+	 */
+	dev->persist->state = MLX4_DEVICE_STATE_UP;
+
 slave_start:
 	err = mlx4_cmd_init(dev);
 	if (err) {
@@ -3108,6 +3113,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev->persist->dev = dev;
 	pci_set_drvdata(pdev, dev->persist);
 	priv->pci_dev_data = id->driver_data;
+	mutex_init(&dev->persist->device_state_mutex);
 
 	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
 	if (ret) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index d41af84f965b..aa1ecbc5a606 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -1178,7 +1178,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);
 
 void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);
 
-void mlx4_handle_catas_err(struct mlx4_dev *dev);
+void mlx4_enter_error_state(struct mlx4_dev_persistent *persist);
 
 int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
 		    enum mlx4_port_type *type);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index da425d2f3708..7d5d317cb7a6 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -411,6 +411,11 @@ enum {
 	MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK	= 1 << 4,
 };
 
+enum {
+	MLX4_DEVICE_STATE_UP			= 1 << 0,
+	MLX4_DEVICE_STATE_INTERNAL_ERROR	= 1 << 1,
+};
+
 #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
 			     MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
 
@@ -753,6 +758,8 @@ struct mlx4_dev_persistent {
 	enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
 	struct work_struct      catas_work;
 	struct workqueue_struct *catas_wq;
+	struct mutex	device_state_mutex; /* protect HW state */
+	u8		state;
 };
 
 struct mlx4_dev {
-- 
cgit v1.2.3


From f5aef5aa35063f2b45c3605871cd525d0cb7fb7a Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:39 +0200
Subject: net/mlx4_core: Activate reset flow upon fatal command cases

We activate reset flow upon command fatal errors, when the device enters an
erroneous state, and must be reset.

The cases below are assumed to be fatal: FW command timed-out, an error from FW
on closing commands, pci is offline when posting/pending a command.

In those cases we place the device into an error state: chip is reset, pending
commands are awakened and completed immediately. Subsequent commands will
return immediately.

The return code in the above cases will depend on the command. Commands which
free and close resources will return success (because the chip was reset, so
callers may safely free their kernel resources). Other commands will return -EIO.

Since the device's state was marked as error, the catas poller will
detect this and restart the device's software stack (as is done when a FW
internal error is directly detected). The device state is protected by a
persistent mutex lives on its mlx4_dev, as such no need any more for the
hcr_mutex which is removed.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c |  11 +-
 drivers/net/ethernet/mellanox/mlx4/cmd.c   | 163 +++++++++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx4/mcg.c   |   3 +
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |   2 +-
 include/linux/mlx4/cmd.h                   |   1 +
 5 files changed, 153 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 588d6b5e1211..63f14ffcc906 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -42,8 +42,8 @@ enum {
 
 
-static int internal_err_reset = 1;
-module_param(internal_err_reset, int, 0644);
+int mlx4_internal_err_reset = 1;
+module_param_named(internal_err_reset, mlx4_internal_err_reset,  int, 0644);
 MODULE_PARM_DESC(internal_err_reset,
 		 "Reset device on internal errors if non-zero"
 		 " (default 1, in SRIOV mode default is 0)");
@@ -92,7 +92,7 @@ void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 	int err;
 	struct mlx4_dev *dev;
 
-	if (!internal_err_reset)
+	if (!mlx4_internal_err_reset)
 		return;
 
 	mutex_lock(&persist->device_state_mutex);
@@ -110,6 +110,7 @@ void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 
 	/* At that step HW was already reset, now notify clients */
 	mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
+	mlx4_cmd_wake_completions(dev);
 	return;
 
 out:
@@ -157,7 +158,7 @@ static void poll_catas(unsigned long dev_ptr)
 	return;
 
 internal_err:
-	if (internal_err_reset)
+	if (mlx4_internal_err_reset)
 		queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
 }
 
@@ -177,7 +178,7 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev)
 
 	/*If we are in SRIOV the default of the module param must be 0*/
 	if (mlx4_is_mfunc(dev))
-		internal_err_reset = 0;
+		mlx4_internal_err_reset = 0;
 
 	INIT_LIST_HEAD(&priv->catas_err.list);
 	init_timer(&priv->catas_err.timer);
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 7cd90e6a4272..3895b2b5fc92 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -182,6 +182,72 @@ static u8 mlx4_errno_to_status(int errno)
 	}
 }
 
+static int mlx4_internal_err_ret_value(struct mlx4_dev *dev, u16 op,
+				       u8 op_modifier)
+{
+	switch (op) {
+	case MLX4_CMD_UNMAP_ICM:
+	case MLX4_CMD_UNMAP_ICM_AUX:
+	case MLX4_CMD_UNMAP_FA:
+	case MLX4_CMD_2RST_QP:
+	case MLX4_CMD_HW2SW_EQ:
+	case MLX4_CMD_HW2SW_CQ:
+	case MLX4_CMD_HW2SW_SRQ:
+	case MLX4_CMD_HW2SW_MPT:
+	case MLX4_CMD_CLOSE_HCA:
+	case MLX4_QP_FLOW_STEERING_DETACH:
+	case MLX4_CMD_FREE_RES:
+	case MLX4_CMD_CLOSE_PORT:
+		return CMD_STAT_OK;
+
+	case MLX4_CMD_QP_ATTACH:
+		/* On Detach case return success */
+		if (op_modifier == 0)
+			return CMD_STAT_OK;
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+
+	default:
+		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+	}
+}
+
+static int mlx4_closing_cmd_fatal_error(u16 op, u8 fw_status)
+{
+	/* Any error during the closing commands below is considered fatal */
+	if (op == MLX4_CMD_CLOSE_HCA ||
+	    op == MLX4_CMD_HW2SW_EQ ||
+	    op == MLX4_CMD_HW2SW_CQ ||
+	    op == MLX4_CMD_2RST_QP ||
+	    op == MLX4_CMD_HW2SW_SRQ ||
+	    op == MLX4_CMD_SYNC_TPT ||
+	    op == MLX4_CMD_UNMAP_ICM ||
+	    op == MLX4_CMD_UNMAP_ICM_AUX ||
+	    op == MLX4_CMD_UNMAP_FA)
+		return 1;
+	/* Error on MLX4_CMD_HW2SW_MPT is fatal except when fw status equals
+	  * CMD_STAT_REG_BOUND.
+	  * This status indicates that memory region has memory windows bound to it
+	  * which may result from invalid user space usage and is not fatal.
+	  */
+	if (op == MLX4_CMD_HW2SW_MPT && fw_status != CMD_STAT_REG_BOUND)
+		return 1;
+	return 0;
+}
+
+static int mlx4_cmd_reset_flow(struct mlx4_dev *dev, u16 op, u8 op_modifier,
+			       int err)
+{
+	/* Only if reset flow is really active return code is based on
+	  * command, otherwise current error code is returned.
+	  */
+	if (mlx4_internal_err_reset) {
+		mlx4_enter_error_state(dev->persist);
+		err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+	}
+
+	return err;
+}
+
 static int comm_pending(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -258,7 +324,7 @@ static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op,
 	cmd->free_head = context->next;
 	spin_unlock(&cmd->context_lock);
 
-	init_completion(&context->done);
+	reinit_completion(&context->done);
 
 	mlx4_comm_cmd_post(dev, op, param);
 
@@ -323,17 +389,21 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 {
 	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
 	u32 __iomem *hcr = cmd->hcr;
-	int ret = -EAGAIN;
+	int ret = -EIO;
 	unsigned long end;
 
-	mutex_lock(&cmd->hcr_mutex);
-
-	if (pci_channel_offline(dev->persist->pdev)) {
+	mutex_lock(&dev->persist->device_state_mutex);
+	/* To avoid writing to unknown addresses after the device state was
+	  * changed to internal error and the chip was reset,
+	  * check the INTERNAL_ERROR flag which is updated under
+	  * device_state_mutex lock.
+	  */
+	if (pci_channel_offline(dev->persist->pdev) ||
+	    (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
 		/*
 		 * Device is going through error recovery
 		 * and cannot accept commands.
 		 */
-		ret = -EIO;
 		goto out;
 	}
 
@@ -347,7 +417,6 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 			 * Device is going through error recovery
 			 * and cannot accept commands.
 			 */
-			ret = -EIO;
 			goto out;
 		}
 
@@ -391,7 +460,11 @@ static int mlx4_cmd_post(struct mlx4_dev *dev, u64 in_param, u64 out_param,
 	ret = 0;
 
 out:
-	mutex_unlock(&cmd->hcr_mutex);
+	if (ret)
+		mlx4_warn(dev, "Could not post command 0x%x: ret=%d, in_param=0x%llx, in_mod=0x%x, op_mod=0x%x\n",
+			  op, ret, in_param, in_modifier, op_modifier);
+	mutex_unlock(&dev->persist->device_state_mutex);
+
 	return ret;
 }
 
@@ -464,12 +537,12 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 
 	down(&priv->cmd.poll_sem);
 
-	if (pci_channel_offline(dev->persist->pdev)) {
+	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
 		/*
 		 * Device is going through error recovery
 		 * and cannot accept commands.
 		 */
-		err = -EIO;
+		err = mlx4_internal_err_ret_value(dev, op, op_modifier);
 		goto out;
 	}
 
@@ -483,7 +556,7 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
 			    in_modifier, op_modifier, op, CMD_POLL_TOKEN, 0);
 	if (err)
-		goto out;
+		goto out_reset;
 
 	end = msecs_to_jiffies(timeout) + jiffies;
 	while (cmd_pending(dev) && time_before(jiffies, end)) {
@@ -493,6 +566,11 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 			 * and cannot accept commands.
 			 */
 			err = -EIO;
+			goto out_reset;
+		}
+
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+			err = mlx4_internal_err_ret_value(dev, op, op_modifier);
 			goto out;
 		}
 
@@ -502,8 +580,8 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	if (cmd_pending(dev)) {
 		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
 			  op);
-		err = -ETIMEDOUT;
-		goto out;
+		err = -EIO;
+		goto out_reset;
 	}
 
 	if (out_is_imm)
@@ -515,10 +593,17 @@ static int mlx4_cmd_poll(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	stat = be32_to_cpu((__force __be32)
 			   __raw_readl(hcr + HCR_STATUS_OFFSET)) >> 24;
 	err = mlx4_status_to_errno(stat);
-	if (err)
+	if (err) {
 		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
 			 op, stat);
+		if (mlx4_closing_cmd_fatal_error(op, stat))
+			goto out_reset;
+		goto out;
+	}
 
+out_reset:
+	if (err)
+		err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
 out:
 	up(&priv->cmd.poll_sem);
 	return err;
@@ -565,17 +650,19 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 		goto out;
 	}
 
-	init_completion(&context->done);
+	reinit_completion(&context->done);
 
-	mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
-		      in_modifier, op_modifier, op, context->token, 1);
+	err = mlx4_cmd_post(dev, in_param, out_param ? *out_param : 0,
+			    in_modifier, op_modifier, op, context->token, 1);
+	if (err)
+		goto out_reset;
 
 	if (!wait_for_completion_timeout(&context->done,
 					 msecs_to_jiffies(timeout))) {
 		mlx4_warn(dev, "command 0x%x timed out (go bit not cleared)\n",
 			  op);
-		err = -EBUSY;
-		goto out;
+		err = -EIO;
+		goto out_reset;
 	}
 
 	err = context->result;
@@ -592,12 +679,20 @@ static int mlx4_cmd_wait(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 		else
 			mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
 				 op, context->fw_status);
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			err = mlx4_internal_err_ret_value(dev, op, op_modifier);
+		else if (mlx4_closing_cmd_fatal_error(op, context->fw_status))
+			goto out_reset;
+
 		goto out;
 	}
 
 	if (out_is_imm)
 		*out_param = context->out_param;
 
+out_reset:
+	if (err)
+		err = mlx4_cmd_reset_flow(dev, op, op_modifier, err);
 out:
 	spin_lock(&cmd->context_lock);
 	context->next = cmd->free_head;
@@ -613,9 +708,12 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 	       u16 op, unsigned long timeout, int native)
 {
 	if (pci_channel_offline(dev->persist->pdev))
-		return -EIO;
+		return mlx4_cmd_reset_flow(dev, op, op_modifier, -EIO);
 
 	if (!mlx4_is_mfunc(dev) || (native && mlx4_is_master(dev))) {
+		if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+			return mlx4_internal_err_ret_value(dev, op,
+							  op_modifier);
 		if (mlx4_priv(dev)->cmd.use_events)
 			return mlx4_cmd_wait(dev, in_param, out_param,
 					     out_is_imm, in_modifier,
@@ -2121,7 +2219,6 @@ int mlx4_cmd_init(struct mlx4_dev *dev)
 	int flags = 0;
 
 	if (!priv->cmd.initialized) {
-		mutex_init(&priv->cmd.hcr_mutex);
 		mutex_init(&priv->cmd.slave_cmd_mutex);
 		sema_init(&priv->cmd.poll_sem, 1);
 		priv->cmd.use_events = 0;
@@ -2232,6 +2329,11 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev)
 	for (i = 0; i < priv->cmd.max_cmds; ++i) {
 		priv->cmd.context[i].token = i;
 		priv->cmd.context[i].next  = i + 1;
+		/* To support fatal error flow, initialize all
+		 * cmd contexts to allow simulating completions
+		 * with complete() at any time.
+		 */
+		init_completion(&priv->cmd.context[i].done);
 	}
 
 	priv->cmd.context[priv->cmd.max_cmds - 1].next = -1;
@@ -2329,6 +2431,25 @@ int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave)
 	return slave - 1;
 }
 
+void mlx4_cmd_wake_completions(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_cmd_context *context;
+	int i;
+
+	spin_lock(&priv->cmd.context_lock);
+	if (priv->cmd.context) {
+		for (i = 0; i < priv->cmd.max_cmds; ++i) {
+			context = &priv->cmd.context[i];
+			context->fw_status = CMD_STAT_INTERNAL_ERR;
+			context->result    =
+				mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
+			complete(&context->done);
+		}
+	}
+	spin_unlock(&priv->cmd.context_lock);
+}
+
 struct mlx4_active_ports mlx4_get_active_ports(struct mlx4_dev *dev, int slave)
 {
 	struct mlx4_active_ports actv_ports;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index a3867e7ef885..d22d9283d2cd 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -1318,6 +1318,9 @@ out:
 	mutex_unlock(&priv->mcg_table.mutex);
 
 	mlx4_free_cmd_mailbox(dev, mailbox);
+	if (err && dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
+		/* In case device is under an error, return success as a closing command */
+		err = 0;
 	return err;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index aa1ecbc5a606..5c772ea4473b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -235,6 +235,7 @@ do {									\
 
 extern int mlx4_log_num_mgm_entry_size;
 extern int log_mtts_per_seg;
+extern int mlx4_internal_err_reset;
 
 #define MLX4_MAX_NUM_SLAVES	(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF)
 #define ALL_SLAVES 0xff
@@ -607,7 +608,6 @@ struct mlx4_mgm {
 struct mlx4_cmd {
 	struct pci_pool	       *pool;
 	void __iomem	       *hcr;
-	struct mutex		hcr_mutex;
 	struct mutex		slave_cmd_mutex;
 	struct semaphore	poll_sem;
 	struct semaphore	event_sem;
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 64d25941b329..e7543844cc7a 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -279,6 +279,7 @@ int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_in
 int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_state);
 int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
 			      struct mlx4_config_dev_params *params);
+void mlx4_cmd_wake_completions(struct mlx4_dev *dev);
 /*
  * mlx4_get_slave_default_vlan -
  * return true if VST ( default vlan)
-- 
cgit v1.2.3


From c69453e294c9f16da977b68e658a8028b854c209 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:40 +0200
Subject: net/mlx4_core: Manage interface state for Reset flow cases

We need to manage interface state to sync between reset flow and some other
relative cases such as remove_one. This has to be done to prevent certain
races. For example in case software stack is down as a result of unload call,
the remove_one should skip the unload phase.

Implement the remove_one case, handling AER and other cases comes next.

The interface can be up/down, upon remove_one, the state will include an extra
bit indicating that the device is cleaned-up, forcing other tasks to finish
before the final cleanup.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c | 13 +++++++++++--
 drivers/net/ethernet/mellanox/mlx4/intf.c  |  2 ++
 drivers/net/ethernet/mellanox/mlx4/main.c  | 13 ++++++++++++-
 include/linux/mlx4/device.h                |  7 +++++++
 4 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 63f14ffcc906..3fcf3cfaedfc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -122,8 +122,14 @@ static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
 	int err = 0;
 
 	mlx4_enter_error_state(persist);
-	err = mlx4_restart_one(persist->pdev);
-	mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", err);
+	mutex_lock(&persist->interface_state_mutex);
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP &&
+	    !(persist->interface_state & MLX4_INTERFACE_STATE_DELETION)) {
+		err = mlx4_restart_one(persist->pdev);
+		mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n",
+			  err);
+	}
+	mutex_unlock(&persist->interface_state_mutex);
 }
 
 static void dump_err_buf(struct mlx4_dev *dev)
@@ -211,6 +217,9 @@ void mlx4_stop_catas_poll(struct mlx4_dev *dev)
 		iounmap(priv->catas_err.map);
 		priv->catas_err.map = NULL;
 	}
+
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_DELETION)
+		flush_workqueue(dev->persist->catas_wq);
 }
 
 int  mlx4_catas_init(struct mlx4_dev *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index 116895ac8b35..fba0b96a6f28 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -138,6 +138,7 @@ int mlx4_register_device(struct mlx4_dev *dev)
 
 	mutex_lock(&intf_mutex);
 
+	dev->persist->interface_state |= MLX4_INTERFACE_STATE_UP;
 	list_add_tail(&priv->dev_list, &dev_list);
 	list_for_each_entry(intf, &intf_list, list)
 		mlx4_add_device(intf, priv);
@@ -162,6 +163,7 @@ void mlx4_unregister_device(struct mlx4_dev *dev)
 		mlx4_remove_device(intf, priv);
 
 	list_del(&priv->dev_list);
+	dev->persist->interface_state &= ~MLX4_INTERFACE_STATE_UP;
 
 	mutex_unlock(&intf_mutex);
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index dc2d910fcc88..d59cae5da3f0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3114,6 +3114,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	pci_set_drvdata(pdev, dev->persist);
 	priv->pci_dev_data = id->driver_data;
 	mutex_init(&dev->persist->device_state_mutex);
+	mutex_init(&dev->persist->interface_state_mutex);
 
 	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
 	if (ret) {
@@ -3232,7 +3233,17 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
-	mlx4_unload_one(pdev);
+	mutex_lock(&persist->interface_state_mutex);
+	persist->interface_state |= MLX4_INTERFACE_STATE_DELETION;
+	mutex_unlock(&persist->interface_state_mutex);
+
+	/* device marked to be under deletion running now without the lock
+	 * letting other tasks to be terminated
+	 */
+	if (persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_unload_one(pdev);
+	else
+		mlx4_info(dev, "%s: interface is down\n", __func__);
 	mlx4_catas_end(dev);
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 7d5d317cb7a6..33f9ca71925c 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -416,6 +416,11 @@ enum {
 	MLX4_DEVICE_STATE_INTERNAL_ERROR	= 1 << 1,
 };
 
+enum {
+	MLX4_INTERFACE_STATE_UP		= 1 << 0,
+	MLX4_INTERFACE_STATE_DELETION	= 1 << 1,
+};
+
 #define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
 			     MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)
 
@@ -760,6 +765,8 @@ struct mlx4_dev_persistent {
 	struct workqueue_struct *catas_wq;
 	struct mutex	device_state_mutex; /* protect HW state */
 	u8		state;
+	struct mutex	interface_state_mutex; /* protect SW state */
+	u8	interface_state;
 };
 
 struct mlx4_dev {
-- 
cgit v1.2.3


From 55ad359225b2232b9b8f04a0dfa169bd3a7d86d2 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 25 Jan 2015 16:59:42 +0200
Subject: net/mlx4_core: Enable device recovery flow with SRIOV

In SRIOV, both the PF and the VF may attempt device recovery whenever they
assume that the device is not functioning.  When the PF driver resets the
device, the VF should detect this and attempt to reinitialize itself.

The VF must be able to reset itself under all circumstances, even
if the PF is not responsive.

The VF shall reset itself in the following cases:

1. Commands are not processed within reasonable time over the communication channel.
This is done considering device state and the correct return code based on
the command as was done in the native mode, done in the next patch.

2. The VF driver receives an internal error event reported by the PF on the
communication channel. This occurs when the PF driver resets the device or
when VF is out of sync with the PF.

Add 'VF reset' capability, which allows the VF to reinitialize itself even when the
PF is not responsive.

As PF and VF may run their reset flow simulantanisly, there are several cases
that are handled:
- Prevent freeing VF resources upon FLR, when PF is in its unloading stage.
- Prevent PF getting VF commands before it has finished initializing its resources.
- Upon VF startup, check that comm-channel is online before sending
  commands to the PF and getting timed-out.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/catas.c | 120 +++++++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx4/cmd.c   |  77 ++++++++++++----
 drivers/net/ethernet/mellanox/mlx4/eq.c    |  10 ++-
 drivers/net/ethernet/mellanox/mlx4/intf.c  |   6 +-
 drivers/net/ethernet/mellanox/mlx4/main.c  | 135 +++++++++++++++++++++++------
 drivers/net/ethernet/mellanox/mlx4/mlx4.h  |   7 +-
 include/linux/mlx4/cmd.h                   |   2 +
 include/linux/mlx4/device.h                |   5 ++
 8 files changed, 292 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/catas.c b/drivers/net/ethernet/mellanox/mlx4/catas.c
index 3fcf3cfaedfc..715de8affcc9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/catas.c
+++ b/drivers/net/ethernet/mellanox/mlx4/catas.c
@@ -45,8 +45,7 @@ enum {
 int mlx4_internal_err_reset = 1;
 module_param_named(internal_err_reset, mlx4_internal_err_reset,  int, 0644);
 MODULE_PARM_DESC(internal_err_reset,
-		 "Reset device on internal errors if non-zero"
-		 " (default 1, in SRIOV mode default is 0)");
+		 "Reset device on internal errors if non-zero (default 1)");
 
 static int read_vendor_id(struct mlx4_dev *dev)
 {
@@ -71,6 +70,9 @@ static int mlx4_reset_master(struct mlx4_dev *dev)
 {
 	int err = 0;
 
+	if (mlx4_is_master(dev))
+		mlx4_report_internal_err_comm_event(dev);
+
 	if (!pci_channel_offline(dev->persist->pdev)) {
 		err = read_vendor_id(dev);
 		/* If PCI can't be accessed to read vendor ID we assume that its
@@ -87,6 +89,81 @@ static int mlx4_reset_master(struct mlx4_dev *dev)
 	return err;
 }
 
+static int mlx4_reset_slave(struct mlx4_dev *dev)
+{
+#define COM_CHAN_RST_REQ_OFFSET 0x10
+#define COM_CHAN_RST_ACK_OFFSET 0x08
+
+	u32 comm_flags;
+	u32 rst_req;
+	u32 rst_ack;
+	unsigned long end;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (pci_channel_offline(dev->persist->pdev))
+		return 0;
+
+	comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+				  MLX4_COMM_CHAN_FLAGS));
+	if (comm_flags == 0xffffffff) {
+		mlx4_err(dev, "VF reset is not needed\n");
+		return 0;
+	}
+
+	if (!(dev->caps.vf_caps & MLX4_VF_CAP_FLAG_RESET)) {
+		mlx4_err(dev, "VF reset is not supported\n");
+		return -EOPNOTSUPP;
+	}
+
+	rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
+		COM_CHAN_RST_REQ_OFFSET;
+	rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
+		COM_CHAN_RST_ACK_OFFSET;
+	if (rst_req != rst_ack) {
+		mlx4_err(dev, "Communication channel isn't sync, fail to send reset\n");
+		return -EIO;
+	}
+
+	rst_req ^= 1;
+	mlx4_warn(dev, "VF is sending reset request to Firmware\n");
+	comm_flags = rst_req << COM_CHAN_RST_REQ_OFFSET;
+	__raw_writel((__force u32)cpu_to_be32(comm_flags),
+		     (__iomem char *)priv->mfunc.comm + MLX4_COMM_CHAN_FLAGS);
+	/* Make sure that our comm channel write doesn't
+	 * get mixed in with writes from another CPU.
+	 */
+	mmiowb();
+
+	end = msecs_to_jiffies(MLX4_COMM_TIME) + jiffies;
+	while (time_before(jiffies, end)) {
+		comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+					  MLX4_COMM_CHAN_FLAGS));
+		rst_ack = (comm_flags & (u32)(1 << COM_CHAN_RST_ACK_OFFSET)) >>
+			COM_CHAN_RST_ACK_OFFSET;
+
+		/* Reading rst_req again since the communication channel can
+		 * be reset at any time by the PF and all its bits will be
+		 * set to zero.
+		 */
+		rst_req = (comm_flags & (u32)(1 << COM_CHAN_RST_REQ_OFFSET)) >>
+			COM_CHAN_RST_REQ_OFFSET;
+
+		if (rst_ack == rst_req) {
+			mlx4_warn(dev, "VF Reset succeed\n");
+			return 0;
+		}
+		cond_resched();
+	}
+	mlx4_err(dev, "Fail to send reset over the communication channel\n");
+	return -ETIMEDOUT;
+}
+
+static int mlx4_comm_internal_err(u32 slave_read)
+{
+	return (u32)COMM_CHAN_EVENT_INTERNAL_ERR ==
+		(slave_read & (u32)COMM_CHAN_EVENT_INTERNAL_ERR) ? 1 : 0;
+}
+
 void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 {
 	int err;
@@ -101,7 +178,10 @@ void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
 
 	dev = persist->dev;
 	mlx4_err(dev, "device is going to be reset\n");
-	err = mlx4_reset_master(dev);
+	if (mlx4_is_slave(dev))
+		err = mlx4_reset_slave(dev);
+	else
+		err = mlx4_reset_master(dev);
 	BUG_ON(err != 0);
 
 	dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
@@ -148,8 +228,15 @@ static void poll_catas(unsigned long dev_ptr)
 {
 	struct mlx4_dev *dev = (struct mlx4_dev *) dev_ptr;
 	struct mlx4_priv *priv = mlx4_priv(dev);
-
-	if (readl(priv->catas_err.map)) {
+	u32 slave_read;
+
+	if (mlx4_is_slave(dev)) {
+		slave_read = swab32(readl(&priv->mfunc.comm->slave_read));
+		if (mlx4_comm_internal_err(slave_read)) {
+			mlx4_warn(dev, "Internal error detected on the communication channel\n");
+			goto internal_err;
+		}
+	} else if (readl(priv->catas_err.map)) {
 		dump_err_buf(dev);
 		goto internal_err;
 	}
@@ -182,22 +269,21 @@ void mlx4_start_catas_poll(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	phys_addr_t addr;
 
-	/*If we are in SRIOV the default of the module param must be 0*/
-	if (mlx4_is_mfunc(dev))
-		mlx4_internal_err_reset = 0;
-
 	INIT_LIST_HEAD(&priv->catas_err.list);
 	init_timer(&priv->catas_err.timer);
 	priv->catas_err.map = NULL;
 
-	addr = pci_resource_start(dev->persist->pdev, priv->fw.catas_bar) +
-		priv->fw.catas_offset;
-
-	priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
-	if (!priv->catas_err.map) {
-		mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
-			  (unsigned long long) addr);
-		return;
+	if (!mlx4_is_slave(dev)) {
+		addr = pci_resource_start(dev->persist->pdev,
+					  priv->fw.catas_bar) +
+					  priv->fw.catas_offset;
+
+		priv->catas_err.map = ioremap(addr, priv->fw.catas_size * 4);
+		if (!priv->catas_err.map) {
+			mlx4_warn(dev, "Failed to map internal error buffer at 0x%llx\n",
+				  (unsigned long long)addr);
+			return;
+		}
 	}
 
 	priv->catas_err.timer.data     = (unsigned long) dev;
diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 3895b2b5fc92..7652eed4bbc8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -42,6 +42,7 @@
 #include <linux/mlx4/device.h>
 #include <linux/semaphore.h>
 #include <rdma/ib_smi.h>
+#include <linux/delay.h>
 
 #include <asm/io.h>
 
@@ -729,7 +730,7 @@ int __mlx4_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
 EXPORT_SYMBOL_GPL(__mlx4_cmd);
 
 
-static int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
+int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev)
 {
 	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_ARM_COMM_CHANNEL,
 			MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
@@ -1945,8 +1946,11 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 		break;
 	case MLX4_COMM_CMD_VHCR_POST:
 		if ((slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_EN) &&
-		    (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST))
+		    (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR_POST)) {
+			mlx4_warn(dev, "slave:%d is out of sync, cmd=0x%x, last command=0x%x, reset is needed\n",
+				  slave, cmd, slave_state[slave].last_cmd);
 			goto reset_slave;
+		}
 
 		mutex_lock(&priv->cmd.slave_cmd_mutex);
 		if (mlx4_master_process_vhcr(dev, slave, NULL)) {
@@ -1980,7 +1984,18 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 
 reset_slave:
 	/* cleanup any slave resources */
-	mlx4_delete_all_resources_for_slave(dev, slave);
+	if (dev->persist->interface_state & MLX4_INTERFACE_STATE_UP)
+		mlx4_delete_all_resources_for_slave(dev, slave);
+
+	if (cmd != MLX4_COMM_CMD_RESET) {
+		mlx4_warn(dev, "Turn on internal error to force reset, slave=%d, cmd=0x%x\n",
+			  slave, cmd);
+		/* Turn on internal error letting slave reset itself immeditaly,
+		 * otherwise it might take till timeout on command is passed
+		 */
+		reply |= ((u32)COMM_CHAN_EVENT_INTERNAL_ERR);
+	}
+
 	spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
 	if (!slave_state[slave].is_slave_going_down)
 		slave_state[slave].last_cmd = MLX4_COMM_CMD_RESET;
@@ -2056,17 +2071,28 @@ void mlx4_master_comm_channel(struct work_struct *work)
 static int sync_toggles(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
-	int wr_toggle;
-	int rd_toggle;
+	u32 wr_toggle;
+	u32 rd_toggle;
 	unsigned long end;
 
-	wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write)) >> 31;
-	end = jiffies + msecs_to_jiffies(5000);
+	wr_toggle = swab32(readl(&priv->mfunc.comm->slave_write));
+	if (wr_toggle == 0xffffffff)
+		end = jiffies + msecs_to_jiffies(30000);
+	else
+		end = jiffies + msecs_to_jiffies(5000);
 
 	while (time_before(jiffies, end)) {
-		rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read)) >> 31;
-		if (rd_toggle == wr_toggle) {
-			priv->cmd.comm_toggle = rd_toggle;
+		rd_toggle = swab32(readl(&priv->mfunc.comm->slave_read));
+		if (wr_toggle == 0xffffffff || rd_toggle == 0xffffffff) {
+			/* PCI might be offline */
+			msleep(100);
+			wr_toggle = swab32(readl(&priv->mfunc.comm->
+					   slave_write));
+			continue;
+		}
+
+		if (rd_toggle >> 31 == wr_toggle >> 31) {
+			priv->cmd.comm_toggle = rd_toggle >> 31;
 			return 0;
 		}
 
@@ -2172,13 +2198,6 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 		if (mlx4_init_resource_tracker(dev))
 			goto err_thread;
 
-		err = mlx4_ARM_COMM_CHANNEL(dev);
-		if (err) {
-			mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
-				 err);
-			goto err_resource;
-		}
-
 	} else {
 		err = sync_toggles(dev);
 		if (err) {
@@ -2188,8 +2207,6 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 	}
 	return 0;
 
-err_resource:
-	mlx4_free_resource_tracker(dev, RES_TR_FREE_ALL);
 err_thread:
 	flush_workqueue(priv->mfunc.master.comm_wq);
 	destroy_workqueue(priv->mfunc.master.comm_wq);
@@ -2266,6 +2283,27 @@ err:
 	return -ENOMEM;
 }
 
+void mlx4_report_internal_err_comm_event(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int slave;
+	u32 slave_read;
+
+	/* Report an internal error event to all
+	 * communication channels.
+	 */
+	for (slave = 0; slave < dev->num_slaves; slave++) {
+		slave_read = swab32(readl(&priv->mfunc.comm[slave].slave_read));
+		slave_read |= (u32)COMM_CHAN_EVENT_INTERNAL_ERR;
+		__raw_writel((__force u32)cpu_to_be32(slave_read),
+			     &priv->mfunc.comm[slave].slave_read);
+		/* Make sure that our comm channel write doesn't
+		 * get mixed in with writes from another CPU.
+		 */
+		mmiowb();
+	}
+}
+
 void mlx4_multi_func_cleanup(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -2281,6 +2319,7 @@ void mlx4_multi_func_cleanup(struct mlx4_dev *dev)
 		kfree(priv->mfunc.master.slave_state);
 		kfree(priv->mfunc.master.vf_admin);
 		kfree(priv->mfunc.master.vf_oper);
+		dev->num_slaves = 0;
 	}
 
 	iounmap(priv->mfunc.comm);
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index 7538c9ce98a9..2f2e6067426d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -429,8 +429,14 @@ void mlx4_master_handle_slave_flr(struct work_struct *work)
 		if (MLX4_COMM_CMD_FLR == slave_state[i].last_cmd) {
 			mlx4_dbg(dev, "mlx4_handle_slave_flr: clean slave: %d\n",
 				 i);
-
-			mlx4_delete_all_resources_for_slave(dev, i);
+			/* In case of 'Reset flow' FLR can be generated for
+			 * a slave before mlx4_load_one is done.
+			 * make sure interface is up before trying to delete
+			 * slave resources which weren't allocated yet.
+			 */
+			if (dev->persist->interface_state &
+			    MLX4_INTERFACE_STATE_UP)
+				mlx4_delete_all_resources_for_slave(dev, i);
 			/*return the slave to running mode*/
 			spin_lock_irqsave(&priv->mfunc.master.slave_state_lock, flags);
 			slave_state[i].last_cmd = MLX4_COMM_CMD_RESET;
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index fba0b96a6f28..68d2bad325d5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -144,8 +144,7 @@ int mlx4_register_device(struct mlx4_dev *dev)
 		mlx4_add_device(intf, priv);
 
 	mutex_unlock(&intf_mutex);
-	if (!mlx4_is_slave(dev))
-		mlx4_start_catas_poll(dev);
+	mlx4_start_catas_poll(dev);
 
 	return 0;
 }
@@ -155,8 +154,7 @@ void mlx4_unregister_device(struct mlx4_dev *dev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_interface *intf;
 
-	if (!mlx4_is_slave(dev))
-		mlx4_stop_catas_poll(dev);
+	mlx4_stop_catas_poll(dev);
 	mutex_lock(&intf_mutex);
 
 	list_for_each_entry(intf, &intf_list, list)
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 6bb0fca137cd..1baf1f1e2866 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -108,6 +108,8 @@ MODULE_PARM_DESC(enable_64b_cqe_eqe,
 					 MLX4_FUNC_CAP_EQE_CQE_STRIDE | \
 					 MLX4_FUNC_CAP_DMFS_A0_STATIC)
 
+#define RESET_PERSIST_MASK_FLAGS	(MLX4_FLAG_SRIOV)
+
 static char mlx4_version[] =
 	DRV_NAME ": Mellanox ConnectX core driver v"
 	DRV_VERSION " (" DRV_RELDATE ")\n";
@@ -1579,6 +1581,50 @@ static void mlx4_close_fw(struct mlx4_dev *dev)
 	}
 }
 
+static int mlx4_comm_check_offline(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_OFFLINE_OFFSET 0x09
+
+	u32 comm_flags;
+	u32 offline_bit;
+	unsigned long end;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	end = msecs_to_jiffies(MLX4_COMM_OFFLINE_TIME_OUT) + jiffies;
+	while (time_before(jiffies, end)) {
+		comm_flags = swab32(readl((__iomem char *)priv->mfunc.comm +
+					  MLX4_COMM_CHAN_FLAGS));
+		offline_bit = (comm_flags &
+			       (u32)(1 << COMM_CHAN_OFFLINE_OFFSET));
+		if (!offline_bit)
+			return 0;
+		/* There are cases as part of AER/Reset flow that PF needs
+		 * around 100 msec to load. We therefore sleep for 100 msec
+		 * to allow other tasks to make use of that CPU during this
+		 * time interval.
+		 */
+		msleep(100);
+	}
+	mlx4_err(dev, "Communication channel is offline.\n");
+	return -EIO;
+}
+
+static void mlx4_reset_vf_support(struct mlx4_dev *dev)
+{
+#define COMM_CHAN_RST_OFFSET 0x1e
+
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	u32 comm_rst;
+	u32 comm_caps;
+
+	comm_caps = swab32(readl((__iomem char *)priv->mfunc.comm +
+				 MLX4_COMM_CHAN_CAPS));
+	comm_rst = (comm_caps & (u32)(1 << COMM_CHAN_RST_OFFSET));
+
+	if (comm_rst)
+		dev->caps.vf_caps |= MLX4_VF_CAP_FLAG_RESET;
+}
+
 static int mlx4_init_slave(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -1594,6 +1640,12 @@ static int mlx4_init_slave(struct mlx4_dev *dev)
 
 	mutex_lock(&priv->cmd.slave_cmd_mutex);
 	priv->cmd.max_cmds = 1;
+	if (mlx4_comm_check_offline(dev)) {
+		mlx4_err(dev, "PF is not responsive, skipping initialization\n");
+		goto err_offline;
+	}
+
+	mlx4_reset_vf_support(dev);
 	mlx4_warn(dev, "Sending reset\n");
 	ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0,
 				       MLX4_COMM_TIME);
@@ -1637,6 +1689,7 @@ static int mlx4_init_slave(struct mlx4_dev *dev)
 
 err:
 	mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0);
+err_offline:
 	mutex_unlock(&priv->cmd.slave_cmd_mutex);
 	return -EIO;
 }
@@ -2494,11 +2547,19 @@ static void mlx4_free_ownership(struct mlx4_dev *dev)
 				  !!((flags) & MLX4_FLAG_MASTER))
 
 static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
-			     u8 total_vfs, int existing_vfs)
+			     u8 total_vfs, int existing_vfs, int reset_flow)
 {
 	u64 dev_flags = dev->flags;
 	int err = 0;
 
+	if (reset_flow) {
+		dev->dev_vfs = kcalloc(total_vfs, sizeof(*dev->dev_vfs),
+				       GFP_KERNEL);
+		if (!dev->dev_vfs)
+			goto free_mem;
+		return dev_flags;
+	}
+
 	atomic_inc(&pf_loading);
 	if (dev->flags &  MLX4_FLAG_SRIOV) {
 		if (existing_vfs != total_vfs) {
@@ -2533,6 +2594,7 @@ static u64 mlx4_enable_sriov(struct mlx4_dev *dev, struct pci_dev *pdev,
 
 disable_sriov:
 	atomic_dec(&pf_loading);
+free_mem:
 	dev->persist->num_vfs = 0;
 	kfree(dev->dev_vfs);
 	return dev_flags & ~MLX4_FLAG_MASTER;
@@ -2557,7 +2619,8 @@ static int mlx4_check_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap
 }
 
 static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
-			 int total_vfs, int *nvfs, struct mlx4_priv *priv)
+			 int total_vfs, int *nvfs, struct mlx4_priv *priv,
+			 int reset_flow)
 {
 	struct mlx4_dev *dev;
 	unsigned sum = 0;
@@ -2679,8 +2742,10 @@ slave_start:
 				goto err_fw;
 
 			if (!(dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
-				u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
-								  existing_vfs);
+				u64 dev_flags = mlx4_enable_sriov(dev, pdev,
+								  total_vfs,
+								  existing_vfs,
+								  reset_flow);
 
 				mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
 				dev->flags = dev_flags;
@@ -2722,7 +2787,7 @@ slave_start:
 			if (dev->flags & MLX4_FLAG_SRIOV) {
 				if (!existing_vfs)
 					pci_disable_sriov(pdev);
-				if (mlx4_is_master(dev))
+				if (mlx4_is_master(dev) && !reset_flow)
 					atomic_dec(&pf_loading);
 				dev->flags &= ~MLX4_FLAG_SRIOV;
 			}
@@ -2736,7 +2801,8 @@ slave_start:
 	}
 
 	if (mlx4_is_master(dev) && (dev_cap->flags2 & MLX4_DEV_CAP_FLAG2_SYS_EQS)) {
-		u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs, existing_vfs);
+		u64 dev_flags = mlx4_enable_sriov(dev, pdev, total_vfs,
+						  existing_vfs, reset_flow);
 
 		if ((dev->flags ^ dev_flags) & (MLX4_FLAG_MASTER | MLX4_FLAG_SLAVE)) {
 			mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_VHCR);
@@ -2848,6 +2914,17 @@ slave_start:
 		goto err_steer;
 
 	mlx4_init_quotas(dev);
+	/* When PF resources are ready arm its comm channel to enable
+	 * getting commands
+	 */
+	if (mlx4_is_master(dev)) {
+		err = mlx4_ARM_COMM_CHANNEL(dev);
+		if (err) {
+			mlx4_err(dev, " Failed to arm comm channel eq: %x\n",
+				 err);
+			goto err_steer;
+		}
+	}
 
 	for (port = 1; port <= dev->caps.num_ports; port++) {
 		err = mlx4_init_port_info(dev, port);
@@ -2866,7 +2943,7 @@ slave_start:
 
 	priv->removed = 0;
 
-	if (mlx4_is_master(dev) && dev->persist->num_vfs)
+	if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
 		atomic_dec(&pf_loading);
 
 	kfree(dev_cap);
@@ -2925,10 +3002,12 @@ err_cmd:
 	mlx4_cmd_cleanup(dev, MLX4_CMD_CLEANUP_ALL);
 
 err_sriov:
-	if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs)
+	if (dev->flags & MLX4_FLAG_SRIOV && !existing_vfs) {
 		pci_disable_sriov(pdev);
+		dev->flags &= ~MLX4_FLAG_SRIOV;
+	}
 
-	if (mlx4_is_master(dev) && dev->persist->num_vfs)
+	if (mlx4_is_master(dev) && dev->persist->num_vfs && !reset_flow)
 		atomic_dec(&pf_loading);
 
 	kfree(priv->dev.dev_vfs);
@@ -3073,7 +3152,7 @@ static int __mlx4_init_one(struct pci_dev *pdev, int pci_dev_data,
 	if (err)
 		goto err_release_regions;
 
-	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 0);
 	if (err)
 		goto err_catas;
 
@@ -3131,9 +3210,11 @@ static void mlx4_clean_dev(struct mlx4_dev *dev)
 {
 	struct mlx4_dev_persistent *persist = dev->persist;
 	struct mlx4_priv *priv = mlx4_priv(dev);
+	unsigned long	flags = (dev->flags & RESET_PERSIST_MASK_FLAGS);
 
 	memset(priv, 0, sizeof(*priv));
 	priv->dev.persist = persist;
+	priv->dev.flags = flags;
 }
 
 static void mlx4_unload_one(struct pci_dev *pdev)
@@ -3143,7 +3224,6 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	int               pci_dev_data;
 	int p, i;
-	int active_vfs = 0;
 
 	if (priv->removed)
 		return;
@@ -3157,14 +3237,6 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 
 	pci_dev_data = priv->pci_dev_data;
 
-	/* Disabling SR-IOV is not allowed while there are active vf's */
-	if (mlx4_is_master(dev)) {
-		active_vfs = mlx4_how_many_lives_vf(dev);
-		if (active_vfs) {
-			pr_warn("Removing PF when there are active VF's !!\n");
-			pr_warn("Will not disable SR-IOV.\n");
-		}
-	}
 	mlx4_stop_sense(dev);
 	mlx4_unregister_device(dev);
 
@@ -3208,12 +3280,6 @@ static void mlx4_unload_one(struct pci_dev *pdev)
 
 	if (dev->flags & MLX4_FLAG_MSI_X)
 		pci_disable_msix(pdev);
-	if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
-		mlx4_warn(dev, "Disabling SR-IOV\n");
-		pci_disable_sriov(pdev);
-		dev->flags &= ~MLX4_FLAG_SRIOV;
-		dev->persist->num_vfs = 0;
-	}
 
 	if (!mlx4_is_slave(dev))
 		mlx4_free_ownership(dev);
@@ -3235,11 +3301,21 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	struct mlx4_dev_persistent *persist = pci_get_drvdata(pdev);
 	struct mlx4_dev  *dev  = persist->dev;
 	struct mlx4_priv *priv = mlx4_priv(dev);
+	int active_vfs = 0;
 
 	mutex_lock(&persist->interface_state_mutex);
 	persist->interface_state |= MLX4_INTERFACE_STATE_DELETION;
 	mutex_unlock(&persist->interface_state_mutex);
 
+	/* Disabling SR-IOV is not allowed while there are active vf's */
+	if (mlx4_is_master(dev) && dev->flags & MLX4_FLAG_SRIOV) {
+		active_vfs = mlx4_how_many_lives_vf(dev);
+		if (active_vfs) {
+			pr_warn("Removing PF when there are active VF's !!\n");
+			pr_warn("Will not disable SR-IOV.\n");
+		}
+	}
+
 	/* device marked to be under deletion running now without the lock
 	 * letting other tasks to be terminated
 	 */
@@ -3248,6 +3324,11 @@ static void mlx4_remove_one(struct pci_dev *pdev)
 	else
 		mlx4_info(dev, "%s: interface is down\n", __func__);
 	mlx4_catas_end(dev);
+	if (dev->flags & MLX4_FLAG_SRIOV && !active_vfs) {
+		mlx4_warn(dev, "Disabling SR-IOV\n");
+		pci_disable_sriov(pdev);
+	}
+
 	pci_release_regions(pdev);
 	pci_disable_device(pdev);
 	kfree(dev->persist);
@@ -3287,7 +3368,7 @@ int mlx4_restart_one(struct pci_dev *pdev)
 	memcpy(nvfs, dev->persist->nvfs, sizeof(dev->persist->nvfs));
 
 	mlx4_unload_one(pdev);
-	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv);
+	err = mlx4_load_one(pdev, pci_dev_data, total_vfs, nvfs, priv, 1);
 	if (err) {
 		mlx4_err(dev, "%s: ERROR: mlx4_load_one failed, pci_name=%s, err=%d\n",
 			 __func__, pci_name(pdev), err);
@@ -3397,7 +3478,7 @@ static pci_ers_result_t mlx4_pci_slot_reset(struct pci_dev *pdev)
 	mutex_lock(&persist->interface_state_mutex);
 	if (!(persist->interface_state & MLX4_INTERFACE_STATE_UP)) {
 		ret = mlx4_load_one(pdev, priv->pci_dev_data, total_vfs, nvfs,
-				    priv);
+				    priv, 1);
 		if (ret) {
 			mlx4_err(dev, "%s: mlx4_load_one failed, ret=%d\n",
 				 __func__,  ret);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 5c772ea4473b..2a15b8248e77 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -85,7 +85,9 @@ enum {
 	MLX4_CLR_INT_SIZE	= 0x00008,
 	MLX4_SLAVE_COMM_BASE	= 0x0,
 	MLX4_COMM_PAGESIZE	= 0x1000,
-	MLX4_CLOCK_SIZE		= 0x00008
+	MLX4_CLOCK_SIZE		= 0x00008,
+	MLX4_COMM_CHAN_CAPS	= 0x8,
+	MLX4_COMM_CHAN_FLAGS	= 0xc
 };
 
 enum {
@@ -120,6 +122,8 @@ enum mlx4_mpt_state {
 };
 
 #define MLX4_COMM_TIME		10000
+#define MLX4_COMM_OFFLINE_TIME_OUT 30000
+
 enum {
 	MLX4_COMM_CMD_RESET,
 	MLX4_COMM_CMD_VHCR0,
@@ -1162,6 +1166,7 @@ enum {
 int mlx4_cmd_init(struct mlx4_dev *dev);
 void mlx4_cmd_cleanup(struct mlx4_dev *dev, int cleanup_mask);
 int mlx4_multi_func_init(struct mlx4_dev *dev);
+int mlx4_ARM_COMM_CHANNEL(struct mlx4_dev *dev);
 void mlx4_multi_func_cleanup(struct mlx4_dev *dev);
 void mlx4_cmd_event(struct mlx4_dev *dev, u16 token, u8 status, u64 out_param);
 int mlx4_cmd_use_events(struct mlx4_dev *dev);
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index e7543844cc7a..c989442ffc6a 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -280,6 +280,7 @@ int mlx4_set_vf_link_state(struct mlx4_dev *dev, int port, int vf, int link_stat
 int mlx4_config_dev_retrieval(struct mlx4_dev *dev,
 			      struct mlx4_config_dev_params *params);
 void mlx4_cmd_wake_completions(struct mlx4_dev *dev);
+void mlx4_report_internal_err_comm_event(struct mlx4_dev *dev);
 /*
  * mlx4_get_slave_default_vlan -
  * return true if VST ( default vlan)
@@ -289,5 +290,6 @@ bool mlx4_get_slave_default_vlan(struct mlx4_dev *dev, int port, int slave,
 				 u16 *vlan, u8 *qos);
 
 #define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8)
+#define COMM_CHAN_EVENT_INTERNAL_ERR (1 << 17)
 
 #endif /* MLX4_CMD_H */
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 33f9ca71925c..5ef54e145e4d 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -208,6 +208,10 @@ enum {
 	MLX4_QUERY_FUNC_FLAGS_A0_RES_QP		= 1LL << 1
 };
 
+enum {
+	MLX4_VF_CAP_FLAG_RESET			= 1 << 0
+};
+
 /* bit enums for an 8-bit flags field indicating special use
  * QPs which require special handling in qp_reserve_range.
  * Currently, this only includes QPs used by the ETH interface,
@@ -545,6 +549,7 @@ struct mlx4_caps {
 	u8			alloc_res_qp_mask;
 	u32			dmfs_high_rate_qpn_base;
 	u32			dmfs_high_rate_qpn_range;
+	u32			vf_caps;
 };
 
 struct mlx4_buf_list {
-- 
cgit v1.2.3


From c2943f14534bdc4230f4da6dcd4ea03c5d8c8162 Mon Sep 17 00:00:00 2001
From: Harout Hedeshian <harouth@codeaurora.org>
Date: Tue, 20 Jan 2015 10:06:05 -0700
Subject: net: ipv6: Add sysctl entry to disable MTU updates from RA

The kernel forcefully applies MTU values received in router
advertisements provided the new MTU is less than the current. This
behavior is undesirable when the user space is managing the MTU. Instead
a sysctl flag 'accept_ra_mtu' is introduced such that the user space
can control whether or not RA provided MTU updates should be applied. The
default behavior is unchanged; user space must explicitly set this flag
to 0 for RA MTUs to be ignored.

Signed-off-by: Harout Hedeshian <harouth@codeaurora.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    | 10 ++++++++++
 net/ipv6/ndisc.c                       |  2 +-
 5 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 85b022179104..a5e4c813f17f 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1287,6 +1287,13 @@ accept_ra_rtr_pref - BOOLEAN
 	Functional default: enabled if accept_ra is enabled.
 			    disabled if accept_ra is disabled.
 
+accept_ra_mtu - BOOLEAN
+	Apply the MTU value specified in RA option 5 (RFC4861). If
+	disabled, the MTU specified in the RA will be ignored.
+
+	Functional default: enabled if accept_ra is enabled.
+			    disabled if accept_ra is disabled.
+
 accept_redirects - BOOLEAN
 	Accept Redirects.
 
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index c694e7baa621..2805062c013f 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -52,6 +52,7 @@ struct ipv6_devconf {
 	__s32		force_tllao;
 	__s32           ndisc_notify;
 	__s32		suppress_frag_ndisc;
+	__s32		accept_ra_mtu;
 	void		*sysctl;
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 73cb02dc3065..437a6a4b125a 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -169,6 +169,7 @@ enum {
 	DEVCONF_SUPPRESS_FRAG_NDISC,
 	DEVCONF_ACCEPT_RA_FROM_LOCAL,
 	DEVCONF_USE_OPTIMISTIC,
+	DEVCONF_ACCEPT_RA_MTU,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d6b4f5d08014..7dcc065e2160 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -201,6 +201,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	.disable_ipv6		= 0,
 	.accept_dad		= 1,
 	.suppress_frag_ndisc	= 1,
+	.accept_ra_mtu		= 1,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -238,6 +239,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	.disable_ipv6		= 0,
 	.accept_dad		= 1,
 	.suppress_frag_ndisc	= 1,
+	.accept_ra_mtu		= 1,
 };
 
 /* Check if a valid qdisc is available */
@@ -4380,6 +4382,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify;
 	array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
 	array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
+	array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5258,6 +5261,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname	= "accept_ra_mtu",
+			.data		= &ipv6_devconf.accept_ra_mtu,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 682866777d53..8a9d7c19e247 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1348,7 +1348,7 @@ skip_routeinfo:
 		}
 	}
 
-	if (ndopts.nd_opts_mtu) {
+	if (ndopts.nd_opts_mtu && in6_dev->cnf.accept_ra_mtu) {
 		__be32 n;
 		u32 mtu;
 
-- 
cgit v1.2.3


From 607954b084d4ad5e6a2e0f795de7803d9c6ae37f Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 21 Jan 2015 11:12:13 +0000
Subject: rhashtable: fix rht_for_each_entry_safe() endless loop

"next" is not updated, causing an endless loop for buckets with more than
one element.

Fixes: 88d6ed15acff ("rhashtable: Convert bucket iterators to take table and index")
Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index a2562ed53ea3..e0337844358e 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -260,7 +260,9 @@ void rhashtable_destroy(struct rhashtable *ht);
 	     next = !rht_is_a_nulls(pos) ?				    \
 		       rht_dereference_bucket(pos->next, tbl, hash) : NULL; \
 	     (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member);	    \
-	     pos = next)
+	     pos = next,						    \
+	     next = !rht_is_a_nulls(pos) ?				    \
+		       rht_dereference_bucket(pos->next, tbl, hash) : NULL)
 
 /**
  * rht_for_each_rcu_continue - continue iterating over rcu hash chain
-- 
cgit v1.2.3


From d5fd120e7860c2b3d4c936a2ebadb6b244bec4c8 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Mon, 26 Jan 2015 20:59:31 +0100
Subject: i2c: Only include slave support if selected

Make the slave support depend on CONFIG_I2C_SLAVE. Otherwise it gets
included unconditionally, even when it is not needed.

I2C bus drivers which implement slave support must select
I2C_SLAVE.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/busses/Kconfig | 1 +
 drivers/i2c/i2c-core.c     | 2 ++
 include/linux/i2c.h        | 6 ++++++
 3 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 31e8308ba899..ab838d9e28b6 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -881,6 +881,7 @@ config I2C_XLR
 config I2C_RCAR
 	tristate "Renesas R-Car I2C Controller"
 	depends on ARCH_SHMOBILE || COMPILE_TEST
+	select I2C_SLAVE
 	help
 	  If you say yes to this option, support will be included for the
 	  R-Car I2C controller.
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 39d25a8cb1ad..e9eae57a2b50 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -2972,6 +2972,7 @@ trace:
 }
 EXPORT_SYMBOL(i2c_smbus_xfer);
 
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
 int i2c_slave_register(struct i2c_client *client, i2c_slave_cb_t slave_cb)
 {
 	int ret;
@@ -3019,6 +3020,7 @@ int i2c_slave_unregister(struct i2c_client *client)
 	return ret;
 }
 EXPORT_SYMBOL_GPL(i2c_slave_unregister);
+#endif
 
 MODULE_AUTHOR("Simon G. Vogl <simon@tk.uni-linz.ac.at>");
 MODULE_DESCRIPTION("I2C-Bus main module");
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index e3a1721c8354..7c7695940ddd 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -228,7 +228,9 @@ struct i2c_client {
 	struct device dev;		/* the device structure		*/
 	int irq;			/* irq issued by device		*/
 	struct list_head detected;
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
 	i2c_slave_cb_t slave_cb;	/* callback for slave mode	*/
+#endif
 };
 #define to_i2c_client(d) container_of(d, struct i2c_client, dev)
 
@@ -253,6 +255,7 @@ static inline void i2c_set_clientdata(struct i2c_client *dev, void *data)
 
 /* I2C slave support */
 
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
 enum i2c_slave_event {
 	I2C_SLAVE_REQ_READ_START,
 	I2C_SLAVE_REQ_READ_END,
@@ -269,6 +272,7 @@ static inline int i2c_slave_event(struct i2c_client *client,
 {
 	return client->slave_cb(client, event, val);
 }
+#endif
 
 /**
  * struct i2c_board_info - template for device creation
@@ -404,8 +408,10 @@ struct i2c_algorithm {
 	/* To determine what the adapter supports */
 	u32 (*functionality) (struct i2c_adapter *);
 
+#if IS_ENABLED(CONFIG_I2C_SLAVE)
 	int (*reg_slave)(struct i2c_client *client);
 	int (*unreg_slave)(struct i2c_client *client);
+#endif
 };
 
 /**
-- 
cgit v1.2.3


From 9879de7373fcfb466ec198293b6ccc1ad7a42dd8 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 26 Jan 2015 12:58:32 -0800
Subject: mm: page_alloc: embed OOM killing naturally into allocation slowpath

The OOM killing invocation does a lot of duplicative checks against the
task's allocation context.  Rework it to take advantage of the existing
checks in the allocator slowpath.

The OOM killer is invoked when the allocator is unable to reclaim any
pages but the allocation has to keep looping.  Instead of having a check
for __GFP_NORETRY hidden in oom_gfp_allowed(), just move the OOM
invocation to the true branch of should_alloc_retry().  The __GFP_FS
check from oom_gfp_allowed() can then be moved into the OOM avoidance
branch in __alloc_pages_may_oom(), along with the PF_DUMPCORE test.

__alloc_pages_may_oom() can then signal to the caller whether the OOM
killer was invoked, instead of requiring it to duplicate the order and
high_zoneidx checks to guess this when deciding whether to continue.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h |  5 ----
 mm/page_alloc.c     | 82 +++++++++++++++++++++++------------------------------
 2 files changed, 35 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 853698c721f7..76200984d1e2 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -85,11 +85,6 @@ static inline void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
-static inline bool oom_gfp_allowed(gfp_t gfp_mask)
-{
-	return (gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY);
-}
-
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
 static inline bool task_will_free_mem(struct task_struct *task)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7633c503a116..8e20f9c2fa5a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2332,12 +2332,21 @@ static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
 	nodemask_t *nodemask, struct zone *preferred_zone,
-	int classzone_idx, int migratetype)
+	int classzone_idx, int migratetype, unsigned long *did_some_progress)
 {
 	struct page *page;
 
-	/* Acquire the per-zone oom lock for each zone */
+	*did_some_progress = 0;
+
+	if (oom_killer_disabled)
+		return NULL;
+
+	/*
+	 * Acquire the per-zone oom lock for each zone.  If that
+	 * fails, somebody else is making progress for us.
+	 */
 	if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
+		*did_some_progress = 1;
 		schedule_timeout_uninterruptible(1);
 		return NULL;
 	}
@@ -2363,12 +2372,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 		goto out;
 
 	if (!(gfp_mask & __GFP_NOFAIL)) {
+		/* Coredumps can quickly deplete all memory reserves */
+		if (current->flags & PF_DUMPCORE)
+			goto out;
 		/* The OOM killer will not help higher order allocs */
 		if (order > PAGE_ALLOC_COSTLY_ORDER)
 			goto out;
 		/* The OOM killer does not needlessly kill tasks for lowmem */
 		if (high_zoneidx < ZONE_NORMAL)
 			goto out;
+		/* The OOM killer does not compensate for light reclaim */
+		if (!(gfp_mask & __GFP_FS))
+			goto out;
 		/*
 		 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
 		 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -2381,7 +2396,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	}
 	/* Exhausted what can be done so it's blamo time */
 	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
-
+	*did_some_progress = 1;
 out:
 	oom_zonelist_unlock(zonelist, gfp_mask);
 	return page;
@@ -2658,7 +2673,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	    (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
 		goto nopage;
 
-restart:
+retry:
 	if (!(gfp_mask & __GFP_NO_KSWAPD))
 		wake_all_kswapds(order, zonelist, high_zoneidx,
 				preferred_zone, nodemask);
@@ -2681,7 +2696,6 @@ restart:
 		classzone_idx = zonelist_zone_idx(preferred_zoneref);
 	}
 
-rebalance:
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
 			high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2788,54 +2802,28 @@ rebalance:
 	if (page)
 		goto got_pg;
 
-	/*
-	 * If we failed to make any progress reclaiming, then we are
-	 * running out of options and have to consider going OOM
-	 */
-	if (!did_some_progress) {
-		if (oom_gfp_allowed(gfp_mask)) {
-			if (oom_killer_disabled)
-				goto nopage;
-			/* Coredumps can quickly deplete all memory reserves */
-			if ((current->flags & PF_DUMPCORE) &&
-			    !(gfp_mask & __GFP_NOFAIL))
-				goto nopage;
-			page = __alloc_pages_may_oom(gfp_mask, order,
-					zonelist, high_zoneidx,
-					nodemask, preferred_zone,
-					classzone_idx, migratetype);
-			if (page)
-				goto got_pg;
-
-			if (!(gfp_mask & __GFP_NOFAIL)) {
-				/*
-				 * The oom killer is not called for high-order
-				 * allocations that may fail, so if no progress
-				 * is being made, there are no other options and
-				 * retrying is unlikely to help.
-				 */
-				if (order > PAGE_ALLOC_COSTLY_ORDER)
-					goto nopage;
-				/*
-				 * The oom killer is not called for lowmem
-				 * allocations to prevent needlessly killing
-				 * innocent tasks.
-				 */
-				if (high_zoneidx < ZONE_NORMAL)
-					goto nopage;
-			}
-
-			goto restart;
-		}
-	}
-
 	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
 	if (should_alloc_retry(gfp_mask, order, did_some_progress,
 						pages_reclaimed)) {
+		/*
+		 * If we fail to make progress by freeing individual
+		 * pages, but the allocation wants us to keep going,
+		 * start OOM killing tasks.
+		 */
+		if (!did_some_progress) {
+			page = __alloc_pages_may_oom(gfp_mask, order, zonelist,
+						high_zoneidx, nodemask,
+						preferred_zone, classzone_idx,
+						migratetype,&did_some_progress);
+			if (page)
+				goto got_pg;
+			if (!did_some_progress)
+				goto nopage;
+		}
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
-		goto rebalance;
+		goto retry;
 	} else {
 		/*
 		 * High-order allocations do not necessarily loop after
-- 
cgit v1.2.3


From 07261edb971492c6b41b44d7b1b51f76807d30ad Mon Sep 17 00:00:00 2001
From: Pranith Kumar <bobby.prani@gmail.com>
Date: Mon, 26 Jan 2015 12:58:43 -0800
Subject: printk: add dummy routine for when CONFIG_PRINTK=n

There are missing dummy routines for log_buf_addr_get() and
log_buf_len_get() for when CONFIG_PRINTK is not set causing build
failures.

This patch adds these dummy routines at the appropriate location.

Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Petr Mladek <pmladek@suse.cz>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index c8f170324e64..4d5bf5726578 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -10,9 +10,6 @@
 extern const char linux_banner[];
 extern const char linux_proc_banner[];
 
-extern char *log_buf_addr_get(void);
-extern u32 log_buf_len_get(void);
-
 static inline int printk_get_level(const char *buffer)
 {
 	if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
@@ -163,6 +160,8 @@ extern int kptr_restrict;
 
 extern void wake_up_klogd(void);
 
+char *log_buf_addr_get(void);
+u32 log_buf_len_get(void);
 void log_buf_kexec_setup(void);
 void __init setup_log_buf(int early);
 void dump_stack_set_arch_desc(const char *fmt, ...);
@@ -198,6 +197,16 @@ static inline void wake_up_klogd(void)
 {
 }
 
+static inline char *log_buf_addr_get(void)
+{
+	return NULL;
+}
+
+static inline u32 log_buf_len_get(void)
+{
+	return 0;
+}
+
 static inline void log_buf_kexec_setup(void)
 {
 }
-- 
cgit v1.2.3


From 0b35fa7daefe9da6fdef91d95e07eebb714a8fcc Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Sun, 25 Jan 2015 23:33:30 +0100
Subject: NFC: st21nfcb: Remove useless include

include/linux/platform_data/st21nfcb.h is phy generic.
There is no need to include linux/i2c.h

Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/platform_data/st21nfcb.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/st21nfcb.h b/include/linux/platform_data/st21nfcb.h
index c3b432f5b63e..05c54c958cc3 100644
--- a/include/linux/platform_data/st21nfcb.h
+++ b/include/linux/platform_data/st21nfcb.h
@@ -19,8 +19,6 @@
 #ifndef _ST21NFCB_NCI_H_
 #define _ST21NFCB_NCI_H_
 
-#include <linux/i2c.h>
-
 #define ST21NFCB_NCI_DRIVER_NAME "st21nfcb_nci"
 
 struct st21nfcb_nfc_platform_data {
-- 
cgit v1.2.3


From 6da7c85c75eed769423b428eb654eaaf89d273c1 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Sun, 25 Jan 2015 23:33:31 +0100
Subject: NFC: st21nfcb: Fix copy/paste error in comment

include/linux/platform_data/st21nfcb.h is based on
include/linux/platform_data/st21nfca.h.

The endif comment is inacurrate for st21nfcb.

Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/platform_data/st21nfcb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/st21nfcb.h b/include/linux/platform_data/st21nfcb.h
index 05c54c958cc3..b023373d9874 100644
--- a/include/linux/platform_data/st21nfcb.h
+++ b/include/linux/platform_data/st21nfcb.h
@@ -26,4 +26,4 @@ struct st21nfcb_nfc_platform_data {
 	unsigned int irq_polarity;
 };
 
-#endif /* _ST21NFCA_HCI_H_ */
+#endif /* _ST21NFCB_NCI_H_ */
-- 
cgit v1.2.3


From aae88261abd58fffef7ee0e00160ce4ea105b0f3 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 26 Jan 2015 22:05:38 -0800
Subject: net: phy: document has_fixups field

has_fixups was introduced to help keeping track of fixups/quirks running
on a PHY device, but we did not update the comment above struct
phy_device accordingly.

Fixes: b0ae009f3dc14 (net: phy: add "has_fixups" boolean property")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 9c189a1fa3a2..1b3690b597d5 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -327,6 +327,7 @@ struct phy_c45_device_ids {
  * c45_ids: 802.3-c45 Device Identifers if is_c45.
  * is_c45:  Set to true if this phy uses clause 45 addressing.
  * is_internal: Set to true if this phy is internal to a MAC.
+ * has_fixups: Set to true if this phy has fixups/quirks.
  * state: state of the PHY for management purposes
  * dev_flags: Device-specific flags used by the PHY driver.
  * addr: Bus address of PHY
-- 
cgit v1.2.3


From 8a477a6fb6a33651adda772360b85fd813569743 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Mon, 26 Jan 2015 22:05:39 -0800
Subject: net: phy: keep track of the PHY suspend state

In order to avoid double calls to phydev->drv->suspend and resume, keep
track of whether the PHY has already been suspended as a consequence of
a successful call to phy_suspend(). We will use this in our MDIO bus
suspend/resume hooks to avoid a double suspend call.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy_device.c | 22 ++++++++++++++++++----
 include/linux/phy.h          |  2 ++
 2 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 3fc91e89f5a5..bdfe51fc3a65 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -699,6 +699,7 @@ int phy_suspend(struct phy_device *phydev)
 {
 	struct phy_driver *phydrv = to_phy_driver(phydev->dev.driver);
 	struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+	int ret = 0;
 
 	/* If the device has WOL enabled, we cannot suspend the PHY */
 	phy_ethtool_get_wol(phydev, &wol);
@@ -706,18 +707,31 @@ int phy_suspend(struct phy_device *phydev)
 		return -EBUSY;
 
 	if (phydrv->suspend)
-		return phydrv->suspend(phydev);
-	return 0;
+		ret = phydrv->suspend(phydev);
+
+	if (ret)
+		return ret;
+
+	phydev->suspended = true;
+
+	return ret;
 }
 EXPORT_SYMBOL(phy_suspend);
 
 int phy_resume(struct phy_device *phydev)
 {
 	struct phy_driver *phydrv = to_phy_driver(phydev->dev.driver);
+	int ret = 0;
 
 	if (phydrv->resume)
-		return phydrv->resume(phydev);
-	return 0;
+		ret = phydrv->resume(phydev);
+
+	if (ret)
+		return ret;
+
+	phydev->suspended = false;
+
+	return ret;
 }
 EXPORT_SYMBOL(phy_resume);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 1b3690b597d5..685809835b5c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -328,6 +328,7 @@ struct phy_c45_device_ids {
  * is_c45:  Set to true if this phy uses clause 45 addressing.
  * is_internal: Set to true if this phy is internal to a MAC.
  * has_fixups: Set to true if this phy has fixups/quirks.
+ * suspended: Set to true if this phy has been suspended successfully.
  * state: state of the PHY for management purposes
  * dev_flags: Device-specific flags used by the PHY driver.
  * addr: Bus address of PHY
@@ -365,6 +366,7 @@ struct phy_device {
 	bool is_c45;
 	bool is_internal;
 	bool has_fixups;
+	bool suspended;
 
 	enum phy_state state;
 
-- 
cgit v1.2.3


From e9fb8b7e4f6af17e3aa4835281e06fdc920341f9 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 6 Jan 2015 16:48:36 +0000
Subject: compat: Declare compat_sys_sigpending and compat_sys_sigprocmask
 prototypes

__ARCH_WANT_SYS_SIGPENDING or __ARCH_WANT_SYS_SIGPROGMASK may be defined
for compat support but the corresponding prototypes are missing from
linux/compat.h.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/compat.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 7450ca2ac1fc..ab25814690bc 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -689,6 +689,15 @@ asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
 asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr,
 				       compat_stack_t __user *uoss_ptr);
 
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set);
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *nset,
+				       compat_old_sigset_t __user *oset);
+#endif
+
 int compat_restore_altstack(const compat_stack_t __user *uss);
 int __compat_save_altstack(compat_stack_t __user *, unsigned long);
 #define compat_save_altstack_ex(uss, sp) do { \
-- 
cgit v1.2.3


From 54e45c169dbce43cf46d00eb1521b655b6e4f9e9 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 6 Jan 2015 16:52:38 +0000
Subject: syscalls: Declare sys_*stat64 prototypes if
 __ARCH_WANT_(COMPAT_)STAT64

Currently, the sys_stat64, sys_fstat64 and sys_lstat64 prototpyes are
only declared if BITS_PER_LONG == 32. Following commit 0753f70f07fb
(fs: Build sys_stat64() and friends if __ARCH_WANT_COMPAT_STAT64), the
implementation of these functions is allowed on 64-bit systems for
compat support. The patch changes the condition on the prototype
declaration from BITS_PER_LONG == 32 to defined(__ARCH_WANT_STAT64) ||
defined(__ARCH_WANT_COMPAT_STAT64).

In addition, it moves the sys_fstatat64 prototype under the same #if
block

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
---
 include/linux/syscalls.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 85893d744901..76d1e38aabe1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -410,12 +410,16 @@ asmlinkage long sys_newlstat(const char __user *filename,
 				struct stat __user *statbuf);
 asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
 asmlinkage long sys_ustat(unsigned dev, struct ustat __user *ubuf);
-#if BITS_PER_LONG == 32
+#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
 asmlinkage long sys_stat64(const char __user *filename,
 				struct stat64 __user *statbuf);
 asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
 asmlinkage long sys_lstat64(const char __user *filename,
 				struct stat64 __user *statbuf);
+asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
+			       struct stat64 __user *statbuf, int flag);
+#endif
+#if BITS_PER_LONG == 32
 asmlinkage long sys_truncate64(const char __user *path, loff_t length);
 asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length);
 #endif
@@ -771,8 +775,6 @@ asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   umode_t mode);
 asmlinkage long sys_newfstatat(int dfd, const char __user *filename,
 			       struct stat __user *statbuf, int flag);
-asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
-			       struct stat64 __user *statbuf, int flag);
 asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
 			       int bufsiz);
 asmlinkage long sys_utimensat(int dfd, const char __user *filename,
-- 
cgit v1.2.3


From cfcf1682c4ca8f601a4702255958e0b1c9aa12cc Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Sat, 24 Jan 2015 19:52:05 +0200
Subject: cfg80211: Add new GCMP, CCMP-256, BIP-GMAC, BIP-CMAC-256 ciphers

This makes cfg80211 aware of the GCMP, GCMP-256, CCMP-256, BIP-GMAC-128,
BIP-GMAC-256, and BIP-CMAC-256 cipher suites. These new cipher suites
were defined in IEEE Std 802.11ac-2013.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 18 ++++++++++++++++++
 net/wireless/util.c       | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 4f4eea8a6288..dbf417bf25bf 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1994,9 +1994,15 @@ enum ieee80211_key_len {
 	WLAN_KEY_LEN_WEP40 = 5,
 	WLAN_KEY_LEN_WEP104 = 13,
 	WLAN_KEY_LEN_CCMP = 16,
+	WLAN_KEY_LEN_CCMP_256 = 32,
 	WLAN_KEY_LEN_TKIP = 32,
 	WLAN_KEY_LEN_AES_CMAC = 16,
 	WLAN_KEY_LEN_SMS4 = 32,
+	WLAN_KEY_LEN_GCMP = 16,
+	WLAN_KEY_LEN_GCMP_256 = 32,
+	WLAN_KEY_LEN_BIP_CMAC_256 = 32,
+	WLAN_KEY_LEN_BIP_GMAC_128 = 16,
+	WLAN_KEY_LEN_BIP_GMAC_256 = 32,
 };
 
 #define IEEE80211_WEP_IV_LEN		4
@@ -2004,9 +2010,16 @@ enum ieee80211_key_len {
 #define IEEE80211_CCMP_HDR_LEN		8
 #define IEEE80211_CCMP_MIC_LEN		8
 #define IEEE80211_CCMP_PN_LEN		6
+#define IEEE80211_CCMP_256_HDR_LEN	8
+#define IEEE80211_CCMP_256_MIC_LEN	16
+#define IEEE80211_CCMP_256_PN_LEN	6
 #define IEEE80211_TKIP_IV_LEN		8
 #define IEEE80211_TKIP_ICV_LEN		4
 #define IEEE80211_CMAC_PN_LEN		6
+#define IEEE80211_GMAC_PN_LEN		6
+#define IEEE80211_GCMP_HDR_LEN		8
+#define IEEE80211_GCMP_MIC_LEN		16
+#define IEEE80211_GCMP_PN_LEN		6
 
 /* Public action codes */
 enum ieee80211_pub_actioncode {
@@ -2230,6 +2243,11 @@ enum ieee80211_sa_query_action {
 #define WLAN_CIPHER_SUITE_WEP104	0x000FAC05
 #define WLAN_CIPHER_SUITE_AES_CMAC	0x000FAC06
 #define WLAN_CIPHER_SUITE_GCMP		0x000FAC08
+#define WLAN_CIPHER_SUITE_GCMP_256	0x000FAC09
+#define WLAN_CIPHER_SUITE_CCMP_256	0x000FAC0A
+#define WLAN_CIPHER_SUITE_BIP_GMAC_128	0x000FAC0B
+#define WLAN_CIPHER_SUITE_BIP_GMAC_256	0x000FAC0C
+#define WLAN_CIPHER_SUITE_BIP_CMAC_256	0x000FAC0D
 
 #define WLAN_CIPHER_SUITE_SMS4		0x00147201
 
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 08f136ad2ea5..919fee807dd9 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -230,6 +230,9 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 	switch (params->cipher) {
 	case WLAN_CIPHER_SUITE_TKIP:
 	case WLAN_CIPHER_SUITE_CCMP:
+	case WLAN_CIPHER_SUITE_CCMP_256:
+	case WLAN_CIPHER_SUITE_GCMP:
+	case WLAN_CIPHER_SUITE_GCMP_256:
 		/* Disallow pairwise keys with non-zero index unless it's WEP
 		 * or a vendor specific cipher (because current deployments use
 		 * pairwise WEP keys with non-zero indices and for vendor
@@ -240,6 +243,9 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 			return -EINVAL;
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+	case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+	case WLAN_CIPHER_SUITE_BIP_GMAC_256:
 		/* Disallow BIP (group-only) cipher as pairwise cipher */
 		if (pairwise)
 			return -EINVAL;
@@ -261,6 +267,18 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 		if (params->key_len != WLAN_KEY_LEN_CCMP)
 			return -EINVAL;
 		break;
+	case WLAN_CIPHER_SUITE_CCMP_256:
+		if (params->key_len != WLAN_KEY_LEN_CCMP_256)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_GCMP:
+		if (params->key_len != WLAN_KEY_LEN_GCMP)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_GCMP_256:
+		if (params->key_len != WLAN_KEY_LEN_GCMP_256)
+			return -EINVAL;
+		break;
 	case WLAN_CIPHER_SUITE_WEP104:
 		if (params->key_len != WLAN_KEY_LEN_WEP104)
 			return -EINVAL;
@@ -269,6 +287,18 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 		if (params->key_len != WLAN_KEY_LEN_AES_CMAC)
 			return -EINVAL;
 		break;
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+		if (params->key_len != WLAN_KEY_LEN_BIP_CMAC_256)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+		if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_128)
+			return -EINVAL;
+		break;
+	case WLAN_CIPHER_SUITE_BIP_GMAC_256:
+		if (params->key_len != WLAN_KEY_LEN_BIP_GMAC_256)
+			return -EINVAL;
+		break;
 	default:
 		/*
 		 * We don't know anything about this algorithm,
@@ -288,7 +318,13 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev,
 			return -EINVAL;
 		case WLAN_CIPHER_SUITE_TKIP:
 		case WLAN_CIPHER_SUITE_CCMP:
+		case WLAN_CIPHER_SUITE_CCMP_256:
+		case WLAN_CIPHER_SUITE_GCMP:
+		case WLAN_CIPHER_SUITE_GCMP_256:
 		case WLAN_CIPHER_SUITE_AES_CMAC:
+		case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+		case WLAN_CIPHER_SUITE_BIP_GMAC_128:
+		case WLAN_CIPHER_SUITE_BIP_GMAC_256:
 			if (params->seq_len != 6)
 				return -EINVAL;
 			break;
-- 
cgit v1.2.3


From 56c52da2d554f081e8fce58ecbcf6a40c605b95b Mon Sep 17 00:00:00 2001
From: Jouni Malinen <jouni@qca.qualcomm.com>
Date: Sat, 24 Jan 2015 19:52:08 +0200
Subject: mac80111: Add BIP-CMAC-256 cipher

This allows mac80211 to configure BIP-CMAC-256 to the driver and also
use software-implementation within mac80211 when the driver does not
support this with hardware accelaration.

Signed-off-by: Jouni Malinen <jouni@qca.qualcomm.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |  9 +++++
 net/mac80211/aes_cmac.c    | 34 +++++++++++++----
 net/mac80211/aes_cmac.h    |  5 ++-
 net/mac80211/cfg.c         |  2 +
 net/mac80211/debugfs_key.c |  4 ++
 net/mac80211/key.c         | 14 ++++++-
 net/mac80211/main.c        | 13 ++++---
 net/mac80211/rx.c          | 21 ++++++++---
 net/mac80211/tx.c          |  3 ++
 net/mac80211/wpa.c         | 92 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/wpa.h         |  4 ++
 11 files changed, 180 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index dbf417bf25bf..b9c7897dc566 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1017,6 +1017,15 @@ struct ieee80211_mmie {
 	u8 mic[8];
 } __packed;
 
+/* Management MIC information element (IEEE 802.11w) for GMAC and CMAC-256 */
+struct ieee80211_mmie_16 {
+	u8 element_id;
+	u8 length;
+	__le16 key_id;
+	u8 sequence_number[6];
+	u8 mic[16];
+} __packed;
+
 struct ieee80211_vendor_ie {
 	u8 element_id;
 	u8 len;
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index 9b9009f99551..4192806be3d3 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -18,8 +18,8 @@
 #include "key.h"
 #include "aes_cmac.h"
 
-#define AES_CMAC_KEY_LEN 16
 #define CMAC_TLEN 8 /* CMAC TLen = 64 bits (8 octets) */
+#define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */
 #define AAD_LEN 20
 
 
@@ -35,9 +35,9 @@ static void gf_mulx(u8 *pad)
 		pad[AES_BLOCK_SIZE - 1] ^= 0x87;
 }
 
-
-static void aes_128_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
-				const u8 *addr[], const size_t *len, u8 *mac)
+static void aes_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
+			    const u8 *addr[], const size_t *len, u8 *mac,
+			    size_t mac_len)
 {
 	u8 cbc[AES_BLOCK_SIZE], pad[AES_BLOCK_SIZE];
 	const u8 *pos, *end;
@@ -88,7 +88,7 @@ static void aes_128_cmac_vector(struct crypto_cipher *tfm, size_t num_elem,
 	for (i = 0; i < AES_BLOCK_SIZE; i++)
 		pad[i] ^= cbc[i];
 	crypto_cipher_encrypt_one(tfm, pad, pad);
-	memcpy(mac, pad, CMAC_TLEN);
+	memcpy(mac, pad, mac_len);
 }
 
 
@@ -107,17 +107,35 @@ void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
 	addr[2] = zero;
 	len[2] = CMAC_TLEN;
 
-	aes_128_cmac_vector(tfm, 3, addr, len, mic);
+	aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN);
 }
 
+void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+			    const u8 *data, size_t data_len, u8 *mic)
+{
+	const u8 *addr[3];
+	size_t len[3];
+	u8 zero[CMAC_TLEN_256];
+
+	memset(zero, 0, CMAC_TLEN_256);
+	addr[0] = aad;
+	len[0] = AAD_LEN;
+	addr[1] = data;
+	len[1] = data_len - CMAC_TLEN_256;
+	addr[2] = zero;
+	len[2] = CMAC_TLEN_256;
+
+	aes_cmac_vector(tfm, 3, addr, len, mic, CMAC_TLEN_256);
+}
 
-struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[])
+struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
+						   size_t key_len)
 {
 	struct crypto_cipher *tfm;
 
 	tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
 	if (!IS_ERR(tfm))
-		crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN);
+		crypto_cipher_setkey(tfm, key, key_len);
 
 	return tfm;
 }
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
index 0ce6487af795..3702041f44fd 100644
--- a/net/mac80211/aes_cmac.h
+++ b/net/mac80211/aes_cmac.h
@@ -11,9 +11,12 @@
 
 #include <linux/crypto.h>
 
-struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[]);
+struct crypto_cipher *ieee80211_aes_cmac_key_setup(const u8 key[],
+						   size_t key_len);
 void ieee80211_aes_cmac(struct crypto_cipher *tfm, const u8 *aad,
 			const u8 *data, size_t data_len, u8 *mic);
+void ieee80211_aes_cmac_256(struct crypto_cipher *tfm, const u8 *aad,
+			    const u8 *data, size_t data_len, u8 *mic);
 void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm);
 
 #endif /* AES_CMAC_H */
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index ef84441c119c..b7e528bbecce 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -164,6 +164,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
 	case WLAN_CIPHER_SUITE_CCMP:
 	case WLAN_CIPHER_SUITE_CCMP_256:
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 	case WLAN_CIPHER_SUITE_GCMP:
 	case WLAN_CIPHER_SUITE_GCMP_256:
 		break;
@@ -362,6 +363,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
 		params.seq_len = 6;
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		pn64 = atomic64_read(&key->u.aes_cmac.tx_pn);
 		seq[0] = pn64;
 		seq[1] = pn64 >> 8;
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 64de07b16092..d1b60eb014a8 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -101,6 +101,7 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
 				(u8)(pn >> 16), (u8)(pn >> 8), (u8)pn);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		pn = atomic64_read(&key->u.aes_cmac.tx_pn);
 		len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n",
 				(u8)(pn >> 40), (u8)(pn >> 32), (u8)(pn >> 24),
@@ -153,6 +154,7 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf,
 		len = p - buf;
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		rpn = key->u.aes_cmac.rx_pn;
 		p += scnprintf(p, sizeof(buf)+buf-p,
 			       "%02x%02x%02x%02x%02x%02x\n",
@@ -191,6 +193,7 @@ static ssize_t key_replays_read(struct file *file, char __user *userbuf,
 		len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		len = scnprintf(buf, sizeof(buf), "%u\n",
 				key->u.aes_cmac.replays);
 		break;
@@ -214,6 +217,7 @@ static ssize_t key_icverrors_read(struct file *file, char __user *userbuf,
 
 	switch (key->conf.cipher) {
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		len = scnprintf(buf, sizeof(buf), "%u\n",
 				key->u.aes_cmac.icverrors);
 		break;
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 83c61085c3f0..7ceea9d9fcd2 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -165,6 +165,7 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key)
 	case WLAN_CIPHER_SUITE_CCMP:
 	case WLAN_CIPHER_SUITE_CCMP_256:
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 	case WLAN_CIPHER_SUITE_GCMP:
 	case WLAN_CIPHER_SUITE_GCMP_256:
 		/* all of these we can do in software - if driver can */
@@ -417,8 +418,12 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
 		}
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		key->conf.iv_len = 0;
-		key->conf.icv_len = sizeof(struct ieee80211_mmie);
+		if (cipher == WLAN_CIPHER_SUITE_AES_CMAC)
+			key->conf.icv_len = sizeof(struct ieee80211_mmie);
+		else
+			key->conf.icv_len = sizeof(struct ieee80211_mmie_16);
 		if (seq)
 			for (j = 0; j < IEEE80211_CMAC_PN_LEN; j++)
 				key->u.aes_cmac.rx_pn[j] =
@@ -428,7 +433,7 @@ ieee80211_key_alloc(u32 cipher, int idx, size_t key_len,
 		 * it does not need to be initialized for every packet.
 		 */
 		key->u.aes_cmac.tfm =
-			ieee80211_aes_cmac_key_setup(key_data);
+			ieee80211_aes_cmac_key_setup(key_data, key_len);
 		if (IS_ERR(key->u.aes_cmac.tfm)) {
 			err = PTR_ERR(key->u.aes_cmac.tfm);
 			kfree(key);
@@ -481,6 +486,7 @@ static void ieee80211_key_free_common(struct ieee80211_key *key)
 		ieee80211_aes_key_free(key->u.ccmp.tfm);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm);
 		break;
 	case WLAN_CIPHER_SUITE_GCMP:
@@ -804,6 +810,7 @@ void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
 		seq->ccmp.pn[0] = pn64 >> 40;
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		pn64 = atomic64_read(&key->u.aes_cmac.tx_pn);
 		seq->ccmp.pn[5] = pn64;
 		seq->ccmp.pn[4] = pn64 >> 8;
@@ -854,6 +861,7 @@ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
 		memcpy(seq->ccmp.pn, pn, IEEE80211_CCMP_PN_LEN);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		if (WARN_ON(tid != 0))
 			return;
 		pn = key->u.aes_cmac.rx_pn;
@@ -897,6 +905,7 @@ void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
 		atomic64_set(&key->u.ccmp.tx_pn, pn64);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		pn64 = (u64)seq->aes_cmac.pn[5] |
 		       ((u64)seq->aes_cmac.pn[4] << 8) |
 		       ((u64)seq->aes_cmac.pn[3] << 16) |
@@ -948,6 +957,7 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
 		memcpy(pn, seq->ccmp.pn, IEEE80211_CCMP_PN_LEN);
 		break;
 	case WLAN_CIPHER_SUITE_AES_CMAC:
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 		if (WARN_ON(tid != 0))
 			return;
 		pn = key->u.aes_cmac.rx_pn;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index a5ad2d5bb29b..053a17c5023a 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -671,7 +671,8 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
 		WLAN_CIPHER_SUITE_GCMP_256,
 
 		/* keep last -- depends on hw flags! */
-		WLAN_CIPHER_SUITE_AES_CMAC
+		WLAN_CIPHER_SUITE_AES_CMAC,
+		WLAN_CIPHER_SUITE_BIP_CMAC_256,
 	};
 
 	if (local->hw.flags & IEEE80211_HW_SW_CRYPTO_CONTROL ||
@@ -710,7 +711,7 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
 		local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites);
 
 		if (!have_mfp)
-			local->hw.wiphy->n_cipher_suites--;
+			local->hw.wiphy->n_cipher_suites -= 2;
 
 		if (!have_wep) {
 			local->hw.wiphy->cipher_suites += 2;
@@ -736,9 +737,9 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
 		if (have_wep)
 			n_suites += 2;
 
-		/* check if we have AES_CMAC */
+		/* check if we have AES_CMAC, BIP-CMAC-256 */
 		if (have_mfp)
-			n_suites++;
+			n_suites += 2;
 
 		suites = kmalloc(sizeof(u32) * n_suites, GFP_KERNEL);
 		if (!suites)
@@ -755,8 +756,10 @@ static int ieee80211_init_cipher_suites(struct ieee80211_local *local)
 			suites[w++] = WLAN_CIPHER_SUITE_WEP104;
 		}
 
-		if (have_mfp)
+		if (have_mfp) {
 			suites[w++] = WLAN_CIPHER_SUITE_AES_CMAC;
+			suites[w++] = WLAN_CIPHER_SUITE_BIP_CMAC_256;
+		}
 
 		for (r = 0; r < local->hw.n_cipher_schemes; r++)
 			suites[w++] = cs[r].cipher;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index e8c6ba5ce70b..93ebc9525478 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -647,6 +647,7 @@ static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
 {
 	struct ieee80211_mgmt *hdr = (struct ieee80211_mgmt *) skb->data;
 	struct ieee80211_mmie *mmie;
+	struct ieee80211_mmie_16 *mmie16;
 
 	if (skb->len < 24 + sizeof(*mmie) || !is_multicast_ether_addr(hdr->da))
 		return -1;
@@ -656,11 +657,18 @@ static int ieee80211_get_mmie_keyidx(struct sk_buff *skb)
 
 	mmie = (struct ieee80211_mmie *)
 		(skb->data + skb->len - sizeof(*mmie));
-	if (mmie->element_id != WLAN_EID_MMIE ||
-	    mmie->length != sizeof(*mmie) - 2)
-		return -1;
-
-	return le16_to_cpu(mmie->key_id);
+	if (mmie->element_id == WLAN_EID_MMIE &&
+	    mmie->length == sizeof(*mmie) - 2)
+		return le16_to_cpu(mmie->key_id);
+
+	mmie16 = (struct ieee80211_mmie_16 *)
+		(skb->data + skb->len - sizeof(*mmie16));
+	if (skb->len >= 24 + sizeof(*mmie16) &&
+	    mmie16->element_id == WLAN_EID_MMIE &&
+	    mmie16->length == sizeof(*mmie16) - 2)
+		return le16_to_cpu(mmie16->key_id);
+
+	return -1;
 }
 
 static int iwl80211_get_cs_keyid(const struct ieee80211_cipher_scheme *cs,
@@ -1660,6 +1668,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
 	case WLAN_CIPHER_SUITE_AES_CMAC:
 		result = ieee80211_crypto_aes_cmac_decrypt(rx);
 		break;
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+		result = ieee80211_crypto_aes_cmac_256_decrypt(rx);
+		break;
 	case WLAN_CIPHER_SUITE_GCMP:
 	case WLAN_CIPHER_SUITE_GCMP_256:
 		result = ieee80211_crypto_gcmp_decrypt(rx);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index be57e0afd019..909c27be1fdc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -639,6 +639,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
 					ieee80211_is_mgmt(hdr->frame_control);
 			break;
 		case WLAN_CIPHER_SUITE_AES_CMAC:
+		case WLAN_CIPHER_SUITE_BIP_CMAC_256:
 			if (!ieee80211_is_mgmt(hdr->frame_control))
 				tx->key = NULL;
 			break;
@@ -1021,6 +1022,8 @@ ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx)
 			tx, IEEE80211_CCMP_256_MIC_LEN);
 	case WLAN_CIPHER_SUITE_AES_CMAC:
 		return ieee80211_crypto_aes_cmac_encrypt(tx);
+	case WLAN_CIPHER_SUITE_BIP_CMAC_256:
+		return ieee80211_crypto_aes_cmac_256_encrypt(tx);
 	case WLAN_CIPHER_SUITE_GCMP:
 	case WLAN_CIPHER_SUITE_GCMP_256:
 		return ieee80211_crypto_gcmp_encrypt(tx);
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index ae654de9782a..549af118de9f 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -955,6 +955,48 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx)
 	return TX_CONTINUE;
 }
 
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx)
+{
+	struct sk_buff *skb;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_key *key = tx->key;
+	struct ieee80211_mmie_16 *mmie;
+	u8 aad[20];
+	u64 pn64;
+
+	if (WARN_ON(skb_queue_len(&tx->skbs) != 1))
+		return TX_DROP;
+
+	skb = skb_peek(&tx->skbs);
+
+	info = IEEE80211_SKB_CB(skb);
+
+	if (info->control.hw_key)
+		return TX_CONTINUE;
+
+	if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
+		return TX_DROP;
+
+	mmie = (struct ieee80211_mmie_16 *)skb_put(skb, sizeof(*mmie));
+	mmie->element_id = WLAN_EID_MMIE;
+	mmie->length = sizeof(*mmie) - 2;
+	mmie->key_id = cpu_to_le16(key->conf.keyidx);
+
+	/* PN = PN + 1 */
+	pn64 = atomic64_inc_return(&key->u.aes_cmac.tx_pn);
+
+	bip_ipn_set64(mmie->sequence_number, pn64);
+
+	bip_aad(skb, aad);
+
+	/* MIC = AES-256-CMAC(IGTK, AAD || Management Frame Body || MMIE, 128)
+	 */
+	ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad,
+			       skb->data + 24, skb->len - 24, mmie->mic);
+
+	return TX_CONTINUE;
+}
 
 ieee80211_rx_result
 ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
@@ -1006,6 +1048,56 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx)
 	return RX_CONTINUE;
 }
 
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx)
+{
+	struct sk_buff *skb = rx->skb;
+	struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+	struct ieee80211_key *key = rx->key;
+	struct ieee80211_mmie_16 *mmie;
+	u8 aad[20], mic[16], ipn[6];
+	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
+
+	if (!ieee80211_is_mgmt(hdr->frame_control))
+		return RX_CONTINUE;
+
+	/* management frames are already linear */
+
+	if (skb->len < 24 + sizeof(*mmie))
+		return RX_DROP_UNUSABLE;
+
+	mmie = (struct ieee80211_mmie_16 *)
+		(skb->data + skb->len - sizeof(*mmie));
+	if (mmie->element_id != WLAN_EID_MMIE ||
+	    mmie->length != sizeof(*mmie) - 2)
+		return RX_DROP_UNUSABLE; /* Invalid MMIE */
+
+	bip_ipn_swap(ipn, mmie->sequence_number);
+
+	if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) {
+		key->u.aes_cmac.replays++;
+		return RX_DROP_UNUSABLE;
+	}
+
+	if (!(status->flag & RX_FLAG_DECRYPTED)) {
+		/* hardware didn't decrypt/verify MIC */
+		bip_aad(skb, aad);
+		ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad,
+				       skb->data + 24, skb->len - 24, mic);
+		if (memcmp(mic, mmie->mic, sizeof(mmie->mic)) != 0) {
+			key->u.aes_cmac.icverrors++;
+			return RX_DROP_UNUSABLE;
+		}
+	}
+
+	memcpy(key->u.aes_cmac.rx_pn, ipn, 6);
+
+	/* Remove MMIE */
+	skb_trim(skb, skb->len - sizeof(*mmie));
+
+	return RX_CONTINUE;
+}
+
 ieee80211_tx_result
 ieee80211_crypto_hw_encrypt(struct ieee80211_tx_data *tx)
 {
diff --git a/net/mac80211/wpa.h b/net/mac80211/wpa.h
index 43e109f27a89..06b7f167a176 100644
--- a/net/mac80211/wpa.h
+++ b/net/mac80211/wpa.h
@@ -32,8 +32,12 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
 
 ieee80211_tx_result
 ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx);
+ieee80211_tx_result
+ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx);
 ieee80211_rx_result
 ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx);
+ieee80211_rx_result
+ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx);
 ieee80211_tx_result
 ieee80211_crypto_hw_encrypt(struct ieee80211_tx_data *tx);
 ieee80211_rx_result
-- 
cgit v1.2.3


From 05c80d75f10ad7d3f95444b65788d6a0bbb4380d Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Fri, 19 Dec 2014 09:14:20 -0300
Subject: [media] hdmi: add new HDMI 2.0 defines

Add new Video InfoFrame colorspace information introduced in HDMI 2.0
and new Audio Coding Extension Types, also from HDMI 2.0.

HDMI_CONTENT_TYPE_NONE was renamed to _GRAPHICS since that's what
it is called in CEA-861-F.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Reviewed-by: Thierry Reding <treding@nvidia.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/linux/hdmi.h | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index cbb5790a35cd..5afc0bff2bbe 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -52,12 +52,18 @@ enum hdmi_colorspace {
 	HDMI_COLORSPACE_RGB,
 	HDMI_COLORSPACE_YUV422,
 	HDMI_COLORSPACE_YUV444,
+	HDMI_COLORSPACE_YUV420,
+	HDMI_COLORSPACE_RESERVED4,
+	HDMI_COLORSPACE_RESERVED5,
+	HDMI_COLORSPACE_RESERVED6,
+	HDMI_COLORSPACE_IDO_DEFINED,
 };
 
 enum hdmi_scan_mode {
 	HDMI_SCAN_MODE_NONE,
 	HDMI_SCAN_MODE_OVERSCAN,
 	HDMI_SCAN_MODE_UNDERSCAN,
+	HDMI_SCAN_MODE_RESERVED,
 };
 
 enum hdmi_colorimetry {
@@ -71,6 +77,7 @@ enum hdmi_picture_aspect {
 	HDMI_PICTURE_ASPECT_NONE,
 	HDMI_PICTURE_ASPECT_4_3,
 	HDMI_PICTURE_ASPECT_16_9,
+	HDMI_PICTURE_ASPECT_RESERVED,
 };
 
 enum hdmi_active_aspect {
@@ -92,12 +99,18 @@ enum hdmi_extended_colorimetry {
 	HDMI_EXTENDED_COLORIMETRY_S_YCC_601,
 	HDMI_EXTENDED_COLORIMETRY_ADOBE_YCC_601,
 	HDMI_EXTENDED_COLORIMETRY_ADOBE_RGB,
+
+	/* The following EC values are only defined in CEA-861-F. */
+	HDMI_EXTENDED_COLORIMETRY_BT2020_CONST_LUM,
+	HDMI_EXTENDED_COLORIMETRY_BT2020,
+	HDMI_EXTENDED_COLORIMETRY_RESERVED,
 };
 
 enum hdmi_quantization_range {
 	HDMI_QUANTIZATION_RANGE_DEFAULT,
 	HDMI_QUANTIZATION_RANGE_LIMITED,
 	HDMI_QUANTIZATION_RANGE_FULL,
+	HDMI_QUANTIZATION_RANGE_RESERVED,
 };
 
 /* non-uniform picture scaling */
@@ -114,7 +127,7 @@ enum hdmi_ycc_quantization_range {
 };
 
 enum hdmi_content_type {
-	HDMI_CONTENT_TYPE_NONE,
+	HDMI_CONTENT_TYPE_GRAPHICS,
 	HDMI_CONTENT_TYPE_PHOTO,
 	HDMI_CONTENT_TYPE_CINEMA,
 	HDMI_CONTENT_TYPE_GAME,
@@ -194,6 +207,7 @@ enum hdmi_audio_coding_type {
 	HDMI_AUDIO_CODING_TYPE_MLP,
 	HDMI_AUDIO_CODING_TYPE_DST,
 	HDMI_AUDIO_CODING_TYPE_WMA_PRO,
+	HDMI_AUDIO_CODING_TYPE_CXT,
 };
 
 enum hdmi_audio_sample_size {
@@ -216,9 +230,23 @@ enum hdmi_audio_sample_frequency {
 
 enum hdmi_audio_coding_type_ext {
 	HDMI_AUDIO_CODING_TYPE_EXT_STREAM,
+
+	/*
+	 * The next three CXT values are defined in CEA-861-E only.
+	 * They do not exist in older versions, and in CEA-861-F they are
+	 * defined as 'Not in use'.
+	 */
 	HDMI_AUDIO_CODING_TYPE_EXT_HE_AAC,
 	HDMI_AUDIO_CODING_TYPE_EXT_HE_AAC_V2,
 	HDMI_AUDIO_CODING_TYPE_EXT_MPEG_SURROUND,
+
+	/* The following CXT values are only defined in CEA-861-F. */
+	HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC,
+	HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC_V2,
+	HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_AAC_LC,
+	HDMI_AUDIO_CODING_TYPE_EXT_DRA,
+	HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC_SURROUND,
+	HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_AAC_LC_SURROUND = 10,
 };
 
 struct hdmi_audio_infoframe {
-- 
cgit v1.2.3


From 2c676f378edb16cb68f7815581c8119fc43a4b85 Mon Sep 17 00:00:00 2001
From: Martin Bugge <marbugge@cisco.com>
Date: Fri, 19 Dec 2014 09:14:21 -0300
Subject: [media] hdmi: added unpack and logging functions for InfoFrames

When receiving video it is very useful to be able to unpack the InfoFrames.
Logging is useful as well, both for transmitters and receivers.

Especially when implementing the VIDIOC_LOG_STATUS ioctl (supported by many
V4L2 drivers) for a receiver it is important to be able to easily log what
the InfoFrame contains. This greatly simplifies debugging.

Signed-off-by: Martin Bugge <marbugge@cisco.com>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/video/hdmi.c | 822 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/hdmi.h |   4 +
 2 files changed, 819 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/hdmi.c b/drivers/video/hdmi.c
index 9e758a8f890d..a7c6ae4e10e5 100644
--- a/drivers/video/hdmi.c
+++ b/drivers/video/hdmi.c
@@ -27,10 +27,12 @@
 #include <linux/export.h>
 #include <linux/hdmi.h>
 #include <linux/string.h>
+#include <linux/device.h>
 
-static void hdmi_infoframe_checksum(void *buffer, size_t size)
+#define hdmi_log(fmt, ...) dev_printk(level, dev, fmt, ##__VA_ARGS__)
+
+static u8 hdmi_infoframe_checksum(u8 *ptr, size_t size)
 {
-	u8 *ptr = buffer;
 	u8 csum = 0;
 	size_t i;
 
@@ -38,7 +40,14 @@ static void hdmi_infoframe_checksum(void *buffer, size_t size)
 	for (i = 0; i < size; i++)
 		csum += ptr[i];
 
-	ptr[3] = 256 - csum;
+	return 256 - csum;
+}
+
+static void hdmi_infoframe_set_checksum(void *buffer, size_t size)
+{
+	u8 *ptr = buffer;
+
+	ptr[3] = hdmi_infoframe_checksum(buffer, size);
 }
 
 /**
@@ -136,7 +145,7 @@ ssize_t hdmi_avi_infoframe_pack(struct hdmi_avi_infoframe *frame, void *buffer,
 	ptr[11] = frame->right_bar & 0xff;
 	ptr[12] = (frame->right_bar >> 8) & 0xff;
 
-	hdmi_infoframe_checksum(buffer, length);
+	hdmi_infoframe_set_checksum(buffer, length);
 
 	return length;
 }
@@ -206,7 +215,7 @@ ssize_t hdmi_spd_infoframe_pack(struct hdmi_spd_infoframe *frame, void *buffer,
 
 	ptr[24] = frame->sdi;
 
-	hdmi_infoframe_checksum(buffer, length);
+	hdmi_infoframe_set_checksum(buffer, length);
 
 	return length;
 }
@@ -281,7 +290,7 @@ ssize_t hdmi_audio_infoframe_pack(struct hdmi_audio_infoframe *frame,
 	if (frame->downmix_inhibit)
 		ptr[4] |= BIT(7);
 
-	hdmi_infoframe_checksum(buffer, length);
+	hdmi_infoframe_set_checksum(buffer, length);
 
 	return length;
 }
@@ -373,7 +382,7 @@ ssize_t hdmi_vendor_infoframe_pack(struct hdmi_vendor_infoframe *frame,
 			ptr[9] = (frame->s3d_ext_data & 0xf) << 4;
 	}
 
-	hdmi_infoframe_checksum(buffer, length);
+	hdmi_infoframe_set_checksum(buffer, length);
 
 	return length;
 }
@@ -434,3 +443,802 @@ hdmi_infoframe_pack(union hdmi_infoframe *frame, void *buffer, size_t size)
 	return length;
 }
 EXPORT_SYMBOL(hdmi_infoframe_pack);
+
+static const char *hdmi_infoframe_type_get_name(enum hdmi_infoframe_type type)
+{
+	if (type < 0x80 || type > 0x9f)
+		return "Invalid";
+	switch (type) {
+	case HDMI_INFOFRAME_TYPE_VENDOR:
+		return "Vendor";
+	case HDMI_INFOFRAME_TYPE_AVI:
+		return "Auxiliary Video Information (AVI)";
+	case HDMI_INFOFRAME_TYPE_SPD:
+		return "Source Product Description (SPD)";
+	case HDMI_INFOFRAME_TYPE_AUDIO:
+		return "Audio";
+	}
+	return "Reserved";
+}
+
+static void hdmi_infoframe_log_header(const char *level,
+				      struct device *dev,
+				      struct hdmi_any_infoframe *frame)
+{
+	hdmi_log("HDMI infoframe: %s, version %u, length %u\n",
+		hdmi_infoframe_type_get_name(frame->type),
+		frame->version, frame->length);
+}
+
+static const char *hdmi_colorspace_get_name(enum hdmi_colorspace colorspace)
+{
+	switch (colorspace) {
+	case HDMI_COLORSPACE_RGB:
+		return "RGB";
+	case HDMI_COLORSPACE_YUV422:
+		return "YCbCr 4:2:2";
+	case HDMI_COLORSPACE_YUV444:
+		return "YCbCr 4:4:4";
+	case HDMI_COLORSPACE_YUV420:
+		return "YCbCr 4:2:0";
+	case HDMI_COLORSPACE_RESERVED4:
+		return "Reserved (4)";
+	case HDMI_COLORSPACE_RESERVED5:
+		return "Reserved (5)";
+	case HDMI_COLORSPACE_RESERVED6:
+		return "Reserved (6)";
+	case HDMI_COLORSPACE_IDO_DEFINED:
+		return "IDO Defined";
+	}
+	return "Invalid";
+}
+
+static const char *hdmi_scan_mode_get_name(enum hdmi_scan_mode scan_mode)
+{
+	switch (scan_mode) {
+	case HDMI_SCAN_MODE_NONE:
+		return "No Data";
+	case HDMI_SCAN_MODE_OVERSCAN:
+		return "Overscan";
+	case HDMI_SCAN_MODE_UNDERSCAN:
+		return "Underscan";
+	case HDMI_SCAN_MODE_RESERVED:
+		return "Reserved";
+	}
+	return "Invalid";
+}
+
+static const char *hdmi_colorimetry_get_name(enum hdmi_colorimetry colorimetry)
+{
+	switch (colorimetry) {
+	case HDMI_COLORIMETRY_NONE:
+		return "No Data";
+	case HDMI_COLORIMETRY_ITU_601:
+		return "ITU601";
+	case HDMI_COLORIMETRY_ITU_709:
+		return "ITU709";
+	case HDMI_COLORIMETRY_EXTENDED:
+		return "Extended";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_picture_aspect_get_name(enum hdmi_picture_aspect picture_aspect)
+{
+	switch (picture_aspect) {
+	case HDMI_PICTURE_ASPECT_NONE:
+		return "No Data";
+	case HDMI_PICTURE_ASPECT_4_3:
+		return "4:3";
+	case HDMI_PICTURE_ASPECT_16_9:
+		return "16:9";
+	case HDMI_PICTURE_ASPECT_RESERVED:
+		return "Reserved";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_active_aspect_get_name(enum hdmi_active_aspect active_aspect)
+{
+	if (active_aspect < 0 || active_aspect > 0xf)
+		return "Invalid";
+
+	switch (active_aspect) {
+	case HDMI_ACTIVE_ASPECT_16_9_TOP:
+		return "16:9 Top";
+	case HDMI_ACTIVE_ASPECT_14_9_TOP:
+		return "14:9 Top";
+	case HDMI_ACTIVE_ASPECT_16_9_CENTER:
+		return "16:9 Center";
+	case HDMI_ACTIVE_ASPECT_PICTURE:
+		return "Same as Picture";
+	case HDMI_ACTIVE_ASPECT_4_3:
+		return "4:3";
+	case HDMI_ACTIVE_ASPECT_16_9:
+		return "16:9";
+	case HDMI_ACTIVE_ASPECT_14_9:
+		return "14:9";
+	case HDMI_ACTIVE_ASPECT_4_3_SP_14_9:
+		return "4:3 SP 14:9";
+	case HDMI_ACTIVE_ASPECT_16_9_SP_14_9:
+		return "16:9 SP 14:9";
+	case HDMI_ACTIVE_ASPECT_16_9_SP_4_3:
+		return "16:9 SP 4:3";
+	}
+	return "Reserved";
+}
+
+static const char *
+hdmi_extended_colorimetry_get_name(enum hdmi_extended_colorimetry ext_col)
+{
+	switch (ext_col) {
+	case HDMI_EXTENDED_COLORIMETRY_XV_YCC_601:
+		return "xvYCC 601";
+	case HDMI_EXTENDED_COLORIMETRY_XV_YCC_709:
+		return "xvYCC 709";
+	case HDMI_EXTENDED_COLORIMETRY_S_YCC_601:
+		return "sYCC 601";
+	case HDMI_EXTENDED_COLORIMETRY_ADOBE_YCC_601:
+		return "Adobe YCC 601";
+	case HDMI_EXTENDED_COLORIMETRY_ADOBE_RGB:
+		return "Adobe RGB";
+	case HDMI_EXTENDED_COLORIMETRY_BT2020_CONST_LUM:
+		return "BT.2020 Constant Luminance";
+	case HDMI_EXTENDED_COLORIMETRY_BT2020:
+		return "BT.2020";
+	case HDMI_EXTENDED_COLORIMETRY_RESERVED:
+		return "Reserved";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_quantization_range_get_name(enum hdmi_quantization_range qrange)
+{
+	switch (qrange) {
+	case HDMI_QUANTIZATION_RANGE_DEFAULT:
+		return "Default";
+	case HDMI_QUANTIZATION_RANGE_LIMITED:
+		return "Limited";
+	case HDMI_QUANTIZATION_RANGE_FULL:
+		return "Full";
+	case HDMI_QUANTIZATION_RANGE_RESERVED:
+		return "Reserved";
+	}
+	return "Invalid";
+}
+
+static const char *hdmi_nups_get_name(enum hdmi_nups nups)
+{
+	switch (nups) {
+	case HDMI_NUPS_UNKNOWN:
+		return "Unknown Non-uniform Scaling";
+	case HDMI_NUPS_HORIZONTAL:
+		return "Horizontally Scaled";
+	case HDMI_NUPS_VERTICAL:
+		return "Vertically Scaled";
+	case HDMI_NUPS_BOTH:
+		return "Horizontally and Vertically Scaled";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_ycc_quantization_range_get_name(enum hdmi_ycc_quantization_range qrange)
+{
+	switch (qrange) {
+	case HDMI_YCC_QUANTIZATION_RANGE_LIMITED:
+		return "Limited";
+	case HDMI_YCC_QUANTIZATION_RANGE_FULL:
+		return "Full";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_content_type_get_name(enum hdmi_content_type content_type)
+{
+	switch (content_type) {
+	case HDMI_CONTENT_TYPE_GRAPHICS:
+		return "Graphics";
+	case HDMI_CONTENT_TYPE_PHOTO:
+		return "Photo";
+	case HDMI_CONTENT_TYPE_CINEMA:
+		return "Cinema";
+	case HDMI_CONTENT_TYPE_GAME:
+		return "Game";
+	}
+	return "Invalid";
+}
+
+/**
+ * hdmi_avi_infoframe_log() - log info of HDMI AVI infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI AVI infoframe
+ */
+static void hdmi_avi_infoframe_log(const char *level,
+				   struct device *dev,
+				   struct hdmi_avi_infoframe *frame)
+{
+	hdmi_infoframe_log_header(level, dev,
+				  (struct hdmi_any_infoframe *)frame);
+
+	hdmi_log("    colorspace: %s\n",
+			hdmi_colorspace_get_name(frame->colorspace));
+	hdmi_log("    scan mode: %s\n",
+			hdmi_scan_mode_get_name(frame->scan_mode));
+	hdmi_log("    colorimetry: %s\n",
+			hdmi_colorimetry_get_name(frame->colorimetry));
+	hdmi_log("    picture aspect: %s\n",
+			hdmi_picture_aspect_get_name(frame->picture_aspect));
+	hdmi_log("    active aspect: %s\n",
+			hdmi_active_aspect_get_name(frame->active_aspect));
+	hdmi_log("    itc: %s\n", frame->itc ? "IT Content" : "No Data");
+	hdmi_log("    extended colorimetry: %s\n",
+			hdmi_extended_colorimetry_get_name(frame->extended_colorimetry));
+	hdmi_log("    quantization range: %s\n",
+			hdmi_quantization_range_get_name(frame->quantization_range));
+	hdmi_log("    nups: %s\n", hdmi_nups_get_name(frame->nups));
+	hdmi_log("    video code: %u\n", frame->video_code);
+	hdmi_log("    ycc quantization range: %s\n",
+			hdmi_ycc_quantization_range_get_name(frame->ycc_quantization_range));
+	hdmi_log("    hdmi content type: %s\n",
+			hdmi_content_type_get_name(frame->content_type));
+	hdmi_log("    pixel repeat: %u\n", frame->pixel_repeat);
+	hdmi_log("    bar top %u, bottom %u, left %u, right %u\n",
+			frame->top_bar, frame->bottom_bar,
+			frame->left_bar, frame->right_bar);
+}
+
+static const char *hdmi_spd_sdi_get_name(enum hdmi_spd_sdi sdi)
+{
+	if (sdi < 0 || sdi > 0xff)
+		return "Invalid";
+	switch (sdi) {
+	case HDMI_SPD_SDI_UNKNOWN:
+		return "Unknown";
+	case HDMI_SPD_SDI_DSTB:
+		return "Digital STB";
+	case HDMI_SPD_SDI_DVDP:
+		return "DVD Player";
+	case HDMI_SPD_SDI_DVHS:
+		return "D-VHS";
+	case HDMI_SPD_SDI_HDDVR:
+		return "HDD Videorecorder";
+	case HDMI_SPD_SDI_DVC:
+		return "DVC";
+	case HDMI_SPD_SDI_DSC:
+		return "DSC";
+	case HDMI_SPD_SDI_VCD:
+		return "Video CD";
+	case HDMI_SPD_SDI_GAME:
+		return "Game";
+	case HDMI_SPD_SDI_PC:
+		return "PC General";
+	case HDMI_SPD_SDI_BD:
+		return "Blu-Ray Disc (BD)";
+	case HDMI_SPD_SDI_SACD:
+		return "Super Audio CD";
+	case HDMI_SPD_SDI_HDDVD:
+		return "HD DVD";
+	case HDMI_SPD_SDI_PMP:
+		return "PMP";
+	}
+	return "Reserved";
+}
+
+/**
+ * hdmi_spd_infoframe_log() - log info of HDMI SPD infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI SPD infoframe
+ */
+static void hdmi_spd_infoframe_log(const char *level,
+				   struct device *dev,
+				   struct hdmi_spd_infoframe *frame)
+{
+	u8 buf[17];
+
+	hdmi_infoframe_log_header(level, dev,
+				  (struct hdmi_any_infoframe *)frame);
+
+	memset(buf, 0, sizeof(buf));
+
+	strncpy(buf, frame->vendor, 8);
+	hdmi_log("    vendor: %s\n", buf);
+	strncpy(buf, frame->product, 16);
+	hdmi_log("    product: %s\n", buf);
+	hdmi_log("    source device information: %s (0x%x)\n",
+		hdmi_spd_sdi_get_name(frame->sdi), frame->sdi);
+}
+
+static const char *
+hdmi_audio_coding_type_get_name(enum hdmi_audio_coding_type coding_type)
+{
+	switch (coding_type) {
+	case HDMI_AUDIO_CODING_TYPE_STREAM:
+		return "Refer to Stream Header";
+	case HDMI_AUDIO_CODING_TYPE_PCM:
+		return "PCM";
+	case HDMI_AUDIO_CODING_TYPE_AC3:
+		return "AC-3";
+	case HDMI_AUDIO_CODING_TYPE_MPEG1:
+		return "MPEG1";
+	case HDMI_AUDIO_CODING_TYPE_MP3:
+		return "MP3";
+	case HDMI_AUDIO_CODING_TYPE_MPEG2:
+		return "MPEG2";
+	case HDMI_AUDIO_CODING_TYPE_AAC_LC:
+		return "AAC";
+	case HDMI_AUDIO_CODING_TYPE_DTS:
+		return "DTS";
+	case HDMI_AUDIO_CODING_TYPE_ATRAC:
+		return "ATRAC";
+	case HDMI_AUDIO_CODING_TYPE_DSD:
+		return "One Bit Audio";
+	case HDMI_AUDIO_CODING_TYPE_EAC3:
+		return "Dolby Digital +";
+	case HDMI_AUDIO_CODING_TYPE_DTS_HD:
+		return "DTS-HD";
+	case HDMI_AUDIO_CODING_TYPE_MLP:
+		return "MAT (MLP)";
+	case HDMI_AUDIO_CODING_TYPE_DST:
+		return "DST";
+	case HDMI_AUDIO_CODING_TYPE_WMA_PRO:
+		return "WMA PRO";
+	case HDMI_AUDIO_CODING_TYPE_CXT:
+		return "Refer to CXT";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_audio_sample_size_get_name(enum hdmi_audio_sample_size sample_size)
+{
+	switch (sample_size) {
+	case HDMI_AUDIO_SAMPLE_SIZE_STREAM:
+		return "Refer to Stream Header";
+	case HDMI_AUDIO_SAMPLE_SIZE_16:
+		return "16 bit";
+	case HDMI_AUDIO_SAMPLE_SIZE_20:
+		return "20 bit";
+	case HDMI_AUDIO_SAMPLE_SIZE_24:
+		return "24 bit";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_audio_sample_frequency_get_name(enum hdmi_audio_sample_frequency freq)
+{
+	switch (freq) {
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_STREAM:
+		return "Refer to Stream Header";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_32000:
+		return "32 kHz";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_44100:
+		return "44.1 kHz (CD)";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_48000:
+		return "48 kHz";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_88200:
+		return "88.2 kHz";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_96000:
+		return "96 kHz";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_176400:
+		return "176.4 kHz";
+	case HDMI_AUDIO_SAMPLE_FREQUENCY_192000:
+		return "192 kHz";
+	}
+	return "Invalid";
+}
+
+static const char *
+hdmi_audio_coding_type_ext_get_name(enum hdmi_audio_coding_type_ext ctx)
+{
+	if (ctx < 0 || ctx > 0x1f)
+		return "Invalid";
+
+	switch (ctx) {
+	case HDMI_AUDIO_CODING_TYPE_EXT_STREAM:
+		return "Refer to CT";
+	case HDMI_AUDIO_CODING_TYPE_EXT_HE_AAC:
+		return "HE AAC";
+	case HDMI_AUDIO_CODING_TYPE_EXT_HE_AAC_V2:
+		return "HE AAC v2";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG_SURROUND:
+		return "MPEG SURROUND";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC:
+		return "MPEG-4 HE AAC";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC_V2:
+		return "MPEG-4 HE AAC v2";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_AAC_LC:
+		return "MPEG-4 AAC LC";
+	case HDMI_AUDIO_CODING_TYPE_EXT_DRA:
+		return "DRA";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_HE_AAC_SURROUND:
+		return "MPEG-4 HE AAC + MPEG Surround";
+	case HDMI_AUDIO_CODING_TYPE_EXT_MPEG4_AAC_LC_SURROUND:
+		return "MPEG-4 AAC LC + MPEG Surround";
+	}
+	return "Reserved";
+}
+
+/**
+ * hdmi_audio_infoframe_log() - log info of HDMI AUDIO infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI AUDIO infoframe
+ */
+static void hdmi_audio_infoframe_log(const char *level,
+				     struct device *dev,
+				     struct hdmi_audio_infoframe *frame)
+{
+	hdmi_infoframe_log_header(level, dev,
+				  (struct hdmi_any_infoframe *)frame);
+
+	if (frame->channels)
+		hdmi_log("    channels: %u\n", frame->channels - 1);
+	else
+		hdmi_log("    channels: Refer to stream header\n");
+	hdmi_log("    coding type: %s\n",
+			hdmi_audio_coding_type_get_name(frame->coding_type));
+	hdmi_log("    sample size: %s\n",
+			hdmi_audio_sample_size_get_name(frame->sample_size));
+	hdmi_log("    sample frequency: %s\n",
+			hdmi_audio_sample_frequency_get_name(frame->sample_frequency));
+	hdmi_log("    coding type ext: %s\n",
+			hdmi_audio_coding_type_ext_get_name(frame->coding_type_ext));
+	hdmi_log("    channel allocation: 0x%x\n",
+			frame->channel_allocation);
+	hdmi_log("    level shift value: %u dB\n",
+			frame->level_shift_value);
+	hdmi_log("    downmix inhibit: %s\n",
+			frame->downmix_inhibit ? "Yes" : "No");
+}
+
+static const char *
+hdmi_3d_structure_get_name(enum hdmi_3d_structure s3d_struct)
+{
+	if (s3d_struct < 0 || s3d_struct > 0xf)
+		return "Invalid";
+
+	switch (s3d_struct) {
+	case HDMI_3D_STRUCTURE_FRAME_PACKING:
+		return "Frame Packing";
+	case HDMI_3D_STRUCTURE_FIELD_ALTERNATIVE:
+		return "Field Alternative";
+	case HDMI_3D_STRUCTURE_LINE_ALTERNATIVE:
+		return "Line Alternative";
+	case HDMI_3D_STRUCTURE_SIDE_BY_SIDE_FULL:
+		return "Side-by-side (Full)";
+	case HDMI_3D_STRUCTURE_L_DEPTH:
+		return "L + Depth";
+	case HDMI_3D_STRUCTURE_L_DEPTH_GFX_GFX_DEPTH:
+		return "L + Depth + Graphics + Graphics-depth";
+	case HDMI_3D_STRUCTURE_TOP_AND_BOTTOM:
+		return "Top-and-Bottom";
+	case HDMI_3D_STRUCTURE_SIDE_BY_SIDE_HALF:
+		return "Side-by-side (Half)";
+	default:
+		break;
+	}
+	return "Reserved";
+}
+
+/**
+ * hdmi_vendor_infoframe_log() - log info of HDMI VENDOR infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI VENDOR infoframe
+ */
+static void
+hdmi_vendor_any_infoframe_log(const char *level,
+			      struct device *dev,
+			      union hdmi_vendor_any_infoframe *frame)
+{
+	struct hdmi_vendor_infoframe *hvf = &frame->hdmi;
+
+	hdmi_infoframe_log_header(level, dev,
+				  (struct hdmi_any_infoframe *)frame);
+
+	if (frame->any.oui != HDMI_IEEE_OUI) {
+		hdmi_log("    not a HDMI vendor infoframe\n");
+		return;
+	}
+	if (hvf->vic == 0 && hvf->s3d_struct == HDMI_3D_STRUCTURE_INVALID) {
+		hdmi_log("    empty frame\n");
+		return;
+	}
+
+	if (hvf->vic)
+		hdmi_log("    HDMI VIC: %u\n", hvf->vic);
+	if (hvf->s3d_struct != HDMI_3D_STRUCTURE_INVALID) {
+		hdmi_log("    3D structure: %s\n",
+				hdmi_3d_structure_get_name(hvf->s3d_struct));
+		if (hvf->s3d_struct >= HDMI_3D_STRUCTURE_SIDE_BY_SIDE_HALF)
+			hdmi_log("    3D extension data: %d\n",
+					hvf->s3d_ext_data);
+	}
+}
+
+/**
+ * hdmi_infoframe_log() - log info of HDMI infoframe
+ * @level: logging level
+ * @dev: device
+ * @frame: HDMI infoframe
+ */
+void hdmi_infoframe_log(const char *level,
+			struct device *dev,
+			union hdmi_infoframe *frame)
+{
+	switch (frame->any.type) {
+	case HDMI_INFOFRAME_TYPE_AVI:
+		hdmi_avi_infoframe_log(level, dev, &frame->avi);
+		break;
+	case HDMI_INFOFRAME_TYPE_SPD:
+		hdmi_spd_infoframe_log(level, dev, &frame->spd);
+		break;
+	case HDMI_INFOFRAME_TYPE_AUDIO:
+		hdmi_audio_infoframe_log(level, dev, &frame->audio);
+		break;
+	case HDMI_INFOFRAME_TYPE_VENDOR:
+		hdmi_vendor_any_infoframe_log(level, dev, &frame->vendor);
+		break;
+	}
+}
+EXPORT_SYMBOL(hdmi_infoframe_log);
+
+/**
+ * hdmi_avi_infoframe_unpack() - unpack binary buffer to a HDMI AVI infoframe
+ * @buffer: source buffer
+ * @frame: HDMI AVI infoframe
+ *
+ * Unpacks the information contained in binary @buffer into a structured
+ * @frame of the HDMI Auxiliary Video (AVI) information frame.
+ * Also verifies the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+static int hdmi_avi_infoframe_unpack(struct hdmi_avi_infoframe *frame,
+				     void *buffer)
+{
+	u8 *ptr = buffer;
+	int ret;
+
+	if (ptr[0] != HDMI_INFOFRAME_TYPE_AVI ||
+	    ptr[1] != 2 ||
+	    ptr[2] != HDMI_AVI_INFOFRAME_SIZE)
+		return -EINVAL;
+
+	if (hdmi_infoframe_checksum(buffer, HDMI_INFOFRAME_SIZE(AVI)) != 0)
+		return -EINVAL;
+
+	ret = hdmi_avi_infoframe_init(frame);
+	if (ret)
+		return ret;
+
+	ptr += HDMI_INFOFRAME_HEADER_SIZE;
+
+	frame->colorspace = (ptr[0] >> 5) & 0x3;
+	if (ptr[0] & 0x10)
+		frame->active_aspect = ptr[1] & 0xf;
+	if (ptr[0] & 0x8) {
+		frame->top_bar = (ptr[5] << 8) + ptr[6];
+		frame->bottom_bar = (ptr[7] << 8) + ptr[8];
+	}
+	if (ptr[0] & 0x4) {
+		frame->left_bar = (ptr[9] << 8) + ptr[10];
+		frame->right_bar = (ptr[11] << 8) + ptr[12];
+	}
+	frame->scan_mode = ptr[0] & 0x3;
+
+	frame->colorimetry = (ptr[1] >> 6) & 0x3;
+	frame->picture_aspect = (ptr[1] >> 4) & 0x3;
+	frame->active_aspect = ptr[1] & 0xf;
+
+	frame->itc = ptr[2] & 0x80 ? true : false;
+	frame->extended_colorimetry = (ptr[2] >> 4) & 0x7;
+	frame->quantization_range = (ptr[2] >> 2) & 0x3;
+	frame->nups = ptr[2] & 0x3;
+
+	frame->video_code = ptr[3] & 0x7f;
+	frame->ycc_quantization_range = (ptr[4] >> 6) & 0x3;
+	frame->content_type = (ptr[4] >> 4) & 0x3;
+
+	frame->pixel_repeat = ptr[4] & 0xf;
+
+	return 0;
+}
+
+/**
+ * hdmi_spd_infoframe_unpack() - unpack binary buffer to a HDMI SPD infoframe
+ * @buffer: source buffer
+ * @frame: HDMI SPD infoframe
+ *
+ * Unpacks the information contained in binary @buffer into a structured
+ * @frame of the HDMI Source Product Description (SPD) information frame.
+ * Also verifies the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+static int hdmi_spd_infoframe_unpack(struct hdmi_spd_infoframe *frame,
+				     void *buffer)
+{
+	u8 *ptr = buffer;
+	int ret;
+
+	if (ptr[0] != HDMI_INFOFRAME_TYPE_SPD ||
+	    ptr[1] != 1 ||
+	    ptr[2] != HDMI_SPD_INFOFRAME_SIZE) {
+		return -EINVAL;
+	}
+
+	if (hdmi_infoframe_checksum(buffer, HDMI_INFOFRAME_SIZE(SPD)) != 0)
+		return -EINVAL;
+
+	ptr += HDMI_INFOFRAME_HEADER_SIZE;
+
+	ret = hdmi_spd_infoframe_init(frame, ptr, ptr + 8);
+	if (ret)
+		return ret;
+
+	frame->sdi = ptr[24];
+
+	return 0;
+}
+
+/**
+ * hdmi_audio_infoframe_unpack() - unpack binary buffer to a HDMI AUDIO infoframe
+ * @buffer: source buffer
+ * @frame: HDMI Audio infoframe
+ *
+ * Unpacks the information contained in binary @buffer into a structured
+ * @frame of the HDMI Audio information frame.
+ * Also verifies the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+static int hdmi_audio_infoframe_unpack(struct hdmi_audio_infoframe *frame,
+				       void *buffer)
+{
+	u8 *ptr = buffer;
+	int ret;
+
+	if (ptr[0] != HDMI_INFOFRAME_TYPE_AUDIO ||
+	    ptr[1] != 1 ||
+	    ptr[2] != HDMI_AUDIO_INFOFRAME_SIZE) {
+		return -EINVAL;
+	}
+
+	if (hdmi_infoframe_checksum(buffer, HDMI_INFOFRAME_SIZE(AUDIO)) != 0)
+		return -EINVAL;
+
+	ret = hdmi_audio_infoframe_init(frame);
+	if (ret)
+		return ret;
+
+	ptr += HDMI_INFOFRAME_HEADER_SIZE;
+
+	frame->channels = ptr[0] & 0x7;
+	frame->coding_type = (ptr[0] >> 4) & 0xf;
+	frame->sample_size = ptr[1] & 0x3;
+	frame->sample_frequency = (ptr[1] >> 2) & 0x7;
+	frame->coding_type_ext = ptr[2] & 0x1f;
+	frame->channel_allocation = ptr[3];
+	frame->level_shift_value = (ptr[4] >> 3) & 0xf;
+	frame->downmix_inhibit = ptr[4] & 0x80 ? true : false;
+
+	return 0;
+}
+
+/**
+ * hdmi_vendor_infoframe_unpack() - unpack binary buffer to a HDMI vendor infoframe
+ * @buffer: source buffer
+ * @frame: HDMI Vendor infoframe
+ *
+ * Unpacks the information contained in binary @buffer into a structured
+ * @frame of the HDMI Vendor information frame.
+ * Also verifies the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+static int
+hdmi_vendor_any_infoframe_unpack(union hdmi_vendor_any_infoframe *frame,
+				 void *buffer)
+{
+	u8 *ptr = buffer;
+	size_t length;
+	int ret;
+	u8 hdmi_video_format;
+	struct hdmi_vendor_infoframe *hvf = &frame->hdmi;
+
+	if (ptr[0] != HDMI_INFOFRAME_TYPE_VENDOR ||
+	    ptr[1] != 1 ||
+	    (ptr[2] != 5 && ptr[2] != 6))
+		return -EINVAL;
+
+	length = ptr[2];
+
+	if (hdmi_infoframe_checksum(buffer,
+				    HDMI_INFOFRAME_HEADER_SIZE + length) != 0)
+		return -EINVAL;
+
+	ptr += HDMI_INFOFRAME_HEADER_SIZE;
+
+	/* HDMI OUI */
+	if ((ptr[0] != 0x03) ||
+	    (ptr[1] != 0x0c) ||
+	    (ptr[2] != 0x00))
+		return -EINVAL;
+
+	hdmi_video_format = ptr[3] >> 5;
+
+	if (hdmi_video_format > 0x2)
+		return -EINVAL;
+
+	ret = hdmi_vendor_infoframe_init(hvf);
+	if (ret)
+		return ret;
+
+	hvf->length = length;
+
+	if (hdmi_video_format == 0x1) {
+		hvf->vic = ptr[4];
+	} else if (hdmi_video_format == 0x2) {
+		hvf->s3d_struct = ptr[4] >> 4;
+		if (hvf->s3d_struct >= HDMI_3D_STRUCTURE_SIDE_BY_SIDE_HALF) {
+			if (length == 6)
+				hvf->s3d_ext_data = ptr[5] >> 4;
+			else
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * hdmi_infoframe_unpack() - unpack binary buffer to a HDMI infoframe
+ * @buffer: source buffer
+ * @frame: HDMI infoframe
+ *
+ * Unpacks the information contained in binary buffer @buffer into a structured
+ * @frame of a HDMI infoframe.
+ * Also verifies the checksum as required by section 5.3.5 of the HDMI 1.4
+ * specification.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int hdmi_infoframe_unpack(union hdmi_infoframe *frame, void *buffer)
+{
+	int ret;
+	u8 *ptr = buffer;
+
+	switch (ptr[0]) {
+	case HDMI_INFOFRAME_TYPE_AVI:
+		ret = hdmi_avi_infoframe_unpack(&frame->avi, buffer);
+		break;
+	case HDMI_INFOFRAME_TYPE_SPD:
+		ret = hdmi_spd_infoframe_unpack(&frame->spd, buffer);
+		break;
+	case HDMI_INFOFRAME_TYPE_AUDIO:
+		ret = hdmi_audio_infoframe_unpack(&frame->audio, buffer);
+		break;
+	case HDMI_INFOFRAME_TYPE_VENDOR:
+		ret = hdmi_vendor_any_infoframe_unpack(&frame->vendor, buffer);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(hdmi_infoframe_unpack);
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index 5afc0bff2bbe..2ff34315a1bb 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -25,6 +25,7 @@
 #define __LINUX_HDMI_H_
 
 #include <linux/types.h>
+#include <linux/device.h>
 
 enum hdmi_infoframe_type {
 	HDMI_INFOFRAME_TYPE_VENDOR = 0x81,
@@ -327,5 +328,8 @@ union hdmi_infoframe {
 
 ssize_t
 hdmi_infoframe_pack(union hdmi_infoframe *frame, void *buffer, size_t size);
+int hdmi_infoframe_unpack(union hdmi_infoframe *frame, void *buffer);
+void hdmi_infoframe_log(const char *level, struct device *dev,
+			union hdmi_infoframe *frame);
 
 #endif /* _DRM_HDMI_H */
-- 
cgit v1.2.3


From dc189053e1a5ae606c56e432dae1afc28261a819 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Fri, 19 Dec 2014 09:14:22 -0300
Subject: [media] hdmi: rename HDMI_AUDIO_CODING_TYPE_EXT_STREAM to _EXT_CT

As per the suggestion of Thierry Reding rename
HDMI_AUDIO_CODING_TYPE_EXT_STREAM to HDMI_AUDIO_CODING_TYPE_EXT_CT to
be consistent with the CEA-861 spec.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Thierry Reding <treding@nvidia.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/video/hdmi.c | 2 +-
 include/linux/hdmi.h | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/hdmi.c b/drivers/video/hdmi.c
index a7c6ae4e10e5..162689227a23 100644
--- a/drivers/video/hdmi.c
+++ b/drivers/video/hdmi.c
@@ -842,7 +842,7 @@ hdmi_audio_coding_type_ext_get_name(enum hdmi_audio_coding_type_ext ctx)
 		return "Invalid";
 
 	switch (ctx) {
-	case HDMI_AUDIO_CODING_TYPE_EXT_STREAM:
+	case HDMI_AUDIO_CODING_TYPE_EXT_CT:
 		return "Refer to CT";
 	case HDMI_AUDIO_CODING_TYPE_EXT_HE_AAC:
 		return "HE AAC";
diff --git a/include/linux/hdmi.h b/include/linux/hdmi.h
index 2ff34315a1bb..e9744202fa29 100644
--- a/include/linux/hdmi.h
+++ b/include/linux/hdmi.h
@@ -230,7 +230,8 @@ enum hdmi_audio_sample_frequency {
 };
 
 enum hdmi_audio_coding_type_ext {
-	HDMI_AUDIO_CODING_TYPE_EXT_STREAM,
+	/* Refer to Audio Coding Type (CT) field in Data Byte 1 */
+	HDMI_AUDIO_CODING_TYPE_EXT_CT,
 
 	/*
 	 * The next three CXT values are defined in CEA-861-E only.
-- 
cgit v1.2.3


From f27b37f5993a080700ccecdce9960d1563eccd36 Mon Sep 17 00:00:00 2001
From: Bintian Wang <bintian.wang@huawei.com>
Date: Tue, 27 Jan 2015 20:50:29 +0800
Subject: regmap: correct the description of structure element in reg_field

Fix incorrect description of structure element "msb", which is
described as "reg".

Signed-off-by: Bintian Wang <bintian.wang@huawei.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 4419b99d8d6e..116655d92269 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -468,7 +468,7 @@ bool regmap_reg_in_ranges(unsigned int reg,
  *
  * @reg: Offset of the register within the regmap bank
  * @lsb: lsb of the register field.
- * @reg: msb of the register field.
+ * @msb: msb of the register field.
  * @id_size: port size if it has some ports
  * @id_offset: address offset for each ports
  */
-- 
cgit v1.2.3


From 2130fb97fecf9a51bb4a21da220cff3f72496a94 Mon Sep 17 00:00:00 2001
From: Christophe Ricard <christophe.ricard@gmail.com>
Date: Tue, 27 Jan 2015 01:18:19 +0100
Subject: NFC: st21nfca: Adding support for secure element

st21nfca has 1 physical SWP line and can support up to 2 secure elements
(UICC & eSE) thanks to an external switch managed with a gpio.

The platform integrator needs to specify thanks to 2 initialization
properties, uicc-present and ese-present, if it is suppose to have uicc
and/or ese. Of course if the platform does not have an external switch,
only one kind of secure element can be supported. Those parameters are
under platform integrator responsibilities.

During initialization, the white_list will be set according to those
parameters.

The discovery_se function will assume a secure element is physically
present according to uicc-present and ese-present values and will add it
to the secure element list. On ese activation, the atr is retrieved to
calculate a command exchange timeout based on the first atr(TB) value.

The se_io will allow to transfer data over SWP. 2 kind of events may appear
after a data is sent over:
- ST21NFCA_EVT_TRANSMIT_DATA when receiving an apdu answer
- ST21NFCA_EVT_WTX_REQUEST when the secure element needs more time than
expected to compute a command. If this timeout expired, a first recovery
tentative consist to send a simple software reset proprietary command.
If this tentative still fail, a second recovery tentative consist to send
a hardware reset proprietary command.
This function is only relevant for eSE like secure element.

This patch also change the way a pipe is referenced. There can be
different pipe connected to the same gate with different host destination
(ex: CONNECTIVITY). In order to keep host information every pipe are
reference with a tuple (gate, host). In order to reduce changes, we are
keeping unchanged the way a gate is addressed on the Terminal Host.
However, this is working because we consider the apdu reader gate is only
present on the eSE slot also the connectivity gate cannot give a reliable
value; it will give the latest stored pipe value.

Signed-off-by: Christophe Ricard <christophe-h.ricard@st.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/nfc/st21nfca/Makefile          |   2 +-
 drivers/nfc/st21nfca/i2c.c             |  17 +-
 drivers/nfc/st21nfca/st21nfca.c        | 134 +++++++++--
 drivers/nfc/st21nfca/st21nfca.h        |  21 +-
 drivers/nfc/st21nfca/st21nfca_se.c     | 390 +++++++++++++++++++++++++++++++++
 drivers/nfc/st21nfca/st21nfca_se.h     |  63 ++++++
 include/linux/platform_data/st21nfca.h |   2 +
 7 files changed, 608 insertions(+), 21 deletions(-)
 create mode 100644 drivers/nfc/st21nfca/st21nfca_se.c
 create mode 100644 drivers/nfc/st21nfca/st21nfca_se.h

(limited to 'include/linux')

diff --git a/drivers/nfc/st21nfca/Makefile b/drivers/nfc/st21nfca/Makefile
index 7d688f97aa27..97edab4bbdf8 100644
--- a/drivers/nfc/st21nfca/Makefile
+++ b/drivers/nfc/st21nfca/Makefile
@@ -2,7 +2,7 @@
 # Makefile for ST21NFCA HCI based NFC driver
 #
 
-st21nfca_hci-objs = st21nfca.o st21nfca_dep.o
+st21nfca_hci-objs = st21nfca.o st21nfca_dep.o st21nfca_se.o
 obj-$(CONFIG_NFC_ST21NFCA)     += st21nfca_hci.o
 
 st21nfca_i2c-objs  = i2c.o
diff --git a/drivers/nfc/st21nfca/i2c.c b/drivers/nfc/st21nfca/i2c.c
index 82b82dbd2997..a32143951616 100644
--- a/drivers/nfc/st21nfca/i2c.c
+++ b/drivers/nfc/st21nfca/i2c.c
@@ -74,6 +74,8 @@ struct st21nfca_i2c_phy {
 	unsigned int gpio_ena;
 	unsigned int irq_polarity;
 
+	struct st21nfca_se_status se_status;
+
 	struct sk_buff *pending_skb;
 	int current_read_len;
 	/*
@@ -537,6 +539,11 @@ static int st21nfca_hci_i2c_of_request_resources(struct i2c_client *client)
 
 	phy->irq_polarity = irq_get_trigger_type(client->irq);
 
+	phy->se_status.is_ese_present =
+				of_property_read_bool(pp, "ese-present");
+	phy->se_status.is_uicc_present =
+				of_property_read_bool(pp, "uicc-present");
+
 	return 0;
 }
 #else
@@ -571,6 +578,9 @@ static int st21nfca_hci_i2c_request_resources(struct i2c_client *client)
 		}
 	}
 
+	phy->se_status.is_ese_present = pdata->is_ese_present;
+	phy->se_status.is_uicc_present = pdata->is_uicc_present;
+
 	return 0;
 }
 
@@ -638,8 +648,11 @@ static int st21nfca_hci_i2c_probe(struct i2c_client *client,
 	}
 
 	return st21nfca_hci_probe(phy, &i2c_phy_ops, LLC_SHDLC_NAME,
-			       ST21NFCA_FRAME_HEADROOM, ST21NFCA_FRAME_TAILROOM,
-			       ST21NFCA_HCI_LLC_MAX_PAYLOAD, &phy->hdev);
+					ST21NFCA_FRAME_HEADROOM,
+					ST21NFCA_FRAME_TAILROOM,
+					ST21NFCA_HCI_LLC_MAX_PAYLOAD,
+					&phy->hdev,
+					&phy->se_status);
 }
 
 static int st21nfca_hci_i2c_remove(struct i2c_client *client)
diff --git a/drivers/nfc/st21nfca/st21nfca.c b/drivers/nfc/st21nfca/st21nfca.c
index 7cb7ce47d2af..24d3d240d5f4 100644
--- a/drivers/nfc/st21nfca/st21nfca.c
+++ b/drivers/nfc/st21nfca/st21nfca.c
@@ -23,6 +23,7 @@
 
 #include "st21nfca.h"
 #include "st21nfca_dep.h"
+#include "st21nfca_se.h"
 
 #define DRIVER_DESC "HCI NFC driver for ST21NFCA"
 
@@ -62,7 +63,6 @@
 #define ST21NFCA_RF_CARD_F_DATARATE		0x08
 #define ST21NFCA_RF_CARD_F_DATARATE_212_424	0x01
 
-#define ST21NFCA_DEVICE_MGNT_GATE		0x01
 #define ST21NFCA_DEVICE_MGNT_PIPE		0x02
 
 #define ST21NFCA_DM_GETINFO			0x13
@@ -78,6 +78,11 @@
 
 #define ST21NFCA_NFC_MODE			0x03	/* NFC_MODE parameter*/
 
+#define ST21NFCA_EVT_HOT_PLUG			0x03
+#define ST21NFCA_EVT_HOT_PLUG_IS_INHIBITED(x) (x->data[0] & 0x80)
+
+#define ST21NFCA_SE_TO_PIPES			2000
+
 static DECLARE_BITMAP(dev_mask, ST21NFCA_NUM_DEVICES);
 
 static struct nfc_hci_gate st21nfca_gates[] = {
@@ -92,6 +97,10 @@ static struct nfc_hci_gate st21nfca_gates[] = {
 	{ST21NFCA_RF_READER_14443_3_A_GATE, NFC_HCI_INVALID_PIPE},
 	{ST21NFCA_RF_READER_ISO15693_GATE, NFC_HCI_INVALID_PIPE},
 	{ST21NFCA_RF_CARD_F_GATE, NFC_HCI_INVALID_PIPE},
+
+	/* Secure element pipes are created by secure element host */
+	{ST21NFCA_CONNECTIVITY_GATE, NFC_HCI_DO_NOT_CREATE_PIPE},
+	{ST21NFCA_APDU_READER_GATE, NFC_HCI_DO_NOT_CREATE_PIPE},
 };
 
 struct st21nfca_pipe_info {
@@ -136,7 +145,8 @@ static int st21nfca_hci_load_session(struct nfc_hci_dev *hdev)
 	 * Pipe can be closed and need to be open.
 	 */
 	r = nfc_hci_connect_gate(hdev, NFC_HCI_HOST_CONTROLLER_ID,
-		ST21NFCA_DEVICE_MGNT_GATE, ST21NFCA_DEVICE_MGNT_PIPE);
+				ST21NFCA_DEVICE_MGNT_GATE,
+				ST21NFCA_DEVICE_MGNT_PIPE);
 	if (r < 0)
 		goto free_info;
 
@@ -167,17 +177,28 @@ static int st21nfca_hci_load_session(struct nfc_hci_dev *hdev)
 		 * - destination gid (1byte)
 		 */
 		info = (struct st21nfca_pipe_info *) skb_pipe_info->data;
+		if (info->dst_gate_id == ST21NFCA_APDU_READER_GATE &&
+			info->src_host_id != ST21NFCA_ESE_HOST_ID) {
+			pr_err("Unexpected apdu_reader pipe on host %x\n",
+				info->src_host_id);
+			continue;
+		}
+
 		for (j = 0; (j < ARRAY_SIZE(st21nfca_gates)) &&
-			(st21nfca_gates[j].gate != info->dst_gate_id);
-			j++)
+			(st21nfca_gates[j].gate != info->dst_gate_id) ; j++)
 			;
 
 		if (j < ARRAY_SIZE(st21nfca_gates) &&
 			st21nfca_gates[j].gate == info->dst_gate_id &&
 			ST21NFCA_DM_IS_PIPE_OPEN(info->pipe_state)) {
 			st21nfca_gates[j].pipe = pipe_info[2];
+
 			hdev->gate2pipe[st21nfca_gates[j].gate] =
-				st21nfca_gates[j].pipe;
+							st21nfca_gates[j].pipe;
+			hdev->pipes[st21nfca_gates[j].pipe].gate =
+							st21nfca_gates[j].gate;
+			hdev->pipes[st21nfca_gates[j].pipe].dest_host =
+							info->src_host_id;
 		}
 	}
 
@@ -187,7 +208,7 @@ static int st21nfca_hci_load_session(struct nfc_hci_dev *hdev)
 	 */
 	if (skb_pipe_list->len + 3 < ARRAY_SIZE(st21nfca_gates)) {
 		for (i = skb_pipe_list->len + 3;
-				i < ARRAY_SIZE(st21nfca_gates); i++) {
+				i < ARRAY_SIZE(st21nfca_gates) - 2; i++) {
 			r = nfc_hci_connect_gate(hdev,
 					NFC_HCI_HOST_CONTROLLER_ID,
 					st21nfca_gates[i].gate,
@@ -244,16 +265,33 @@ out:
 
 static int st21nfca_hci_ready(struct nfc_hci_dev *hdev)
 {
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
 	struct sk_buff *skb;
 
 	u8 param;
+	u8 white_list[2];
+	int wl_size = 0;
 	int r;
 
-	param = NFC_HCI_UICC_HOST_ID;
-	r = nfc_hci_set_param(hdev, NFC_HCI_ADMIN_GATE,
-			      NFC_HCI_ADMIN_WHITELIST, &param, 1);
-	if (r < 0)
-		return r;
+	if (info->se_status->is_ese_present &&
+		info->se_status->is_uicc_present) {
+		white_list[wl_size++] = NFC_HCI_UICC_HOST_ID;
+		white_list[wl_size++] = ST21NFCA_ESE_HOST_ID;
+	} else if (!info->se_status->is_ese_present &&
+			 info->se_status->is_uicc_present) {
+		white_list[wl_size++] = NFC_HCI_UICC_HOST_ID;
+	} else if (info->se_status->is_ese_present &&
+			!info->se_status->is_uicc_present) {
+		white_list[wl_size++] = ST21NFCA_ESE_HOST_ID;
+	}
+
+	if (wl_size) {
+		r = nfc_hci_set_param(hdev, NFC_HCI_ADMIN_GATE,
+					NFC_HCI_ADMIN_WHITELIST,
+					(u8 *) &white_list, wl_size);
+		if (r < 0)
+			return r;
+	}
 
 	/* Set NFC_MODE in device management gate to enable */
 	r = nfc_hci_get_param(hdev, ST21NFCA_DEVICE_MGNT_GATE,
@@ -821,19 +859,79 @@ static int st21nfca_hci_check_presence(struct nfc_hci_dev *hdev,
 	}
 }
 
+static void st21nfca_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
+				struct sk_buff *skb)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+	u8 gate = hdev->pipes[pipe].gate;
+
+	pr_debug("cmd: %x\n", cmd);
+
+	switch (cmd) {
+	case NFC_HCI_ANY_OPEN_PIPE:
+		if (gate != ST21NFCA_APDU_READER_GATE &&
+			hdev->pipes[pipe].dest_host != NFC_HCI_UICC_HOST_ID)
+			info->se_info.count_pipes++;
+
+		if (info->se_info.count_pipes == info->se_info.expected_pipes) {
+			del_timer_sync(&info->se_info.se_active_timer);
+			info->se_info.se_active = false;
+			info->se_info.count_pipes = 0;
+			complete(&info->se_info.req_completion);
+		}
+	break;
+	}
+}
+
+static int st21nfca_admin_event_received(struct nfc_hci_dev *hdev, u8 event,
+					struct sk_buff *skb)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	pr_debug("admin event: %x\n", event);
+
+	switch (event) {
+	case ST21NFCA_EVT_HOT_PLUG:
+		if (info->se_info.se_active) {
+			if (!ST21NFCA_EVT_HOT_PLUG_IS_INHIBITED(skb)) {
+				del_timer_sync(&info->se_info.se_active_timer);
+				info->se_info.se_active = false;
+				complete(&info->se_info.req_completion);
+			} else {
+				mod_timer(&info->se_info.se_active_timer,
+					jiffies +
+					msecs_to_jiffies(ST21NFCA_SE_TO_PIPES));
+			}
+		}
+	break;
+	}
+	kfree_skb(skb);
+	return 0;
+}
+
 /*
  * Returns:
  * <= 0: driver handled the event, skb consumed
  *    1: driver does not handle the event, please do standard processing
  */
-static int st21nfca_hci_event_received(struct nfc_hci_dev *hdev, u8 gate,
+static int st21nfca_hci_event_received(struct nfc_hci_dev *hdev, u8 pipe,
 				       u8 event, struct sk_buff *skb)
 {
+	u8 gate = hdev->pipes[pipe].gate;
+	u8 host = hdev->pipes[pipe].dest_host;
+
 	pr_debug("hci event: %d gate: %x\n", event, gate);
 
 	switch (gate) {
+	case NFC_HCI_ADMIN_GATE:
+		return st21nfca_admin_event_received(hdev, event, skb);
 	case ST21NFCA_RF_CARD_F_GATE:
 		return st21nfca_dep_event_received(hdev, event, skb);
+	case ST21NFCA_CONNECTIVITY_GATE:
+		return st21nfca_connectivity_event_received(hdev, host,
+							event, skb);
+	case ST21NFCA_APDU_READER_GATE:
+		return st21nfca_apdu_reader_event_received(hdev, event, skb);
 	default:
 		return 1;
 	}
@@ -855,11 +953,17 @@ static struct nfc_hci_ops st21nfca_hci_ops = {
 	.tm_send = st21nfca_hci_tm_send,
 	.check_presence = st21nfca_hci_check_presence,
 	.event_received = st21nfca_hci_event_received,
+	.cmd_received = st21nfca_hci_cmd_received,
+	.discover_se = st21nfca_hci_discover_se,
+	.enable_se = st21nfca_hci_enable_se,
+	.disable_se = st21nfca_hci_disable_se,
+	.se_io = st21nfca_hci_se_io,
 };
 
 int st21nfca_hci_probe(void *phy_id, struct nfc_phy_ops *phy_ops,
 		       char *llc_name, int phy_headroom, int phy_tailroom,
-		       int phy_payload, struct nfc_hci_dev **hdev)
+		       int phy_payload, struct nfc_hci_dev **hdev,
+			   struct st21nfca_se_status *se_status)
 {
 	struct st21nfca_hci_info *info;
 	int r = 0;
@@ -919,6 +1023,8 @@ int st21nfca_hci_probe(void *phy_id, struct nfc_phy_ops *phy_ops,
 		goto err_alloc_hdev;
 	}
 
+	info->se_status = se_status;
+
 	nfc_hci_set_clientdata(info->hdev, info);
 
 	r = nfc_hci_register_device(info->hdev);
@@ -927,6 +1033,7 @@ int st21nfca_hci_probe(void *phy_id, struct nfc_phy_ops *phy_ops,
 
 	*hdev = info->hdev;
 	st21nfca_dep_init(info->hdev);
+	st21nfca_se_init(info->hdev);
 
 	return 0;
 
@@ -945,6 +1052,7 @@ void st21nfca_hci_remove(struct nfc_hci_dev *hdev)
 	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
 
 	st21nfca_dep_deinit(hdev);
+	st21nfca_se_deinit(hdev);
 	nfc_hci_unregister_device(hdev);
 	nfc_hci_free_device(hdev);
 	kfree(info);
diff --git a/drivers/nfc/st21nfca/st21nfca.h b/drivers/nfc/st21nfca/st21nfca.h
index 7c2a85292230..15a78d330a9f 100644
--- a/drivers/nfc/st21nfca/st21nfca.h
+++ b/drivers/nfc/st21nfca/st21nfca.h
@@ -20,6 +20,7 @@
 #include <net/nfc/hci.h>
 
 #include "st21nfca_dep.h"
+#include "st21nfca_se.h"
 
 #define HCI_MODE 0
 
@@ -51,9 +52,15 @@
 
 #define ST21NFCA_NUM_DEVICES 256
 
+struct st21nfca_se_status {
+	bool is_ese_present;
+	bool is_uicc_present;
+};
+
 int st21nfca_hci_probe(void *phy_id, struct nfc_phy_ops *phy_ops,
 		       char *llc_name, int phy_headroom, int phy_tailroom,
-		       int phy_payload, struct nfc_hci_dev **hdev);
+		       int phy_payload, struct nfc_hci_dev **hdev,
+			   struct st21nfca_se_status *se_status);
 void st21nfca_hci_remove(struct nfc_hci_dev *hdev);
 
 enum st21nfca_state {
@@ -66,6 +73,7 @@ struct st21nfca_hci_info {
 	void *phy_id;
 
 	struct nfc_hci_dev *hdev;
+	struct st21nfca_se_status *se_status;
 
 	enum st21nfca_state state;
 
@@ -76,13 +84,16 @@ struct st21nfca_hci_info {
 	void *async_cb_context;
 
 	struct st21nfca_dep_info dep_info;
+	struct st21nfca_se_info se_info;
 };
 
 /* Reader RF commands */
-#define ST21NFCA_WR_XCHG_DATA            0x10
-
-#define ST21NFCA_RF_READER_F_GATE               0x14
+#define ST21NFCA_WR_XCHG_DATA           0x10
 
-#define ST21NFCA_RF_CARD_F_GATE 0x24
+#define ST21NFCA_DEVICE_MGNT_GATE       0x01
+#define ST21NFCA_RF_READER_F_GATE       0x14
+#define ST21NFCA_RF_CARD_F_GATE			0x24
+#define ST21NFCA_APDU_READER_GATE		0xf0
+#define ST21NFCA_CONNECTIVITY_GATE		0x41
 
 #endif /* __LOCAL_ST21NFCA_H_ */
diff --git a/drivers/nfc/st21nfca/st21nfca_se.c b/drivers/nfc/st21nfca/st21nfca_se.c
new file mode 100644
index 000000000000..9b93d3904ab5
--- /dev/null
+++ b/drivers/nfc/st21nfca/st21nfca_se.c
@@ -0,0 +1,390 @@
+/*
+ * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <net/nfc/hci.h>
+
+#include "st21nfca.h"
+#include "st21nfca_se.h"
+
+#define ST21NFCA_EVT_UICC_ACTIVATE		0x10
+#define ST21NFCA_EVT_UICC_DEACTIVATE	0x13
+#define ST21NFCA_EVT_SE_HARD_RESET		0x20
+#define ST21NFCA_EVT_SE_SOFT_RESET		0x11
+#define ST21NFCA_EVT_SE_END_OF_APDU_TRANSFER	0x21
+#define ST21NFCA_EVT_SE_ACTIVATE		0x22
+#define ST21NFCA_EVT_SE_DEACTIVATE		0x23
+
+#define ST21NFCA_EVT_TRANSMIT_DATA		0x10
+#define ST21NFCA_EVT_WTX_REQUEST		0x11
+
+#define ST21NFCA_EVT_CONNECTIVITY		0x10
+#define ST21NFCA_EVT_TRANSACTION		0x12
+
+#define ST21NFCA_ESE_HOST_ID			0xc0
+
+#define ST21NFCA_SE_TO_HOT_PLUG			1000
+/* Connectivity pipe only */
+#define ST21NFCA_SE_COUNT_PIPE_UICC		0x01
+/* Connectivity + APDU Reader pipe */
+#define ST21NFCA_SE_COUNT_PIPE_EMBEDDED	0x02
+
+#define ST21NFCA_SE_MODE_OFF			0x00
+#define ST21NFCA_SE_MODE_ON				0x01
+
+#define ST21NFCA_PARAM_ATR				0x01
+#define ST21NFCA_ATR_DEFAULT_BWI		0x04
+
+/*
+ * WT = 2^BWI/10[s], convert into msecs and add a secure
+ * room by increasing by 2 this timeout
+ */
+#define ST21NFCA_BWI_TO_TIMEOUT(x)		((1 << x) * 200)
+#define ST21NFCA_ATR_GET_Y_FROM_TD(x)	(x >> 4)
+
+/* If TA is present bit 0 is set */
+#define ST21NFCA_ATR_TA_PRESENT(x) (x & 0x01)
+/* If TB is present bit 1 is set */
+#define ST21NFCA_ATR_TB_PRESENT(x) (x & 0x02)
+
+static u8 st21nfca_se_get_bwi(struct nfc_hci_dev *hdev)
+{
+	int i;
+	u8 td;
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	/* Bits 8 to 5 of the first TB for T=1 encode BWI from zero to nine */
+	for (i = 1; i < ST21NFCA_ESE_MAX_LENGTH; i++) {
+		td = ST21NFCA_ATR_GET_Y_FROM_TD(info->se_info.atr[i]);
+		if (ST21NFCA_ATR_TA_PRESENT(td))
+			i++;
+		if (ST21NFCA_ATR_TB_PRESENT(td)) {
+			i++;
+			return info->se_info.atr[i] >> 4;
+		}
+	}
+	return ST21NFCA_ATR_DEFAULT_BWI;
+}
+
+static void st21nfca_se_get_atr(struct nfc_hci_dev *hdev)
+{
+	int r;
+	struct sk_buff *skb;
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	r = nfc_hci_get_param(hdev, ST21NFCA_APDU_READER_GATE,
+			ST21NFCA_PARAM_ATR, &skb);
+	if (r < 0)
+		return;
+
+	if (skb->len <= ST21NFCA_ESE_MAX_LENGTH) {
+		memcpy(info->se_info.atr, skb->data, skb->len);
+		info->se_info.wt_timeout =
+			ST21NFCA_BWI_TO_TIMEOUT(st21nfca_se_get_bwi(hdev));
+	}
+	kfree_skb(skb);
+}
+
+static int st21nfca_hci_control_se(struct nfc_hci_dev *hdev, u32 se_idx,
+				u8 state)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+	int r;
+	struct sk_buff *sk_host_list;
+	u8 se_event, host_id;
+
+	switch (se_idx) {
+	case NFC_HCI_UICC_HOST_ID:
+		se_event = (state == ST21NFCA_SE_MODE_ON ?
+					ST21NFCA_EVT_UICC_ACTIVATE :
+					ST21NFCA_EVT_UICC_DEACTIVATE);
+
+		info->se_info.count_pipes = 0;
+		info->se_info.expected_pipes = ST21NFCA_SE_COUNT_PIPE_UICC;
+		break;
+	case ST21NFCA_ESE_HOST_ID:
+		se_event = (state == ST21NFCA_SE_MODE_ON ?
+					ST21NFCA_EVT_SE_ACTIVATE :
+					ST21NFCA_EVT_SE_DEACTIVATE);
+
+		info->se_info.count_pipes = 0;
+		info->se_info.expected_pipes = ST21NFCA_SE_COUNT_PIPE_EMBEDDED;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/*
+	 * Wait for an EVT_HOT_PLUG in order to
+	 * retrieve a relevant host list.
+	 */
+	reinit_completion(&info->se_info.req_completion);
+	r = nfc_hci_send_event(hdev, ST21NFCA_DEVICE_MGNT_GATE, se_event,
+			       NULL, 0);
+	if (r < 0)
+		return r;
+
+	mod_timer(&info->se_info.se_active_timer, jiffies +
+		msecs_to_jiffies(ST21NFCA_SE_TO_HOT_PLUG));
+	info->se_info.se_active = true;
+
+	/* Ignore return value and check in any case the host_list */
+	wait_for_completion_interruptible(&info->se_info.req_completion);
+
+	r = nfc_hci_get_param(hdev, NFC_HCI_ADMIN_GATE,
+			NFC_HCI_ADMIN_HOST_LIST,
+			&sk_host_list);
+	if (r < 0)
+		return r;
+
+	host_id = sk_host_list->data[sk_host_list->len - 1];
+	kfree_skb(sk_host_list);
+
+	if (state == ST21NFCA_SE_MODE_ON && host_id == se_idx)
+		return se_idx;
+	else if (state == ST21NFCA_SE_MODE_OFF && host_id != se_idx)
+		return se_idx;
+
+	return -1;
+}
+
+int st21nfca_hci_discover_se(struct nfc_hci_dev *hdev)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+	int se_count = 0;
+
+	if (info->se_status->is_uicc_present) {
+		nfc_add_se(hdev->ndev, NFC_HCI_UICC_HOST_ID, NFC_SE_UICC);
+		se_count++;
+	}
+
+	if (info->se_status->is_ese_present) {
+		nfc_add_se(hdev->ndev, ST21NFCA_ESE_HOST_ID, NFC_SE_EMBEDDED);
+		se_count++;
+	}
+
+	return !se_count;
+}
+EXPORT_SYMBOL(st21nfca_hci_discover_se);
+
+int st21nfca_hci_enable_se(struct nfc_hci_dev *hdev, u32 se_idx)
+{
+	int r;
+
+	/*
+	 * According to upper layer, se_idx == NFC_SE_UICC when
+	 * info->se_status->is_uicc_enable is true should never happen.
+	 * Same for eSE.
+	 */
+	r = st21nfca_hci_control_se(hdev, se_idx, ST21NFCA_SE_MODE_ON);
+
+	if (r == ST21NFCA_ESE_HOST_ID) {
+		st21nfca_se_get_atr(hdev);
+		r = nfc_hci_send_event(hdev, ST21NFCA_APDU_READER_GATE,
+				ST21NFCA_EVT_SE_SOFT_RESET, NULL, 0);
+		if (r < 0)
+			return r;
+	} else if (r < 0) {
+		/*
+		 * The activation tentative failed, the secure element
+		 * is not connected. Remove from the list.
+		 */
+		nfc_remove_se(hdev->ndev, se_idx);
+		return r;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(st21nfca_hci_enable_se);
+
+int st21nfca_hci_disable_se(struct nfc_hci_dev *hdev, u32 se_idx)
+{
+	int r;
+
+	/*
+	 * According to upper layer, se_idx == NFC_SE_UICC when
+	 * info->se_status->is_uicc_enable is true should never happen
+	 * Same for eSE.
+	 */
+	r = st21nfca_hci_control_se(hdev, se_idx, ST21NFCA_SE_MODE_OFF);
+	if (r < 0)
+		return r;
+
+	return 0;
+}
+EXPORT_SYMBOL(st21nfca_hci_disable_se);
+
+int st21nfca_hci_se_io(struct nfc_hci_dev *hdev, u32 se_idx,
+			u8 *apdu, size_t apdu_length,
+			se_io_cb_t cb, void *cb_context)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	pr_debug("se_io %x\n", se_idx);
+
+	switch (se_idx) {
+	case ST21NFCA_ESE_HOST_ID:
+		info->se_info.cb = cb;
+		info->se_info.cb_context = cb_context;
+		mod_timer(&info->se_info.bwi_timer, jiffies +
+			  msecs_to_jiffies(info->se_info.wt_timeout));
+		info->se_info.bwi_active = true;
+		return nfc_hci_send_event(hdev, ST21NFCA_APDU_READER_GATE,
+					ST21NFCA_EVT_TRANSMIT_DATA,
+					apdu, apdu_length);
+	default:
+		return -ENODEV;
+	}
+}
+EXPORT_SYMBOL(st21nfca_hci_se_io);
+
+static void st21nfca_se_wt_timeout(unsigned long data)
+{
+	/*
+	 * No answer from the secure element
+	 * within the defined timeout.
+	 * Let's send a reset request as recovery procedure.
+	 * According to the situation, we first try to send a software reset
+	 * to the secure element. If the next command is still not
+	 * answering in time, we send to the CLF a secure element hardware
+	 * reset request.
+	 */
+	/* hardware reset managed through VCC_UICC_OUT power supply */
+	u8 param = 0x01;
+	struct st21nfca_hci_info *info = (struct st21nfca_hci_info *) data;
+
+	pr_debug("\n");
+
+	info->se_info.bwi_active = false;
+
+	if (!info->se_info.xch_error) {
+		info->se_info.xch_error = true;
+		nfc_hci_send_event(info->hdev, ST21NFCA_APDU_READER_GATE,
+				ST21NFCA_EVT_SE_SOFT_RESET, NULL, 0);
+	} else {
+		info->se_info.xch_error = false;
+		nfc_hci_send_event(info->hdev, ST21NFCA_DEVICE_MGNT_GATE,
+				ST21NFCA_EVT_SE_HARD_RESET, &param, 1);
+	}
+	info->se_info.cb(info->se_info.cb_context, NULL, 0, -ETIME);
+}
+
+static void st21nfca_se_activation_timeout(unsigned long data)
+{
+	struct st21nfca_hci_info *info = (struct st21nfca_hci_info *) data;
+
+	pr_debug("\n");
+
+	info->se_info.se_active = false;
+
+	complete(&info->se_info.req_completion);
+}
+
+/*
+ * Returns:
+ * <= 0: driver handled the event, skb consumed
+ *    1: driver does not handle the event, please do standard processing
+ */
+int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host,
+				u8 event, struct sk_buff *skb)
+{
+	int r = 0;
+
+	pr_debug("connectivity gate event: %x\n", event);
+
+	switch (event) {
+	case ST21NFCA_EVT_CONNECTIVITY:
+		break;
+	case ST21NFCA_EVT_TRANSACTION:
+		break;
+	default:
+		return 1;
+	}
+	kfree_skb(skb);
+	return r;
+}
+EXPORT_SYMBOL(st21nfca_connectivity_event_received);
+
+int st21nfca_apdu_reader_event_received(struct nfc_hci_dev *hdev,
+					u8 event, struct sk_buff *skb)
+{
+	int r = 0;
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	pr_debug("apdu reader gate event: %x\n", event);
+
+	switch (event) {
+	case ST21NFCA_EVT_TRANSMIT_DATA:
+		del_timer_sync(&info->se_info.bwi_timer);
+		info->se_info.bwi_active = false;
+		r = nfc_hci_send_event(hdev, ST21NFCA_DEVICE_MGNT_GATE,
+				ST21NFCA_EVT_SE_END_OF_APDU_TRANSFER, NULL, 0);
+		if (r < 0)
+			goto exit;
+
+		info->se_info.cb(info->se_info.cb_context,
+			skb->data, skb->len, 0);
+		break;
+	case ST21NFCA_EVT_WTX_REQUEST:
+		mod_timer(&info->se_info.bwi_timer, jiffies +
+				msecs_to_jiffies(info->se_info.wt_timeout));
+		break;
+	}
+
+exit:
+	kfree_skb(skb);
+	return r;
+}
+EXPORT_SYMBOL(st21nfca_apdu_reader_event_received);
+
+void st21nfca_se_init(struct nfc_hci_dev *hdev)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	init_completion(&info->se_info.req_completion);
+	/* initialize timers */
+	init_timer(&info->se_info.bwi_timer);
+	info->se_info.bwi_timer.data = (unsigned long)info;
+	info->se_info.bwi_timer.function = st21nfca_se_wt_timeout;
+	info->se_info.bwi_active = false;
+
+	init_timer(&info->se_info.se_active_timer);
+	info->se_info.se_active_timer.data = (unsigned long)info;
+	info->se_info.se_active_timer.function = st21nfca_se_activation_timeout;
+	info->se_info.se_active = false;
+
+	info->se_info.count_pipes = 0;
+	info->se_info.expected_pipes = 0;
+
+	info->se_info.xch_error = false;
+
+	info->se_info.wt_timeout =
+			ST21NFCA_BWI_TO_TIMEOUT(ST21NFCA_ATR_DEFAULT_BWI);
+}
+EXPORT_SYMBOL(st21nfca_se_init);
+
+void st21nfca_se_deinit(struct nfc_hci_dev *hdev)
+{
+	struct st21nfca_hci_info *info = nfc_hci_get_clientdata(hdev);
+
+	if (info->se_info.bwi_active)
+		del_timer_sync(&info->se_info.bwi_timer);
+	if (info->se_info.se_active)
+		del_timer_sync(&info->se_info.se_active_timer);
+
+	info->se_info.bwi_active = false;
+	info->se_info.se_active = false;
+}
+EXPORT_SYMBOL(st21nfca_se_deinit);
diff --git a/drivers/nfc/st21nfca/st21nfca_se.h b/drivers/nfc/st21nfca/st21nfca_se.h
new file mode 100644
index 000000000000..b172cfcaeb90
--- /dev/null
+++ b/drivers/nfc/st21nfca/st21nfca_se.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2014  STMicroelectronics SAS. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ST21NFCA_SE_H
+#define __ST21NFCA_SE_H
+
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+
+/*
+ * ref ISO7816-3 chap 8.1. the initial character TS is followed by a
+ * sequence of at most 32 characters.
+ */
+#define ST21NFCA_ESE_MAX_LENGTH			33
+#define ST21NFCA_ESE_HOST_ID			0xc0
+
+struct st21nfca_se_info {
+	u8 atr[ST21NFCA_ESE_MAX_LENGTH];
+	struct completion req_completion;
+
+	struct timer_list bwi_timer;
+	int wt_timeout; /* in msecs */
+	bool bwi_active;
+
+	struct timer_list se_active_timer;
+	bool se_active;
+	int expected_pipes;
+	int count_pipes;
+
+	bool xch_error;
+
+	se_io_cb_t cb;
+	void *cb_context;
+};
+
+int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host,
+					u8 event, struct sk_buff *skb);
+int st21nfca_apdu_reader_event_received(struct nfc_hci_dev *hdev,
+					u8 event, struct sk_buff *skb);
+
+int st21nfca_hci_discover_se(struct nfc_hci_dev *hdev);
+int st21nfca_hci_enable_se(struct nfc_hci_dev *hdev, u32 se_idx);
+int st21nfca_hci_disable_se(struct nfc_hci_dev *hdev, u32 se_idx);
+int st21nfca_hci_se_io(struct nfc_hci_dev *hdev, u32 se_idx,
+		u8 *apdu, size_t apdu_length,
+		se_io_cb_t cb, void *cb_context);
+
+void st21nfca_se_init(struct nfc_hci_dev *hdev);
+void st21nfca_se_deinit(struct nfc_hci_dev *hdev);
+#endif /* __ST21NFCA_SE_H */
diff --git a/include/linux/platform_data/st21nfca.h b/include/linux/platform_data/st21nfca.h
index 5087fff96d86..cc2bdafb0c69 100644
--- a/include/linux/platform_data/st21nfca.h
+++ b/include/linux/platform_data/st21nfca.h
@@ -26,6 +26,8 @@
 struct st21nfca_nfc_platform_data {
 	unsigned int gpio_ena;
 	unsigned int irq_polarity;
+	bool is_ese_present;
+	bool is_uicc_present;
 };
 
 #endif /* _ST21NFCA_HCI_H_ */
-- 
cgit v1.2.3


From be6a6b43b597a37d96dbf74985f72045ccef0940 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Tue, 27 Jan 2015 15:57:59 +0200
Subject: net/mlx4_core: Add bad-cable event support

If the firmware can detect a bad cable, allow it to generate an
event, and print the problem in the log.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/eq.c | 22 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/fw.c |  9 ++++++++-
 include/linux/mlx4/device.h             | 14 +++++++++++++-
 3 files changed, 43 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index 2f2e6067426d..4df006d8afa4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -88,6 +88,8 @@ static u64 get_async_ev_mask(struct mlx4_dev *dev)
 	u64 async_ev_mask = MLX4_ASYNC_EVENT_MASK;
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV)
 		async_ev_mask |= (1ull << MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT);
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
+		async_ev_mask |= (1ull << MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT);
 
 	return async_ev_mask;
 }
@@ -736,6 +738,26 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
 					    (unsigned long) eqe);
 			break;
 
+		case MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT:
+			switch (eqe->subtype) {
+			case MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_BAD_CABLE:
+				mlx4_warn(dev, "Bad cable detected on port %u\n",
+					  eqe->event.bad_cable.port);
+				break;
+			case MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_UNSUPPORTED_CABLE:
+				mlx4_warn(dev, "Unsupported cable detected\n");
+				break;
+			default:
+				mlx4_dbg(dev,
+					 "Unhandled recoverable error event detected: %02x(%02x) on EQ %d at index %u. owner=%x, nent=0x%x, ownership=%s\n",
+					 eqe->type, eqe->subtype, eq->eqn,
+					 eq->cons_index, eqe->owner, eq->nent,
+					 !!(eqe->owner & 0x80) ^
+					 !!(eq->cons_index & eq->nent) ? "HW" : "SW");
+				break;
+			}
+			break;
+
 		case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
 		case MLX4_EVENT_TYPE_ECC_DETECT:
 		default:
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 982861d1df44..2eadc2882e4f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -145,7 +145,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[16] = "CONFIG DEV support",
 		[17] = "Asymmetric EQs support",
 		[18] = "More than 80 VFs support",
-		[19] = "Performance optimized for limited rule configuration flow steering support"
+		[19] = "Performance optimized for limited rule configuration flow steering support",
+		[20] = "Recoverable error events support"
 	};
 	int i;
 
@@ -859,6 +860,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET);
 	if (field32 & (1 << 0))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ETH_BACKPL_AN_REP;
+	if (field32 & (1 << 7))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_FW_REASSIGN_MAC);
 	if (field & 1<<6)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_REASSIGN_MAC_EN;
@@ -1562,6 +1565,7 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 #define INIT_HCA_VXLAN_OFFSET		 0x0c
 #define INIT_HCA_CACHELINE_SZ_OFFSET	 0x0e
 #define INIT_HCA_FLAGS_OFFSET		 0x014
+#define INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET 0x018
 #define INIT_HCA_QPC_OFFSET		 0x020
 #define	 INIT_HCA_QPC_BASE_OFFSET	 (INIT_HCA_QPC_OFFSET + 0x10)
 #define	 INIT_HCA_LOG_QP_OFFSET		 (INIT_HCA_QPC_OFFSET + 0x17)
@@ -1668,6 +1672,9 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 		dev->caps.userspace_caps |= MLX4_USER_DEV_CAP_LARGE_CQE;
 	}
 
+	if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT)
+		*(inbox + INIT_HCA_RECOVERABLE_ERROR_EVENT_OFFSET / 4) |= cpu_to_be32(1 << 31);
+
 	/* QPC/EEC/CQC/EQC/RDMARC attributes */
 
 	MLX4_PUT(inbox, param->qpc_base,      INIT_HCA_QPC_BASE_OFFSET);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 5ef54e145e4d..c95d659a39f2 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -200,7 +200,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_CONFIG_DEV		= 1LL <<  16,
 	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17,
 	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18,
-	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19
+	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19,
+	MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20
 };
 
 enum {
@@ -280,6 +281,7 @@ enum mlx4_event {
 	MLX4_EVENT_TYPE_FATAL_WARNING	   = 0x1b,
 	MLX4_EVENT_TYPE_FLR_EVENT	   = 0x1c,
 	MLX4_EVENT_TYPE_PORT_MNG_CHG_EVENT = 0x1d,
+	MLX4_EVENT_TYPE_RECOVERABLE_ERROR_EVENT  = 0x3e,
 	MLX4_EVENT_TYPE_NONE		   = 0xff,
 };
 
@@ -288,6 +290,11 @@ enum {
 	MLX4_PORT_CHANGE_SUBTYPE_ACTIVE	= 4
 };
 
+enum {
+	MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_BAD_CABLE		= 1,
+	MLX4_RECOVERABLE_ERROR_EVENT_SUBTYPE_UNSUPPORTED_CABLE	= 2,
+};
+
 enum {
 	MLX4_FATAL_WARNING_SUBTYPE_WARMING = 0,
 };
@@ -860,6 +867,11 @@ struct mlx4_eqe {
 				} __packed tbl_change_info;
 			} params;
 		} __packed port_mgmt_change;
+		struct {
+			u8 reserved[3];
+			u8 port;
+			u32 reserved1[5];
+		} __packed bad_cable;
 	}			event;
 	u8			slave_id;
 	u8			reserved3[2];
-- 
cgit v1.2.3


From 5a03108689c6f3e448a920b42af04e6d28401f80 Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Tue, 27 Jan 2015 15:58:02 +0200
Subject: net/mlx4_core: Adjust command timeouts to conform to the firmware
 spec

The firmware spec states that the timeout for all commands should be 60 seconds.

In the past, the spec indicated that there were several classes of timeout
(short, medium, and long).  The driver has these different timeout classes.
We leave the class differentiation in the driver as-is (to protect against any
future spec changes), but set the timeout for all classes to be 60 seconds.

In addition, we fix a few commands which had hard-coded numeric timeouts specified.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c | 19 ++++++++++---------
 drivers/net/ethernet/mellanox/mlx4/mr.c |  4 ++--
 include/linux/mlx4/cmd.h                |  6 +++---
 3 files changed, 15 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 2aa7c232d0b6..0c90d1072e47 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -1774,8 +1774,8 @@ int mlx4_INIT_HCA(struct mlx4_dev *dev, struct mlx4_init_hca_param *param)
 		MLX4_PUT(inbox, parser_params,	INIT_HCA_VXLAN_OFFSET);
 	}
 
-	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA, 10000,
-		       MLX4_CMD_NATIVE);
+	err = mlx4_cmd(dev, mailbox->dma, 0, 0, MLX4_CMD_INIT_HCA,
+		       MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 
 	if (err)
 		mlx4_err(dev, "INIT_HCA returns %d\n", err);
@@ -2029,7 +2029,7 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
 	if (dev->caps.port_mask[port] != MLX4_PORT_TYPE_IB) {
 		if (priv->mfunc.master.init_port_ref[port] == 1) {
 			err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
-				       1000, MLX4_CMD_NATIVE);
+				       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 			if (err)
 				return err;
 		}
@@ -2040,7 +2040,7 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
 			if (!priv->mfunc.master.qp0_state[port].qp0_active &&
 			    priv->mfunc.master.qp0_state[port].port_active) {
 				err = mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
-					       1000, MLX4_CMD_NATIVE);
+					       MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 				if (err)
 					return err;
 				priv->mfunc.master.slave_state[slave].init_port_mask &= ~(1 << port);
@@ -2055,15 +2055,15 @@ int mlx4_CLOSE_PORT_wrapper(struct mlx4_dev *dev, int slave,
 
 int mlx4_CLOSE_PORT(struct mlx4_dev *dev, int port)
 {
-	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT, 1000,
-			MLX4_CMD_WRAPPED);
+	return mlx4_cmd(dev, 0, port, 0, MLX4_CMD_CLOSE_PORT,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED);
 }
 EXPORT_SYMBOL_GPL(mlx4_CLOSE_PORT);
 
 int mlx4_CLOSE_HCA(struct mlx4_dev *dev, int panic)
 {
-	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA, 1000,
-			MLX4_CMD_NATIVE);
+	return mlx4_cmd(dev, 0, 0, panic, MLX4_CMD_CLOSE_HCA,
+			MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE);
 }
 
 struct mlx4_config_dev {
@@ -2202,7 +2202,8 @@ int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
 int mlx4_NOP(struct mlx4_dev *dev)
 {
 	/* Input modifier of 0x1f means "finish as soon as possible." */
-	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, 100, MLX4_CMD_NATIVE);
+	return mlx4_cmd(dev, 0, 0x1f, 0, MLX4_CMD_NOP, MLX4_CMD_TIME_CLASS_A,
+			MLX4_CMD_NATIVE);
 }
 
 int mlx4_get_phys_port_id(struct mlx4_dev *dev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c
index 8dbdf1d29357..d21e884a0838 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mr.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mr.c
@@ -1155,7 +1155,7 @@ EXPORT_SYMBOL_GPL(mlx4_fmr_free);
 
 int mlx4_SYNC_TPT(struct mlx4_dev *dev)
 {
-	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, 1000,
-			MLX4_CMD_NATIVE);
+	return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT,
+			MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
 }
 EXPORT_SYMBOL_GPL(mlx4_SYNC_TPT);
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index c989442ffc6a..ae95adc78509 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -165,9 +165,9 @@ enum {
 };
 
 enum {
-	MLX4_CMD_TIME_CLASS_A	= 10000,
-	MLX4_CMD_TIME_CLASS_B	= 10000,
-	MLX4_CMD_TIME_CLASS_C	= 10000,
+	MLX4_CMD_TIME_CLASS_A	= 60000,
+	MLX4_CMD_TIME_CLASS_B	= 60000,
+	MLX4_CMD_TIME_CLASS_C	= 60000,
 };
 
 enum {
-- 
cgit v1.2.3


From 14bf61ffe6ac54afcd1e888a4407fe16054483db Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 9 Oct 2014 16:03:13 +0200
Subject: quota: Switch ->get_dqblk() and ->set_dqblk() to use bytes as space
 units

Currently ->get_dqblk() and ->set_dqblk() use struct fs_disk_quota which
tracks space limits and usage in 512-byte blocks. However VFS quotas
track usage in bytes (as some filesystems require that) and we need to
somehow pass this information. Upto now it wasn't a problem because we
didn't do any unit conversion (thus VFS quota routines happily stuck
number of bytes into d_bcount field of struct fd_disk_quota). Only if
you tried to use Q_XGETQUOTA or Q_XSETQLIM for VFS quotas (or Q_GETQUOTA
/ Q_SETQUOTA for XFS quotas), you got bogus results. Hardly anyone
tried this but reportedly some Samba users hit the problem in practice.
So when we want interfaces compatible we need to fix this.

We bite the bullet and define another quota structure used for passing
information from/to ->get_dqblk()/->set_dqblk. It's somewhat sad we have
to have more conversion routines in fs/quota/quota.c and another copying
of quota structure slows down getting of quota information by about 2%
but it seems cleaner than overloading e.g. units of d_bcount to bytes.

CC: stable@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/gfs2/quota.c          |  49 +++++++-------
 fs/quota/dquot.c         |  83 ++++++++++++------------
 fs/quota/quota.c         | 162 +++++++++++++++++++++++++++++++++++++++--------
 fs/xfs/xfs_qm.h          |   4 +-
 fs/xfs/xfs_qm_syscalls.c | 156 +++++++++++++++++++--------------------------
 fs/xfs/xfs_quotaops.c    |   8 +--
 include/linux/quota.h    |  47 +++++++++++++-
 include/linux/quotaops.h |   4 +-
 8 files changed, 318 insertions(+), 195 deletions(-)

(limited to 'include/linux')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c8b148bbdc8b..3e193cb36996 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -667,7 +667,7 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 
 static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 			     s64 change, struct gfs2_quota_data *qd,
-			     struct fs_disk_quota *fdq)
+			     struct qc_dqblk *fdq)
 {
 	struct inode *inode = &ip->i_inode;
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -697,16 +697,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
 	be64_add_cpu(&q.qu_value, change);
 	qd->qd_qb.qb_value = q.qu_value;
 	if (fdq) {
-		if (fdq->d_fieldmask & FS_DQ_BSOFT) {
-			q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
+		if (fdq->d_fieldmask & QC_SPC_SOFT) {
+			q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift);
 			qd->qd_qb.qb_warn = q.qu_warn;
 		}
-		if (fdq->d_fieldmask & FS_DQ_BHARD) {
-			q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
+		if (fdq->d_fieldmask & QC_SPC_HARD) {
+			q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift);
 			qd->qd_qb.qb_limit = q.qu_limit;
 		}
-		if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
-			q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+		if (fdq->d_fieldmask & QC_SPACE) {
+			q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift);
 			qd->qd_qb.qb_value = q.qu_value;
 		}
 	}
@@ -1497,7 +1497,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
 }
 
 static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
-			  struct fs_disk_quota *fdq)
+			  struct qc_dqblk *fdq)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_quota_lvb *qlvb;
@@ -1505,7 +1505,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
 	struct gfs2_holder q_gh;
 	int error;
 
-	memset(fdq, 0, sizeof(struct fs_disk_quota));
+	memset(fdq, 0, sizeof(*fdq));
 
 	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
 		return -ESRCH; /* Crazy XFS error code */
@@ -1522,12 +1522,9 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
 		goto out;
 
 	qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
-	fdq->d_version = FS_DQUOT_VERSION;
-	fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
-	fdq->d_id = from_kqid_munged(current_user_ns(), qid);
-	fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
-	fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
-	fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
+	fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift;
+	fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift;
+	fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift;
 
 	gfs2_glock_dq_uninit(&q_gh);
 out:
@@ -1536,10 +1533,10 @@ out:
 }
 
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
+#define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE)
 
 static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
-			  struct fs_disk_quota *fdq)
+			  struct qc_dqblk *fdq)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1583,17 +1580,17 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
 		goto out_i;
 
 	/* If nothing has changed, this is a no-op */
-	if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
-	    ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
-		fdq->d_fieldmask ^= FS_DQ_BSOFT;
+	if ((fdq->d_fieldmask & QC_SPC_SOFT) &&
+	    ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
+		fdq->d_fieldmask ^= QC_SPC_SOFT;
 
-	if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
-	    ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
-		fdq->d_fieldmask ^= FS_DQ_BHARD;
+	if ((fdq->d_fieldmask & QC_SPC_HARD) &&
+	    ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
+		fdq->d_fieldmask ^= QC_SPC_HARD;
 
-	if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
-	    ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
-		fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+	if ((fdq->d_fieldmask & QC_SPACE) &&
+	    ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+		fdq->d_fieldmask ^= QC_SPACE;
 
 	if (fdq->d_fieldmask == 0)
 		goto out_i;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8f0acef3d184..69df5b239844 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2396,30 +2396,25 @@ static inline qsize_t stoqb(qsize_t space)
 }
 
 /* Generic routine for getting common part of quota structure */
-static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 {
 	struct mem_dqblk *dm = &dquot->dq_dqb;
 
 	memset(di, 0, sizeof(*di));
-	di->d_version = FS_DQUOT_VERSION;
-	di->d_flags = dquot->dq_id.type == USRQUOTA ?
-			FS_USER_QUOTA : FS_GROUP_QUOTA;
-	di->d_id = from_kqid_munged(current_user_ns(), dquot->dq_id);
-
 	spin_lock(&dq_data_lock);
-	di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
-	di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
+	di->d_spc_hardlimit = dm->dqb_bhardlimit;
+	di->d_spc_softlimit = dm->dqb_bsoftlimit;
 	di->d_ino_hardlimit = dm->dqb_ihardlimit;
 	di->d_ino_softlimit = dm->dqb_isoftlimit;
-	di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
-	di->d_icount = dm->dqb_curinodes;
-	di->d_btimer = dm->dqb_btime;
-	di->d_itimer = dm->dqb_itime;
+	di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
+	di->d_ino_count = dm->dqb_curinodes;
+	di->d_spc_timer = dm->dqb_btime;
+	di->d_ino_timer = dm->dqb_itime;
 	spin_unlock(&dq_data_lock);
 }
 
 int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
-		    struct fs_disk_quota *di)
+		    struct qc_dqblk *di)
 {
 	struct dquot *dquot;
 
@@ -2433,70 +2428,70 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
 }
 EXPORT_SYMBOL(dquot_get_dqblk);
 
-#define VFS_FS_DQ_MASK \
-	(FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
-	 FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
-	 FS_DQ_BTIMER | FS_DQ_ITIMER)
+#define VFS_QC_MASK \
+	(QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
+	 QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
+	 QC_SPC_TIMER | QC_INO_TIMER)
 
 /* Generic routine for setting common part of quota structure */
-static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 {
 	struct mem_dqblk *dm = &dquot->dq_dqb;
 	int check_blim = 0, check_ilim = 0;
 	struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
 
-	if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
+	if (di->d_fieldmask & ~VFS_QC_MASK)
 		return -EINVAL;
 
-	if (((di->d_fieldmask & FS_DQ_BSOFT) &&
-	     (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
-	    ((di->d_fieldmask & FS_DQ_BHARD) &&
-	     (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
-	    ((di->d_fieldmask & FS_DQ_ISOFT) &&
+	if (((di->d_fieldmask & QC_SPC_SOFT) &&
+	     stoqb(di->d_spc_softlimit) > dqi->dqi_maxblimit) ||
+	    ((di->d_fieldmask & QC_SPC_HARD) &&
+	     stoqb(di->d_spc_hardlimit) > dqi->dqi_maxblimit) ||
+	    ((di->d_fieldmask & QC_INO_SOFT) &&
 	     (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
-	    ((di->d_fieldmask & FS_DQ_IHARD) &&
+	    ((di->d_fieldmask & QC_INO_HARD) &&
 	     (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
 		return -ERANGE;
 
 	spin_lock(&dq_data_lock);
-	if (di->d_fieldmask & FS_DQ_BCOUNT) {
-		dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
+	if (di->d_fieldmask & QC_SPACE) {
+		dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
 		check_blim = 1;
 		set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
 	}
 
-	if (di->d_fieldmask & FS_DQ_BSOFT)
-		dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
-	if (di->d_fieldmask & FS_DQ_BHARD)
-		dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
-	if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
+	if (di->d_fieldmask & QC_SPC_SOFT)
+		dm->dqb_bsoftlimit = di->d_spc_softlimit;
+	if (di->d_fieldmask & QC_SPC_HARD)
+		dm->dqb_bhardlimit = di->d_spc_hardlimit;
+	if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
 		check_blim = 1;
 		set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
 	}
 
-	if (di->d_fieldmask & FS_DQ_ICOUNT) {
-		dm->dqb_curinodes = di->d_icount;
+	if (di->d_fieldmask & QC_INO_COUNT) {
+		dm->dqb_curinodes = di->d_ino_count;
 		check_ilim = 1;
 		set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
 	}
 
-	if (di->d_fieldmask & FS_DQ_ISOFT)
+	if (di->d_fieldmask & QC_INO_SOFT)
 		dm->dqb_isoftlimit = di->d_ino_softlimit;
-	if (di->d_fieldmask & FS_DQ_IHARD)
+	if (di->d_fieldmask & QC_INO_HARD)
 		dm->dqb_ihardlimit = di->d_ino_hardlimit;
-	if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
+	if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
 		check_ilim = 1;
 		set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
 	}
 
-	if (di->d_fieldmask & FS_DQ_BTIMER) {
-		dm->dqb_btime = di->d_btimer;
+	if (di->d_fieldmask & QC_SPC_TIMER) {
+		dm->dqb_btime = di->d_spc_timer;
 		check_blim = 1;
 		set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
 	}
 
-	if (di->d_fieldmask & FS_DQ_ITIMER) {
-		dm->dqb_itime = di->d_itimer;
+	if (di->d_fieldmask & QC_INO_TIMER) {
+		dm->dqb_itime = di->d_ino_timer;
 		check_ilim = 1;
 		set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
 	}
@@ -2506,7 +2501,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 		    dm->dqb_curspace < dm->dqb_bsoftlimit) {
 			dm->dqb_btime = 0;
 			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
-		} else if (!(di->d_fieldmask & FS_DQ_BTIMER))
+		} else if (!(di->d_fieldmask & QC_SPC_TIMER))
 			/* Set grace only if user hasn't provided his own... */
 			dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
 	}
@@ -2515,7 +2510,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 		    dm->dqb_curinodes < dm->dqb_isoftlimit) {
 			dm->dqb_itime = 0;
 			clear_bit(DQ_INODES_B, &dquot->dq_flags);
-		} else if (!(di->d_fieldmask & FS_DQ_ITIMER))
+		} else if (!(di->d_fieldmask & QC_INO_TIMER))
 			/* Set grace only if user hasn't provided his own... */
 			dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
 	}
@@ -2531,7 +2526,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 }
 
 int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
-		  struct fs_disk_quota *di)
+		  struct qc_dqblk *di)
 {
 	struct dquot *dquot;
 	int rc;
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2aa4151f99d2..6f3856328eea 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -118,17 +118,27 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
 	return sb->s_qcop->set_info(sb, type, &info);
 }
 
-static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
+static inline qsize_t qbtos(qsize_t blocks)
+{
+	return blocks << QIF_DQBLKSIZE_BITS;
+}
+
+static inline qsize_t stoqb(qsize_t space)
+{
+	return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
+static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src)
 {
 	memset(dst, 0, sizeof(*dst));
-	dst->dqb_bhardlimit = src->d_blk_hardlimit;
-	dst->dqb_bsoftlimit = src->d_blk_softlimit;
-	dst->dqb_curspace = src->d_bcount;
+	dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit);
+	dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit);
+	dst->dqb_curspace = src->d_space;
 	dst->dqb_ihardlimit = src->d_ino_hardlimit;
 	dst->dqb_isoftlimit = src->d_ino_softlimit;
-	dst->dqb_curinodes = src->d_icount;
-	dst->dqb_btime = src->d_btimer;
-	dst->dqb_itime = src->d_itimer;
+	dst->dqb_curinodes = src->d_ino_count;
+	dst->dqb_btime = src->d_spc_timer;
+	dst->dqb_itime = src->d_ino_timer;
 	dst->dqb_valid = QIF_ALL;
 }
 
@@ -136,7 +146,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
 			  void __user *addr)
 {
 	struct kqid qid;
-	struct fs_disk_quota fdq;
+	struct qc_dqblk fdq;
 	struct if_dqblk idq;
 	int ret;
 
@@ -154,36 +164,36 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
 	return 0;
 }
 
-static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
+static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
 {
-	dst->d_blk_hardlimit = src->dqb_bhardlimit;
-	dst->d_blk_softlimit  = src->dqb_bsoftlimit;
-	dst->d_bcount = src->dqb_curspace;
+	dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
+	dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit);
+	dst->d_space = src->dqb_curspace;
 	dst->d_ino_hardlimit = src->dqb_ihardlimit;
 	dst->d_ino_softlimit = src->dqb_isoftlimit;
-	dst->d_icount = src->dqb_curinodes;
-	dst->d_btimer = src->dqb_btime;
-	dst->d_itimer = src->dqb_itime;
+	dst->d_ino_count = src->dqb_curinodes;
+	dst->d_spc_timer = src->dqb_btime;
+	dst->d_ino_timer = src->dqb_itime;
 
 	dst->d_fieldmask = 0;
 	if (src->dqb_valid & QIF_BLIMITS)
-		dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
+		dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD;
 	if (src->dqb_valid & QIF_SPACE)
-		dst->d_fieldmask |= FS_DQ_BCOUNT;
+		dst->d_fieldmask |= QC_SPACE;
 	if (src->dqb_valid & QIF_ILIMITS)
-		dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
+		dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD;
 	if (src->dqb_valid & QIF_INODES)
-		dst->d_fieldmask |= FS_DQ_ICOUNT;
+		dst->d_fieldmask |= QC_INO_COUNT;
 	if (src->dqb_valid & QIF_BTIME)
-		dst->d_fieldmask |= FS_DQ_BTIMER;
+		dst->d_fieldmask |= QC_SPC_TIMER;
 	if (src->dqb_valid & QIF_ITIME)
-		dst->d_fieldmask |= FS_DQ_ITIMER;
+		dst->d_fieldmask |= QC_INO_TIMER;
 }
 
 static int quota_setquota(struct super_block *sb, int type, qid_t id,
 			  void __user *addr)
 {
-	struct fs_disk_quota fdq;
+	struct qc_dqblk fdq;
 	struct if_dqblk idq;
 	struct kqid qid;
 
@@ -247,10 +257,78 @@ static int quota_getxstatev(struct super_block *sb, void __user *addr)
 	return ret;
 }
 
+/*
+ * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them
+ * out of there as xfsprogs rely on definitions being in that header file. So
+ * just define same functions here for quota purposes.
+ */
+#define XFS_BB_SHIFT 9
+
+static inline u64 quota_bbtob(u64 blocks)
+{
+	return blocks << XFS_BB_SHIFT;
+}
+
+static inline u64 quota_btobb(u64 bytes)
+{
+	return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT;
+}
+
+static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src)
+{
+	dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit);
+	dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit);
+	dst->d_ino_hardlimit = src->d_ino_hardlimit;
+	dst->d_ino_softlimit = src->d_ino_softlimit;
+	dst->d_space = quota_bbtob(src->d_bcount);
+	dst->d_ino_count = src->d_icount;
+	dst->d_ino_timer = src->d_itimer;
+	dst->d_spc_timer = src->d_btimer;
+	dst->d_ino_warns = src->d_iwarns;
+	dst->d_spc_warns = src->d_bwarns;
+	dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit);
+	dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit);
+	dst->d_rt_space = quota_bbtob(src->d_rtbcount);
+	dst->d_rt_spc_timer = src->d_rtbtimer;
+	dst->d_rt_spc_warns = src->d_rtbwarns;
+	dst->d_fieldmask = 0;
+	if (src->d_fieldmask & FS_DQ_ISOFT)
+		dst->d_fieldmask |= QC_INO_SOFT;
+	if (src->d_fieldmask & FS_DQ_IHARD)
+		dst->d_fieldmask |= QC_INO_HARD;
+	if (src->d_fieldmask & FS_DQ_BSOFT)
+		dst->d_fieldmask |= QC_SPC_SOFT;
+	if (src->d_fieldmask & FS_DQ_BHARD)
+		dst->d_fieldmask |= QC_SPC_HARD;
+	if (src->d_fieldmask & FS_DQ_RTBSOFT)
+		dst->d_fieldmask |= QC_RT_SPC_SOFT;
+	if (src->d_fieldmask & FS_DQ_RTBHARD)
+		dst->d_fieldmask |= QC_RT_SPC_HARD;
+	if (src->d_fieldmask & FS_DQ_BTIMER)
+		dst->d_fieldmask |= QC_SPC_TIMER;
+	if (src->d_fieldmask & FS_DQ_ITIMER)
+		dst->d_fieldmask |= QC_INO_TIMER;
+	if (src->d_fieldmask & FS_DQ_RTBTIMER)
+		dst->d_fieldmask |= QC_RT_SPC_TIMER;
+	if (src->d_fieldmask & FS_DQ_BWARNS)
+		dst->d_fieldmask |= QC_SPC_WARNS;
+	if (src->d_fieldmask & FS_DQ_IWARNS)
+		dst->d_fieldmask |= QC_INO_WARNS;
+	if (src->d_fieldmask & FS_DQ_RTBWARNS)
+		dst->d_fieldmask |= QC_RT_SPC_WARNS;
+	if (src->d_fieldmask & FS_DQ_BCOUNT)
+		dst->d_fieldmask |= QC_SPACE;
+	if (src->d_fieldmask & FS_DQ_ICOUNT)
+		dst->d_fieldmask |= QC_INO_COUNT;
+	if (src->d_fieldmask & FS_DQ_RTBCOUNT)
+		dst->d_fieldmask |= QC_RT_SPACE;
+}
+
 static int quota_setxquota(struct super_block *sb, int type, qid_t id,
 			   void __user *addr)
 {
 	struct fs_disk_quota fdq;
+	struct qc_dqblk qdq;
 	struct kqid qid;
 
 	if (copy_from_user(&fdq, addr, sizeof(fdq)))
@@ -260,13 +338,44 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
 	qid = make_kqid(current_user_ns(), type, id);
 	if (!qid_valid(qid))
 		return -EINVAL;
-	return sb->s_qcop->set_dqblk(sb, qid, &fdq);
+	copy_from_xfs_dqblk(&qdq, &fdq);
+	return sb->s_qcop->set_dqblk(sb, qid, &qdq);
+}
+
+static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src,
+			      int type, qid_t id)
+{
+	memset(dst, 0, sizeof(*dst));
+	dst->d_version = FS_DQUOT_VERSION;
+	dst->d_id = id;
+	if (type == USRQUOTA)
+		dst->d_flags = FS_USER_QUOTA;
+	else if (type == PRJQUOTA)
+		dst->d_flags = FS_PROJ_QUOTA;
+	else
+		dst->d_flags = FS_GROUP_QUOTA;
+	dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit);
+	dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit);
+	dst->d_ino_hardlimit = src->d_ino_hardlimit;
+	dst->d_ino_softlimit = src->d_ino_softlimit;
+	dst->d_bcount = quota_btobb(src->d_space);
+	dst->d_icount = src->d_ino_count;
+	dst->d_itimer = src->d_ino_timer;
+	dst->d_btimer = src->d_spc_timer;
+	dst->d_iwarns = src->d_ino_warns;
+	dst->d_bwarns = src->d_spc_warns;
+	dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit);
+	dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit);
+	dst->d_rtbcount = quota_btobb(src->d_rt_space);
+	dst->d_rtbtimer = src->d_rt_spc_timer;
+	dst->d_rtbwarns = src->d_rt_spc_warns;
 }
 
 static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 			   void __user *addr)
 {
 	struct fs_disk_quota fdq;
+	struct qc_dqblk qdq;
 	struct kqid qid;
 	int ret;
 
@@ -275,8 +384,11 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 	qid = make_kqid(current_user_ns(), type, id);
 	if (!qid_valid(qid))
 		return -EINVAL;
-	ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
-	if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
+	ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
+	if (ret)
+		return ret;
+	copy_to_xfs_dqblk(&fdq, &qdq, type, id);
+	if (copy_to_user(addr, &fdq, sizeof(fdq)))
 		return -EFAULT;
 	return ret;
 }
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 3a07a937e232..41f6c0b9d51c 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -166,9 +166,9 @@ extern void		xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
 /* quota ops */
 extern int		xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
 extern int		xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
-					uint, struct fs_disk_quota *);
+					uint, struct qc_dqblk *);
 extern int		xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
-					struct fs_disk_quota *);
+					struct qc_dqblk *);
 extern int		xfs_qm_scall_getqstat(struct xfs_mount *,
 					struct fs_quota_stat *);
 extern int		xfs_qm_scall_getqstatv(struct xfs_mount *,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 74fca68e43b6..cb6168ec92c9 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -39,7 +39,6 @@ STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
 					uint);
 STATIC uint	xfs_qm_export_flags(uint);
-STATIC uint	xfs_qm_export_qtype_flags(uint);
 
 /*
  * Turn off quota accounting and/or enforcement for all udquots and/or
@@ -573,8 +572,8 @@ xfs_qm_scall_getqstatv(
 	return 0;
 }
 
-#define XFS_DQ_MASK \
-	(FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
+#define XFS_QC_MASK \
+	(QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
 
 /*
  * Adjust quota limits, and start/stop timers accordingly.
@@ -584,7 +583,7 @@ xfs_qm_scall_setqlim(
 	struct xfs_mount	*mp,
 	xfs_dqid_t		id,
 	uint			type,
-	fs_disk_quota_t		*newlim)
+	struct qc_dqblk		*newlim)
 {
 	struct xfs_quotainfo	*q = mp->m_quotainfo;
 	struct xfs_disk_dquot	*ddq;
@@ -593,9 +592,9 @@ xfs_qm_scall_setqlim(
 	int			error;
 	xfs_qcnt_t		hard, soft;
 
-	if (newlim->d_fieldmask & ~XFS_DQ_MASK)
+	if (newlim->d_fieldmask & ~XFS_QC_MASK)
 		return -EINVAL;
-	if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+	if ((newlim->d_fieldmask & XFS_QC_MASK) == 0)
 		return 0;
 
 	/*
@@ -633,11 +632,11 @@ xfs_qm_scall_setqlim(
 	/*
 	 * Make sure that hardlimits are >= soft limits before changing.
 	 */
-	hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
+	hard = (newlim->d_fieldmask & QC_SPC_HARD) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) :
 			be64_to_cpu(ddq->d_blk_hardlimit);
-	soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
+	soft = (newlim->d_fieldmask & QC_SPC_SOFT) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) :
 			be64_to_cpu(ddq->d_blk_softlimit);
 	if (hard == 0 || hard >= soft) {
 		ddq->d_blk_hardlimit = cpu_to_be64(hard);
@@ -650,11 +649,11 @@ xfs_qm_scall_setqlim(
 	} else {
 		xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
 	}
-	hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
+	hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) :
 			be64_to_cpu(ddq->d_rtb_hardlimit);
-	soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
-		(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
+	soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ?
+		(xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) :
 			be64_to_cpu(ddq->d_rtb_softlimit);
 	if (hard == 0 || hard >= soft) {
 		ddq->d_rtb_hardlimit = cpu_to_be64(hard);
@@ -667,10 +666,10 @@ xfs_qm_scall_setqlim(
 		xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
 	}
 
-	hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
+	hard = (newlim->d_fieldmask & QC_INO_HARD) ?
 		(xfs_qcnt_t) newlim->d_ino_hardlimit :
 			be64_to_cpu(ddq->d_ino_hardlimit);
-	soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
+	soft = (newlim->d_fieldmask & QC_INO_SOFT) ?
 		(xfs_qcnt_t) newlim->d_ino_softlimit :
 			be64_to_cpu(ddq->d_ino_softlimit);
 	if (hard == 0 || hard >= soft) {
@@ -687,12 +686,12 @@ xfs_qm_scall_setqlim(
 	/*
 	 * Update warnings counter(s) if requested
 	 */
-	if (newlim->d_fieldmask & FS_DQ_BWARNS)
-		ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns);
-	if (newlim->d_fieldmask & FS_DQ_IWARNS)
-		ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns);
-	if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-		ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns);
+	if (newlim->d_fieldmask & QC_SPC_WARNS)
+		ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns);
+	if (newlim->d_fieldmask & QC_INO_WARNS)
+		ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns);
+	if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
+		ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns);
 
 	if (id == 0) {
 		/*
@@ -702,24 +701,24 @@ xfs_qm_scall_setqlim(
 		 * soft and hard limit values (already done, above), and
 		 * for warnings.
 		 */
-		if (newlim->d_fieldmask & FS_DQ_BTIMER) {
-			q->qi_btimelimit = newlim->d_btimer;
-			ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
+		if (newlim->d_fieldmask & QC_SPC_TIMER) {
+			q->qi_btimelimit = newlim->d_spc_timer;
+			ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer);
 		}
-		if (newlim->d_fieldmask & FS_DQ_ITIMER) {
-			q->qi_itimelimit = newlim->d_itimer;
-			ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
+		if (newlim->d_fieldmask & QC_INO_TIMER) {
+			q->qi_itimelimit = newlim->d_ino_timer;
+			ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer);
 		}
-		if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
-			q->qi_rtbtimelimit = newlim->d_rtbtimer;
-			ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
+		if (newlim->d_fieldmask & QC_RT_SPC_TIMER) {
+			q->qi_rtbtimelimit = newlim->d_rt_spc_timer;
+			ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer);
 		}
-		if (newlim->d_fieldmask & FS_DQ_BWARNS)
-			q->qi_bwarnlimit = newlim->d_bwarns;
-		if (newlim->d_fieldmask & FS_DQ_IWARNS)
-			q->qi_iwarnlimit = newlim->d_iwarns;
-		if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-			q->qi_rtbwarnlimit = newlim->d_rtbwarns;
+		if (newlim->d_fieldmask & QC_SPC_WARNS)
+			q->qi_bwarnlimit = newlim->d_spc_warns;
+		if (newlim->d_fieldmask & QC_INO_WARNS)
+			q->qi_iwarnlimit = newlim->d_ino_warns;
+		if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
+			q->qi_rtbwarnlimit = newlim->d_rt_spc_warns;
 	} else {
 		/*
 		 * If the user is now over quota, start the timelimit.
@@ -824,7 +823,7 @@ xfs_qm_scall_getquota(
 	struct xfs_mount	*mp,
 	xfs_dqid_t		id,
 	uint			type,
-	struct fs_disk_quota	*dst)
+	struct qc_dqblk		*dst)
 {
 	struct xfs_dquot	*dqp;
 	int			error;
@@ -848,28 +847,25 @@ xfs_qm_scall_getquota(
 	}
 
 	memset(dst, 0, sizeof(*dst));
-	dst->d_version = FS_DQUOT_VERSION;
-	dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags);
-	dst->d_id = be32_to_cpu(dqp->q_core.d_id);
-	dst->d_blk_hardlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-	dst->d_blk_softlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
+	dst->d_spc_hardlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
+	dst->d_spc_softlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
 	dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
 	dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
-	dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount);
-	dst->d_icount = dqp->q_res_icount;
-	dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer);
-	dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer);
-	dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns);
-	dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns);
-	dst->d_rtb_hardlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
-	dst->d_rtb_softlimit =
-		XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
-	dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount);
-	dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer);
-	dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+	dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount);
+	dst->d_ino_count = dqp->q_res_icount;
+	dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer);
+	dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer);
+	dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns);
+	dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns);
+	dst->d_rt_spc_hardlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
+	dst->d_rt_spc_softlimit =
+		XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
+	dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount);
+	dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+	dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
 
 	/*
 	 * Internally, we don't reset all the timers when quota enforcement
@@ -882,23 +878,23 @@ xfs_qm_scall_getquota(
 	     dqp->q_core.d_flags == XFS_DQ_GROUP) ||
 	    (!XFS_IS_PQUOTA_ENFORCED(mp) &&
 	     dqp->q_core.d_flags == XFS_DQ_PROJ)) {
-		dst->d_btimer = 0;
-		dst->d_itimer = 0;
-		dst->d_rtbtimer = 0;
+		dst->d_spc_timer = 0;
+		dst->d_ino_timer = 0;
+		dst->d_rt_spc_timer = 0;
 	}
 
 #ifdef DEBUG
-	if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
-	     (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
-	     (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
-	    dst->d_id != 0) {
-		if ((dst->d_bcount > dst->d_blk_softlimit) &&
-		    (dst->d_blk_softlimit > 0)) {
-			ASSERT(dst->d_btimer != 0);
+	if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
+	     (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
+	     (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
+	    id != 0) {
+		if ((dst->d_space > dst->d_spc_softlimit) &&
+		    (dst->d_spc_softlimit > 0)) {
+			ASSERT(dst->d_spc_timer != 0);
 		}
-		if ((dst->d_icount > dst->d_ino_softlimit) &&
+		if ((dst->d_ino_count > dst->d_ino_softlimit) &&
 		    (dst->d_ino_softlimit > 0)) {
-			ASSERT(dst->d_itimer != 0);
+			ASSERT(dst->d_ino_timer != 0);
 		}
 	}
 #endif
@@ -907,26 +903,6 @@ out_put:
 	return error;
 }
 
-STATIC uint
-xfs_qm_export_qtype_flags(
-	uint flags)
-{
-	/*
-	 * Can't be more than one, or none.
-	 */
-	ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
-		(FS_PROJ_QUOTA | FS_USER_QUOTA));
-	ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
-		(FS_PROJ_QUOTA | FS_GROUP_QUOTA));
-	ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
-		(FS_USER_QUOTA | FS_GROUP_QUOTA));
-	ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
-
-	return (flags & XFS_DQ_USER) ?
-		FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
-			FS_PROJ_QUOTA : FS_GROUP_QUOTA;
-}
-
 STATIC uint
 xfs_qm_export_flags(
 	uint flags)
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7542bbeca6a1..801a84c1cdc3 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -131,7 +131,7 @@ STATIC int
 xfs_fs_get_dqblk(
 	struct super_block	*sb,
 	struct kqid		qid,
-	struct fs_disk_quota	*fdq)
+	struct qc_dqblk		*qdq)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
@@ -141,14 +141,14 @@ xfs_fs_get_dqblk(
 		return -ESRCH;
 
 	return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
-				      xfs_quota_type(qid.type), fdq);
+				      xfs_quota_type(qid.type), qdq);
 }
 
 STATIC int
 xfs_fs_set_dqblk(
 	struct super_block	*sb,
 	struct kqid		qid,
-	struct fs_disk_quota	*fdq)
+	struct qc_dqblk		*qdq)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
@@ -160,7 +160,7 @@ xfs_fs_set_dqblk(
 		return -ESRCH;
 
 	return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
-				     xfs_quota_type(qid.type), fdq);
+				     xfs_quota_type(qid.type), qdq);
 }
 
 const struct quotactl_ops xfs_quotactl_operations = {
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 50978b781a19..097d7eb2441e 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -321,6 +321,49 @@ struct dquot_operations {
 
 struct path;
 
+/* Structure for communicating via ->get_dqblk() & ->set_dqblk() */
+struct qc_dqblk {
+	int d_fieldmask;	/* mask of fields to change in ->set_dqblk() */
+	u64 d_spc_hardlimit;	/* absolute limit on used space */
+	u64 d_spc_softlimit;	/* preferred limit on used space */
+	u64 d_ino_hardlimit;	/* maximum # allocated inodes */
+	u64 d_ino_softlimit;	/* preferred inode limit */
+	u64 d_space;		/* Space owned by the user */
+	u64 d_ino_count;	/* # inodes owned by the user */
+	s64 d_ino_timer;	/* zero if within inode limits */
+				/* if not, we refuse service */
+	s64 d_spc_timer;	/* similar to above; for space */
+	int d_ino_warns;	/* # warnings issued wrt num inodes */
+	int d_spc_warns;	/* # warnings issued wrt used space */
+	u64 d_rt_spc_hardlimit;	/* absolute limit on realtime space */
+	u64 d_rt_spc_softlimit;	/* preferred limit on RT space */
+	u64 d_rt_space;		/* realtime space owned */
+	s64 d_rt_spc_timer;	/* similar to above; for RT space */
+	int d_rt_spc_warns;	/* # warnings issued wrt RT space */
+};
+
+/* Field specifiers for ->set_dqblk() in struct qc_dqblk */
+#define	QC_INO_SOFT	(1<<0)
+#define	QC_INO_HARD	(1<<1)
+#define	QC_SPC_SOFT	(1<<2)
+#define	QC_SPC_HARD	(1<<3)
+#define	QC_RT_SPC_SOFT	(1<<4)
+#define	QC_RT_SPC_HARD	(1<<5)
+#define QC_LIMIT_MASK (QC_INO_SOFT | QC_INO_HARD | QC_SPC_SOFT | QC_SPC_HARD | \
+		       QC_RT_SPC_SOFT | QC_RT_SPC_HARD)
+#define	QC_SPC_TIMER	(1<<6)
+#define	QC_INO_TIMER	(1<<7)
+#define	QC_RT_SPC_TIMER	(1<<8)
+#define QC_TIMER_MASK (QC_SPC_TIMER | QC_INO_TIMER | QC_RT_SPC_TIMER)
+#define	QC_SPC_WARNS	(1<<9)
+#define	QC_INO_WARNS	(1<<10)
+#define	QC_RT_SPC_WARNS	(1<<11)
+#define QC_WARNS_MASK (QC_SPC_WARNS | QC_INO_WARNS | QC_RT_SPC_WARNS)
+#define	QC_SPACE	(1<<12)
+#define	QC_INO_COUNT	(1<<13)
+#define	QC_RT_SPACE	(1<<14)
+#define QC_ACCT_MASK (QC_SPACE | QC_INO_COUNT | QC_RT_SPACE)
+
 /* Operations handling requests from userspace */
 struct quotactl_ops {
 	int (*quota_on)(struct super_block *, int, int, struct path *);
@@ -329,8 +372,8 @@ struct quotactl_ops {
 	int (*quota_sync)(struct super_block *, int);
 	int (*get_info)(struct super_block *, int, struct if_dqinfo *);
 	int (*set_info)(struct super_block *, int, struct if_dqinfo *);
-	int (*get_dqblk)(struct super_block *, struct kqid, struct fs_disk_quota *);
-	int (*set_dqblk)(struct super_block *, struct kqid, struct fs_disk_quota *);
+	int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
+	int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
 	int (*get_xstate)(struct super_block *, struct fs_quota_stat *);
 	int (*set_xstate)(struct super_block *, unsigned int, int);
 	int (*get_xstatev)(struct super_block *, struct fs_quota_statv *);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index f23538a6e411..29e3455f7d41 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -98,9 +98,9 @@ int dquot_quota_sync(struct super_block *sb, int type);
 int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
 int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii);
 int dquot_get_dqblk(struct super_block *sb, struct kqid id,
-		struct fs_disk_quota *di);
+		struct qc_dqblk *di);
 int dquot_set_dqblk(struct super_block *sb, struct kqid id,
-		struct fs_disk_quota *di);
+		struct qc_dqblk *di);
 
 int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
 int dquot_transfer(struct inode *inode, struct iattr *iattr);
-- 
cgit v1.2.3


From 3aa8793f751d4cfcaca886e75ab30dfb00cf1d88 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Fri, 28 Nov 2014 14:38:36 +0100
Subject: mmc: core: Initial support for MMC power sequences

System on chip designs may specify a specific MMC power sequence. To
successfully detect an (e)MMC/SD/SDIO card, that power sequence must
be followed while initializing the card.

To be able to handle these SOC specific power sequences, let's add a
MMC power sequence interface. It provides the following functions to
help the mmc core to deal with these power sequences.

mmc_pwrseq_alloc() - Invoked from mmc_of_parse(), to initialize data.
mmc_pwrseq_pre_power_on()- Invoked in the beginning of mmc_power_up().
mmc_pwrseq_post_power_on()- Invoked at the end in mmc_power_up().
mmc_pwrseq_power_off()- Invoked from mmc_power_off().
mmc_pwrseq_free() - Invoked from mmc_free_host(), to free data.

Each MMC power sequence provider will be responsible to implement a set
of callbacks. These callbacks mirrors the functions above.

This patch adds the skeleton, following patches will extend the core of
the MMC power sequence and add support for a specific simple MMC power
sequence.

Do note, since the mmc_pwrseq_alloc() is invoked from mmc_of_parse(),
host drivers needs to make use of this API to enable the support for
MMC power sequences. Moreover the MMC power sequence support depends on
CONFIG_OF.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Reviewed-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
---
 drivers/mmc/core/Makefile |  2 +-
 drivers/mmc/core/core.c   |  7 +++++++
 drivers/mmc/core/host.c   |  4 +++-
 drivers/mmc/core/pwrseq.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/mmc/core/pwrseq.h | 40 +++++++++++++++++++++++++++++++++++++
 include/linux/mmc/host.h  |  2 ++
 6 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 drivers/mmc/core/pwrseq.c
 create mode 100644 drivers/mmc/core/pwrseq.h

(limited to 'include/linux')

diff --git a/drivers/mmc/core/Makefile b/drivers/mmc/core/Makefile
index 38ed210ce2f3..ccdd35f78256 100644
--- a/drivers/mmc/core/Makefile
+++ b/drivers/mmc/core/Makefile
@@ -8,5 +8,5 @@ mmc_core-y			:= core.o bus.o host.o \
 				   sdio.o sdio_ops.o sdio_bus.o \
 				   sdio_cis.o sdio_io.o sdio_irq.o \
 				   quirks.o slot-gpio.o
-
+mmc_core-$(CONFIG_OF)		+= pwrseq.o
 mmc_core-$(CONFIG_DEBUG_FS)	+= debugfs.o
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index d5c176e87951..1be7055548cb 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -40,6 +40,7 @@
 #include "bus.h"
 #include "host.h"
 #include "sdio_bus.h"
+#include "pwrseq.h"
 
 #include "mmc_ops.h"
 #include "sd_ops.h"
@@ -1615,6 +1616,8 @@ void mmc_power_up(struct mmc_host *host, u32 ocr)
 
 	mmc_host_clk_hold(host);
 
+	mmc_pwrseq_pre_power_on(host);
+
 	host->ios.vdd = fls(ocr) - 1;
 	host->ios.power_mode = MMC_POWER_UP;
 	/* Set initial state and call mmc_set_ios */
@@ -1645,6 +1648,8 @@ void mmc_power_up(struct mmc_host *host, u32 ocr)
 	 */
 	mmc_delay(10);
 
+	mmc_pwrseq_post_power_on(host);
+
 	mmc_host_clk_release(host);
 }
 
@@ -1655,6 +1660,8 @@ void mmc_power_off(struct mmc_host *host)
 
 	mmc_host_clk_hold(host);
 
+	mmc_pwrseq_power_off(host);
+
 	host->ios.clock = 0;
 	host->ios.vdd = 0;
 
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 07636449b4de..8be0df758e68 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -30,6 +30,7 @@
 #include "core.h"
 #include "host.h"
 #include "slot-gpio.h"
+#include "pwrseq.h"
 
 #define cls_dev_to_mmc_host(d)	container_of(d, struct mmc_host, class_dev)
 
@@ -448,7 +449,7 @@ int mmc_of_parse(struct mmc_host *host)
 		host->dsr_req = 0;
 	}
 
-	return 0;
+	return mmc_pwrseq_alloc(host);
 }
 
 EXPORT_SYMBOL(mmc_of_parse);
@@ -588,6 +589,7 @@ EXPORT_SYMBOL(mmc_remove_host);
  */
 void mmc_free_host(struct mmc_host *host)
 {
+	mmc_pwrseq_free(host);
 	put_device(&host->class_dev);
 }
 
diff --git a/drivers/mmc/core/pwrseq.c b/drivers/mmc/core/pwrseq.c
new file mode 100644
index 000000000000..bd087723929f
--- /dev/null
+++ b/drivers/mmc/core/pwrseq.c
@@ -0,0 +1,50 @@
+/*
+ *  Copyright (C) 2014 Linaro Ltd
+ *
+ * Author: Ulf Hansson <ulf.hansson@linaro.org>
+ *
+ * License terms: GNU General Public License (GPL) version 2
+ *
+ *  MMC power sequence management
+ */
+#include <linux/mmc/host.h>
+
+#include "pwrseq.h"
+
+
+int mmc_pwrseq_alloc(struct mmc_host *host)
+{
+	return 0;
+}
+
+void mmc_pwrseq_pre_power_on(struct mmc_host *host)
+{
+	struct mmc_pwrseq *pwrseq = host->pwrseq;
+
+	if (pwrseq && pwrseq->ops && pwrseq->ops->pre_power_on)
+		pwrseq->ops->pre_power_on(host);
+}
+
+void mmc_pwrseq_post_power_on(struct mmc_host *host)
+{
+	struct mmc_pwrseq *pwrseq = host->pwrseq;
+
+	if (pwrseq && pwrseq->ops && pwrseq->ops->post_power_on)
+		pwrseq->ops->post_power_on(host);
+}
+
+void mmc_pwrseq_power_off(struct mmc_host *host)
+{
+	struct mmc_pwrseq *pwrseq = host->pwrseq;
+
+	if (pwrseq && pwrseq->ops && pwrseq->ops->power_off)
+		pwrseq->ops->power_off(host);
+}
+
+void mmc_pwrseq_free(struct mmc_host *host)
+{
+	struct mmc_pwrseq *pwrseq = host->pwrseq;
+
+	if (pwrseq && pwrseq->ops && pwrseq->ops->free)
+		pwrseq->ops->free(host);
+}
diff --git a/drivers/mmc/core/pwrseq.h b/drivers/mmc/core/pwrseq.h
new file mode 100644
index 000000000000..12aaf2b4745b
--- /dev/null
+++ b/drivers/mmc/core/pwrseq.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2014 Linaro Ltd
+ *
+ * Author: Ulf Hansson <ulf.hansson@linaro.org>
+ *
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#ifndef _MMC_CORE_PWRSEQ_H
+#define _MMC_CORE_PWRSEQ_H
+
+struct mmc_pwrseq_ops {
+	void (*pre_power_on)(struct mmc_host *host);
+	void (*post_power_on)(struct mmc_host *host);
+	void (*power_off)(struct mmc_host *host);
+	void (*free)(struct mmc_host *host);
+};
+
+struct mmc_pwrseq {
+	struct mmc_pwrseq_ops *ops;
+};
+
+#ifdef CONFIG_OF
+
+int mmc_pwrseq_alloc(struct mmc_host *host);
+void mmc_pwrseq_pre_power_on(struct mmc_host *host);
+void mmc_pwrseq_post_power_on(struct mmc_host *host);
+void mmc_pwrseq_power_off(struct mmc_host *host);
+void mmc_pwrseq_free(struct mmc_host *host);
+
+#else
+
+static inline int mmc_pwrseq_alloc(struct mmc_host *host) { return 0; }
+static inline void mmc_pwrseq_pre_power_on(struct mmc_host *host) {}
+static inline void mmc_pwrseq_post_power_on(struct mmc_host *host) {}
+static inline void mmc_pwrseq_power_off(struct mmc_host *host) {}
+static inline void mmc_pwrseq_free(struct mmc_host *host) {}
+
+#endif
+
+#endif
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index b6bf718c3498..0c8cbe5d1550 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -195,6 +195,7 @@ struct mmc_context_info {
 };
 
 struct regulator;
+struct mmc_pwrseq;
 
 struct mmc_supply {
 	struct regulator *vmmc;		/* Card power supply */
@@ -206,6 +207,7 @@ struct mmc_host {
 	struct device		class_dev;
 	int			index;
 	const struct mmc_host_ops *ops;
+	struct mmc_pwrseq	*pwrseq;
 	unsigned int		f_min;
 	unsigned int		f_max;
 	unsigned int		f_init;
-- 
cgit v1.2.3


From c3c87e770458aa004bd7ed3f29945ff436fd6511 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 23 Jan 2015 11:19:48 +0100
Subject: perf: Tighten (and fix) the grouping condition

The fix from 9fc81d87420d ("perf: Fix events installation during
moving group") was incomplete in that it failed to recognise that
creating a group with events for different CPUs is semantically
broken -- they cannot be co-scheduled.

Furthermore, it leads to real breakage where, when we create an event
for CPU Y and then migrate it to form a group on CPU X, the code gets
confused where the counter is programmed -- triggered in practice
as well by me via the perf fuzzer.

Fix this by tightening the rules for creating groups. Only allow
grouping of counters that can be co-scheduled in the same context.
This means for the same task and/or the same cpu.

Fixes: 9fc81d87420d ("perf: Fix events installation during moving group")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20150123125834.090683288@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  6 ------
 kernel/events/core.c       | 15 +++++++++++++--
 2 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4f7a61ca4b39..664de5a4ec46 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -450,11 +450,6 @@ struct perf_event {
 #endif /* CONFIG_PERF_EVENTS */
 };
 
-enum perf_event_context_type {
-	task_context,
-	cpu_context,
-};
-
 /**
  * struct perf_event_context - event context structure
  *
@@ -462,7 +457,6 @@ enum perf_event_context_type {
  */
 struct perf_event_context {
 	struct pmu			*pmu;
-	enum perf_event_context_type	type;
 	/*
 	 * Protect the states of the events in the list,
 	 * nr_active, and the list:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 882f835a0d85..19efcf13375a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6776,7 +6776,6 @@ skip_type:
 		__perf_event_init_context(&cpuctx->ctx);
 		lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
 		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
-		cpuctx->ctx.type = cpu_context;
 		cpuctx->ctx.pmu = pmu;
 
 		__perf_cpu_hrtimer_init(cpuctx, cpu);
@@ -7420,7 +7419,19 @@ SYSCALL_DEFINE5(perf_event_open,
 		 * task or CPU context:
 		 */
 		if (move_group) {
-			if (group_leader->ctx->type != ctx->type)
+			/*
+			 * Make sure we're both on the same task, or both
+			 * per-cpu events.
+			 */
+			if (group_leader->ctx->task != ctx->task)
+				goto err_context;
+
+			/*
+			 * Make sure we're both events for the same CPU;
+			 * grouping events for different CPUs is broken; since
+			 * you can never concurrently schedule them anyhow.
+			 */
+			if (group_leader->cpu != event->cpu)
 				goto err_context;
 		} else {
 			if (group_leader->ctx != ctx)
-- 
cgit v1.2.3


From 667a0a06c99d5291433b869ed35dabdd95ba1453 Mon Sep 17 00:00:00 2001
From: David Vrabel <david.vrabel@citrix.com>
Date: Thu, 18 Dec 2014 14:48:15 +0000
Subject: mm: provide a find_special_page vma operation

The optional find_special_page VMA operation is used to lookup the
pages backing a VMA.  This is useful in cases where the normal
mechanisms for finding the page don't work.  This is only called if
the PTE is special.

One use case is a Xen PV guest mapping foreign pages into userspace.

In a Xen PV guest, the PTEs contain MFNs so get_user_pages() (for
example) must do an MFN to PFN (M2P) lookup before it can get the
page.  For foreign pages (those owned by another guest) the M2P lookup
returns the PFN as seen by the foreign guest (which would be
completely the wrong page for the local guest).

This cannot be fixed up improving the M2P lookup since one MFN may be
mapped onto two or more pages so getting the right page is impossible
given just the MFN.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 8 ++++++++
 mm/memory.c        | 2 ++
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80fc92a49649..9269af7349fe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -290,6 +290,14 @@ struct vm_operations_struct {
 	/* called by sys_remap_file_pages() to populate non-linear mapping */
 	int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
 			   unsigned long size, pgoff_t pgoff);
+
+	/*
+	 * Called by vm_normal_page() for special PTEs to find the
+	 * page for @addr.  This is useful if the default behavior
+	 * (using pte_page()) would not find the correct page.
+	 */
+	struct page *(*find_special_page)(struct vm_area_struct *vma,
+					  unsigned long addr);
 };
 
 struct mmu_gather;
diff --git a/mm/memory.c b/mm/memory.c
index 54f3a9b00956..dc2e01a315e2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -754,6 +754,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 	if (HAVE_PTE_SPECIAL) {
 		if (likely(!pte_special(pte)))
 			goto check_pfn;
+		if (vma->vm_ops && vma->vm_ops->find_special_page)
+			return vma->vm_ops->find_special_page(vma, addr);
 		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
 			return NULL;
 		if (!is_zero_pfn(pfn))
-- 
cgit v1.2.3


From d8ac3dd41aea245f65465449efc35dd3ac71e91d Mon Sep 17 00:00:00 2001
From: Jennifer Herbert <jennifer.herbert@citrix.com>
Date: Mon, 5 Jan 2015 13:24:09 +0000
Subject: mm: add 'foreign' alias for the 'pinned' page flag

The foreign page flag will be used by Xen guests to mark pages that
have grant mappings of frames from other (foreign) guests.

The foreign flag is an alias for the existing (Xen-specific) pinned
flag.  This is safe because pinned is only used on pages used for page
tables and these cannot also be foreign.

Signed-off-by: Jennifer Herbert <jennifer.herbert@citrix.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 include/linux/page-flags.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e1f5fcd79792..5ed7bdaf22d5 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -121,8 +121,12 @@ enum pageflags {
 	PG_fscache = PG_private_2,	/* page backed by cache */
 
 	/* XEN */
+	/* Pinned in Xen as a read-only pagetable page. */
 	PG_pinned = PG_owner_priv_1,
+	/* Pinned as part of domain save (see xen_mm_pin_all()). */
 	PG_savepinned = PG_dirty,
+	/* Has a grant mapping of another (foreign) domain's page. */
+	PG_foreign = PG_owner_priv_1,
 
 	/* SLOB */
 	PG_slob_free = PG_private,
@@ -215,6 +219,7 @@ __PAGEFLAG(Slab, slab)
 PAGEFLAG(Checked, checked)		/* Used by some filesystems */
 PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned)	/* Xen */
 PAGEFLAG(SavePinned, savepinned);			/* Xen */
+PAGEFLAG(Foreign, foreign);				/* Xen */
 PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved)
 PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked)
 	__SETPAGEFLAG(SwapBacked, swapbacked)
-- 
cgit v1.2.3


From 6ea22486ba46bcb665de36514094d74575cd1330 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Wed, 28 Jan 2015 12:48:53 +0000
Subject: tracing: Add array printing helper

If a trace event contains an array, there is currently no standard
way to format this for text output.  Drivers are currently hacking
around this by a) local hacks that use the trace_seq functionailty
directly, or b) just not printing that information.  For fixed size
arrays, formatting of the elements can be open-coded, but this gets
cumbersome for arrays of non-trivial size.

These approaches result in non-standard content of the event format
description delivered to userspace, so userland tools needs to be
taught to understand and parse each array printing method
individually.

This patch implements a __print_array() helper that tracepoint
implementations can use instead of reinventing it.  A simple C-style
syntax is used to delimit the array and its elements {like,this}.

So that the helpers can be used with large static arrays as well as
dynamic arrays, they take a pointer and element count: they can be
used with __get_dynamic_array() for use with dynamic arrays.
Link: http://lkml.kernel.org/r/1422449335-8289-2-git-send-email-javi.merino@arm.com

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Signed-off-by: Javi Merino <javi.merino@arm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  4 ++++
 include/trace/ftrace.h       |  9 +++++++++
 kernel/trace/trace_output.c  | 44 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0bebb5c348b8..5aa4a9269547 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -44,6 +44,10 @@ const char *ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
 const char *ftrace_print_hex_seq(struct trace_seq *p,
 				 const unsigned char *buf, int len);
 
+const char *ftrace_print_array_seq(struct trace_seq *p,
+				   const void *buf, int buf_len,
+				   size_t el_size);
+
 struct trace_iterator;
 struct trace_event;
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 139b5067345b..304901fc5f34 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -263,6 +263,14 @@
 #undef __print_hex
 #define __print_hex(buf, buf_len) ftrace_print_hex_seq(p, buf, buf_len)
 
+#undef __print_array
+#define __print_array(array, count, el_size)				\
+	({								\
+		BUILD_BUG_ON(el_size != 1 && el_size != 2 &&		\
+			     el_size != 4 && el_size != 8);		\
+		ftrace_print_array_seq(p, array, count, el_size);	\
+	})
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 static notrace enum print_line_t					\
@@ -674,6 +682,7 @@ static inline void ftrace_test_probe_##call(void)			\
 #undef __get_dynamic_array_len
 #undef __get_str
 #undef __get_bitmask
+#undef __print_array
 
 #undef TP_printk
 #define TP_printk(fmt, args...) "\"" fmt "\", "  __stringify(args)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b77b9a697619..692bf7184c8c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -177,6 +177,50 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 }
 EXPORT_SYMBOL(ftrace_print_hex_seq);
 
+const char *
+ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
+		       size_t el_size)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+	const char *prefix = "";
+	void *ptr = (void *)buf;
+
+	trace_seq_putc(p, '{');
+
+	while (ptr < buf + buf_len) {
+		switch (el_size) {
+		case 1:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u8 *)ptr);
+			break;
+		case 2:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u16 *)ptr);
+			break;
+		case 4:
+			trace_seq_printf(p, "%s0x%x", prefix,
+					 *(u32 *)ptr);
+			break;
+		case 8:
+			trace_seq_printf(p, "%s0x%llx", prefix,
+					 *(u64 *)ptr);
+			break;
+		default:
+			trace_seq_printf(p, "BAD SIZE:%zu 0x%x", el_size,
+					 *(u8 *)ptr);
+			el_size = 1;
+		}
+		prefix = ",";
+		ptr += el_size;
+	}
+
+	trace_seq_putc(p, '}');
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
+EXPORT_SYMBOL(ftrace_print_array_seq);
+
 int ftrace_raw_output_prep(struct trace_iterator *iter,
 			   struct trace_event *trace_event)
 {
-- 
cgit v1.2.3


From 8c7dd8bce05345ca5fe249b64782e8feeb3b9259 Mon Sep 17 00:00:00 2001
From: James Ban <james.ban.opensource@diasemi.com>
Date: Wed, 28 Jan 2015 09:28:08 +0900
Subject: regulator: da9211: Add gpio control for enable/disable of buck

This is a patch for adding gpio control about enable/disable of buck.

Signed-off-by: James Ban <james.ban.opensource@diasemi.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/devicetree/bindings/regulator/da9211.txt |  7 ++++++-
 drivers/regulator/da9211-regulator.c                   | 12 ++++++++++++
 include/linux/regulator/da9211.h                       |  1 +
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/da9211.txt b/Documentation/devicetree/bindings/regulator/da9211.txt
index 240019a82f9a..eb618907c7de 100644
--- a/Documentation/devicetree/bindings/regulator/da9211.txt
+++ b/Documentation/devicetree/bindings/regulator/da9211.txt
@@ -11,6 +11,7 @@ Required properties:
   BUCKA and BUCKB.
 
 Optional properties:
+- enable-gpios: platform gpio for control of BUCKA/BUCKB.
 - Any optional property defined in regulator.txt
 
 Example 1) DA9211
@@ -27,6 +28,7 @@ Example 1) DA9211
 				regulator-max-microvolt = <1570000>;
 				regulator-min-microamp 	= <2000000>;
 				regulator-max-microamp 	= <5000000>;
+				enable-gpios = <&gpio 27 0>;
 			};
 			BUCKB {
 				regulator-name = "VBUCKB";
@@ -34,11 +36,12 @@ Example 1) DA9211
 				regulator-max-microvolt = <1570000>;
 				regulator-min-microamp 	= <2000000>;
 				regulator-max-microamp 	= <5000000>;
+				enable-gpios = <&gpio 17 0>;
 			};
 		};
 	};
 
-Example 2) DA92113
+Example 2) DA9213
 	pmic: da9213@68 {
 		compatible = "dlg,da9213";
 		reg = <0x68>;
@@ -51,6 +54,7 @@ Example 2) DA92113
 				regulator-max-microvolt = <1570000>;
 				regulator-min-microamp 	= <3000000>;
 				regulator-max-microamp 	= <6000000>;
+				enable-gpios = <&gpio 27 0>;
 			};
 			BUCKB {
 				regulator-name = "VBUCKB";
@@ -58,6 +62,7 @@ Example 2) DA92113
 				regulator-max-microvolt = <1570000>;
 				regulator-min-microamp 	= <3000000>;
 				regulator-max-microamp 	= <6000000>;
+				enable-gpios = <&gpio 17 0>;
 			};
 		};
 	};
diff --git a/drivers/regulator/da9211-regulator.c b/drivers/regulator/da9211-regulator.c
index 8e6957c63a69..01343419555e 100644
--- a/drivers/regulator/da9211-regulator.c
+++ b/drivers/regulator/da9211-regulator.c
@@ -24,6 +24,7 @@
 #include <linux/regmap.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
+#include <linux/of_gpio.h>
 #include <linux/regulator/of_regulator.h>
 #include <linux/regulator/da9211.h>
 #include "da9211-regulator.h"
@@ -277,6 +278,9 @@ static struct da9211_pdata *da9211_parse_regulators_dt(
 
 		pdata->init_data[n] = da9211_matches[i].init_data;
 		pdata->reg_node[n] = da9211_matches[i].of_node;
+		pdata->gpio_ren[n] =
+			of_get_named_gpio(da9211_matches[i].of_node,
+				"enable-gpios", 0);
 		n++;
 	}
 
@@ -366,6 +370,14 @@ static int da9211_regulator_init(struct da9211 *chip)
 		config.regmap = chip->regmap;
 		config.of_node = chip->pdata->reg_node[i];
 
+		if (gpio_is_valid(chip->pdata->gpio_ren[i])) {
+			config.ena_gpio = chip->pdata->gpio_ren[i];
+			config.ena_gpio_initialized = true;
+		} else {
+			config.ena_gpio = -EINVAL;
+			config.ena_gpio_initialized = false;
+		}
+
 		chip->rdev[i] = devm_regulator_register(chip->dev,
 			&da9211_regulators[i], &config);
 		if (IS_ERR(chip->rdev[i])) {
diff --git a/include/linux/regulator/da9211.h b/include/linux/regulator/da9211.h
index d1d9d3849bdb..5dd65acc2a69 100644
--- a/include/linux/regulator/da9211.h
+++ b/include/linux/regulator/da9211.h
@@ -32,6 +32,7 @@ struct da9211_pdata {
 	 * 2 : 2 phase 2 buck
 	 */
 	int num_buck;
+	int gpio_ren[DA9211_MAX_REGULATORS];
 	struct device_node *reg_node[DA9211_MAX_REGULATORS];
 	struct regulator_init_data *init_data[DA9211_MAX_REGULATORS];
 };
-- 
cgit v1.2.3


From 018d5ef2048fcab339467bcbebccf588c9bd2531 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Thu, 29 Jan 2015 08:30:29 +0900
Subject: ata: ahci_platform: fix owner module reference mismatch for scsi host

The owner module reference of the ahci platform's scsi_host is
initialized to libahci_platform's one, because these drivers use a
scsi_host_template defined in libahci_platform.  So these drivers can
be unloaded even if the scsi device is being accessed.

This fixes it by pushing the scsi_host_template from libahci_platform
to all leaf drivers.  The scsi_host_template is passed through a new
argument of ahci_platform_init_host().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: linux-ide@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
---
 drivers/ata/ahci.h             |  4 ++++
 drivers/ata/ahci_da850.c       | 11 +++++++++--
 drivers/ata/ahci_imx.c         | 11 +++++++++--
 drivers/ata/ahci_mvebu.c       | 11 +++++++++--
 drivers/ata/ahci_platform.c    | 11 +++++++++--
 drivers/ata/ahci_st.c          | 11 +++++++++--
 drivers/ata/ahci_sunxi.c       | 11 +++++++++--
 drivers/ata/ahci_tegra.c       | 11 +++++++++--
 drivers/ata/ahci_xgene.c       | 11 +++++++++--
 drivers/ata/libahci_platform.c | 10 ++++------
 include/linux/ahci_platform.h  |  4 +++-
 include/linux/libata.h         |  6 ++++++
 12 files changed, 89 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index 275358ae0b3f..71262e08648e 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -354,6 +354,10 @@ extern int ahci_ignore_sss;
 extern struct device_attribute *ahci_shost_attrs[];
 extern struct device_attribute *ahci_sdev_attrs[];
 
+/*
+ * This must be instantiated by the edge drivers.  Read the comments
+ * for ATA_BASE_SHT
+ */
 #define AHCI_SHT(drv_name)						\
 	ATA_NCQ_SHT(drv_name),						\
 	.can_queue		= AHCI_MAX_CMDS - 1,			\
diff --git a/drivers/ata/ahci_da850.c b/drivers/ata/ahci_da850.c
index ce8a7a6d6c7f..267a3d3e79f4 100644
--- a/drivers/ata/ahci_da850.c
+++ b/drivers/ata/ahci_da850.c
@@ -16,6 +16,8 @@
 #include <linux/ahci_platform.h>
 #include "ahci.h"
 
+#define DRV_NAME "ahci_da850"
+
 /* SATA PHY Control Register offset from AHCI base */
 #define SATA_P0PHYCR_REG	0x178
 
@@ -59,6 +61,10 @@ static const struct ata_port_info ahci_da850_port_info = {
 	.port_ops	= &ahci_platform_ops,
 };
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int ahci_da850_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -85,7 +91,8 @@ static int ahci_da850_probe(struct platform_device *pdev)
 
 	da850_sata_init(dev, pwrdn_reg, hpriv->mmio);
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_da850_port_info);
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_da850_port_info,
+				     &ahci_platform_sht);
 	if (rc)
 		goto disable_resources;
 
@@ -102,7 +109,7 @@ static struct platform_driver ahci_da850_driver = {
 	.probe = ahci_da850_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "ahci_da850",
+		.name = DRV_NAME,
 		.pm = &ahci_da850_pm_ops,
 	},
 };
diff --git a/drivers/ata/ahci_imx.c b/drivers/ata/ahci_imx.c
index 41632e57d46f..3f3a7db208ae 100644
--- a/drivers/ata/ahci_imx.c
+++ b/drivers/ata/ahci_imx.c
@@ -28,6 +28,8 @@
 #include <linux/libata.h>
 #include "ahci.h"
 
+#define DRV_NAME "ahci-imx"
+
 enum {
 	/* Timer 1-ms Register */
 	IMX_TIMER1MS				= 0x00e0,
@@ -520,6 +522,10 @@ static u32 imx_ahci_parse_props(struct device *dev,
 	return reg_value;
 }
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int imx_ahci_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -616,7 +622,8 @@ static int imx_ahci_probe(struct platform_device *pdev)
 	reg_val = clk_get_rate(imxpriv->ahb_clk) / 1000;
 	writel(reg_val, hpriv->mmio + IMX_TIMER1MS);
 
-	ret = ahci_platform_init_host(pdev, hpriv, &ahci_imx_port_info);
+	ret = ahci_platform_init_host(pdev, hpriv, &ahci_imx_port_info,
+				      &ahci_platform_sht);
 	if (ret)
 		goto disable_sata;
 
@@ -674,7 +681,7 @@ static struct platform_driver imx_ahci_driver = {
 	.probe = imx_ahci_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "ahci-imx",
+		.name = DRV_NAME,
 		.of_match_table = imx_ahci_of_match,
 		.pm = &ahci_imx_pm_ops,
 	},
diff --git a/drivers/ata/ahci_mvebu.c b/drivers/ata/ahci_mvebu.c
index 64bb08432b69..23716dd8a7ec 100644
--- a/drivers/ata/ahci_mvebu.c
+++ b/drivers/ata/ahci_mvebu.c
@@ -19,6 +19,8 @@
 #include <linux/platform_device.h>
 #include "ahci.h"
 
+#define DRV_NAME "ahci-mvebu"
+
 #define AHCI_VENDOR_SPECIFIC_0_ADDR  0xa0
 #define AHCI_VENDOR_SPECIFIC_0_DATA  0xa4
 
@@ -67,6 +69,10 @@ static const struct ata_port_info ahci_mvebu_port_info = {
 	.port_ops  = &ahci_platform_ops,
 };
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int ahci_mvebu_probe(struct platform_device *pdev)
 {
 	struct ahci_host_priv *hpriv;
@@ -88,7 +94,8 @@ static int ahci_mvebu_probe(struct platform_device *pdev)
 	ahci_mvebu_mbus_config(hpriv, dram);
 	ahci_mvebu_regret_option(hpriv);
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_mvebu_port_info);
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_mvebu_port_info,
+				     &ahci_platform_sht);
 	if (rc)
 		goto disable_resources;
 
@@ -114,7 +121,7 @@ static struct platform_driver ahci_mvebu_driver = {
 	.probe = ahci_mvebu_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "ahci-mvebu",
+		.name = DRV_NAME,
 		.of_match_table = ahci_mvebu_of_match,
 	},
 };
diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c
index 18d539837045..78d6ae0b90c4 100644
--- a/drivers/ata/ahci_platform.c
+++ b/drivers/ata/ahci_platform.c
@@ -22,6 +22,8 @@
 #include <linux/ahci_platform.h>
 #include "ahci.h"
 
+#define DRV_NAME "ahci"
+
 static const struct ata_port_info ahci_port_info = {
 	.flags		= AHCI_FLAG_COMMON,
 	.pio_mask	= ATA_PIO4,
@@ -29,6 +31,10 @@ static const struct ata_port_info ahci_port_info = {
 	.port_ops	= &ahci_platform_ops,
 };
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int ahci_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -46,7 +52,8 @@ static int ahci_probe(struct platform_device *pdev)
 	if (of_device_is_compatible(dev->of_node, "hisilicon,hisi-ahci"))
 		hpriv->flags |= AHCI_HFLAG_NO_FBS | AHCI_HFLAG_NO_NCQ;
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_port_info);
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_port_info,
+				     &ahci_platform_sht);
 	if (rc)
 		goto disable_resources;
 
@@ -75,7 +82,7 @@ static struct platform_driver ahci_driver = {
 	.probe = ahci_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "ahci",
+		.name = DRV_NAME,
 		.of_match_table = ahci_of_match,
 		.pm = &ahci_pm_ops,
 	},
diff --git a/drivers/ata/ahci_st.c b/drivers/ata/ahci_st.c
index 2f9e8317cc16..bc971af262e7 100644
--- a/drivers/ata/ahci_st.c
+++ b/drivers/ata/ahci_st.c
@@ -23,6 +23,8 @@
 
 #include "ahci.h"
 
+#define DRV_NAME  "st_ahci"
+
 #define ST_AHCI_OOBR			0xbc
 #define ST_AHCI_OOBR_WE			BIT(31)
 #define ST_AHCI_OOBR_CWMIN_SHIFT	24
@@ -140,6 +142,10 @@ static const struct ata_port_info st_ahci_port_info = {
 	.port_ops       = &st_ahci_port_ops,
 };
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int st_ahci_probe(struct platform_device *pdev)
 {
 	struct st_ahci_drv_data *drv_data;
@@ -166,7 +172,8 @@ static int st_ahci_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	err = ahci_platform_init_host(pdev, hpriv, &st_ahci_port_info);
+	err = ahci_platform_init_host(pdev, hpriv, &st_ahci_port_info,
+				      &ahci_platform_sht);
 	if (err) {
 		ahci_platform_disable_resources(hpriv);
 		return err;
@@ -229,7 +236,7 @@ MODULE_DEVICE_TABLE(of, st_ahci_match);
 
 static struct platform_driver st_ahci_driver = {
 	.driver = {
-		.name = "st_ahci",
+		.name = DRV_NAME,
 		.pm = &st_ahci_pm_ops,
 		.of_match_table = of_match_ptr(st_ahci_match),
 	},
diff --git a/drivers/ata/ahci_sunxi.c b/drivers/ata/ahci_sunxi.c
index e2e0da539a2f..b26437430163 100644
--- a/drivers/ata/ahci_sunxi.c
+++ b/drivers/ata/ahci_sunxi.c
@@ -27,6 +27,8 @@
 #include <linux/regulator/consumer.h>
 #include "ahci.h"
 
+#define DRV_NAME "ahci-sunxi"
+
 /* Insmod parameters */
 static bool enable_pmp;
 module_param(enable_pmp, bool, 0);
@@ -169,6 +171,10 @@ static const struct ata_port_info ahci_sunxi_port_info = {
 	.port_ops	= &ahci_platform_ops,
 };
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int ahci_sunxi_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -200,7 +206,8 @@ static int ahci_sunxi_probe(struct platform_device *pdev)
 	if (!enable_pmp)
 		hpriv->flags |= AHCI_HFLAG_NO_PMP;
 
-	rc = ahci_platform_init_host(pdev, hpriv, &ahci_sunxi_port_info);
+	rc = ahci_platform_init_host(pdev, hpriv, &ahci_sunxi_port_info,
+				     &ahci_platform_sht);
 	if (rc)
 		goto disable_resources;
 
@@ -251,7 +258,7 @@ static struct platform_driver ahci_sunxi_driver = {
 	.probe = ahci_sunxi_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "ahci-sunxi",
+		.name = DRV_NAME,
 		.of_match_table = ahci_sunxi_of_match,
 		.pm = &ahci_sunxi_pm_ops,
 	},
diff --git a/drivers/ata/ahci_tegra.c b/drivers/ata/ahci_tegra.c
index 032904402c95..3a62eb246d80 100644
--- a/drivers/ata/ahci_tegra.c
+++ b/drivers/ata/ahci_tegra.c
@@ -31,6 +31,8 @@
 
 #include "ahci.h"
 
+#define DRV_NAME "tegra-ahci"
+
 #define SATA_CONFIGURATION_0				0x180
 #define SATA_CONFIGURATION_EN_FPCI			BIT(0)
 
@@ -289,6 +291,10 @@ static const struct of_device_id tegra_ahci_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, tegra_ahci_of_match);
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int tegra_ahci_probe(struct platform_device *pdev)
 {
 	struct ahci_host_priv *hpriv;
@@ -354,7 +360,8 @@ static int tegra_ahci_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = ahci_platform_init_host(pdev, hpriv, &ahci_tegra_port_info);
+	ret = ahci_platform_init_host(pdev, hpriv, &ahci_tegra_port_info,
+				      &ahci_platform_sht);
 	if (ret)
 		goto deinit_controller;
 
@@ -370,7 +377,7 @@ static struct platform_driver tegra_ahci_driver = {
 	.probe = tegra_ahci_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "tegra-ahci",
+		.name = DRV_NAME,
 		.of_match_table = tegra_ahci_of_match,
 	},
 	/* LP0 suspend support not implemented */
diff --git a/drivers/ata/ahci_xgene.c b/drivers/ata/ahci_xgene.c
index 7f6887535c1e..e3b8750e8e9d 100644
--- a/drivers/ata/ahci_xgene.c
+++ b/drivers/ata/ahci_xgene.c
@@ -30,6 +30,8 @@
 #include <linux/phy/phy.h>
 #include "ahci.h"
 
+#define DRV_NAME "xgene-ahci"
+
 /* Max # of disk per a controller */
 #define MAX_AHCI_CHN_PERCTR		2
 
@@ -621,6 +623,10 @@ static int xgene_ahci_mux_select(struct xgene_ahci_context *ctx)
 	return val & CFG_SATA_ENET_SELECT_MASK ? -1 : 0;
 }
 
+static struct scsi_host_template ahci_platform_sht = {
+	AHCI_SHT(DRV_NAME),
+};
+
 static int xgene_ahci_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -698,7 +704,8 @@ static int xgene_ahci_probe(struct platform_device *pdev)
 skip_clk_phy:
 	hpriv->flags = AHCI_HFLAG_NO_PMP | AHCI_HFLAG_NO_NCQ;
 
-	rc = ahci_platform_init_host(pdev, hpriv, &xgene_ahci_port_info);
+	rc = ahci_platform_init_host(pdev, hpriv, &xgene_ahci_port_info,
+				     &ahci_platform_sht);
 	if (rc)
 		goto disable_resources;
 
@@ -720,7 +727,7 @@ static struct platform_driver xgene_ahci_driver = {
 	.probe = xgene_ahci_probe,
 	.remove = ata_platform_remove_one,
 	.driver = {
-		.name = "xgene-ahci",
+		.name = DRV_NAME,
 		.of_match_table = xgene_ahci_of_match,
 	},
 };
diff --git a/drivers/ata/libahci_platform.c b/drivers/ata/libahci_platform.c
index 504d534ccbfe..077c7a261354 100644
--- a/drivers/ata/libahci_platform.c
+++ b/drivers/ata/libahci_platform.c
@@ -35,10 +35,6 @@ struct ata_port_operations ahci_platform_ops = {
 };
 EXPORT_SYMBOL_GPL(ahci_platform_ops);
 
-static struct scsi_host_template ahci_platform_sht = {
-	AHCI_SHT("ahci_platform"),
-};
-
 /**
  * ahci_platform_enable_phys - Enable PHYs
  * @hpriv: host private area to store config values
@@ -494,6 +490,7 @@ EXPORT_SYMBOL_GPL(ahci_platform_get_resources);
  * @pdev: platform device pointer for the host
  * @hpriv: ahci-host private data for the host
  * @pi_template: template for the ata_port_info to use
+ * @sht: scsi_host_template to use when registering
  *
  * This function does all the usual steps needed to bring up an
  * ahci-platform host, note any necessary resources (ie clks, phys, etc.)
@@ -504,7 +501,8 @@ EXPORT_SYMBOL_GPL(ahci_platform_get_resources);
  */
 int ahci_platform_init_host(struct platform_device *pdev,
 			    struct ahci_host_priv *hpriv,
-			    const struct ata_port_info *pi_template)
+			    const struct ata_port_info *pi_template,
+			    struct scsi_host_template *sht)
 {
 	struct device *dev = &pdev->dev;
 	struct ata_port_info pi = *pi_template;
@@ -588,7 +586,7 @@ int ahci_platform_init_host(struct platform_device *pdev,
 	ahci_init_controller(host);
 	ahci_print_info(host, "platform");
 
-	return ahci_host_activate(host, irq, &ahci_platform_sht);
+	return ahci_host_activate(host, irq, sht);
 }
 EXPORT_SYMBOL_GPL(ahci_platform_init_host);
 
diff --git a/include/linux/ahci_platform.h b/include/linux/ahci_platform.h
index f65b33809170..a270f25ee7c7 100644
--- a/include/linux/ahci_platform.h
+++ b/include/linux/ahci_platform.h
@@ -21,6 +21,7 @@ struct device;
 struct ata_port_info;
 struct ahci_host_priv;
 struct platform_device;
+struct scsi_host_template;
 
 int ahci_platform_enable_clks(struct ahci_host_priv *hpriv);
 void ahci_platform_disable_clks(struct ahci_host_priv *hpriv);
@@ -32,7 +33,8 @@ struct ahci_host_priv *ahci_platform_get_resources(
 	struct platform_device *pdev);
 int ahci_platform_init_host(struct platform_device *pdev,
 			    struct ahci_host_priv *hpriv,
-			    const struct ata_port_info *pi_template);
+			    const struct ata_port_info *pi_template,
+			    struct scsi_host_template *sht);
 
 int ahci_platform_suspend_host(struct device *dev);
 int ahci_platform_resume_host(struct device *dev);
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 2d182413b1db..11beb4196c32 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1338,6 +1338,12 @@ extern const struct ata_port_operations ata_base_port_ops;
 extern const struct ata_port_operations sata_port_ops;
 extern struct device_attribute *ata_common_sdev_attrs[];
 
+/*
+ * All sht initializers (BASE, PIO, BMDMA, NCQ) must be instantiated
+ * by the edge drivers.  Because the 'module' field of sht must be the
+ * edge driver's module reference, otherwise the driver can be unloaded
+ * even if the scsi_device is being accessed.
+ */
 #define ATA_BASE_SHT(drv_name)					\
 	.module			= THIS_MODULE,			\
 	.name			= drv_name,			\
-- 
cgit v1.2.3


From 17263905399471016cda6c1975044d14291c5ba5 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Thu, 29 Jan 2015 08:30:30 +0900
Subject: ata: pata_platform: fix owner module reference mismatch for scsi host

The owner module reference of the pata_of_platform's scsi_host is
initialized to pata_platform's one, because pata_of_platform driver
use a scsi_host_template defined in pata_platform.  So this drivers
can be unloaded even if the scsi device is being accessed.

This fixes it by propagating the scsi_host_template to pata_of_platform
driver.  The scsi_host_template is passed through a new
argument of __pata_platform_probe().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Hans de Goede <hdegoede@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: "James E.J. Bottomley" <JBottomley@parallels.com>
Cc: linux-ide@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
---
 drivers/ata/pata_of_platform.c | 10 ++++++++--
 drivers/ata/pata_platform.c    |  8 +++++---
 include/linux/ata_platform.h   |  5 ++++-
 3 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/pata_of_platform.c b/drivers/ata/pata_of_platform.c
index dcc408abe171..b6b7af894d9d 100644
--- a/drivers/ata/pata_of_platform.c
+++ b/drivers/ata/pata_of_platform.c
@@ -16,6 +16,12 @@
 #include <linux/ata_platform.h>
 #include <linux/libata.h>
 
+#define DRV_NAME "pata_of_platform"
+
+static struct scsi_host_template pata_platform_sht = {
+	ATA_PIO_SHT(DRV_NAME),
+};
+
 static int pata_of_platform_probe(struct platform_device *ofdev)
 {
 	int ret;
@@ -63,7 +69,7 @@ static int pata_of_platform_probe(struct platform_device *ofdev)
 	pio_mask |= (1 << pio_mode) - 1;
 
 	return __pata_platform_probe(&ofdev->dev, &io_res, &ctl_res, irq_res,
-				     reg_shift, pio_mask);
+				     reg_shift, pio_mask, &pata_platform_sht);
 }
 
 static struct of_device_id pata_of_platform_match[] = {
@@ -74,7 +80,7 @@ MODULE_DEVICE_TABLE(of, pata_of_platform_match);
 
 static struct platform_driver pata_of_platform_driver = {
 	.driver = {
-		.name = "pata_of_platform",
+		.name = DRV_NAME,
 		.of_match_table = pata_of_platform_match,
 	},
 	.probe		= pata_of_platform_probe,
diff --git a/drivers/ata/pata_platform.c b/drivers/ata/pata_platform.c
index 1eedfe46d7c8..c503ded87bb8 100644
--- a/drivers/ata/pata_platform.c
+++ b/drivers/ata/pata_platform.c
@@ -78,6 +78,7 @@ static void pata_platform_setup_port(struct ata_ioports *ioaddr,
  *	@irq_res: Resource representing IRQ and its flags
  *	@ioport_shift: I/O port shift
  *	@__pio_mask: PIO mask
+ *	@sht: scsi_host_template to use when registering
  *
  *	Register a platform bus IDE interface. Such interfaces are PIO and we
  *	assume do not support IRQ sharing.
@@ -99,7 +100,8 @@ static void pata_platform_setup_port(struct ata_ioports *ioaddr,
  */
 int __pata_platform_probe(struct device *dev, struct resource *io_res,
 			  struct resource *ctl_res, struct resource *irq_res,
-			  unsigned int ioport_shift, int __pio_mask)
+			  unsigned int ioport_shift, int __pio_mask,
+			  struct scsi_host_template *sht)
 {
 	struct ata_host *host;
 	struct ata_port *ap;
@@ -170,7 +172,7 @@ int __pata_platform_probe(struct device *dev, struct resource *io_res,
 
 	/* activate */
 	return ata_host_activate(host, irq, irq ? ata_sff_interrupt : NULL,
-				 irq_flags, &pata_platform_sht);
+				 irq_flags, sht);
 }
 EXPORT_SYMBOL_GPL(__pata_platform_probe);
 
@@ -216,7 +218,7 @@ static int pata_platform_probe(struct platform_device *pdev)
 
 	return __pata_platform_probe(&pdev->dev, io_res, ctl_res, irq_res,
 				     pp_info ? pp_info->ioport_shift : 0,
-				     pio_mask);
+				     pio_mask, &pata_platform_sht);
 }
 
 static struct platform_driver pata_platform_driver = {
diff --git a/include/linux/ata_platform.h b/include/linux/ata_platform.h
index 5c618a084225..619d9e78e644 100644
--- a/include/linux/ata_platform.h
+++ b/include/linux/ata_platform.h
@@ -10,12 +10,15 @@ struct pata_platform_info {
 	unsigned int ioport_shift;
 };
 
+struct scsi_host_template;
+
 extern int __pata_platform_probe(struct device *dev,
 				 struct resource *io_res,
 				 struct resource *ctl_res,
 				 struct resource *irq_res,
 				 unsigned int ioport_shift,
-				 int __pio_mask);
+				 int __pio_mask,
+				 struct scsi_host_template *sht);
 
 /*
  * Marvell SATA private data
-- 
cgit v1.2.3


From 61f552141c9c0e88b3fdc7046265781ffd8fa68a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sun, 18 Jan 2015 16:45:42 +0100
Subject: ftrace: let notrace function attribute disable hotpatching if
 necessary

gcc supports an s390 specific function attribute called "hotpatch".
It can be used to specify the number of halfwords that shall be added before
and after a function and which shall be filled with nops for runtime patching.

s390 will use the hotpatch attribute for function tracing, therefore make
sure that the notrace function attribute either disables the mcount call
or in case of hotpatch nop generation.

Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 include/linux/compiler.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index d5ad7b1118fc..1ef679f4b88e 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -54,7 +54,11 @@ extern void __chk_io_ptr(const volatile void __iomem *);
 #include <linux/compiler-gcc.h>
 #endif
 
+#ifdef CC_USING_HOTPATCH
+#define notrace __attribute__((hotpatch(0,0)))
+#else
 #define notrace __attribute__((no_instrument_function))
+#endif
 
 /* Intel compiler defines __GNUC__. So we will overwrite implementations
  * coming from above header files here
-- 
cgit v1.2.3


From 3c313161353ad527de1a6ecde0f0d41700858fd6 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sat, 24 Jan 2015 18:47:20 +0100
Subject: bcma: detect SPROM revision 11
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extracting values from it is still unsupported, but at least we'll
display some meaningful error now.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/sprom.c         | 3 ++-
 include/linux/ssb/ssb_regs.h | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/sprom.c b/drivers/bcma/sprom.c
index efb037f9c98a..206edd3ba668 100644
--- a/drivers/bcma/sprom.c
+++ b/drivers/bcma/sprom.c
@@ -579,7 +579,8 @@ int bcma_sprom_get(struct bcma_bus *bus)
 	u16 offset = BCMA_CC_SPROM;
 	u16 *sprom;
 	size_t sprom_sizes[] = { SSB_SPROMSIZE_WORDS_R4,
-				 SSB_SPROMSIZE_WORDS_R10, };
+				 SSB_SPROMSIZE_WORDS_R10,
+				 SSB_SPROMSIZE_WORDS_R11, };
 	int i, err = 0;
 
 	if (!bus->drv_cc.core)
diff --git a/include/linux/ssb/ssb_regs.h b/include/linux/ssb/ssb_regs.h
index f7b9100686c3..c0f707ac192b 100644
--- a/include/linux/ssb/ssb_regs.h
+++ b/include/linux/ssb/ssb_regs.h
@@ -173,6 +173,7 @@
 #define SSB_SPROMSIZE_BYTES_R123	(SSB_SPROMSIZE_WORDS_R123 * sizeof(u16))
 #define SSB_SPROMSIZE_BYTES_R4		(SSB_SPROMSIZE_WORDS_R4 * sizeof(u16))
 #define SSB_SPROMSIZE_WORDS_R10		230
+#define SSB_SPROMSIZE_WORDS_R11		234
 #define SSB_SPROM_BASE1			0x1000
 #define SSB_SPROM_BASE31		0x0800
 #define SSB_SPROM_REVISION		0x007E
-- 
cgit v1.2.3


From b504075f5903b969a54ef3a6ae994c0872edb259 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sun, 25 Jan 2015 11:11:14 +0100
Subject: bcma: add early_init function for PCIe core and move some fix into it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are some PCIe core fixes that need to be applied before accessing
SPROM, otherwise reading it may fail.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/driver_pci.c            | 66 ++++++++++++++++++++++++------------
 drivers/bcma/main.c                  |  7 ++++
 include/linux/bcma/bcma_driver_pci.h |  2 ++
 3 files changed, 53 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/driver_pci.c b/drivers/bcma/driver_pci.c
index b85a505603e6..786666488a2d 100644
--- a/drivers/bcma/driver_pci.c
+++ b/drivers/bcma/driver_pci.c
@@ -144,6 +144,47 @@ static u16 bcma_pcie_mdio_writeread(struct bcma_drv_pci *pc, u16 device,
 	return bcma_pcie_mdio_read(pc, device, address);
 }
 
+/**************************************************
+ * Early init.
+ **************************************************/
+
+static void bcma_core_pci_fixcfg(struct bcma_drv_pci *pc)
+{
+	struct bcma_device *core = pc->core;
+	u16 val16, core_index;
+	uint regoff;
+
+	regoff = BCMA_CORE_PCI_SPROM(BCMA_CORE_PCI_SPROM_PI_OFFSET);
+	core_index = (u16)core->core_index;
+
+	val16 = pcicore_read16(pc, regoff);
+	if (((val16 & BCMA_CORE_PCI_SPROM_PI_MASK) >> BCMA_CORE_PCI_SPROM_PI_SHIFT)
+	     != core_index) {
+		val16 = (core_index << BCMA_CORE_PCI_SPROM_PI_SHIFT) |
+			(val16 & ~BCMA_CORE_PCI_SPROM_PI_MASK);
+		pcicore_write16(pc, regoff, val16);
+	}
+}
+
+/*
+ * Apply some early fixes required before accessing SPROM.
+ * See also si_pci_fixcfg.
+ */
+void bcma_core_pci_early_init(struct bcma_drv_pci *pc)
+{
+	if (pc->early_setup_done)
+		return;
+
+	pc->hostmode = bcma_core_pci_is_in_hostmode(pc);
+	if (pc->hostmode)
+		goto out;
+
+	bcma_core_pci_fixcfg(pc);
+
+out:
+	pc->early_setup_done = true;
+}
+
 /**************************************************
  * Workarounds.
  **************************************************/
@@ -175,24 +216,6 @@ static void bcma_pcicore_serdes_workaround(struct bcma_drv_pci *pc)
 		                     tmp & ~BCMA_CORE_PCI_PLL_CTRL_FREQDET_EN);
 }
 
-static void bcma_core_pci_fixcfg(struct bcma_drv_pci *pc)
-{
-	struct bcma_device *core = pc->core;
-	u16 val16, core_index;
-	uint regoff;
-
-	regoff = BCMA_CORE_PCI_SPROM(BCMA_CORE_PCI_SPROM_PI_OFFSET);
-	core_index = (u16)core->core_index;
-
-	val16 = pcicore_read16(pc, regoff);
-	if (((val16 & BCMA_CORE_PCI_SPROM_PI_MASK) >> BCMA_CORE_PCI_SPROM_PI_SHIFT)
-	     != core_index) {
-		val16 = (core_index << BCMA_CORE_PCI_SPROM_PI_SHIFT) |
-			(val16 & ~BCMA_CORE_PCI_SPROM_PI_MASK);
-		pcicore_write16(pc, regoff, val16);
-	}
-}
-
 /* Fix MISC config to allow coming out of L2/L3-Ready state w/o PRST */
 /* Needs to happen when coming out of 'standby'/'hibernate' */
 static void bcma_core_pci_config_fixup(struct bcma_drv_pci *pc)
@@ -216,7 +239,6 @@ static void bcma_core_pci_config_fixup(struct bcma_drv_pci *pc)
 
 static void bcma_core_pci_clientmode_init(struct bcma_drv_pci *pc)
 {
-	bcma_core_pci_fixcfg(pc);
 	bcma_pcicore_serdes_workaround(pc);
 	bcma_core_pci_config_fixup(pc);
 }
@@ -226,11 +248,11 @@ void bcma_core_pci_init(struct bcma_drv_pci *pc)
 	if (pc->setup_done)
 		return;
 
-	pc->hostmode = bcma_core_pci_is_in_hostmode(pc);
+	bcma_core_pci_early_init(pc);
+
 	if (pc->hostmode)
 		bcma_core_pci_hostmode_init(pc);
-
-	if (!pc->hostmode)
+	else
 		bcma_core_pci_clientmode_init(pc);
 }
 
diff --git a/drivers/bcma/main.c b/drivers/bcma/main.c
index 73b2ee3de972..38bde6eab8a4 100644
--- a/drivers/bcma/main.c
+++ b/drivers/bcma/main.c
@@ -402,6 +402,13 @@ int bcma_bus_register(struct bcma_bus *bus)
 		bcma_core_chipcommon_early_init(&bus->drv_cc);
 	}
 
+	/* Early init PCIE core */
+	core = bcma_find_core(bus, BCMA_CORE_PCIE);
+	if (core) {
+		bus->drv_pci[0].core = core;
+		bcma_core_pci_early_init(&bus->drv_pci[0]);
+	}
+
 	/* Cores providing flash access go before SPROM init */
 	list_for_each_entry(core, &bus->cores, list) {
 		if (bcma_is_core_needed_early(core->id.id))
diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h
index 0333e605ea0d..3f809ae372c4 100644
--- a/include/linux/bcma/bcma_driver_pci.h
+++ b/include/linux/bcma/bcma_driver_pci.h
@@ -223,6 +223,7 @@ struct bcma_drv_pci_host {
 
 struct bcma_drv_pci {
 	struct bcma_device *core;
+	u8 early_setup_done:1;
 	u8 setup_done:1;
 	u8 hostmode:1;
 
@@ -237,6 +238,7 @@ struct bcma_drv_pci {
 #define pcicore_write16(pc, offset, val)	bcma_write16((pc)->core, offset, val)
 #define pcicore_write32(pc, offset, val)	bcma_write32((pc)->core, offset, val)
 
+extern void bcma_core_pci_early_init(struct bcma_drv_pci *pc);
 extern void bcma_core_pci_init(struct bcma_drv_pci *pc);
 extern int bcma_core_pci_irq_ctl(struct bcma_drv_pci *pc,
 				 struct bcma_device *core, bool enable);
-- 
cgit v1.2.3


From 8be08a39d498d5d93ff5149276e34ccb4ec3757f Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Sun, 25 Jan 2015 13:41:19 +0100
Subject: bcma: implement host code support for PCIe Gen 2 devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is stil incomplete, so we don't add PCI IDs of new devices yet.
Purpose of this patch is to allow testing & adjusting rest of the code.

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: Kalle Valo <kvalo@codeaurora.org>
---
 drivers/bcma/host_pci.c        | 6 ++++--
 include/linux/bcma/bcma.h      | 1 +
 include/linux/bcma/bcma_regs.h | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c
index cd9161a8b3a1..53c6a8a58859 100644
--- a/drivers/bcma/host_pci.c
+++ b/drivers/bcma/host_pci.c
@@ -13,10 +13,12 @@
 
 static void bcma_host_pci_switch_core(struct bcma_device *core)
 {
+	int win2 = core->bus->host_is_pcie2 ?
+		BCMA_PCIE2_BAR0_WIN2 : BCMA_PCI_BAR0_WIN2;
+
 	pci_write_config_dword(core->bus->host_pci, BCMA_PCI_BAR0_WIN,
 			       core->addr);
-	pci_write_config_dword(core->bus->host_pci, BCMA_PCI_BAR0_WIN2,
-			       core->wrap);
+	pci_write_config_dword(core->bus->host_pci, win2, core->wrap);
 	core->bus->mapped_core = core;
 	bcma_debug(core->bus, "Switched to core: 0x%X\n", core->id.id);
 }
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index eb1c6a47b67f..994739da827f 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -318,6 +318,7 @@ struct bcma_bus {
 	const struct bcma_host_ops *ops;
 
 	enum bcma_hosttype hosttype;
+	bool host_is_pcie2; /* Used for BCMA_HOSTTYPE_PCI only */
 	union {
 		/* Pointer to the PCI bus (only for BCMA_HOSTTYPE_PCI) */
 		struct pci_dev *host_pci;
diff --git a/include/linux/bcma/bcma_regs.h b/include/linux/bcma/bcma_regs.h
index e64ae7bf80a1..ebd5c1fcdea4 100644
--- a/include/linux/bcma/bcma_regs.h
+++ b/include/linux/bcma/bcma_regs.h
@@ -64,6 +64,8 @@
 #define  BCMA_PCI_GPIO_XTAL		0x40	/* PCI config space GPIO 14 for Xtal powerup */
 #define  BCMA_PCI_GPIO_PLL		0x80	/* PCI config space GPIO 15 for PLL powerdown */
 
+#define BCMA_PCIE2_BAR0_WIN2		0x70
+
 /* SiliconBackplane Address Map.
  * All regions may not exist on all chips.
  */
-- 
cgit v1.2.3


From 0501be6429e4eb02f417ad83eacd84b8c57b0283 Mon Sep 17 00:00:00 2001
From: Alexey Skidanov <alexey.skidanov@sandisk.com>
Date: Thu, 29 Jan 2015 10:49:43 +0200
Subject: mmc: Resolve BKOPS compatability issue

This patch is coming to fix compatibility issue of BKOPS_EN  field of EXT_CSD.
In eMMC-5.1, BKOPS_EN was changed, and now it has two operational bits:
Bit 0 - MANUAL_EN
Bit 1 - AUTO_EN
In previous eMMC revisions, only Bit 0 was supported.

Signed-off-by: Alexey Skidanov <alexey.skidanov@sandisk.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/core.c  | 2 +-
 drivers/mmc/core/mmc.c   | 8 +++++---
 include/linux/mmc/card.h | 2 +-
 include/linux/mmc/mmc.h  | 5 +++++
 4 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 1be7055548cb..0dc64e6e00d4 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -275,7 +275,7 @@ void mmc_start_bkops(struct mmc_card *card, bool from_exception)
 
 	BUG_ON(!card);
 
-	if (!card->ext_csd.bkops_en || mmc_card_doing_bkops(card))
+	if (!card->ext_csd.man_bkops_en || mmc_card_doing_bkops(card))
 		return;
 
 	err = mmc_read_bkops_status(card);
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 1fc48a280659..1d41e8541f38 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -483,11 +483,13 @@ static int mmc_decode_ext_csd(struct mmc_card *card, u8 *ext_csd)
 		/* check whether the eMMC card supports BKOPS */
 		if (ext_csd[EXT_CSD_BKOPS_SUPPORT] & 0x1) {
 			card->ext_csd.bkops = 1;
-			card->ext_csd.bkops_en = ext_csd[EXT_CSD_BKOPS_EN];
+			card->ext_csd.man_bkops_en =
+					(ext_csd[EXT_CSD_BKOPS_EN] &
+						EXT_CSD_MANUAL_BKOPS_MASK);
 			card->ext_csd.raw_bkops_status =
 				ext_csd[EXT_CSD_BKOPS_STATUS];
-			if (!card->ext_csd.bkops_en)
-				pr_info("%s: BKOPS_EN bit is not set\n",
+			if (!card->ext_csd.man_bkops_en)
+				pr_info("%s: MAN_BKOPS_EN bit is not set\n",
 					mmc_hostname(card->host));
 		}
 
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 4d69c00497bd..a6cf4c063e4e 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -83,7 +83,7 @@ struct mmc_ext_csd {
 	bool			hpi;			/* HPI support bit */
 	unsigned int		hpi_cmd;		/* cmd used as HPI */
 	bool			bkops;		/* background support bit */
-	bool			bkops_en;	/* background enable bit */
+	bool			man_bkops_en;	/* manual bkops enable bit */
 	unsigned int            data_sector_size;       /* 512 bytes or 4KB */
 	unsigned int            data_tag_unit_size;     /* DATA TAG UNIT size */
 	unsigned int		boot_ro_lock;		/* ro lock support */
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index fb97b5cc91cd..124f562118b8 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -427,6 +427,11 @@ struct _mmc_csd {
  */
 #define EXT_CSD_BKOPS_LEVEL_2		0x2
 
+/*
+ * BKOPS modes
+ */
+#define EXT_CSD_MANUAL_BKOPS_MASK	0x01
+
 /*
  * MMC_SWITCH access modes
  */
-- 
cgit v1.2.3


From 33692f27597fcab536d7cbbcc8f52905133e4aa7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 29 Jan 2015 10:51:32 -0800
Subject: vm: add VM_FAULT_SIGSEGV handling support

The core VM already knows about VM_FAULT_SIGBUS, but cannot return a
"you should SIGSEGV" error, because the SIGSEGV case was generally
handled by the caller - usually the architecture fault handler.

That results in lots of duplication - all the architecture fault
handlers end up doing very similar "look up vma, check permissions, do
retries etc" - but it generally works.  However, there are cases where
the VM actually wants to SIGSEGV, and applications _expect_ SIGSEGV.

In particular, when accessing the stack guard page, libsigsegv expects a
SIGSEGV.  And it usually got one, because the stack growth is handled by
that duplicated architecture fault handler.

However, when the generic VM layer started propagating the error return
from the stack expansion in commit fee7e49d4514 ("mm: propagate error
from stack expansion even for guard page"), that now exposed the
existing VM_FAULT_SIGBUS result to user space.  And user space really
expected SIGSEGV, not SIGBUS.

To fix that case, we need to add a VM_FAULT_SIGSEGV, and teach all those
duplicate architecture fault handlers about it.  They all already have
the code to handle SIGSEGV, so it's about just tying that new return
value to the existing code, but it's all a bit annoying.

This is the mindless minimal patch to do this.  A more extensive patch
would be to try to gather up the mostly shared fault handling logic into
one generic helper routine, and long-term we really should do that
cleanup.

Just from this patch, you can generally see that most architectures just
copied (directly or indirectly) the old x86 way of doing things, but in
the meantime that original x86 model has been improved to hold the VM
semaphore for shorter times etc and to handle VM_FAULT_RETRY and other
"newer" things, so it would be a good idea to bring all those
improvements to the generic case and teach other architectures about
them too.

Reported-and-tested-by: Takashi Iwai <tiwai@suse.de>
Tested-by: Jan Engelhardt <jengelh@inai.de>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> # "s390 still compiles and boots"
Cc: linux-arch@vger.kernel.org
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/mm/fault.c                        | 2 ++
 arch/arc/mm/fault.c                          | 2 ++
 arch/avr32/mm/fault.c                        | 2 ++
 arch/cris/mm/fault.c                         | 2 ++
 arch/frv/mm/fault.c                          | 2 ++
 arch/ia64/mm/fault.c                         | 2 ++
 arch/m32r/mm/fault.c                         | 2 ++
 arch/m68k/mm/fault.c                         | 2 ++
 arch/metag/mm/fault.c                        | 2 ++
 arch/microblaze/mm/fault.c                   | 2 ++
 arch/mips/mm/fault.c                         | 2 ++
 arch/mn10300/mm/fault.c                      | 2 ++
 arch/nios2/mm/fault.c                        | 2 ++
 arch/openrisc/mm/fault.c                     | 2 ++
 arch/parisc/mm/fault.c                       | 2 ++
 arch/powerpc/mm/copro_fault.c                | 2 +-
 arch/powerpc/mm/fault.c                      | 2 ++
 arch/s390/mm/fault.c                         | 6 ++++++
 arch/score/mm/fault.c                        | 2 ++
 arch/sh/mm/fault.c                           | 2 ++
 arch/sparc/mm/fault_32.c                     | 2 ++
 arch/sparc/mm/fault_64.c                     | 2 ++
 arch/tile/mm/fault.c                         | 2 ++
 arch/um/kernel/trap.c                        | 2 ++
 arch/x86/mm/fault.c                          | 2 ++
 arch/xtensa/mm/fault.c                       | 2 ++
 drivers/staging/lustre/lustre/llite/vvp_io.c | 2 +-
 include/linux/mm.h                           | 6 ++++--
 mm/gup.c                                     | 4 ++--
 mm/ksm.c                                     | 2 +-
 30 files changed, 63 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 98838a05ba6d..9d0ac091a52a 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -156,6 +156,8 @@ retry:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index 6f7e3a68803a..0f8df3b5b1b3 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -161,6 +161,8 @@ good_area:
 
 	if (fault & VM_FAULT_OOM)
 		goto out_of_memory;
+	else if (fault & VM_FAULT_SIGSEV)
+		goto bad_area;
 	else if (fault & VM_FAULT_SIGBUS)
 		goto do_sigbus;
 
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index 0eca93327195..d223a8b57c1e 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -142,6 +142,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 1790f22e71a2..2686a7aa8ec8 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -176,6 +176,8 @@ retry:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 9a66372fc7c7..ec4917ddf678 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -168,6 +168,8 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 7225dad87094..ba5ba7accd0d 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -172,6 +172,8 @@ retry:
 		 */
 		if (fault & VM_FAULT_OOM) {
 			goto out_of_memory;
+		} else if (fault & VM_FAULT_SIGSEGV) {
+			goto bad_area;
 		} else if (fault & VM_FAULT_SIGBUS) {
 			signal = SIGBUS;
 			goto bad_area;
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index e9c6a8014bd6..e3d4d4890104 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -200,6 +200,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 2bd7487440c4..b2f04aee46ec 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -145,6 +145,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto map_err;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto bus_err;
 		BUG();
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index 332680e5ebf2..2de5dc695a87 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -141,6 +141,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index fa4cf52aa7a6..d46a5ebb7570 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -224,6 +224,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index becc42bb1849..70ab5d664332 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -158,6 +158,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 3516cbdf1ee9..0c2cc5d39c8e 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -262,6 +262,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index 15a0bb5fc06d..34429d5a0ccd 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -135,6 +135,8 @@ survive:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 0703acf7d327..230ac20ae794 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -171,6 +171,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 3ca9c1131cfe..e5120e653240 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -256,6 +256,8 @@ good_area:
 		 */
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto bad_area;
 		BUG();
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 5a236f082c78..1b5305d4bdab 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -76,7 +76,7 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
 		if (*flt & VM_FAULT_OOM) {
 			ret = -ENOMEM;
 			goto out_unlock;
-		} else if (*flt & VM_FAULT_SIGBUS) {
+		} else if (*flt & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
 			ret = -EFAULT;
 			goto out_unlock;
 		}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index eb79907f34fa..6154b0a2b063 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -437,6 +437,8 @@ good_area:
 	 */
 	fault = handle_mm_fault(mm, vma, address, flags);
 	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
+		if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		rc = mm_fault_error(regs, address, fault);
 		if (rc >= MM_FAULT_RETURN)
 			goto bail;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 811937bb90be..9065d5aa3932 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -374,6 +374,12 @@ static noinline void do_fault_error(struct pt_regs *regs, int fault)
 				do_no_context(regs);
 			else
 				pagefault_out_of_memory();
+		} else if (fault & VM_FAULT_SIGSEGV) {
+			/* Kernel mode? Handle exceptions or die */
+			if (!user_mode(regs))
+				do_no_context(regs);
+			else
+				do_sigsegv(regs, SEGV_MAPERR);
 		} else if (fault & VM_FAULT_SIGBUS) {
 			/* Kernel mode? Handle exceptions or die */
 			if (!user_mode(regs))
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 52238983527d..6860beb2a280 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -114,6 +114,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 541dc6101508..a58fec9b55e0 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -353,6 +353,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	} else {
 		if (fault & VM_FAULT_SIGBUS)
 			do_sigbus(regs, error_code, address);
+		else if (fault & VM_FAULT_SIGSEGV)
+			bad_area(regs, error_code, address);
 		else
 			BUG();
 	}
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index 908e8c17c902..70d817154fe8 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -249,6 +249,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index 18fcd7167095..479823249429 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -446,6 +446,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 565e25a98334..0f61a73534e6 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -442,6 +442,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 5678c3571e7c..209617302df8 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -80,6 +80,8 @@ good_area:
 		if (unlikely(fault & VM_FAULT_ERROR)) {
 			if (fault & VM_FAULT_OOM) {
 				goto out_of_memory;
+			} else if (fault & VM_FAULT_SIGSEGV) {
+				goto out;
 			} else if (fault & VM_FAULT_SIGBUS) {
 				err = -EACCES;
 				goto out;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 38dcec403b46..e3ff27a5b634 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -898,6 +898,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
 			     VM_FAULT_HWPOISON_LARGE))
 			do_sigbus(regs, error_code, address, fault);
+		else if (fault & VM_FAULT_SIGSEGV)
+			bad_area_nosemaphore(regs, error_code, address);
 		else
 			BUG();
 	}
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index b57c4f91f487..9e3571a6535c 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -117,6 +117,8 @@ good_area:
 	if (unlikely(fault & VM_FAULT_ERROR)) {
 		if (fault & VM_FAULT_OOM)
 			goto out_of_memory;
+		else if (fault & VM_FAULT_SIGSEGV)
+			goto bad_area;
 		else if (fault & VM_FAULT_SIGBUS)
 			goto do_sigbus;
 		BUG();
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index 930f6010203e..65d610abe06e 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -632,7 +632,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
 		return 0;
 	}
 
-	if (cfio->fault.ft_flags & VM_FAULT_SIGBUS) {
+	if (cfio->fault.ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
 		CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address);
 		return -EFAULT;
 	}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80fc92a49649..dd5ea3016fc4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1070,6 +1070,7 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_WRITE	0x0008	/* Special case for get_user_pages */
 #define VM_FAULT_HWPOISON 0x0010	/* Hit poisoned small page */
 #define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
+#define VM_FAULT_SIGSEGV 0x0040
 
 #define VM_FAULT_NOPAGE	0x0100	/* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
@@ -1078,8 +1079,9 @@ static inline int page_mapped(struct page *page)
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 
-#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
-			 VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE)
+#define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
+			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
+			 VM_FAULT_FALLBACK)
 
 /* Encode hstate index for a hwpoisoned large page */
 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)
diff --git a/mm/gup.c b/mm/gup.c
index a900759cc807..8dd50ce6326f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -296,7 +296,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
 			return -ENOMEM;
 		if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
 			return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
-		if (ret & VM_FAULT_SIGBUS)
+		if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
 			return -EFAULT;
 		BUG();
 	}
@@ -571,7 +571,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 			return -ENOMEM;
 		if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
 			return -EHWPOISON;
-		if (ret & VM_FAULT_SIGBUS)
+		if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
 			return -EFAULT;
 		BUG();
 	}
diff --git a/mm/ksm.c b/mm/ksm.c
index d247efab5073..15647fb0394f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -376,7 +376,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 		else
 			ret = VM_FAULT_WRITE;
 		put_page(page);
-	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
+	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
 	/*
 	 * We must loop because handle_mm_fault() may back out if there's
 	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
-- 
cgit v1.2.3


From 7866a621043fbaca3d7389e9b9f69dd1a2e5a855 Mon Sep 17 00:00:00 2001
From: Salam Noureddine <noureddine@arista.com>
Date: Tue, 27 Jan 2015 11:35:48 -0800
Subject: dev: add per net_device packet type chains

When many pf_packet listeners are created on a lot of interfaces the
current implementation using global packet type lists scales poorly.
This patch adds per net_device packet type lists to fix this problem.

The patch was originally written by Eric Biederman for linux-2.6.29.
Tested on linux-3.16.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Salam Noureddine <noureddine@arista.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   2 +
 net/core/dev.c            | 132 +++++++++++++++++++++++++++++-----------------
 2 files changed, 86 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 642d426a668f..3d37c6eb1732 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1514,6 +1514,8 @@ struct net_device {
 	struct list_head	napi_list;
 	struct list_head	unreg_list;
 	struct list_head	close_list;
+	struct list_head	ptype_all;
+	struct list_head	ptype_specific;
 
 	struct {
 		struct list_head upper;
diff --git a/net/core/dev.c b/net/core/dev.c
index 7f028d441e98..1d564d68e31a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -371,9 +371,10 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 static inline struct list_head *ptype_head(const struct packet_type *pt)
 {
 	if (pt->type == htons(ETH_P_ALL))
-		return &ptype_all;
+		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 	else
-		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+		return pt->dev ? &pt->dev->ptype_specific :
+				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 }
 
 /**
@@ -1734,6 +1735,23 @@ static inline int deliver_skb(struct sk_buff *skb,
 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 }
 
+static inline void deliver_ptype_list_skb(struct sk_buff *skb,
+					  struct packet_type **pt,
+					  struct net_device *dev, __be16 type,
+					  struct list_head *ptype_list)
+{
+	struct packet_type *ptype, *pt_prev = *pt;
+
+	list_for_each_entry_rcu(ptype, ptype_list, list) {
+		if (ptype->type != type)
+			continue;
+		if (pt_prev)
+			deliver_skb(skb, pt_prev, dev);
+		pt_prev = ptype;
+	}
+	*pt = pt_prev;
+}
+
 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 {
 	if (!ptype->af_packet_priv || !skb->sk)
@@ -1757,45 +1775,54 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 	struct packet_type *ptype;
 	struct sk_buff *skb2 = NULL;
 	struct packet_type *pt_prev = NULL;
+	struct list_head *ptype_list = &ptype_all;
 
 	rcu_read_lock();
-	list_for_each_entry_rcu(ptype, &ptype_all, list) {
+again:
+	list_for_each_entry_rcu(ptype, ptype_list, list) {
 		/* Never send packets back to the socket
 		 * they originated from - MvS (miquels@drinkel.ow.org)
 		 */
-		if ((ptype->dev == dev || !ptype->dev) &&
-		    (!skb_loop_sk(ptype, skb))) {
-			if (pt_prev) {
-				deliver_skb(skb2, pt_prev, skb->dev);
-				pt_prev = ptype;
-				continue;
-			}
+		if (skb_loop_sk(ptype, skb))
+			continue;
 
-			skb2 = skb_clone(skb, GFP_ATOMIC);
-			if (!skb2)
-				break;
+		if (pt_prev) {
+			deliver_skb(skb2, pt_prev, skb->dev);
+			pt_prev = ptype;
+			continue;
+		}
 
-			net_timestamp_set(skb2);
+		/* need to clone skb, done only once */
+		skb2 = skb_clone(skb, GFP_ATOMIC);
+		if (!skb2)
+			goto out_unlock;
 
-			/* skb->nh should be correctly
-			   set by sender, so that the second statement is
-			   just protection against buggy protocols.
-			 */
-			skb_reset_mac_header(skb2);
-
-			if (skb_network_header(skb2) < skb2->data ||
-			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
-				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
-						     ntohs(skb2->protocol),
-						     dev->name);
-				skb_reset_network_header(skb2);
-			}
+		net_timestamp_set(skb2);
 
-			skb2->transport_header = skb2->network_header;
-			skb2->pkt_type = PACKET_OUTGOING;
-			pt_prev = ptype;
+		/* skb->nh should be correctly
+		 * set by sender, so that the second statement is
+		 * just protection against buggy protocols.
+		 */
+		skb_reset_mac_header(skb2);
+
+		if (skb_network_header(skb2) < skb2->data ||
+		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
+			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
+					     ntohs(skb2->protocol),
+					     dev->name);
+			skb_reset_network_header(skb2);
 		}
+
+		skb2->transport_header = skb2->network_header;
+		skb2->pkt_type = PACKET_OUTGOING;
+		pt_prev = ptype;
+	}
+
+	if (ptype_list == &ptype_all) {
+		ptype_list = &dev->ptype_all;
+		goto again;
 	}
+out_unlock:
 	if (pt_prev)
 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
 	rcu_read_unlock();
@@ -2617,7 +2644,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
 	unsigned int len;
 	int rc;
 
-	if (!list_empty(&ptype_all))
+	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
 		dev_queue_xmit_nit(skb, dev);
 
 	len = skb->len;
@@ -3615,7 +3642,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
 	struct packet_type *ptype, *pt_prev;
 	rx_handler_func_t *rx_handler;
 	struct net_device *orig_dev;
-	struct net_device *null_or_dev;
 	bool deliver_exact = false;
 	int ret = NET_RX_DROP;
 	__be16 type;
@@ -3658,11 +3684,15 @@ another_round:
 		goto skip_taps;
 
 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
-		if (!ptype->dev || ptype->dev == skb->dev) {
-			if (pt_prev)
-				ret = deliver_skb(skb, pt_prev, orig_dev);
-			pt_prev = ptype;
-		}
+		if (pt_prev)
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+		pt_prev = ptype;
+	}
+
+	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
+		if (pt_prev)
+			ret = deliver_skb(skb, pt_prev, orig_dev);
+		pt_prev = ptype;
 	}
 
 skip_taps:
@@ -3718,19 +3748,21 @@ ncls:
 		skb->vlan_tci = 0;
 	}
 
+	type = skb->protocol;
+
 	/* deliver only exact match when indicated */
-	null_or_dev = deliver_exact ? skb->dev : NULL;
+	if (likely(!deliver_exact)) {
+		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+				       &ptype_base[ntohs(type) &
+						   PTYPE_HASH_MASK]);
+	}
 
-	type = skb->protocol;
-	list_for_each_entry_rcu(ptype,
-			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
-		if (ptype->type == type &&
-		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
-		     ptype->dev == orig_dev)) {
-			if (pt_prev)
-				ret = deliver_skb(skb, pt_prev, orig_dev);
-			pt_prev = ptype;
-		}
+	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+			       &orig_dev->ptype_specific);
+
+	if (unlikely(skb->dev != orig_dev)) {
+		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+				       &skb->dev->ptype_specific);
 	}
 
 	if (pt_prev) {
@@ -6579,6 +6611,8 @@ void netdev_run_todo(void)
 
 		/* paranoia */
 		BUG_ON(netdev_refcnt_read(dev));
+		BUG_ON(!list_empty(&dev->ptype_all));
+		BUG_ON(!list_empty(&dev->ptype_specific));
 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 		WARN_ON(dev->dn_ptr);
@@ -6761,6 +6795,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	INIT_LIST_HEAD(&dev->adj_list.lower);
 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
+	INIT_LIST_HEAD(&dev->ptype_all);
+	INIT_LIST_HEAD(&dev->ptype_specific);
 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 	setup(dev);
 
-- 
cgit v1.2.3


From f262f28c147051e7aa6daaf4fb5996833ffadff4 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Mon, 26 Jan 2015 13:16:27 +0900
Subject: PM / devfreq: event: Add devfreq_event class

This patch adds a new class in devfreq, devfreq_event, which provides
raw data (e.g., memory bus utilization, GPU utilization) for devfreq
governors.

- devfreq_event device : Provides raw data for a governor of a devfreq device
- devfreq device       : Monitors device state and changes frequency/voltage
			of the device using the raw data from its
			devfreq_event device.

A devfreq device dertermines performance states (normally the frequency
and the voltage vlues) based on the results its designtated devfreq governor:
e.g., ondemand, performance, powersave.

In order to give such results required by a devfreq device, the devfreq
governor requires data that indicates the performance requirement given
to the devfreq device. The conventional (previous) implementatino of
devfreq subsystem requires a devfreq device driver to implement its own
mechanism to acquire performance requirement for its governor. However,
there had been issues with such requirements:

1. Although performance requirement of such devices is usually acquired
 from common devices (PMU/PPMU), we do not have any abstract structure to
 represent them properly.
2. Such performance requirement devices (PMU/PPMU) are actual hardware
 pieces that may be represented by Device Tree directly while devfreq device
 itself is a virtual entity that are not considered to be represented by
 Device Tree according to Device Tree folks.

In order to address such issues, a devferq_event device (represented by
this patch) provides a template for device drivers representing
performance monitoring unit, which gives the basic or raw data for
preformance requirement, which in turn, is required by devfreq governors.

The following description explains the feature of two kind of devfreq class:
- devfreq class (existing)
 : devfreq consumer device use raw data from devfreq_event device for
   determining proper current system state and change voltage/frequency
   dynamically using various governors.

- devfreq_event class (new)
 : Provide measured raw data to devfreq device for governor

Cc: MyungJoo Ham <myungjoo.ham@samsung.com>
Cc: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
[Commit message rewritten & conflict resolved by MyungJoo]
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/Kconfig             |   2 +
 drivers/devfreq/Makefile            |   3 +
 drivers/devfreq/devfreq-event.c     | 494 ++++++++++++++++++++++++++++++++++++
 drivers/devfreq/event/Kconfig       |  25 ++
 drivers/devfreq/event/Makefile      |   2 +
 drivers/devfreq/event/exynos-ppmu.c | 374 +++++++++++++++++++++++++++
 drivers/devfreq/event/exynos-ppmu.h |  93 +++++++
 include/linux/devfreq-event.h       | 196 ++++++++++++++
 8 files changed, 1189 insertions(+)
 create mode 100644 drivers/devfreq/devfreq-event.c
 create mode 100644 drivers/devfreq/event/Kconfig
 create mode 100644 drivers/devfreq/event/Makefile
 create mode 100644 drivers/devfreq/event/exynos-ppmu.c
 create mode 100644 drivers/devfreq/event/exynos-ppmu.h
 create mode 100644 include/linux/devfreq-event.h

(limited to 'include/linux')

diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig
index 4aab799712bb..51dccb3620ea 100644
--- a/drivers/devfreq/Kconfig
+++ b/drivers/devfreq/Kconfig
@@ -97,4 +97,6 @@ config ARM_TEGRA_DEVFREQ
          It reads ACTMON counters of memory controllers and adjusts the
          operating frequencies and voltages with OPP support.
 
+source "drivers/devfreq/event/Kconfig"
+
 endif # PM_DEVFREQ
diff --git a/drivers/devfreq/Makefile b/drivers/devfreq/Makefile
index 0ea991f3a483..9256c35bcce9 100644
--- a/drivers/devfreq/Makefile
+++ b/drivers/devfreq/Makefile
@@ -8,3 +8,6 @@ obj-$(CONFIG_DEVFREQ_GOV_USERSPACE)	+= governor_userspace.o
 obj-$(CONFIG_ARM_EXYNOS4_BUS_DEVFREQ)	+= exynos/
 obj-$(CONFIG_ARM_EXYNOS5_BUS_DEVFREQ)	+= exynos/
 obj-$(CONFIG_ARM_TEGRA_DEVFREQ)		+= tegra-devfreq.o
+
+# DEVFREQ Event Drivers
+obj-$(CONFIG_PM_DEVFREQ_EVENT)		+= event/
diff --git a/drivers/devfreq/devfreq-event.c b/drivers/devfreq/devfreq-event.c
new file mode 100644
index 000000000000..f304a0289eda
--- /dev/null
+++ b/drivers/devfreq/devfreq-event.c
@@ -0,0 +1,494 @@
+/*
+ * devfreq-event: a framework to provide raw data and events of devfreq devices
+ *
+ * Copyright (C) 2015 Samsung Electronics
+ * Author: Chanwoo Choi <cw00.choi@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This driver is based on drivers/devfreq/devfreq.c.
+ */
+
+#include <linux/devfreq-event.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/of.h>
+
+static struct class *devfreq_event_class;
+
+/* The list of all devfreq event list */
+static LIST_HEAD(devfreq_event_list);
+static DEFINE_MUTEX(devfreq_event_list_lock);
+
+#define to_devfreq_event(DEV) container_of(DEV, struct devfreq_event_dev, dev)
+
+/**
+ * devfreq_event_enable_edev() - Enable the devfreq-event dev and increase
+ *				 the enable_count of devfreq-event dev.
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function increase the enable_count and enable the
+ * devfreq-event device. The devfreq-event device should be enabled before
+ * using it by devfreq device.
+ */
+int devfreq_event_enable_edev(struct devfreq_event_dev *edev)
+{
+	int ret = 0;
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	mutex_lock(&edev->lock);
+	if (edev->desc->ops && edev->desc->ops->enable
+			&& edev->enable_count == 0) {
+		ret = edev->desc->ops->enable(edev);
+		if (ret < 0)
+			goto err;
+	}
+	edev->enable_count++;
+err:
+	mutex_unlock(&edev->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_enable_edev);
+
+/**
+ * devfreq_event_disable_edev() - Disable the devfreq-event dev and decrease
+ *				  the enable_count of the devfreq-event dev.
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function decrease the enable_count and disable the
+ * devfreq-event device. After the devfreq-event device is disabled,
+ * devfreq device can't use the devfreq-event device for get/set/reset
+ * operations.
+ */
+int devfreq_event_disable_edev(struct devfreq_event_dev *edev)
+{
+	int ret = 0;
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	mutex_lock(&edev->lock);
+	if (edev->enable_count <= 0) {
+		dev_warn(&edev->dev, "unbalanced enable_count\n");
+		ret = -EIO;
+		goto err;
+	}
+
+	if (edev->desc->ops && edev->desc->ops->disable
+			&& edev->enable_count == 1) {
+		ret = edev->desc->ops->disable(edev);
+		if (ret < 0)
+			goto err;
+	}
+	edev->enable_count--;
+err:
+	mutex_unlock(&edev->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_disable_edev);
+
+/**
+ * devfreq_event_is_enabled() - Check whether devfreq-event dev is enabled or
+ *				not.
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function check whether devfreq-event dev is enabled or not.
+ * If return true, the devfreq-event dev is enabeld. If return false, the
+ * devfreq-event dev is disabled.
+ */
+bool devfreq_event_is_enabled(struct devfreq_event_dev *edev)
+{
+	bool enabled = false;
+
+	if (!edev || !edev->desc)
+		return enabled;
+
+	mutex_lock(&edev->lock);
+
+	if (edev->enable_count > 0)
+		enabled = true;
+
+	mutex_unlock(&edev->lock);
+
+	return enabled;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_is_enabled);
+
+/**
+ * devfreq_event_set_event() - Set event to devfreq-event dev to start.
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function set the event to the devfreq-event device to start
+ * for getting the event data which could be various event type.
+ */
+int devfreq_event_set_event(struct devfreq_event_dev *edev)
+{
+	int ret;
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	if (!edev->desc->ops || !edev->desc->ops->set_event)
+		return -EINVAL;
+
+	if (!devfreq_event_is_enabled(edev))
+		return -EPERM;
+
+	mutex_lock(&edev->lock);
+	ret = edev->desc->ops->set_event(edev);
+	mutex_unlock(&edev->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_set_event);
+
+/**
+ * devfreq_event_get_event() - Get {load|total}_count from devfreq-event dev.
+ * @edev	: the devfreq-event device
+ * @edata	: the calculated data of devfreq-event device
+ *
+ * Note that this function get the calculated event data from devfreq-event dev
+ * after stoping the progress of whole sequence of devfreq-event dev.
+ */
+int devfreq_event_get_event(struct devfreq_event_dev *edev,
+			    struct devfreq_event_data *edata)
+{
+	int ret;
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	if (!edev->desc->ops || !edev->desc->ops->get_event)
+		return -EINVAL;
+
+	if (!devfreq_event_is_enabled(edev))
+		return -EINVAL;
+
+	edata->total_count = edata->load_count = 0;
+
+	mutex_lock(&edev->lock);
+	ret = edev->desc->ops->get_event(edev, edata);
+	if (ret < 0)
+		edata->total_count = edata->load_count = 0;
+	mutex_unlock(&edev->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_get_event);
+
+/**
+ * devfreq_event_reset_event() - Reset all opeations of devfreq-event dev.
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function stop all operations of devfreq-event dev and reset
+ * the current event data to make the devfreq-event device into initial state.
+ */
+int devfreq_event_reset_event(struct devfreq_event_dev *edev)
+{
+	int ret = 0;
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	if (!devfreq_event_is_enabled(edev))
+		return -EPERM;
+
+	mutex_lock(&edev->lock);
+	if (edev->desc->ops && edev->desc->ops->reset)
+		ret = edev->desc->ops->reset(edev);
+	mutex_unlock(&edev->lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_reset_event);
+
+/**
+ * devfreq_event_get_edev_by_phandle() - Get the devfreq-event dev from
+ *					 devicetree.
+ * @dev		: the pointer to the given device
+ * @index	: the index into list of devfreq-event device
+ *
+ * Note that this function return the pointer of devfreq-event device.
+ */
+struct devfreq_event_dev *devfreq_event_get_edev_by_phandle(struct device *dev,
+						      int index)
+{
+	struct device_node *node;
+	struct devfreq_event_dev *edev;
+
+	if (!dev->of_node) {
+		dev_err(dev, "device does not have a device node entry\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	node = of_parse_phandle(dev->of_node, "devfreq-events", index);
+	if (!node) {
+		dev_err(dev, "failed to get phandle in %s node\n",
+			dev->of_node->full_name);
+		return ERR_PTR(-ENODEV);
+	}
+
+	mutex_lock(&devfreq_event_list_lock);
+	list_for_each_entry(edev, &devfreq_event_list, node) {
+		if (!strcmp(edev->desc->name, node->name))
+			goto out;
+	}
+	edev = NULL;
+out:
+	mutex_unlock(&devfreq_event_list_lock);
+
+	if (!edev) {
+		dev_err(dev, "unable to get devfreq-event device : %s\n",
+			node->name);
+		of_node_put(node);
+		return ERR_PTR(-ENODEV);
+	}
+
+	of_node_put(node);
+
+	return edev;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_get_edev_by_phandle);
+
+/**
+ * devfreq_event_get_edev_count() - Get the count of devfreq-event dev
+ * @dev		: the pointer to the given device
+ *
+ * Note that this function return the count of devfreq-event devices.
+ */
+int devfreq_event_get_edev_count(struct device *dev)
+{
+	int count;
+
+	if (!dev->of_node) {
+		dev_err(dev, "device does not have a device node entry\n");
+		return -EINVAL;
+	}
+
+	count = of_property_count_elems_of_size(dev->of_node, "devfreq-events",
+						sizeof(u32));
+	if (count < 0 ) {
+		dev_err(dev,
+			"failed to get the count of devfreq-event in %s node\n",
+			dev->of_node->full_name);
+		return count;
+	}
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_get_edev_count);
+
+static void devfreq_event_release_edev(struct device *dev)
+{
+	struct devfreq_event_dev *edev = to_devfreq_event(dev);
+
+	kfree(edev);
+}
+
+/**
+ * devfreq_event_add_edev() - Add new devfreq-event device.
+ * @dev		: the device owning the devfreq-event device being created
+ * @desc	: the devfreq-event device's decriptor which include essential
+ *		  data for devfreq-event device.
+ *
+ * Note that this function add new devfreq-event device to devfreq-event class
+ * list and register the device of the devfreq-event device.
+ */
+struct devfreq_event_dev *devfreq_event_add_edev(struct device *dev,
+						struct devfreq_event_desc *desc)
+{
+	struct devfreq_event_dev *edev;
+	static atomic_t event_no = ATOMIC_INIT(0);
+	int ret;
+
+	if (!dev || !desc)
+		return ERR_PTR(-EINVAL);
+
+	if (!desc->name || !desc->ops)
+		return ERR_PTR(-EINVAL);
+
+	if (!desc->ops->set_event || !desc->ops->get_event)
+		return ERR_PTR(-EINVAL);
+
+	edev = kzalloc(sizeof(struct devfreq_event_dev), GFP_KERNEL);
+	if (!edev)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&edev->lock);
+	edev->desc = desc;
+	edev->enable_count = 0;
+	edev->dev.parent = dev;
+	edev->dev.class = devfreq_event_class;
+	edev->dev.release = devfreq_event_release_edev;
+
+	dev_set_name(&edev->dev, "event.%d", atomic_inc_return(&event_no) - 1);
+	ret = device_register(&edev->dev);
+	if (ret < 0) {
+		put_device(&edev->dev);
+		return ERR_PTR(ret);
+	}
+	dev_set_drvdata(&edev->dev, edev);
+
+	INIT_LIST_HEAD(&edev->node);
+
+	mutex_lock(&devfreq_event_list_lock);
+	list_add(&edev->node, &devfreq_event_list);
+	mutex_unlock(&devfreq_event_list_lock);
+
+	return edev;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_add_edev);
+
+/**
+ * devfreq_event_remove_edev() - Remove the devfreq-event device registered.
+ * @dev		: the devfreq-event device
+ *
+ * Note that this function remove the registered devfreq-event device.
+ */
+int devfreq_event_remove_edev(struct devfreq_event_dev *edev)
+{
+	if (!edev)
+		return -EINVAL;
+
+	WARN_ON(edev->enable_count);
+
+	mutex_lock(&devfreq_event_list_lock);
+	list_del(&edev->node);
+	mutex_unlock(&devfreq_event_list_lock);
+
+	device_unregister(&edev->dev);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devfreq_event_remove_edev);
+
+static int devm_devfreq_event_match(struct device *dev, void *res, void *data)
+{
+	struct devfreq_event_dev **r = res;
+
+	if (WARN_ON(!r || !*r))
+		return 0;
+
+	return *r == data;
+}
+
+static void devm_devfreq_event_release(struct device *dev, void *res)
+{
+	devfreq_event_remove_edev(*(struct devfreq_event_dev **)res);
+}
+
+/**
+ * devm_devfreq_event_add_edev() - Resource-managed devfreq_event_add_edev()
+ * @dev		: the device owning the devfreq-event device being created
+ * @desc	: the devfreq-event device's decriptor which include essential
+ *		  data for devfreq-event device.
+ *
+ * Note that this function manages automatically the memory of devfreq-event
+ * device using device resource management and simplify the free operation
+ * for memory of devfreq-event device.
+ */
+struct devfreq_event_dev *devm_devfreq_event_add_edev(struct device *dev,
+						struct devfreq_event_desc *desc)
+{
+	struct devfreq_event_dev **ptr, *edev;
+
+	ptr = devres_alloc(devm_devfreq_event_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	edev = devfreq_event_add_edev(dev, desc);
+	if (IS_ERR(edev)) {
+		devres_free(ptr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	*ptr = edev;
+	devres_add(dev, ptr);
+
+	return edev;
+}
+EXPORT_SYMBOL_GPL(devm_devfreq_event_add_edev);
+
+/**
+ * devm_devfreq_event_remove_edev()- Resource-managed devfreq_event_remove_edev()
+ * @dev		: the device owning the devfreq-event device being created
+ * @edev	: the devfreq-event device
+ *
+ * Note that this function manages automatically the memory of devfreq-event
+ * device using device resource management.
+ */
+void devm_devfreq_event_remove_edev(struct device *dev,
+				struct devfreq_event_dev *edev)
+{
+	WARN_ON(devres_release(dev, devm_devfreq_event_release,
+			       devm_devfreq_event_match, edev));
+}
+EXPORT_SYMBOL_GPL(devm_devfreq_event_remove_edev);
+
+/*
+ * Device attributes for devfreq-event class.
+ */
+static ssize_t name_show(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct devfreq_event_dev *edev = to_devfreq_event(dev);
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	return sprintf(buf, "%s\n", edev->desc->name);
+}
+static DEVICE_ATTR_RO(name);
+
+static ssize_t enable_count_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct devfreq_event_dev *edev = to_devfreq_event(dev);
+
+	if (!edev || !edev->desc)
+		return -EINVAL;
+
+	return sprintf(buf, "%d\n", edev->enable_count);
+}
+static DEVICE_ATTR_RO(enable_count);
+
+static struct attribute *devfreq_event_attrs[] = {
+	&dev_attr_name.attr,
+	&dev_attr_enable_count.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(devfreq_event);
+
+static int __init devfreq_event_init(void)
+{
+	devfreq_event_class = class_create(THIS_MODULE, "devfreq-event");
+	if (IS_ERR(devfreq_event_class)) {
+		pr_err("%s: couldn't create class\n", __FILE__);
+		return PTR_ERR(devfreq_event_class);
+	}
+
+	devfreq_event_class->dev_groups = devfreq_event_groups;
+
+	return 0;
+}
+subsys_initcall(devfreq_event_init);
+
+static void __exit devfreq_event_exit(void)
+{
+	class_destroy(devfreq_event_class);
+}
+module_exit(devfreq_event_exit);
+
+MODULE_AUTHOR("Chanwoo Choi <cw00.choi@samsung.com>");
+MODULE_DESCRIPTION("DEVFREQ-Event class support");
+MODULE_LICENSE("GPL");
diff --git a/drivers/devfreq/event/Kconfig b/drivers/devfreq/event/Kconfig
new file mode 100644
index 000000000000..a11720affc31
--- /dev/null
+++ b/drivers/devfreq/event/Kconfig
@@ -0,0 +1,25 @@
+menuconfig PM_DEVFREQ_EVENT
+	bool "DEVFREQ-Event device Support"
+	help
+	  The devfreq-event device provide the raw data and events which
+	  indicate the current state of devfreq-event device. The provided
+	  data from devfreq-event device is used to monitor the state of
+	  device and determine the suitable size of resource to reduce the
+	  wasted resource.
+
+	  The devfreq-event device can support the various type of events
+	  (e.g., raw data, utilization, latency, bandwidth). The events
+	  may be used by devfreq governor and other subsystem.
+
+if PM_DEVFREQ_EVENT
+
+config DEVFREQ_EVENT_EXYNOS_PPMU
+	bool "EXYNOS PPMU (Platform Performance Monitoring Unit) DEVFREQ event Driver"
+	depends on ARCH_EXYNOS
+	select PM_OPP
+	help
+	  This add the devfreq-event driver for Exynos SoC. It provides PPMU
+	  (Platform Performance Monitoring Unit) counters to estimate the
+	  utilization of each module.
+
+endif # PM_DEVFREQ_EVENT
diff --git a/drivers/devfreq/event/Makefile b/drivers/devfreq/event/Makefile
new file mode 100644
index 000000000000..be146ead79cf
--- /dev/null
+++ b/drivers/devfreq/event/Makefile
@@ -0,0 +1,2 @@
+# Exynos DEVFREQ Event Drivers
+obj-$(CONFIG_DEVFREQ_EVENT_EXYNOS_PPMU) += exynos-ppmu.o
diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c
new file mode 100644
index 000000000000..135be0aada9d
--- /dev/null
+++ b/drivers/devfreq/event/exynos-ppmu.c
@@ -0,0 +1,374 @@
+/*
+ * exynos_ppmu.c - EXYNOS PPMU (Platform Performance Monitoring Unit) support
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author : Chanwoo Choi <cw00.choi@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This driver is based on drivers/devfreq/exynos/exynos_ppmu.c
+ */
+
+#include <linux/clk.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/suspend.h>
+#include <linux/devfreq-event.h>
+
+#include "exynos-ppmu.h"
+
+struct exynos_ppmu_data {
+	void __iomem *base;
+	struct clk *clk;
+};
+
+struct exynos_ppmu {
+	struct devfreq_event_dev **edev;
+	struct devfreq_event_desc *desc;
+	unsigned int num_events;
+
+	struct device *dev;
+	struct mutex lock;
+
+	struct exynos_ppmu_data ppmu;
+};
+
+#define PPMU_EVENT(name)			\
+	{ "ppmu-event0-"#name, PPMU_PMNCNT0 },	\
+	{ "ppmu-event1-"#name, PPMU_PMNCNT1 },	\
+	{ "ppmu-event2-"#name, PPMU_PMNCNT2 },	\
+	{ "ppmu-event3-"#name, PPMU_PMNCNT3 }
+
+struct __exynos_ppmu_events {
+	char *name;
+	int id;
+} ppmu_events[] = {
+	/* For Exynos3250, Exynos4 and Exynos5260 */
+	PPMU_EVENT(g3d),
+	PPMU_EVENT(fsys),
+
+	/* For Exynos4 SoCs and Exynos3250 */
+	PPMU_EVENT(dmc0),
+	PPMU_EVENT(dmc1),
+	PPMU_EVENT(cpu),
+	PPMU_EVENT(rightbus),
+	PPMU_EVENT(leftbus),
+	PPMU_EVENT(lcd0),
+	PPMU_EVENT(camif),
+
+	/* Only for Exynos3250 and Exynos5260 */
+	PPMU_EVENT(mfc),
+
+	/* Only for Exynos4 SoCs */
+	PPMU_EVENT(mfc-left),
+	PPMU_EVENT(mfc-right),
+
+	/* Only for Exynos5260 SoCs */
+	PPMU_EVENT(drex0-s0),
+	PPMU_EVENT(drex0-s1),
+	PPMU_EVENT(drex1-s0),
+	PPMU_EVENT(drex1-s1),
+	PPMU_EVENT(eagle),
+	PPMU_EVENT(kfc),
+	PPMU_EVENT(isp),
+	PPMU_EVENT(fimc),
+	PPMU_EVENT(gscl),
+	PPMU_EVENT(mscl),
+	PPMU_EVENT(fimd0x),
+	PPMU_EVENT(fimd1x),
+	{ /* sentinel */ },
+};
+
+static int exynos_ppmu_find_ppmu_id(struct devfreq_event_dev *edev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ppmu_events); i++)
+		if (!strcmp(edev->desc->name, ppmu_events[i].name))
+			return ppmu_events[i].id;
+
+	return -EINVAL;
+}
+
+static int exynos_ppmu_disable(struct devfreq_event_dev *edev)
+{
+	struct exynos_ppmu *info = devfreq_event_get_drvdata(edev);
+	u32 pmnc;
+
+	/* Disable all counters */
+	__raw_writel(PPMU_CCNT_MASK |
+		     PPMU_PMCNT0_MASK |
+		     PPMU_PMCNT1_MASK |
+		     PPMU_PMCNT2_MASK |
+		     PPMU_PMCNT3_MASK,
+		     info->ppmu.base + PPMU_CNTENC);
+
+	/* Disable PPMU */
+	pmnc = __raw_readl(info->ppmu.base + PPMU_PMNC);
+	pmnc &= ~PPMU_PMNC_ENABLE_MASK;
+	__raw_writel(pmnc, info->ppmu.base + PPMU_PMNC);
+
+	return 0;
+}
+
+static int exynos_ppmu_set_event(struct devfreq_event_dev *edev)
+{
+	struct exynos_ppmu *info = devfreq_event_get_drvdata(edev);
+	int id = exynos_ppmu_find_ppmu_id(edev);
+	u32 pmnc, cntens;
+
+	if (id < 0)
+		return id;
+
+	/* Enable specific counter */
+	cntens = __raw_readl(info->ppmu.base + PPMU_CNTENS);
+	cntens |= (PPMU_CCNT_MASK | (PPMU_ENABLE << id));
+	__raw_writel(cntens, info->ppmu.base + PPMU_CNTENS);
+
+	/* Set the event of Read/Write data count  */
+	__raw_writel(PPMU_RO_DATA_CNT | PPMU_WO_DATA_CNT,
+			info->ppmu.base + PPMU_BEVTxSEL(id));
+
+	/* Reset cycle counter/performance counter and enable PPMU */
+	pmnc = __raw_readl(info->ppmu.base + PPMU_PMNC);
+	pmnc &= ~(PPMU_PMNC_ENABLE_MASK
+			| PPMU_PMNC_COUNTER_RESET_MASK
+			| PPMU_PMNC_CC_RESET_MASK);
+	pmnc |= (PPMU_ENABLE << PPMU_PMNC_ENABLE_SHIFT);
+	pmnc |= (PPMU_ENABLE << PPMU_PMNC_COUNTER_RESET_SHIFT);
+	pmnc |= (PPMU_ENABLE << PPMU_PMNC_CC_RESET_SHIFT);
+	__raw_writel(pmnc, info->ppmu.base + PPMU_PMNC);
+
+	return 0;
+}
+
+static int exynos_ppmu_get_event(struct devfreq_event_dev *edev,
+				struct devfreq_event_data *edata)
+{
+	struct exynos_ppmu *info = devfreq_event_get_drvdata(edev);
+	int id = exynos_ppmu_find_ppmu_id(edev);
+	u32 pmnc, cntenc;
+
+	if (id < 0)
+		return -EINVAL;
+
+	/* Disable PPMU */
+	pmnc = __raw_readl(info->ppmu.base + PPMU_PMNC);
+	pmnc &= ~PPMU_PMNC_ENABLE_MASK;
+	__raw_writel(pmnc, info->ppmu.base + PPMU_PMNC);
+
+	/* Read cycle count */
+	edata->total_count = __raw_readl(info->ppmu.base + PPMU_CCNT);
+
+	/* Read performance count */
+	switch (id) {
+	case PPMU_PMNCNT0:
+	case PPMU_PMNCNT1:
+	case PPMU_PMNCNT2:
+		edata->load_count
+			= __raw_readl(info->ppmu.base + PPMU_PMNCT(id));
+		break;
+	case PPMU_PMNCNT3:
+		edata->load_count =
+			((__raw_readl(info->ppmu.base + PPMU_PMCNT3_HIGH) << 8)
+			| __raw_readl(info->ppmu.base + PPMU_PMCNT3_LOW));
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Disable specific counter */
+	cntenc = __raw_readl(info->ppmu.base + PPMU_CNTENC);
+	cntenc |= (PPMU_CCNT_MASK | (PPMU_ENABLE << id));
+	__raw_writel(cntenc, info->ppmu.base + PPMU_CNTENC);
+
+	dev_dbg(&edev->dev, "%s (event: %ld/%ld)\n", edev->desc->name,
+					edata->load_count, edata->total_count);
+
+	return 0;
+}
+
+static struct devfreq_event_ops exynos_ppmu_ops = {
+	.disable = exynos_ppmu_disable,
+	.set_event = exynos_ppmu_set_event,
+	.get_event = exynos_ppmu_get_event,
+};
+
+static int of_get_devfreq_events(struct device_node *np,
+				 struct exynos_ppmu *info)
+{
+	struct devfreq_event_desc *desc;
+	struct device *dev = info->dev;
+	struct device_node *events_np, *node;
+	int i, j, count;
+
+	events_np = of_get_child_by_name(np, "events");
+	if (!events_np) {
+		dev_err(dev,
+			"failed to get child node of devfreq-event devices\n");
+		return -EINVAL;
+	}
+
+	count = of_get_child_count(events_np);
+	desc = devm_kzalloc(dev, sizeof(*desc) * count, GFP_KERNEL);
+	if (!desc)
+		return -ENOMEM;
+	info->num_events = count;
+
+	j = 0;
+	for_each_child_of_node(events_np, node) {
+		for (i = 0; i < ARRAY_SIZE(ppmu_events); i++) {
+			if (!ppmu_events[i].name)
+				continue;
+
+			if (!of_node_cmp(node->name, ppmu_events[i].name))
+				break;
+		}
+
+		if (i == ARRAY_SIZE(ppmu_events)) {
+			dev_warn(dev,
+				"don't know how to configure events : %s\n",
+				node->name);
+			continue;
+		}
+
+		desc[j].ops = &exynos_ppmu_ops;
+		desc[j].driver_data = info;
+
+		of_property_read_string(node, "event-name", &desc[j].name);
+
+		j++;
+
+		of_node_put(node);
+	}
+	info->desc = desc;
+
+	of_node_put(events_np);
+
+	return 0;
+}
+
+static int exynos_ppmu_parse_dt(struct exynos_ppmu *info)
+{
+	struct device *dev = info->dev;
+	struct device_node *np = dev->of_node;
+	int ret = 0;
+
+	if (!np) {
+		dev_err(dev, "failed to find devicetree node\n");
+		return -EINVAL;
+	}
+
+	/* Maps the memory mapped IO to control PPMU register */
+	info->ppmu.base = of_iomap(np, 0);
+	if (IS_ERR_OR_NULL(info->ppmu.base)) {
+		dev_err(dev, "failed to map memory region\n");
+		return -ENOMEM;
+	}
+
+	info->ppmu.clk = devm_clk_get(dev, "ppmu");
+	if (IS_ERR(info->ppmu.clk)) {
+		info->ppmu.clk = NULL;
+		dev_warn(dev, "cannot get PPMU clock\n");
+	}
+
+	ret = of_get_devfreq_events(np, info);
+	if (ret < 0) {
+		dev_err(dev, "failed to parse exynos ppmu dt node\n");
+		goto err;
+	}
+
+	return 0;
+
+err:
+	iounmap(info->ppmu.base);
+
+	return ret;
+}
+
+static int exynos_ppmu_probe(struct platform_device *pdev)
+{
+	struct exynos_ppmu *info;
+	struct devfreq_event_dev **edev;
+	struct devfreq_event_desc *desc;
+	int i, ret = 0, size;
+
+	info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	mutex_init(&info->lock);
+	info->dev = &pdev->dev;
+
+	/* Parse dt data to get resource */
+	ret = exynos_ppmu_parse_dt(info);
+	if (ret < 0) {
+		dev_err(&pdev->dev,
+			"failed to parse devicetree for resource\n");
+		return ret;
+	}
+	desc = info->desc;
+
+	size = sizeof(struct devfreq_event_dev *) * info->num_events;
+	info->edev = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+	if (!info->edev) {
+		dev_err(&pdev->dev,
+			"failed to allocate memory devfreq-event devices\n");
+		return -ENOMEM;
+	}
+	edev = info->edev;
+	platform_set_drvdata(pdev, info);
+
+	for (i = 0; i < info->num_events; i++) {
+		edev[i] = devm_devfreq_event_add_edev(&pdev->dev, &desc[i]);
+		if (IS_ERR(edev)) {
+			ret = PTR_ERR(edev);
+			dev_err(&pdev->dev,
+				"failed to add devfreq-event device\n");
+			goto err;
+		}
+	}
+
+	clk_prepare_enable(info->ppmu.clk);
+
+	return 0;
+err:
+	iounmap(info->ppmu.base);
+
+	return ret;
+}
+
+static int exynos_ppmu_remove(struct platform_device *pdev)
+{
+	struct exynos_ppmu *info = platform_get_drvdata(pdev);
+
+	clk_disable_unprepare(info->ppmu.clk);
+	iounmap(info->ppmu.base);
+
+	return 0;
+}
+
+static struct of_device_id exynos_ppmu_id_match[] = {
+	{ .compatible = "samsung,exynos-ppmu", },
+	{ /* sentinel */ },
+};
+
+static struct platform_driver exynos_ppmu_driver = {
+	.probe	= exynos_ppmu_probe,
+	.remove	= exynos_ppmu_remove,
+	.driver = {
+		.name	= "exynos-ppmu",
+		.of_match_table = exynos_ppmu_id_match,
+	},
+};
+module_platform_driver(exynos_ppmu_driver);
+
+MODULE_DESCRIPTION("Exynos PPMU(Platform Performance Monitoring Unit) driver");
+MODULE_AUTHOR("Chanwoo Choi <cw00.choi@samsung.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/devfreq/event/exynos-ppmu.h b/drivers/devfreq/event/exynos-ppmu.h
new file mode 100644
index 000000000000..4e831d48c138
--- /dev/null
+++ b/drivers/devfreq/event/exynos-ppmu.h
@@ -0,0 +1,93 @@
+/*
+ * exynos_ppmu.h - EXYNOS PPMU header file
+ *
+ * Copyright (c) 2015 Samsung Electronics Co., Ltd.
+ * Author : Chanwoo Choi <cw00.choi@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __EXYNOS_PPMU_H__
+#define __EXYNOS_PPMU_H__
+
+enum ppmu_state {
+	PPMU_DISABLE = 0,
+	PPMU_ENABLE,
+};
+
+enum ppmu_counter {
+	PPMU_PMNCNT0 = 0,
+	PPMU_PMNCNT1,
+	PPMU_PMNCNT2,
+	PPMU_PMNCNT3,
+
+	PPMU_PMNCNT_MAX,
+};
+
+enum ppmu_event_type {
+	PPMU_RO_BUSY_CYCLE_CNT	= 0x0,
+	PPMU_WO_BUSY_CYCLE_CNT	= 0x1,
+	PPMU_RW_BUSY_CYCLE_CNT	= 0x2,
+	PPMU_RO_REQUEST_CNT	= 0x3,
+	PPMU_WO_REQUEST_CNT	= 0x4,
+	PPMU_RO_DATA_CNT	= 0x5,
+	PPMU_WO_DATA_CNT	= 0x6,
+	PPMU_RO_LATENCY		= 0x12,
+	PPMU_WO_LATENCY		= 0x16,
+};
+
+enum ppmu_reg {
+	/* PPC control register */
+	PPMU_PMNC		= 0x00,
+	PPMU_CNTENS		= 0x10,
+	PPMU_CNTENC		= 0x20,
+	PPMU_INTENS		= 0x30,
+	PPMU_INTENC		= 0x40,
+	PPMU_FLAG		= 0x50,
+
+	/* Cycle Counter and Performance Event Counter Register */
+	PPMU_CCNT		= 0x100,
+	PPMU_PMCNT0		= 0x110,
+	PPMU_PMCNT1		= 0x120,
+	PPMU_PMCNT2		= 0x130,
+	PPMU_PMCNT3_HIGH	= 0x140,
+	PPMU_PMCNT3_LOW		= 0x150,
+
+	/* Bus Event Generator */
+	PPMU_BEVT0SEL		= 0x1000,
+	PPMU_BEVT1SEL		= 0x1100,
+	PPMU_BEVT2SEL		= 0x1200,
+	PPMU_BEVT3SEL		= 0x1300,
+	PPMU_COUNTER_RESET	= 0x1810,
+	PPMU_READ_OVERFLOW_CNT	= 0x1810,
+	PPMU_READ_UNDERFLOW_CNT	= 0x1814,
+	PPMU_WRITE_OVERFLOW_CNT	= 0x1850,
+	PPMU_WRITE_UNDERFLOW_CNT = 0x1854,
+	PPMU_READ_PENDING_CNT	= 0x1880,
+	PPMU_WRITE_PENDING_CNT	= 0x1884
+};
+
+/* PMNC register */
+#define PPMU_PMNC_CC_RESET_SHIFT	2
+#define PPMU_PMNC_COUNTER_RESET_SHIFT	1
+#define PPMU_PMNC_ENABLE_SHIFT		0
+#define PPMU_PMNC_START_MODE_MASK	BIT(16)
+#define PPMU_PMNC_CC_DIVIDER_MASK	BIT(3)
+#define PPMU_PMNC_CC_RESET_MASK		BIT(2)
+#define PPMU_PMNC_COUNTER_RESET_MASK	BIT(1)
+#define PPMU_PMNC_ENABLE_MASK		BIT(0)
+
+/* CNTENS/CNTENC/INTENS/INTENC/FLAG register */
+#define PPMU_CCNT_MASK			BIT(31)
+#define PPMU_PMCNT3_MASK		BIT(3)
+#define PPMU_PMCNT2_MASK		BIT(2)
+#define PPMU_PMCNT1_MASK		BIT(1)
+#define PPMU_PMCNT0_MASK		BIT(0)
+
+/* PPMU_PMNCTx/PPMU_BETxSEL registers */
+#define PPMU_PMNCT(x)			(PPMU_PMCNT0 + (0x10 * x))
+#define PPMU_BEVTxSEL(x)		(PPMU_BEVT0SEL + (0x100 * x))
+
+#endif /* __EXYNOS_PPMU_H__ */
diff --git a/include/linux/devfreq-event.h b/include/linux/devfreq-event.h
new file mode 100644
index 000000000000..602fbbfcfeed
--- /dev/null
+++ b/include/linux/devfreq-event.h
@@ -0,0 +1,196 @@
+/*
+ * devfreq-event: a framework to provide raw data and events of devfreq devices
+ *
+ * Copyright (C) 2014 Samsung Electronics
+ * Author: Chanwoo Choi <cw00.choi@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_DEVFREQ_EVENT_H__
+#define __LINUX_DEVFREQ_EVENT_H__
+
+#include <linux/device.h>
+
+/**
+ * struct devfreq_event_dev - the devfreq-event device
+ *
+ * @node	: Contain the devfreq-event device that have been registered.
+ * @dev		: the device registered by devfreq-event class. dev.parent is
+ *		  the device using devfreq-event.
+ * @lock	: a mutex to protect accessing devfreq-event.
+ * @enable_count: the number of enable function have been called.
+ * @desc	: the description for devfreq-event device.
+ *
+ * This structure contains devfreq-event device information.
+ */
+struct devfreq_event_dev {
+	struct list_head node;
+
+	struct device dev;
+	struct mutex lock;
+	u32 enable_count;
+
+	const struct devfreq_event_desc *desc;
+};
+
+/**
+ * struct devfreq_event_data - the devfreq-event data
+ *
+ * @load_count	: load count of devfreq-event device for the given period.
+ * @total_count	: total count of devfreq-event device for the given period.
+ *		  each count may represent a clock cycle, a time unit
+ *		  (ns/us/...), or anything the device driver wants.
+ *		  Generally, utilization is load_count / total_count.
+ *
+ * This structure contains the data of devfreq-event device for polling period.
+ */
+struct devfreq_event_data {
+	unsigned long load_count;
+	unsigned long total_count;
+};
+
+/**
+ * struct devfreq_event_ops - the operations of devfreq-event device
+ *
+ * @enable	: Enable the devfreq-event device.
+ * @disable	: Disable the devfreq-event device.
+ * @reset	: Reset all setting of the devfreq-event device.
+ * @set_event	: Set the specific event type for the devfreq-event device.
+ * @get_event	: Get the result of the devfreq-event devie with specific
+ *		  event type.
+ *
+ * This structure contains devfreq-event device operations which can be
+ * implemented by devfreq-event device drivers.
+ */
+struct devfreq_event_ops {
+	/* Optional functions */
+	int (*enable)(struct devfreq_event_dev *edev);
+	int (*disable)(struct devfreq_event_dev *edev);
+	int (*reset)(struct devfreq_event_dev *edev);
+
+	/* Mandatory functions */
+	int (*set_event)(struct devfreq_event_dev *edev);
+	int (*get_event)(struct devfreq_event_dev *edev,
+			 struct devfreq_event_data *edata);
+};
+
+/**
+ * struct devfreq_event_desc - the descriptor of devfreq-event device
+ *
+ * @name	: the name of devfreq-event device.
+ * @driver_data	: the private data for devfreq-event driver.
+ * @ops		: the operation to control devfreq-event device.
+ *
+ * Each devfreq-event device is described with a this structure.
+ * This structure contains the various data for devfreq-event device.
+ */
+struct devfreq_event_desc {
+	const char *name;
+	void *driver_data;
+
+	struct devfreq_event_ops *ops;
+};
+
+#if defined(CONFIG_PM_DEVFREQ_EVENT)
+extern int devfreq_event_enable_edev(struct devfreq_event_dev *edev);
+extern int devfreq_event_disable_edev(struct devfreq_event_dev *edev);
+extern bool devfreq_event_is_enabled(struct devfreq_event_dev *edev);
+extern int devfreq_event_set_event(struct devfreq_event_dev *edev);
+extern int devfreq_event_get_event(struct devfreq_event_dev *edev,
+				struct devfreq_event_data *edata);
+extern int devfreq_event_reset_event(struct devfreq_event_dev *edev);
+extern struct devfreq_event_dev *devfreq_event_get_edev_by_phandle(
+				struct device *dev, int index);
+extern int devfreq_event_get_edev_count(struct device *dev);
+extern struct devfreq_event_dev *devfreq_event_add_edev(struct device *dev,
+				struct devfreq_event_desc *desc);
+extern int devfreq_event_remove_edev(struct devfreq_event_dev *edev);
+extern struct devfreq_event_dev *devm_devfreq_event_add_edev(struct device *dev,
+				struct devfreq_event_desc *desc);
+extern void devm_devfreq_event_remove_edev(struct device *dev,
+				struct devfreq_event_dev *edev);
+static inline void *devfreq_event_get_drvdata(struct devfreq_event_dev *edev)
+{
+	return edev->desc->driver_data;
+}
+#else
+static inline int devfreq_event_enable_edev(struct devfreq_event_dev *edev)
+{
+	return -EINVAL;
+}
+
+static inline int devfreq_event_disable_edev(struct devfreq_event_dev *edev)
+{
+	return -EINVAL;
+}
+
+static inline bool devfreq_event_is_enabled(struct devfreq_event_dev *edev)
+{
+	return false;
+}
+
+static inline int devfreq_event_set_event(struct devfreq_event_dev *edev)
+{
+	return -EINVAL;
+}
+
+static inline int devfreq_event_get_event(struct devfreq_event_dev *edev,
+					struct devfreq_event_data *edata)
+{
+	return -EINVAL;
+}
+
+static inline int devfreq_event_reset_event(struct devfreq_event_dev *edev)
+{
+	return -EINVAL;
+}
+
+static inline void *devfreq_event_get_drvdata(struct devfreq_event_dev *edev)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline struct devfreq_event_dev *devfreq_event_get_edev_by_phandle(
+					struct device *dev, int index)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline int devfreq_event_get_edev_count(struct device *dev)
+{
+	return -EINVAL;
+}
+
+static inline struct devfreq_event_dev *devfreq_event_add_edev(struct device *dev,
+					struct devfreq_event_desc *desc)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline int devfreq_event_remove_edev(struct devfreq_event_dev *edev)
+{
+	return -EINVAL;
+}
+
+static inline struct devfreq_event_dev *devm_devfreq_event_add_edev(
+					struct device *dev,
+					struct devfreq_event_desc *desc)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline void devm_devfreq_event_remove_edev(struct device *dev,
+					struct devfreq_event_dev *edev)
+{
+}
+
+static inline void *devfreq_event_get_drvdata(struct devfreq_event_dev *edev)
+{
+	return NULL;
+}
+#endif /* CONFIG_PM_DEVFREQ_EVENT */
+
+#endif /* __LINUX_DEVFREQ_EVENT_H__ */
-- 
cgit v1.2.3


From 38e478c4489a845a5e8baf7849c286af5fed5b66 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Oct 2014 15:56:21 +0200
Subject: quota: Split ->set_xstate callback into two

Split ->set_xstate callback into two callbacks - one for turning quotas
on (->quota_enable) and one for turning quotas off (->quota_disable). That
way we don't have to pass quotactl command into the callback which seems
cleaner.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/quota.c      | 20 +++++++++++++----
 fs/xfs/xfs_quotaops.c | 59 ++++++++++++++++++++++++++++++++-------------------
 include/linux/quota.h |  3 ++-
 3 files changed, 55 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 6f3856328eea..e2ae2b99e555 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -208,15 +208,26 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
 	return sb->s_qcop->set_dqblk(sb, qid, &fdq);
 }
 
-static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
+static int quota_enable(struct super_block *sb, void __user *addr)
 {
 	__u32 flags;
 
 	if (copy_from_user(&flags, addr, sizeof(flags)))
 		return -EFAULT;
-	if (!sb->s_qcop->set_xstate)
+	if (!sb->s_qcop->quota_enable)
 		return -ENOSYS;
-	return sb->s_qcop->set_xstate(sb, flags, cmd);
+	return sb->s_qcop->quota_enable(sb, flags);
+}
+
+static int quota_disable(struct super_block *sb, void __user *addr)
+{
+	__u32 flags;
+
+	if (copy_from_user(&flags, addr, sizeof(flags)))
+		return -EFAULT;
+	if (!sb->s_qcop->quota_disable)
+		return -ENOSYS;
+	return sb->s_qcop->quota_disable(sb, flags);
 }
 
 static int quota_getxstate(struct super_block *sb, void __user *addr)
@@ -447,8 +458,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 			return -ENOSYS;
 		return sb->s_qcop->quota_sync(sb, type);
 	case Q_XQUOTAON:
+		return quota_enable(sb, addr);
 	case Q_XQUOTAOFF:
-		return quota_setxstate(sb, cmd, addr);
+		return quota_disable(sb, addr);
 	case Q_XQUOTARM:
 		return quota_rmxquota(sb, addr);
 	case Q_XGETQSTAT:
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index a226203fa46a..6923905ab33d 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -64,19 +64,10 @@ xfs_fs_get_xstatev(
 	return xfs_qm_scall_getqstatv(mp, fqs);
 }
 
-STATIC int
-xfs_fs_set_xstate(
-	struct super_block	*sb,
-	unsigned int		uflags,
-	int			op)
+static unsigned int
+xfs_quota_flags(unsigned int uflags)
 {
-	struct xfs_mount	*mp = XFS_M(sb);
-	unsigned int		flags = 0;
-
-	if (sb->s_flags & MS_RDONLY)
-		return -EROFS;
-	if (!XFS_IS_QUOTA_RUNNING(mp))
-		return -ENOSYS;
+	unsigned int flags = 0;
 
 	if (uflags & FS_QUOTA_UDQ_ACCT)
 		flags |= XFS_UQUOTA_ACCT;
@@ -91,16 +82,39 @@ xfs_fs_set_xstate(
 	if (uflags & FS_QUOTA_PDQ_ENFD)
 		flags |= XFS_PQUOTA_ENFD;
 
-	switch (op) {
-	case Q_XQUOTAON:
-		return xfs_qm_scall_quotaon(mp, flags);
-	case Q_XQUOTAOFF:
-		if (!XFS_IS_QUOTA_ON(mp))
-			return -EINVAL;
-		return xfs_qm_scall_quotaoff(mp, flags);
-	}
+	return flags;
+}
+
+STATIC int
+xfs_quota_enable(
+	struct super_block	*sb,
+	unsigned int		uflags)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+
+	return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
+}
+
+STATIC int
+xfs_quota_disable(
+	struct super_block	*sb,
+	unsigned int		uflags)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	if (!XFS_IS_QUOTA_RUNNING(mp))
+		return -ENOSYS;
+	if (!XFS_IS_QUOTA_ON(mp))
+		return -EINVAL;
 
-	return -EINVAL;
+	return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
 }
 
 STATIC int
@@ -166,7 +180,8 @@ xfs_fs_set_dqblk(
 const struct quotactl_ops xfs_quotactl_operations = {
 	.get_xstatev		= xfs_fs_get_xstatev,
 	.get_xstate		= xfs_fs_get_xstate,
-	.set_xstate		= xfs_fs_set_xstate,
+	.quota_enable		= xfs_quota_enable,
+	.quota_disable		= xfs_quota_disable,
 	.rm_xquota		= xfs_fs_rm_xquota,
 	.get_dqblk		= xfs_fs_get_dqblk,
 	.set_dqblk		= xfs_fs_set_dqblk,
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 227f37f463c9..4da497b807c4 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -371,13 +371,14 @@ struct quotactl_ops {
 	int (*quota_on)(struct super_block *, int, int, struct path *);
 	int (*quota_on_meta)(struct super_block *, int, int);
 	int (*quota_off)(struct super_block *, int);
+	int (*quota_enable)(struct super_block *, unsigned int);
+	int (*quota_disable)(struct super_block *, unsigned int);
 	int (*quota_sync)(struct super_block *, int);
 	int (*get_info)(struct super_block *, int, struct if_dqinfo *);
 	int (*set_info)(struct super_block *, int, struct if_dqinfo *);
 	int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
 	int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
 	int (*get_xstate)(struct super_block *, struct fs_quota_stat *);
-	int (*set_xstate)(struct super_block *, unsigned int, int);
 	int (*get_xstatev)(struct super_block *, struct fs_quota_statv *);
 	int (*rm_xquota)(struct super_block *, unsigned int);
 };
-- 
cgit v1.2.3


From d3b863248577504f6eecca2a464d6ddf86b71584 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Oct 2014 16:07:12 +0200
Subject: quota: Wire up ->quota_{enable,disable} callbacks into
 Q_QUOTA{ON,OFF}

Make Q_QUOTAON / Q_QUOTAOFF quotactl call ->quota_enable /
->quota_disable callback when provided. To match current behavior of
ocfs2 & ext4 we make these quotactls turn on / off quota enforcement for
appropriate quota type.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/quota.c         | 31 +++++++++++++++++++++++++++----
 include/linux/quotaops.h |  2 ++
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index e2ae2b99e555..ce78a70a596f 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -66,18 +66,43 @@ static int quota_sync_all(int type)
 	return ret;
 }
 
+unsigned int qtype_enforce_flag(int type)
+{
+	switch (type) {
+	case USRQUOTA:
+		return FS_QUOTA_UDQ_ENFD;
+	case GRPQUOTA:
+		return FS_QUOTA_GDQ_ENFD;
+	case PRJQUOTA:
+		return FS_QUOTA_PDQ_ENFD;
+	}
+	return 0;
+}
+
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
 		         struct path *path)
 {
-	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
+	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta &&
+	    !sb->s_qcop->quota_enable)
 		return -ENOSYS;
 	if (sb->s_qcop->quota_on_meta)
 		return sb->s_qcop->quota_on_meta(sb, type, id);
+	if (sb->s_qcop->quota_enable)
+		return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 	return sb->s_qcop->quota_on(sb, type, id, path);
 }
 
+static int quota_quotaoff(struct super_block *sb, int type)
+{
+	if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable)
+		return -ENOSYS;
+	if (sb->s_qcop->quota_disable)
+		return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type));
+	return sb->s_qcop->quota_off(sb, type);
+}
+
 static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
 {
 	__u32 fmt;
@@ -440,9 +465,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 	case Q_QUOTAON:
 		return quota_quotaon(sb, type, cmd, id, path);
 	case Q_QUOTAOFF:
-		if (!sb->s_qcop->quota_off)
-			return -ENOSYS;
-		return sb->s_qcop->quota_off(sb, type);
+		return quota_quotaoff(sb, type);
 	case Q_GETFMT:
 		return quota_getfmt(sb, type, addr);
 	case Q_GETINFO:
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 29e3455f7d41..ff0b665591d0 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -386,4 +386,6 @@ static inline void dquot_release_reservation_block(struct inode *inode,
 	__dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE);
 }
 
+unsigned int qtype_enforce_flag(int type);
+
 #endif /* _LINUX_QUOTAOPS_ */
-- 
cgit v1.2.3


From 3e2af67e66ff025796af1a8a1fcbb4236304f90c Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 6 Oct 2014 18:40:51 +0200
Subject: quota: Add ->quota_{enable,disable} callbacks for VFS quotas

Add functions which translate ->quota_enable / ->quota_disable calls
into appropriate changes in VFS quota. This will enable filesystems
supporting VFS quota files in system inodes to be controlled via
Q_XQUOTA[ON|OFF] quotactls for better userspace compatibility.

Also provide a vector for quotactl using these functions which can be
used by filesystems with quota files stored in hidden system files.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/dquot.c         | 91 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/quotaops.h |  1 +
 2 files changed, 92 insertions(+)

(limited to 'include/linux')

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 29eb9dc5728a..b47d0c17ea6f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2385,6 +2385,86 @@ out:
 }
 EXPORT_SYMBOL(dquot_quota_on_mount);
 
+static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
+{
+	int ret;
+	int type;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+		return -ENOSYS;
+	/* Accounting cannot be turned on while fs is mounted */
+	flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
+	if (!flags)
+		return -EINVAL;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (!(flags & qtype_enforce_flag(type)))
+			continue;
+		/* Can't enforce without accounting */
+		if (!sb_has_quota_usage_enabled(sb, type))
+			return -EINVAL;
+		ret = dquot_enable(dqopt->files[type], type,
+				   dqopt->info[type].dqi_fmt_id,
+				   DQUOT_LIMITS_ENABLED);
+		if (ret < 0)
+			goto out_err;
+	}
+	return 0;
+out_err:
+	/* Backout enforcement enablement we already did */
+	for (type--; type >= 0; type--)  {
+		if (flags & qtype_enforce_flag(type))
+			dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+	}
+	/* Error code translation for better compatibility with XFS */
+	if (ret == -EBUSY)
+		ret = -EEXIST;
+	return ret;
+}
+
+static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
+{
+	int ret;
+	int type;
+	struct quota_info *dqopt = sb_dqopt(sb);
+
+	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+		return -ENOSYS;
+	/*
+	 * We don't support turning off accounting via quotactl. In principle
+	 * quota infrastructure can do this but filesystems don't expect
+	 * userspace to be able to do it.
+	 */
+	if (flags &
+		  (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
+		return -EOPNOTSUPP;
+
+	/* Filter out limits not enabled */
+	for (type = 0; type < MAXQUOTAS; type++)
+		if (!sb_has_quota_limits_enabled(sb, type))
+			flags &= ~qtype_enforce_flag(type);
+	/* Nothing left? */
+	if (!flags)
+		return -EEXIST;
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (flags & qtype_enforce_flag(type)) {
+			ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+			if (ret < 0)
+				goto out_err;
+		}
+	}
+	return 0;
+out_err:
+	/* Backout enforcement disabling we already did */
+	for (type--; type >= 0; type--)  {
+		if (flags & qtype_enforce_flag(type))
+			dquot_enable(dqopt->files[type], type,
+				     dqopt->info[type].dqi_fmt_id,
+				     DQUOT_LIMITS_ENABLED);
+	}
+	return ret;
+}
+
 static inline qsize_t qbtos(qsize_t blocks)
 {
 	return blocks << QIF_DQBLKSIZE_BITS;
@@ -2614,6 +2694,17 @@ const struct quotactl_ops dquot_quotactl_ops = {
 };
 EXPORT_SYMBOL(dquot_quotactl_ops);
 
+const struct quotactl_ops dquot_quotactl_sysfile_ops = {
+	.quota_enable	= dquot_quota_enable,
+	.quota_disable	= dquot_quota_disable,
+	.quota_sync	= dquot_quota_sync,
+	.get_info	= dquot_get_dqinfo,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk
+};
+EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
+
 static int do_proc_dqstats(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index ff0b665591d0..df73258cca47 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -166,6 +166,7 @@ static inline bool sb_has_quota_active(struct super_block *sb, int type)
  */
 extern const struct dquot_operations dquot_operations;
 extern const struct quotactl_ops dquot_quotactl_ops;
+extern const struct quotactl_ops dquot_quotactl_sysfile_ops;
 
 #else
 
-- 
cgit v1.2.3


From aaa3daed156ff3c6acb28c8b18028f8b57d6c91b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Oct 2014 18:35:31 +0200
Subject: quota: Remove quota_on_meta callback

There are no more users for quota_on_meta callback. Just remove it.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/quota/quota.c      | 5 +----
 include/linux/quota.h | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ce78a70a596f..d14a799c7785 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -82,11 +82,8 @@ unsigned int qtype_enforce_flag(int type)
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
 		         struct path *path)
 {
-	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta &&
-	    !sb->s_qcop->quota_enable)
+	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
 		return -ENOSYS;
-	if (sb->s_qcop->quota_on_meta)
-		return sb->s_qcop->quota_on_meta(sb, type, id);
 	if (sb->s_qcop->quota_enable)
 		return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
 	if (IS_ERR(path))
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 4da497b807c4..0938159d65c8 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -369,7 +369,6 @@ struct qc_dqblk {
 /* Operations handling requests from userspace */
 struct quotactl_ops {
 	int (*quota_on)(struct super_block *, int, int, struct path *);
-	int (*quota_on_meta)(struct super_block *, int, int);
 	int (*quota_off)(struct super_block *, int);
 	int (*quota_enable)(struct super_block *, unsigned int);
 	int (*quota_disable)(struct super_block *, unsigned int);
-- 
cgit v1.2.3


From b10a08194c2b615955dfab2300331a90ae9344c7 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 9 Oct 2014 16:54:13 +0200
Subject: quota: Store maximum space limit in bytes

Currently maximum space limit quota format supports is in blocks however
since we store space limits in bytes, this is somewhat confusing. So
store the maximum limit in bytes as well. Also rename the field to match
the new unit and related inode field to match the new naming scheme.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ocfs2/quota_local.c |  4 ++--
 fs/quota/dquot.c       | 18 ++++--------------
 fs/quota/quota_v1.c    |  4 ++--
 fs/quota/quota_v2.c    | 10 +++++-----
 include/linux/quota.h  |  4 ++--
 5 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 55f832f553cb..89c0b2620814 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -701,8 +701,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
 	/* We don't need the lock and we have to acquire quota file locks
 	 * which will later depend on this lock */
 	mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
-	info->dqi_maxblimit = 0x7fffffffffffffffLL;
-	info->dqi_maxilimit = 0x7fffffffffffffffLL;
+	info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
+	info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
 	oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
 	if (!oinfo) {
 		mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index b47d0c17ea6f..0ccd4ba3a246 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2465,16 +2465,6 @@ out_err:
 	return ret;
 }
 
-static inline qsize_t qbtos(qsize_t blocks)
-{
-	return blocks << QIF_DQBLKSIZE_BITS;
-}
-
-static inline qsize_t stoqb(qsize_t space)
-{
-	return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
-}
-
 /* Generic routine for getting common part of quota structure */
 static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 {
@@ -2524,13 +2514,13 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 		return -EINVAL;
 
 	if (((di->d_fieldmask & QC_SPC_SOFT) &&
-	     stoqb(di->d_spc_softlimit) > dqi->dqi_maxblimit) ||
+	     di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
 	    ((di->d_fieldmask & QC_SPC_HARD) &&
-	     stoqb(di->d_spc_hardlimit) > dqi->dqi_maxblimit) ||
+	     di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
 	    ((di->d_fieldmask & QC_INO_SOFT) &&
-	     (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
+	     (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
 	    ((di->d_fieldmask & QC_INO_HARD) &&
-	     (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
+	     (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
 		return -ERANGE;
 
 	spin_lock(&dq_data_lock);
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 469c6848b322..8fe79beced5c 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -169,8 +169,8 @@ static int v1_read_file_info(struct super_block *sb, int type)
 	}
 	ret = 0;
 	/* limits are stored as unsigned 32-bit data */
-	dqopt->info[type].dqi_maxblimit = 0xffffffff;
-	dqopt->info[type].dqi_maxilimit = 0xffffffff;
+	dqopt->info[type].dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
+	dqopt->info[type].dqi_max_ino_limit = 0xffffffff;
 	dqopt->info[type].dqi_igrace =
 			dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
 	dqopt->info[type].dqi_bgrace =
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 54cac436ac72..9cb10d7197f7 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -117,12 +117,12 @@ static int v2_read_file_info(struct super_block *sb, int type)
 	qinfo = info->dqi_priv;
 	if (version == 0) {
 		/* limits are stored as unsigned 32-bit data */
-		info->dqi_maxblimit = 0xffffffff;
-		info->dqi_maxilimit = 0xffffffff;
+		info->dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
+		info->dqi_max_ino_limit = 0xffffffff;
 	} else {
-		/* used space is stored as unsigned 64-bit value */
-		info->dqi_maxblimit = 0xffffffffffffffffULL;	/* 2^64-1 */
-		info->dqi_maxilimit = 0xffffffffffffffffULL;
+		/* used space is stored as unsigned 64-bit value in bytes */
+		info->dqi_max_spc_limit = 0xffffffffffffffffULL; /* 2^64-1 */
+		info->dqi_max_ino_limit = 0xffffffffffffffffULL;
 	}
 	info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
 	info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 0938159d65c8..d534e8ed308a 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -216,8 +216,8 @@ struct mem_dqinfo {
 	unsigned long dqi_flags;
 	unsigned int dqi_bgrace;
 	unsigned int dqi_igrace;
-	qsize_t dqi_maxblimit;
-	qsize_t dqi_maxilimit;
+	qsize_t dqi_max_spc_limit;
+	qsize_t dqi_max_ino_limit;
 	void *dqi_priv;
 };
 
-- 
cgit v1.2.3


From d4bcef3fbe887ff93b58da4fcf6df1eee416e8fa Mon Sep 17 00:00:00 2001
From: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Date: Thu, 29 Jan 2015 20:37:07 +0900
Subject: net: Fix vlan_get_protocol for stacked vlan

vlan_get_protocol() could not get network protocol if a skb has a 802.1ad
vlan tag or multiple vlans, which caused incorrect checksum calculation
in several drivers.

Fix vlan_get_protocol() to retrieve network protocol instead of incorrect
vlan protocol.

As the logic is the same as skb_network_protocol(), create a common helper
function __vlan_get_protocol() and call it from existing functions.

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 60 +++++++++++++++++++++++++++++++++++++------------
 net/core/dev.c          | 31 +------------------------
 2 files changed, 47 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 515a35e2a48a..960e666c51e4 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -472,27 +472,59 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 /**
  * vlan_get_protocol - get protocol EtherType.
  * @skb: skbuff to query
+ * @type: first vlan protocol
+ * @depth: buffer to store length of eth and vlan tags in bytes
  *
  * Returns the EtherType of the packet, regardless of whether it is
  * vlan encapsulated (normal or hardware accelerated) or not.
  */
-static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
+static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type,
+					 int *depth)
 {
-	__be16 protocol = 0;
-
-	if (vlan_tx_tag_present(skb) ||
-	     skb->protocol != cpu_to_be16(ETH_P_8021Q))
-		protocol = skb->protocol;
-	else {
-		__be16 proto, *protop;
-		protop = skb_header_pointer(skb, offsetof(struct vlan_ethhdr,
-						h_vlan_encapsulated_proto),
-						sizeof(proto), &proto);
-		if (likely(protop))
-			protocol = *protop;
+	unsigned int vlan_depth = skb->mac_len;
+
+	/* if type is 802.1Q/AD then the header should already be
+	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+	 * ETH_HLEN otherwise
+	 */
+	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
+		if (vlan_depth) {
+			if (WARN_ON(vlan_depth < VLAN_HLEN))
+				return 0;
+			vlan_depth -= VLAN_HLEN;
+		} else {
+			vlan_depth = ETH_HLEN;
+		}
+		do {
+			struct vlan_hdr *vh;
+
+			if (unlikely(!pskb_may_pull(skb,
+						    vlan_depth + VLAN_HLEN)))
+				return 0;
+
+			vh = (struct vlan_hdr *)(skb->data + vlan_depth);
+			type = vh->h_vlan_encapsulated_proto;
+			vlan_depth += VLAN_HLEN;
+		} while (type == htons(ETH_P_8021Q) ||
+			 type == htons(ETH_P_8021AD));
 	}
 
-	return protocol;
+	if (depth)
+		*depth = vlan_depth;
+
+	return type;
+}
+
+/**
+ * vlan_get_protocol - get protocol EtherType.
+ * @skb: skbuff to query
+ *
+ * Returns the EtherType of the packet, regardless of whether it is
+ * vlan encapsulated (normal or hardware accelerated) or not.
+ */
+static inline __be16 vlan_get_protocol(struct sk_buff *skb)
+{
+	return __vlan_get_protocol(skb, skb->protocol, NULL);
 }
 
 static inline void vlan_set_encap_proto(struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 171420e75b03..c87a2264a02b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2352,7 +2352,6 @@ EXPORT_SYMBOL(skb_checksum_help);
 
 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 {
-	unsigned int vlan_depth = skb->mac_len;
 	__be16 type = skb->protocol;
 
 	/* Tunnel gso handlers can set protocol to ethernet. */
@@ -2366,35 +2365,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
 		type = eth->h_proto;
 	}
 
-	/* if skb->protocol is 802.1Q/AD then the header should already be
-	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
-	 * ETH_HLEN otherwise
-	 */
-	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
-		if (vlan_depth) {
-			if (WARN_ON(vlan_depth < VLAN_HLEN))
-				return 0;
-			vlan_depth -= VLAN_HLEN;
-		} else {
-			vlan_depth = ETH_HLEN;
-		}
-		do {
-			struct vlan_hdr *vh;
-
-			if (unlikely(!pskb_may_pull(skb,
-						    vlan_depth + VLAN_HLEN)))
-				return 0;
-
-			vh = (struct vlan_hdr *)(skb->data + vlan_depth);
-			type = vh->h_vlan_encapsulated_proto;
-			vlan_depth += VLAN_HLEN;
-		} while (type == htons(ETH_P_8021Q) ||
-			 type == htons(ETH_P_8021AD));
-	}
-
-	*depth = vlan_depth;
-
-	return type;
+	return __vlan_get_protocol(skb, type, depth);
 }
 
 /**
-- 
cgit v1.2.3


From 448c7f3830ca283e485aa943279acea6bde8b270 Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@bitmath.org>
Date: Sun, 1 Feb 2015 11:25:14 -0800
Subject: Input: MT - add support for balanced slot assignment

Some devices are not fast enough to differentiate between a fast-moving
contact and a new contact. This problem cannot be fully resolved because
information is truly missing, but it is possible to safe-guard against
obvious mistakes by restricting movement with a maximum displacement.

The new problem formulation for dmax > 0 cannot benefit from the speedup
for positive definite matrices, but since the convergence is faster, the
result is about the same. For a handful of contacts, the latency difference
is truly negligible.

Suggested-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Tested-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Henrik Rydberg <rydberg@bitmath.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/input-mt.c                  | 31 ++++++++++++++++++++-----------
 drivers/input/mouse/alps.c                |  2 +-
 drivers/input/mouse/bcm5974.c             |  2 +-
 drivers/input/mouse/cypress_ps2.c         |  2 +-
 drivers/input/mouse/synaptics.c           |  2 +-
 drivers/input/touchscreen/pixcir_i2c_ts.c |  2 +-
 include/linux/input/mt.h                  |  3 ++-
 7 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/input-mt.c b/drivers/input/input-mt.c
index fbe29fcb15c5..cb150a1dbaff 100644
--- a/drivers/input/input-mt.c
+++ b/drivers/input/input-mt.c
@@ -293,7 +293,7 @@ void input_mt_sync_frame(struct input_dev *dev)
 }
 EXPORT_SYMBOL(input_mt_sync_frame);
 
-static int adjust_dual(int *begin, int step, int *end, int eq)
+static int adjust_dual(int *begin, int step, int *end, int eq, int mu)
 {
 	int f, *p, s, c;
 
@@ -311,9 +311,10 @@ static int adjust_dual(int *begin, int step, int *end, int eq)
 			s = *p;
 
 	c = (f + s + 1) / 2;
-	if (c == 0 || (c > 0 && !eq))
+	if (c == 0 || (c > mu && (!eq || mu > 0)))
 		return 0;
-	if (s < 0)
+	/* Improve convergence for positive matrices by penalizing overcovers */
+	if (s < 0 && mu <= 0)
 		c *= 2;
 
 	for (p = begin; p != end; p += step)
@@ -322,23 +323,24 @@ static int adjust_dual(int *begin, int step, int *end, int eq)
 	return (c < s && s <= 0) || (f >= 0 && f < c);
 }
 
-static void find_reduced_matrix(int *w, int nr, int nc, int nrc)
+static void find_reduced_matrix(int *w, int nr, int nc, int nrc, int mu)
 {
 	int i, k, sum;
 
 	for (k = 0; k < nrc; k++) {
 		for (i = 0; i < nr; i++)
-			adjust_dual(w + i, nr, w + i + nrc, nr <= nc);
+			adjust_dual(w + i, nr, w + i + nrc, nr <= nc, mu);
 		sum = 0;
 		for (i = 0; i < nrc; i += nr)
-			sum += adjust_dual(w + i, 1, w + i + nr, nc <= nr);
+			sum += adjust_dual(w + i, 1, w + i + nr, nc <= nr, mu);
 		if (!sum)
 			break;
 	}
 }
 
 static int input_mt_set_matrix(struct input_mt *mt,
-			       const struct input_mt_pos *pos, int num_pos)
+			       const struct input_mt_pos *pos, int num_pos,
+			       int mu)
 {
 	const struct input_mt_pos *p;
 	struct input_mt_slot *s;
@@ -352,7 +354,7 @@ static int input_mt_set_matrix(struct input_mt *mt,
 		y = input_mt_get_value(s, ABS_MT_POSITION_Y);
 		for (p = pos; p != pos + num_pos; p++) {
 			int dx = x - p->x, dy = y - p->y;
-			*w++ = dx * dx + dy * dy;
+			*w++ = dx * dx + dy * dy - mu;
 		}
 	}
 
@@ -393,17 +395,24 @@ static void input_mt_set_slots(struct input_mt *mt,
  * @slots: the slot assignment to be filled
  * @pos: the position array to match
  * @num_pos: number of positions
+ * @dmax: maximum ABS_MT_POSITION displacement (zero for infinite)
  *
  * Performs a best match against the current contacts and returns
  * the slot assignment list. New contacts are assigned to unused
  * slots.
  *
+ * The assignments are balanced so that all coordinate displacements are
+ * below the euclidian distance dmax. If no such assignment can be found,
+ * some contacts are assigned to unused slots.
+ *
  * Returns zero on success, or negative error in case of failure.
  */
 int input_mt_assign_slots(struct input_dev *dev, int *slots,
-			  const struct input_mt_pos *pos, int num_pos)
+			  const struct input_mt_pos *pos, int num_pos,
+			  int dmax)
 {
 	struct input_mt *mt = dev->mt;
+	int mu = 2 * dmax * dmax;
 	int nrc;
 
 	if (!mt || !mt->red)
@@ -413,8 +422,8 @@ int input_mt_assign_slots(struct input_dev *dev, int *slots,
 	if (num_pos < 1)
 		return 0;
 
-	nrc = input_mt_set_matrix(mt, pos, num_pos);
-	find_reduced_matrix(mt->red, num_pos, nrc / num_pos, nrc);
+	nrc = input_mt_set_matrix(mt, pos, num_pos, mu);
+	find_reduced_matrix(mt->red, num_pos, nrc / num_pos, nrc, mu);
 	input_mt_set_slots(mt, slots, num_pos);
 
 	return 0;
diff --git a/drivers/input/mouse/alps.c b/drivers/input/mouse/alps.c
index f719f28d370c..f205b8be2ce4 100644
--- a/drivers/input/mouse/alps.c
+++ b/drivers/input/mouse/alps.c
@@ -435,7 +435,7 @@ static void alps_report_mt_data(struct psmouse *psmouse, int n)
 	struct alps_fields *f = &priv->f;
 	int i, slot[MAX_TOUCHES];
 
-	input_mt_assign_slots(dev, slot, f->mt, n);
+	input_mt_assign_slots(dev, slot, f->mt, n, 0);
 	for (i = 0; i < n; i++)
 		alps_set_slot(dev, slot[i], f->mt[i].x, f->mt[i].y);
 
diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c
index c329cdb0b91a..b10709f04615 100644
--- a/drivers/input/mouse/bcm5974.c
+++ b/drivers/input/mouse/bcm5974.c
@@ -564,7 +564,7 @@ static int report_tp_state(struct bcm5974 *dev, int size)
 		dev->index[n++] = &f[i];
 	}
 
-	input_mt_assign_slots(input, dev->slots, dev->pos, n);
+	input_mt_assign_slots(input, dev->slots, dev->pos, n, 0);
 
 	for (i = 0; i < n; i++)
 		report_finger_data(input, dev->slots[i],
diff --git a/drivers/input/mouse/cypress_ps2.c b/drivers/input/mouse/cypress_ps2.c
index 8af34ffe208b..9118a1861a45 100644
--- a/drivers/input/mouse/cypress_ps2.c
+++ b/drivers/input/mouse/cypress_ps2.c
@@ -538,7 +538,7 @@ static void cypress_process_packet(struct psmouse *psmouse, bool zero_pkt)
 		pos[i].y = contact->y;
 	}
 
-	input_mt_assign_slots(input, slots, pos, n);
+	input_mt_assign_slots(input, slots, pos, n, 0);
 
 	for (i = 0; i < n; i++) {
 		contact = &report_data.contacts[i];
diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c
index f89de8971bf8..a3692e3b7cab 100644
--- a/drivers/input/mouse/synaptics.c
+++ b/drivers/input/mouse/synaptics.c
@@ -809,7 +809,7 @@ static void synaptics_report_mt_data(struct psmouse *psmouse,
 		pos[i].y = synaptics_invert_y(hw[i]->y);
 	}
 
-	input_mt_assign_slots(dev, slot, pos, nsemi);
+	input_mt_assign_slots(dev, slot, pos, nsemi, 0);
 
 	for (i = 0; i < nsemi; i++) {
 		input_mt_slot(dev, slot[i]);
diff --git a/drivers/input/touchscreen/pixcir_i2c_ts.c b/drivers/input/touchscreen/pixcir_i2c_ts.c
index 4fb5537fdd42..2c2107147319 100644
--- a/drivers/input/touchscreen/pixcir_i2c_ts.c
+++ b/drivers/input/touchscreen/pixcir_i2c_ts.c
@@ -126,7 +126,7 @@ static void pixcir_ts_report(struct pixcir_i2c_ts_data *ts,
 			pos[i].y = touch->y;
 		}
 
-		input_mt_assign_slots(ts->input, slots, pos, n);
+		input_mt_assign_slots(ts->input, slots, pos, n, 0);
 	}
 
 	for (i = 0; i < n; i++) {
diff --git a/include/linux/input/mt.h b/include/linux/input/mt.h
index f583ff639776..d7188de4db96 100644
--- a/include/linux/input/mt.h
+++ b/include/linux/input/mt.h
@@ -119,7 +119,8 @@ struct input_mt_pos {
 };
 
 int input_mt_assign_slots(struct input_dev *dev, int *slots,
-			  const struct input_mt_pos *pos, int num_pos);
+			  const struct input_mt_pos *pos, int num_pos,
+			  int dmax);
 
 int input_mt_get_slot_by_key(struct input_dev *dev, int key);
 
-- 
cgit v1.2.3


From 00845eb968ead28007338b2bb852b8beef816583 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 1 Feb 2015 12:23:32 -0800
Subject: sched: don't cause task state changes in nested sleep debugging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 8eb23b9f35aa ("sched: Debug nested sleeps") added code to report
on nested sleep conditions, which we generally want to avoid because the
inner sleeping operation can re-set the thread state to TASK_RUNNING,
but that will then cause the outer sleep loop not actually sleep when it
calls schedule.

However, that's actually valid traditional behavior, with the inner
sleep being some fairly rare case (like taking a sleeping lock that
normally doesn't actually need to sleep).

And the debug code would actually change the state of the task to
TASK_RUNNING internally, which makes that kind of traditional and
working code not work at all, because now the nested sleep doesn't just
sometimes cause the outer one to not block, but will cause it to happen
every time.

In particular, it will cause the cardbus kernel daemon (pccardd) to
basically busy-loop doing scheduling, converting a laptop into a heater,
as reported by Bruno Prémont.  But there may be other legacy uses of
that nested sleep model in other drivers that are also likely to never
get converted to the new model.

This fixes both cases:

 - don't set TASK_RUNNING when the nested condition happens (note: even
   if WARN_ONCE() only _warns_ once, the return value isn't whether the
   warning happened, but whether the condition for the warning was true.
   So despite the warning only happening once, the "if (WARN_ON(..))"
   would trigger for every nested sleep.

 - in the cases where we knowingly disable the warning by using
   "sched_annotate_sleep()", don't change the task state (that is used
   for all core scheduling decisions), instead use '->task_state_change'
   that is used for the debugging decision itself.

(Credit for the second part of the fix goes to Oleg Nesterov: "Can't we
avoid this subtle change in behaviour DEBUG_ATOMIC_SLEEP adds?" with the
suggested change to use 'task_state_change' as part of the test)

Reported-and-bisected-by: Bruno Prémont <bonbons@linux-vserver.org>
Tested-by: Rafael J Wysocki <rjw@rjwysocki.net>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>,
Cc: Ilya Dryomov <ilya.dryomov@inktank.com>,
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Hurley <peter@hurleysoftware.com>,
Cc: Davidlohr Bueso <dave@stgolabs.net>,
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 2 +-
 kernel/sched/core.c    | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5449d2f4a1ef..64ce58bee6f5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -176,7 +176,7 @@ extern int _cond_resched(void);
  */
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0)
-# define sched_annotate_sleep()	__set_current_state(TASK_RUNNING)
+# define sched_annotate_sleep()	(current->task_state_change = 0)
 #else
   static inline void ___might_sleep(const char *file, int line,
 				   int preempt_offset) { }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc00566e..e628cb11b560 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7292,13 +7292,12 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 	 * since we will exit with TASK_RUNNING make sure we enter with it,
 	 * otherwise we will destroy state.
 	 */
-	if (WARN_ONCE(current->state != TASK_RUNNING,
+	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
 			"do not call blocking ops when !TASK_RUNNING; "
 			"state=%lx set at [<%p>] %pS\n",
 			current->state,
 			(void *)current->task_state_change,
-			(void *)current->task_state_change))
-		__set_current_state(TASK_RUNNING);
+			(void *)current->task_state_change);
 
 	___might_sleep(file, line, preempt_offset);
 }
-- 
cgit v1.2.3


From fd979c0132074856975a6e79bc2226b99435ec5b Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Fri, 30 Jan 2015 13:45:57 -0800
Subject: perf: provide sysfs_show for struct perf_pmu_events_attr

(struct perf_pmu_events_attr) is defined in include/linux/perf_event.h,
but the only "show" for it is in x86 and contains x86 specific stuff.

Make a generic one for those of us who are just using the event_str.

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/perf_event.h |  3 +++
 kernel/events/core.c       | 12 ++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 486e84ccb1f9..58f59bdb590b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -897,6 +897,9 @@ struct perf_pmu_events_attr {
 	const char *event_str;
 };
 
+ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+			      char *page);
+
 #define PMU_EVENT_ATTR(_name, _var, _id, _show)				\
 static struct perf_pmu_events_attr _var = {				\
 	.attr = __ATTR(_name, 0444, _show, NULL),			\
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4c1ee7f2bebc..934687f8d51b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8276,6 +8276,18 @@ void __init perf_event_init(void)
 		     != 1024);
 }
 
+ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+			      char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr =
+		container_of(attr, struct perf_pmu_events_attr, attr);
+
+	if (pmu_attr->event_str)
+		return sprintf(page, "%s\n", pmu_attr->event_str);
+
+	return 0;
+}
+
 static int __init perf_event_sysfs_init(void)
 {
 	struct pmu *pmu;
-- 
cgit v1.2.3


From f0405b816149665393cc62b9e5082fc2d79714df Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Fri, 30 Jan 2015 13:45:58 -0800
Subject: perf: add PMU_EVENT_ATTR_STRING() helper

Helper for constructing static struct perf_pmu_events_attr s.

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 include/linux/perf_event.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 58f59bdb590b..1d3631448b91 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -906,6 +906,13 @@ static struct perf_pmu_events_attr _var = {				\
 	.id   =  _id,							\
 };
 
+#define PMU_EVENT_ATTR_STRING(_name, _var, _str)			    \
+static struct perf_pmu_events_attr _var = {				    \
+	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
+	.id		= 0,						    \
+	.event_str	= _str,						    \
+};
+
 #define PMU_FORMAT_ATTR(_name, _format)					\
 static ssize_t								\
 _name##_show(struct device *dev,					\
-- 
cgit v1.2.3


From aafb3e98b27977148c8c86499684f8f5c3decfbb Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Thu, 29 Jan 2015 22:40:11 -0800
Subject: netdev: introduce new NETIF_F_HW_SWITCH_OFFLOAD feature flag for
 switch device offloads

This is a high level feature flag for all switch asic offloads

switch drivers set this flag on switch ports. Logical devices like
bridge, bonds, vxlans can inherit this flag from their slaves/ports.

The patch also adds the flag to NETIF_F_ONE_FOR_ALL, so that it gets
propagated to the upperdevices (bridges and bonds).

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 8e30685affeb..7d59dc6ab789 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -66,6 +66,7 @@ enum {
 	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 	NETIF_F_BUSY_POLL_BIT,		/* Busy poll */
+	NETIF_F_HW_SWITCH_OFFLOAD_BIT,  /* HW switch offload */
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -124,6 +125,7 @@ enum {
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
+#define NETIF_F_HW_SWITCH_OFFLOAD	__NETIF_F(HW_SWITCH_OFFLOAD)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
@@ -159,7 +161,9 @@ enum {
  */
 #define NETIF_F_ONE_FOR_ALL	(NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
 				 NETIF_F_SG | NETIF_F_HIGHDMA |		\
-				 NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED)
+				 NETIF_F_FRAGLIST | NETIF_F_VLAN_CHALLENGED | \
+				 NETIF_F_HW_SWITCH_OFFLOAD)
+
 /*
  * If one device doesn't support one of these features, then disable it
  * for all in netdev_increment_features.
-- 
cgit v1.2.3


From add511b38266aa10c1079f9248854e6a415c4dc2 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Thu, 29 Jan 2015 22:40:12 -0800
Subject: bridge: add flags argument to ndo_bridge_setlink and
 ndo_bridge_dellink

bridge flags are needed inside ndo_bridge_setlink/dellink handlers to
avoid another call to parse IFLA_AF_SPEC inside these handlers

This is used later in this series

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/emulex/benet/be_main.c   |  3 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  2 +-
 drivers/net/ethernet/rocker/rocker.c          |  2 +-
 include/linux/netdevice.h                     |  6 ++++--
 net/bridge/br_netlink.c                       |  4 ++--
 net/bridge/br_private.h                       |  4 ++--
 net/core/rtnetlink.c                          | 10 ++++++----
 7 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 598c5070c629..efed92c7b731 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4327,7 +4327,8 @@ fw_exit:
 	return status;
 }
 
-static int be_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh)
+static int be_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
+				 u16 flags)
 {
 	struct be_adapter *adapter = netdev_priv(dev);
 	struct nlattr *attr, *br_spec;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 7bb421bfd84e..e4086fea4be2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -7786,7 +7786,7 @@ static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
 }
 
 static int ixgbe_ndo_bridge_setlink(struct net_device *dev,
-				    struct nlmsghdr *nlh)
+				    struct nlmsghdr *nlh, u16 flags)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct nlattr *attr, *br_spec;
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index 11f4ffcc113d..f0d607ca5e73 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -3722,7 +3722,7 @@ skip:
 }
 
 static int rocker_port_bridge_setlink(struct net_device *dev,
-				      struct nlmsghdr *nlh)
+				      struct nlmsghdr *nlh, u16 flags)
 {
 	struct rocker_port *rocker_port = netdev_priv(dev);
 	struct nlattr *protinfo;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3d37c6eb1732..16251e96e6aa 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1154,13 +1154,15 @@ struct net_device_ops {
 						int idx);
 
 	int			(*ndo_bridge_setlink)(struct net_device *dev,
-						      struct nlmsghdr *nlh);
+						      struct nlmsghdr *nlh,
+						      u16 flags);
 	int			(*ndo_bridge_getlink)(struct sk_buff *skb,
 						      u32 pid, u32 seq,
 						      struct net_device *dev,
 						      u32 filter_mask);
 	int			(*ndo_bridge_dellink)(struct net_device *dev,
-						      struct nlmsghdr *nlh);
+						      struct nlmsghdr *nlh,
+						      u16 flags);
 	int			(*ndo_change_carrier)(struct net_device *dev,
 						      bool new_carrier);
 	int			(*ndo_get_phys_port_id)(struct net_device *dev,
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index e08b260f33fe..088e80203845 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -494,7 +494,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
 }
 
 /* Change state and parameters on port. */
-int br_setlink(struct net_device *dev, struct nlmsghdr *nlh)
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
 {
 	struct nlattr *protinfo;
 	struct nlattr *afspec;
@@ -550,7 +550,7 @@ out:
 }
 
 /* Delete port information */
-int br_dellink(struct net_device *dev, struct nlmsghdr *nlh)
+int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
 {
 	struct nlattr *afspec;
 	struct net_bridge_port *p;
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index e8e3f3681680..de0919975a25 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -819,8 +819,8 @@ extern struct rtnl_link_ops br_link_ops;
 int br_netlink_init(void);
 void br_netlink_fini(void);
 void br_ifinfo_notify(int event, struct net_bridge_port *port);
-int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg);
-int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg);
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
+int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
 int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
 	       u32 filter_mask);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index fedd7ab4085a..673cb4c6f391 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2991,7 +2991,7 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
 			goto out;
 		}
 
-		err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+		err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags);
 		if (err)
 			goto out;
 
@@ -3002,7 +3002,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (!dev->netdev_ops->ndo_bridge_setlink)
 			err = -EOPNOTSUPP;
 		else
-			err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
+			err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh,
+								  flags);
 		if (!err) {
 			flags &= ~BRIDGE_FLAGS_SELF;
 
@@ -3064,7 +3065,7 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
 			goto out;
 		}
 
-		err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh);
+		err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags);
 		if (err)
 			goto out;
 
@@ -3075,7 +3076,8 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (!dev->netdev_ops->ndo_bridge_dellink)
 			err = -EOPNOTSUPP;
 		else
-			err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh);
+			err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh,
+								  flags);
 
 		if (!err) {
 			flags &= ~BRIDGE_FLAGS_SELF;
-- 
cgit v1.2.3


From 15d0f5ea348b9c4e6d41df294dde38a56a39c7bf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 2 Feb 2015 10:07:59 -0700
Subject: Make super_blocks and sb_lock static

The only user outside of fs/super.c is gone now

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 fs/super.c         | 4 ++--
 include/linux/fs.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/super.c b/fs/super.c
index 3b4dadafdd60..05a021638b11 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,8 +36,8 @@
 #include "internal.h"
 
 
-LIST_HEAD(super_blocks);
-DEFINE_SPINLOCK(sb_lock);
+static LIST_HEAD(super_blocks);
+static DEFINE_SPINLOCK(sb_lock);
 
 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
 	"sb_writers",
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65d02de342e1..2f717baefdf8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1183,8 +1183,6 @@ struct mm_struct;
 #define UMOUNT_NOFOLLOW	0x00000008	/* Don't follow symlink on umount */
 #define UMOUNT_UNUSED	0x80000000	/* Flag guaranteed to be unused */
 
-extern struct list_head super_blocks;
-extern spinlock_t sb_lock;
 
 /* Possible states of 'frozen' field */
 enum {
-- 
cgit v1.2.3


From 6cae0a4648c0db2a74efb816cd2ce84390c90480 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 16 Aug 2014 13:31:51 +0200
Subject: nfs: add LAYOUT_TYPE_MAX enum value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This gives us a nice upper bound for later use in nfѕd.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nfs4.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 022b761dbf0a..8a3589c2542c 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -516,6 +516,7 @@ enum pnfs_layouttype {
 	LAYOUT_NFSV4_1_FILES  = 1,
 	LAYOUT_OSD2_OBJECTS = 2,
 	LAYOUT_BLOCK_VOLUME = 3,
+	LAYOUT_TYPE_MAX
 };
 
 /* used for both layout return and recall */
-- 
cgit v1.2.3


From 11afe9f76e121e960445deee5b7f26f0787a1990 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 21 Jan 2015 19:17:03 +0100
Subject: fs: add FL_LAYOUT lease type

This (ab-)uses the file locking code to allow filesystems to recall
outstanding pNFS layouts on a file.  This new lease type is similar but
not quite the same as FL_DELEG.  A FL_LAYOUT lease can always be granted,
an a per-filesystem lock (XFS iolock for the initial implementation)
ensures not FL_LAYOUT leases granted when we would need to recall them.

Also included are changes that allow multiple outstanding read
leases of different types on the same file as long as they have a
differnt owner.  This wasn't a problem until now as nfsd never set
FL_LEASE leases, and no one else used FL_DELEG leases, but given that
nfsd will also issues FL_LAYOUT leases we will have to handle it now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/locks.c         | 14 ++++++++++----
 include/linux/fs.h | 16 ++++++++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 22ac7694cc84..4753218f308e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
 
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
 #define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
 
 static bool lease_breaking(struct file_lock *fl)
@@ -1371,6 +1371,8 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 
 static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 {
+	if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
+		return false;
 	if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
 		return false;
 	return locks_conflict(breaker, lease);
@@ -1594,11 +1596,14 @@ int fcntl_getlease(struct file *filp)
  * conflict with the lease we're trying to set.
  */
 static int
-check_conflicting_open(const struct dentry *dentry, const long arg)
+check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
 {
 	int ret = 0;
 	struct inode *inode = dentry->d_inode;
 
+	if (flags & FL_LAYOUT)
+		return 0;
+
 	if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
 		return -EAGAIN;
 
@@ -1647,7 +1652,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 
 	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
-	error = check_conflicting_open(dentry, arg);
+	error = check_conflicting_open(dentry, arg, lease->fl_flags);
 	if (error)
 		goto out;
 
@@ -1703,7 +1708,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 * precedes these checks.
 	 */
 	smp_mb();
-	error = check_conflicting_open(dentry, arg);
+	error = check_conflicting_open(dentry, arg, lease->fl_flags);
 	if (error) {
 		locks_unlink_lock_ctx(lease, &ctx->flc_lease_cnt);
 		goto out;
@@ -1787,6 +1792,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
 			WARN_ON_ONCE(1);
 			return -ENOLCK;
 		}
+
 		return generic_add_lease(filp, arg, flp, priv);
 	default:
 		return -EINVAL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ddd2fa7cefd3..84740145f835 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -875,6 +875,7 @@ static inline struct file *get_file(struct file *f)
 #define FL_DOWNGRADE_PENDING	256 /* Lease is being downgraded */
 #define FL_UNLOCK_PENDING	512 /* Lease is being broken */
 #define FL_OFDLCK	1024	/* lock is "owned" by struct file */
+#define FL_LAYOUT	2048	/* outstanding pNFS layout */
 
 /*
  * Special return value from posix_lock_file() and vfs_lock_file() for
@@ -2037,6 +2038,16 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
 	return ret;
 }
 
+static inline int break_layout(struct inode *inode, bool wait)
+{
+	smp_mb();
+	if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
+		return __break_lease(inode,
+				wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
+				FL_LAYOUT);
+	return 0;
+}
+
 #else /* !CONFIG_FILE_LOCKING */
 static inline int locks_mandatory_locked(struct file *file)
 {
@@ -2092,6 +2103,11 @@ static inline int break_deleg_wait(struct inode **delegated_inode)
 	return 0;
 }
 
+static inline int break_layout(struct inode *inode, bool wait)
+{
+	return 0;
+}
+
 #endif /* CONFIG_FILE_LOCKING */
 
 /* fs/open.c */
-- 
cgit v1.2.3


From 9cf514ccfacb301f3b1b4509a8ce25dffad55880 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 5 May 2014 13:11:59 +0200
Subject: nfsd: implement pNFS operations

Add support for the GETDEVICEINFO, LAYOUTGET, LAYOUTCOMMIT and
LAYOUTRETURN NFSv4.1 operations, as well as backing code to manage
outstanding layouts and devices.

Layout management is very straight forward, with a nfs4_layout_stateid
structure that extends nfs4_stid to manage layout stateids as the
top-level structure.  It is linked into the nfs4_file and nfs4_client
structures like the other stateids, and contains a linked list of
layouts that hang of the stateid.  The actual layout operations are
implemented in layout drivers that are not part of this commit, but
will be added later.

The worst part of this commit is the management of the pNFS device IDs,
which suffers from a specification that is not sanely implementable due
to the fact that the device-IDs are global and not bound to an export,
and have a small enough size so that we can't store the fsid portion of
a file handle, and must never be reused.  As we still do need perform all
export authentication and validation checks on a device ID passed to
GETDEVICEINFO we are caught between a rock and a hard place.  To work
around this issue we add a new hash that maps from a 64-bit integer to a
fsid so that we can look up the export to authenticate against it,
a 32-bit integer as a generation that we can bump when changing the device,
and a currently unused 32-bit integer that could be used in the future
to handle more than a single device per export.  Entries in this hash
table are never deleted as we can't reuse the ids anyway, and would have
a severe lifetime problem anyway as Linux export structures are temporary
structures that can go away under load.

Parts of the XDR data, structures and marshaling/unmarshaling code, as
well as many concepts are derived from the old pNFS server implementation
from Andy Adamson, Benny Halevy, Dean Hildebrand, Marc Eshel, Fred Isaman,
Mike Sager, Ricardo Labiaga and many others.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/nfsd/Kconfig                  |  10 +
 fs/nfsd/Makefile                 |   1 +
 fs/nfsd/export.c                 |   8 +
 fs/nfsd/export.h                 |   2 +
 fs/nfsd/nfs4layouts.c            | 487 +++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4proc.c               | 302 ++++++++++++++++++++++++
 fs/nfsd/nfs4state.c              |  16 +-
 fs/nfsd/nfs4xdr.c                | 312 +++++++++++++++++++++++++
 fs/nfsd/nfsctl.c                 |   9 +-
 fs/nfsd/nfsd.h                   |  16 +-
 fs/nfsd/pnfs.h                   |  80 +++++++
 fs/nfsd/state.h                  |  21 ++
 fs/nfsd/xdr4.h                   |  59 +++++
 include/linux/nfs4.h             |   1 +
 include/uapi/linux/nfsd/debug.h  |   1 +
 include/uapi/linux/nfsd/export.h |   4 +-
 16 files changed, 1324 insertions(+), 5 deletions(-)
 create mode 100644 fs/nfsd/nfs4layouts.c
 create mode 100644 fs/nfsd/pnfs.h

(limited to 'include/linux')

diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
 
 	  If unsure, say N.
 
+config NFSD_PNFS
+	bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+	depends on NFSD_V4
+	help
+	  This option enables support for the parallel NFS features of the
+	  minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
+	  server.
+
+	  If unsure, say N.
+
 config NFSD_V4_SECURITY_LABEL
 	bool "Provide Security Label support for NFSv4 server"
 	depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..5806270a8567 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -12,3 +12,4 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
 			   nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
 #include "nfsd.h"
 #include "nfsfh.h"
 #include "netns.h"
+#include "pnfs.h"
 
 #define NFSDDBG_FACILITY	NFSDDBG_EXPORT
 
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 
 	exp.ex_client = dom;
 	exp.cd = cd;
+	exp.ex_devid_map = NULL;
 
 	/* expiry */
 	err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 		if (!gid_valid(exp.ex_anon_gid))
 			goto out4;
 		err = 0;
+
+		nfsd4_setup_layout_type(&exp);
 	}
 
 	expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_fslocs.locations = NULL;
 	new->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = 0;
+	new->ex_layout_type = 0;
 	new->ex_uuid = NULL;
 	new->cd = item->cd;
 }
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	new->ex_anon_uid = item->ex_anon_uid;
 	new->ex_anon_gid = item->ex_anon_gid;
 	new->ex_fsid = item->ex_fsid;
+	new->ex_devid_map = item->ex_devid_map;
+	item->ex_devid_map = NULL;
 	new->ex_uuid = item->ex_uuid;
 	item->ex_uuid = NULL;
 	new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
 	item->ex_fslocs.locations_count = 0;
 	new->ex_fslocs.migrated = item->ex_fslocs.migrated;
 	item->ex_fslocs.migrated = 0;
+	new->ex_layout_type = item->ex_layout_type;
 	new->ex_nflavors = item->ex_nflavors;
 	for (i = 0; i < MAX_SECINFO_LIST; i++) {
 		new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
 	struct nfsd4_fs_locations ex_fslocs;
 	uint32_t		ex_nflavors;
 	struct exp_flavor_info	ex_flavors[MAX_SECINFO_LIST];
+	enum pnfs_layouttype	ex_layout_type;
+	struct nfsd4_deviceid_map *ex_devid_map;
 	struct cache_detail	*cd;
 };
 
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..8273270418b1
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,487 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/jhash.h>
+#include <linux/sched.h>
+
+#include "pnfs.h"
+#include "netns.h"
+
+#define NFSDDBG_FACILITY                NFSDDBG_PNFS
+
+struct nfs4_layout {
+	struct list_head		lo_perstate;
+	struct nfs4_layout_stateid	*lo_state;
+	struct nfsd4_layout_seg		lo_seg;
+};
+
+static struct kmem_cache *nfs4_layout_cache;
+static struct kmem_cache *nfs4_layout_stateid_cache;
+
+const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+};
+
+/* pNFS device ID to export fsid mapping */
+#define DEVID_HASH_BITS	8
+#define DEVID_HASH_SIZE	(1 << DEVID_HASH_BITS)
+#define DEVID_HASH_MASK	(DEVID_HASH_SIZE - 1)
+static u64 nfsd_devid_seq = 1;
+static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfsd_devid_lock);
+
+static inline u32 devid_hashfn(u64 idx)
+{
+	return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
+}
+
+static void
+nfsd4_alloc_devid_map(const struct svc_fh *fhp)
+{
+	const struct knfsd_fh *fh = &fhp->fh_handle;
+	size_t fsid_len = key_len(fh->fh_fsid_type);
+	struct nfsd4_deviceid_map *map, *old;
+	int i;
+
+	map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
+	if (!map)
+		return;
+
+	map->fsid_type = fh->fh_fsid_type;
+	memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+
+	spin_lock(&nfsd_devid_lock);
+	if (fhp->fh_export->ex_devid_map)
+		goto out_unlock;
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++) {
+		list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
+			if (old->fsid_type != fh->fh_fsid_type)
+				continue;
+			if (memcmp(old->fsid, fh->fh_fsid,
+					key_len(old->fsid_type)))
+				continue;
+
+			fhp->fh_export->ex_devid_map = old;
+			goto out_unlock;
+		}
+	}
+
+	map->idx = nfsd_devid_seq++;
+	list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
+	fhp->fh_export->ex_devid_map = map;
+	map = NULL;
+
+out_unlock:
+	spin_unlock(&nfsd_devid_lock);
+	kfree(map);
+}
+
+struct nfsd4_deviceid_map *
+nfsd4_find_devid_map(int idx)
+{
+	struct nfsd4_deviceid_map *map, *ret = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
+		if (map->idx == idx)
+			ret = map;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+int
+nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+		u32 device_generation)
+{
+	if (!fhp->fh_export->ex_devid_map) {
+		nfsd4_alloc_devid_map(fhp);
+		if (!fhp->fh_export->ex_devid_map)
+			return -ENOMEM;
+	}
+
+	id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
+	id->generation = device_generation;
+	id->pad = 0;
+	return 0;
+}
+
+void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+	if (exp->ex_flags & NFSEXP_NOPNFS)
+		return;
+}
+
+static void
+nfsd4_free_layout_stateid(struct nfs4_stid *stid)
+{
+	struct nfs4_layout_stateid *ls = layoutstateid(stid);
+	struct nfs4_client *clp = ls->ls_stid.sc_client;
+	struct nfs4_file *fp = ls->ls_stid.sc_file;
+
+	spin_lock(&clp->cl_lock);
+	list_del_init(&ls->ls_perclnt);
+	spin_unlock(&clp->cl_lock);
+
+	spin_lock(&fp->fi_lock);
+	list_del_init(&ls->ls_perfile);
+	spin_unlock(&fp->fi_lock);
+
+	kmem_cache_free(nfs4_layout_stateid_cache, ls);
+}
+
+static struct nfs4_layout_stateid *
+nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
+		struct nfs4_stid *parent, u32 layout_type)
+{
+	struct nfs4_client *clp = cstate->clp;
+	struct nfs4_file *fp = parent->sc_file;
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_stid *stp;
+
+	stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+	if (!stp)
+		return NULL;
+	stp->sc_free = nfsd4_free_layout_stateid;
+	get_nfs4_file(fp);
+	stp->sc_file = fp;
+
+	ls = layoutstateid(stp);
+	INIT_LIST_HEAD(&ls->ls_perclnt);
+	INIT_LIST_HEAD(&ls->ls_perfile);
+	spin_lock_init(&ls->ls_lock);
+	INIT_LIST_HEAD(&ls->ls_layouts);
+	ls->ls_layout_type = layout_type;
+
+	spin_lock(&clp->cl_lock);
+	stp->sc_type = NFS4_LAYOUT_STID;
+	list_add(&ls->ls_perclnt, &clp->cl_lo_states);
+	spin_unlock(&clp->cl_lock);
+
+	spin_lock(&fp->fi_lock);
+	list_add(&ls->ls_perfile, &fp->fi_lo_states);
+	spin_unlock(&fp->fi_lock);
+
+	return ls;
+}
+
+__be32
+nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
+{
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_stid *stid;
+	unsigned char typemask = NFS4_LAYOUT_STID;
+	__be32 status;
+
+	if (create)
+		typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+
+	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+			net_generic(SVC_NET(rqstp), nfsd_net_id));
+	if (status)
+		goto out;
+
+	if (!fh_match(&cstate->current_fh.fh_handle,
+		      &stid->sc_file->fi_fhandle)) {
+		status = nfserr_bad_stateid;
+		goto out_put_stid;
+	}
+
+	if (stid->sc_type != NFS4_LAYOUT_STID) {
+		ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
+		nfs4_put_stid(stid);
+
+		status = nfserr_jukebox;
+		if (!ls)
+			goto out;
+	} else {
+		ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
+
+		status = nfserr_bad_stateid;
+		if (stateid->si_generation > stid->sc_stateid.si_generation)
+			goto out_put_stid;
+		if (layout_type != ls->ls_layout_type)
+			goto out_put_stid;
+	}
+
+	*lsp = ls;
+	return 0;
+
+out_put_stid:
+	nfs4_put_stid(stid);
+out:
+	return status;
+}
+
+static inline u64
+layout_end(struct nfsd4_layout_seg *seg)
+{
+	u64 end = seg->offset + seg->length;
+	return end >= seg->offset ? end : NFS4_MAX_UINT64;
+}
+
+static void
+layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
+{
+	if (end == NFS4_MAX_UINT64)
+		lo->length = NFS4_MAX_UINT64;
+	else
+		lo->length = end - lo->offset;
+}
+
+static bool
+layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
+{
+	if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
+		return false;
+	if (layout_end(&lo->lo_seg) <= s->offset)
+		return false;
+	if (layout_end(s) <= lo->lo_seg.offset)
+		return false;
+	return true;
+}
+
+static bool
+layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
+{
+	if (lo->iomode != new->iomode)
+		return false;
+	if (layout_end(new) < lo->offset)
+		return false;
+	if (layout_end(lo) < new->offset)
+		return false;
+
+	lo->offset = min(lo->offset, new->offset);
+	layout_update_len(lo, max(layout_end(lo), layout_end(new)));
+	return true;
+}
+
+__be32
+nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+{
+	struct nfsd4_layout_seg *seg = &lgp->lg_seg;
+	struct nfs4_layout *lp, *new = NULL;
+
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+		if (layouts_try_merge(&lp->lo_seg, seg))
+			goto done;
+	}
+	spin_unlock(&ls->ls_lock);
+
+	new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+	if (!new)
+		return nfserr_jukebox;
+	memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+	new->lo_state = ls;
+
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+		if (layouts_try_merge(&lp->lo_seg, seg))
+			goto done;
+	}
+
+	atomic_inc(&ls->ls_stid.sc_count);
+	list_add_tail(&new->lo_perstate, &ls->ls_layouts);
+	new = NULL;
+done:
+	update_stateid(&ls->ls_stid.sc_stateid);
+	memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+	spin_unlock(&ls->ls_lock);
+	if (new)
+		kmem_cache_free(nfs4_layout_cache, new);
+	return nfs_ok;
+}
+
+static void
+nfsd4_free_layouts(struct list_head *reaplist)
+{
+	while (!list_empty(reaplist)) {
+		struct nfs4_layout *lp = list_first_entry(reaplist,
+				struct nfs4_layout, lo_perstate);
+
+		list_del(&lp->lo_perstate);
+		nfs4_put_stid(&lp->lo_state->ls_stid);
+		kmem_cache_free(nfs4_layout_cache, lp);
+	}
+}
+
+static void
+nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
+		struct list_head *reaplist)
+{
+	struct nfsd4_layout_seg *lo = &lp->lo_seg;
+	u64 end = layout_end(lo);
+
+	if (seg->offset <= lo->offset) {
+		if (layout_end(seg) >= end) {
+			list_move_tail(&lp->lo_perstate, reaplist);
+			return;
+		}
+		end = seg->offset;
+	} else {
+		/* retain the whole layout segment on a split. */
+		if (layout_end(seg) < end) {
+			dprintk("%s: split not supported\n", __func__);
+			return;
+		}
+
+		lo->offset = layout_end(seg);
+	}
+
+	layout_update_len(lo, end);
+}
+
+__be32
+nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct nfs4_layout_stateid *ls;
+	struct nfs4_layout *lp, *n;
+	LIST_HEAD(reaplist);
+	__be32 nfserr;
+	int found = 0;
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
+						false, lrp->lr_layout_type,
+						&ls);
+	if (nfserr)
+		return nfserr;
+
+	spin_lock(&ls->ls_lock);
+	list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
+		if (layouts_overlapping(lp, &lrp->lr_seg)) {
+			nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
+			found++;
+		}
+	}
+	if (!list_empty(&ls->ls_layouts)) {
+		if (found) {
+			update_stateid(&ls->ls_stid.sc_stateid);
+			memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
+				sizeof(stateid_t));
+		}
+		lrp->lrs_present = 1;
+	} else {
+		nfs4_unhash_stid(&ls->ls_stid);
+		lrp->lrs_present = 0;
+	}
+	spin_unlock(&ls->ls_lock);
+
+	nfs4_put_stid(&ls->ls_stid);
+	nfsd4_free_layouts(&reaplist);
+	return nfs_ok;
+}
+
+__be32
+nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	struct nfs4_client *clp = cstate->clp;
+	struct nfs4_layout *lp, *t;
+	LIST_HEAD(reaplist);
+
+	lrp->lrs_present = 0;
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+		if (lrp->lr_return_type == RETURN_FSID &&
+		    !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
+				   &cstate->current_fh.fh_handle))
+			continue;
+
+		spin_lock(&ls->ls_lock);
+		list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
+			if (lrp->lr_seg.iomode == IOMODE_ANY ||
+			    lrp->lr_seg.iomode == lp->lo_seg.iomode)
+				list_move_tail(&lp->lo_perstate, &reaplist);
+		}
+		spin_unlock(&ls->ls_lock);
+	}
+	spin_unlock(&clp->cl_lock);
+
+	nfsd4_free_layouts(&reaplist);
+	return 0;
+}
+
+static void
+nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
+		struct list_head *reaplist)
+{
+	spin_lock(&ls->ls_lock);
+	list_splice_init(&ls->ls_layouts, reaplist);
+	spin_unlock(&ls->ls_lock);
+}
+
+void
+nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	LIST_HEAD(reaplist);
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
+		nfsd4_return_all_layouts(ls, &reaplist);
+	spin_unlock(&clp->cl_lock);
+
+	nfsd4_free_layouts(&reaplist);
+}
+
+void
+nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+	struct nfs4_layout_stateid *ls, *n;
+	LIST_HEAD(reaplist);
+
+	spin_lock(&fp->fi_lock);
+	list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
+		if (ls->ls_stid.sc_client == clp)
+			nfsd4_return_all_layouts(ls, &reaplist);
+	}
+	spin_unlock(&fp->fi_lock);
+
+	nfsd4_free_layouts(&reaplist);
+}
+
+int
+nfsd4_init_pnfs(void)
+{
+	int i;
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&nfsd_devid_hash[i]);
+
+	nfs4_layout_cache = kmem_cache_create("nfs4_layout",
+			sizeof(struct nfs4_layout), 0, 0, NULL);
+	if (!nfs4_layout_cache)
+		return -ENOMEM;
+
+	nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
+			sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+	if (!nfs4_layout_stateid_cache) {
+		kmem_cache_destroy(nfs4_layout_cache);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void
+nfsd4_exit_pnfs(void)
+{
+	int i;
+
+	kmem_cache_destroy(nfs4_layout_cache);
+	kmem_cache_destroy(nfs4_layout_stateid_cache);
+
+	for (i = 0; i < DEVID_HASH_SIZE; i++) {
+		struct nfsd4_deviceid_map *map, *n;
+
+		list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
+			kfree(map);
+	}
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..2b91443497cc 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,7 @@
 #include "current_stateid.h"
 #include "netns.h"
 #include "acl.h"
+#include "pnfs.h"
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -1178,6 +1179,252 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status == nfserr_same ? nfs_ok : status;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static const struct nfsd4_layout_ops *
+nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
+{
+	if (!exp->ex_layout_type) {
+		dprintk("%s: export does not support pNFS\n", __func__);
+		return NULL;
+	}
+
+	if (exp->ex_layout_type != layout_type) {
+		dprintk("%s: layout type %d not supported\n",
+			__func__, layout_type);
+		return NULL;
+	}
+
+	return nfsd4_layout_ops[layout_type];
+}
+
+static __be32
+nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_getdeviceinfo *gdp)
+{
+	const struct nfsd4_layout_ops *ops;
+	struct nfsd4_deviceid_map *map;
+	struct svc_export *exp;
+	__be32 nfserr;
+
+	dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
+	       __func__,
+	       gdp->gd_layout_type,
+	       gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
+	       gdp->gd_maxcount);
+
+	map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
+	if (!map) {
+		dprintk("%s: couldn't find device ID to export mapping!\n",
+			__func__);
+		return nfserr_noent;
+	}
+
+	exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+	if (IS_ERR(exp)) {
+		dprintk("%s: could not find device id\n", __func__);
+		return nfserr_noent;
+	}
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
+	if (!ops)
+		goto out;
+
+	nfserr = nfs_ok;
+	if (gdp->gd_maxcount != 0)
+		nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+
+	gdp->gd_notify_types &= ops->notify_types;
+	exp_put(exp);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutget *lgp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	const struct nfsd4_layout_ops *ops;
+	struct nfs4_layout_stateid *ls;
+	__be32 nfserr;
+	int accmode;
+
+	switch (lgp->lg_seg.iomode) {
+	case IOMODE_READ:
+		accmode = NFSD_MAY_READ;
+		break;
+	case IOMODE_RW:
+		accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
+		break;
+	default:
+		dprintk("%s: invalid iomode %d\n",
+			__func__, lgp->lg_seg.iomode);
+		nfserr = nfserr_badiomode;
+		goto out;
+	}
+
+	nfserr = fh_verify(rqstp, current_fh, 0, accmode);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
+	if (!ops)
+		goto out;
+
+	/*
+	 * Verify minlength and range as per RFC5661:
+	 *  o  If loga_length is less than loga_minlength,
+	 *     the metadata server MUST return NFS4ERR_INVAL.
+	 *  o  If the sum of loga_offset and loga_minlength exceeds
+	 *     NFS4_UINT64_MAX, and loga_minlength is not
+	 *     NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
+	 *  o  If the sum of loga_offset and loga_length exceeds
+	 *     NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
+	 *     the error NFS4ERR_INVAL MUST result.
+	 */
+	nfserr = nfserr_inval;
+	if (lgp->lg_seg.length < lgp->lg_minlength ||
+	    (lgp->lg_minlength != NFS4_MAX_UINT64 &&
+	     lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
+	    (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
+	     lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
+		goto out;
+	if (lgp->lg_seg.length == 0)
+		goto out;
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
+						true, lgp->lg_layout_type, &ls);
+	if (nfserr)
+		goto out;
+
+	nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
+				     current_fh, lgp);
+	if (nfserr)
+		goto out_put_stid;
+
+	nfserr = nfsd4_insert_layout(lgp, ls);
+
+out_put_stid:
+	nfs4_put_stid(&ls->ls_stid);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutcommit *lcp)
+{
+	const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
+	struct svc_fh *current_fh = &cstate->current_fh;
+	const struct nfsd4_layout_ops *ops;
+	loff_t new_size = lcp->lc_last_wr + 1;
+	struct inode *inode;
+	struct nfs4_layout_stateid *ls;
+	__be32 nfserr;
+
+	nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
+	if (!ops)
+		goto out;
+	inode = current_fh->fh_dentry->d_inode;
+
+	nfserr = nfserr_inval;
+	if (new_size <= seg->offset) {
+		dprintk("pnfsd: last write before layout segment\n");
+		goto out;
+	}
+	if (new_size > seg->offset + seg->length) {
+		dprintk("pnfsd: last write beyond layout segment\n");
+		goto out;
+	}
+	if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
+		dprintk("pnfsd: layoutcommit beyond EOF\n");
+		goto out;
+	}
+
+	nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
+						false, lcp->lc_layout_type,
+						&ls);
+	if (nfserr) {
+		/* fixup error code as per RFC5661 */
+		if (nfserr == nfserr_bad_stateid)
+			nfserr = nfserr_badlayout;
+		goto out;
+	}
+
+	nfserr = ops->proc_layoutcommit(inode, lcp);
+	if (nfserr)
+		goto out_put_stid;
+
+	if (new_size > i_size_read(inode)) {
+		lcp->lc_size_chg = 1;
+		lcp->lc_newsize = new_size;
+	} else {
+		lcp->lc_size_chg = 0;
+	}
+
+out_put_stid:
+	nfs4_put_stid(&ls->ls_stid);
+out:
+	return nfserr;
+}
+
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct svc_fh *current_fh = &cstate->current_fh;
+	__be32 nfserr;
+
+	nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_layoutunavailable;
+	if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
+		goto out;
+
+	switch (lrp->lr_seg.iomode) {
+	case IOMODE_READ:
+	case IOMODE_RW:
+	case IOMODE_ANY:
+		break;
+	default:
+		dprintk("%s: invalid iomode %d\n", __func__,
+			lrp->lr_seg.iomode);
+		nfserr = nfserr_inval;
+		goto out;
+	}
+
+	switch (lrp->lr_return_type) {
+	case RETURN_FILE:
+		nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
+		break;
+	case RETURN_FSID:
+	case RETURN_ALL:
+		nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
+		break;
+	default:
+		dprintk("%s: invalid return_type %d\n", __func__,
+			lrp->lr_return_type);
+		nfserr = nfserr_inval;
+		break;
+	}
+out:
+	return nfserr;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 /*
  * NULL call.
  */
@@ -1679,6 +1926,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
 		op_encode_channel_attrs_maxsz) * sizeof(__be32);
 }
 
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * At this stage we don't really know what layout driver will handle the request,
+ * so we need to define an arbitrary upper bound here.
+ */
+#define MAX_LAYOUT_SIZE		128
+static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* logr_return_on_close */ +
+		op_encode_stateid_maxsz +
+		1 /* nr of layouts */ +
+		MAX_LAYOUT_SIZE) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* locr_newsize */ +
+		2 /* ns_size */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* lrs_stateid */ +
+		op_encode_stateid_maxsz) * sizeof(__be32);
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static struct nfsd4_operation nfsd4_ops[] = {
 	[OP_ACCESS] = {
 		.op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2243,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO] = {
+		.op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
+		.op_flags = ALLOWED_WITHOUT_FH,
+		.op_name = "OP_GETDEVICEINFO",
+	},
+	[OP_LAYOUTGET] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutget,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTGET",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
+	},
+	[OP_LAYOUTCOMMIT] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutcommit,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTCOMMIT",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
+	},
+	[OP_LAYOUTRETURN] = {
+		.op_func = (nfsd4op_func)nfsd4_layoutreturn,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_LAYOUTRETURN",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
+	},
+#endif /* CONFIG_NFSD_PNFS */
 
 	/* NFSv4.2 operations */
 	[OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index eefd29ec43f2..c89f79dc69e2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
 #include "current_stateid.h"
 
 #include "netns.h"
+#include "pnfs.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -1539,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 	INIT_LIST_HEAD(&clp->cl_lru);
 	INIT_LIST_HEAD(&clp->cl_callbacks);
 	INIT_LIST_HEAD(&clp->cl_revoked);
+#ifdef CONFIG_NFSD_PNFS
+	INIT_LIST_HEAD(&clp->cl_lo_states);
+#endif
 	spin_lock_init(&clp->cl_lock);
 	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
 	return clp;
@@ -1643,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
 		nfs4_get_stateowner(&oo->oo_owner);
 		release_openowner(oo);
 	}
+	nfsd4_return_all_client_layouts(clp);
 	nfsd4_shutdown_callback(clp);
 	if (clp->cl_cb_conn.cb_xprt)
 		svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2126,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 static void
 nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
 {
-	/* pNFS is not supported */
+#ifdef CONFIG_NFSD_PNFS
+	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
+#else
 	new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif
 
 	/* Referrals are supported, Migration is not. */
 	new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3055,6 +3063,9 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
 	fp->fi_share_deny = 0;
 	memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
 	memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#ifdef CONFIG_NFSD_PNFS
+	INIT_LIST_HEAD(&fp->fi_lo_states);
+#endif
 	hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
 }
 
@@ -4841,6 +4852,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	update_stateid(&stp->st_stid.sc_stateid);
 	memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
 
+	nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
+				      stp->st_stid.sc_file);
+
 	nfsd4_close_open_stateid(stp);
 
 	/* put reference from nfs4_preprocess_seqid_op */
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 974533e5a427..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
 #include "state.h"
 #include "cache.h"
 #include "netns.h"
+#include "pnfs.h"
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -1522,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
 	DECODE_TAIL;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+		struct nfsd4_getdeviceinfo *gdev)
+{
+	DECODE_HEAD;
+	u32 num, i;
+
+	READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+	COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+	gdev->gd_layout_type = be32_to_cpup(p++);
+	gdev->gd_maxcount = be32_to_cpup(p++);
+	num = be32_to_cpup(p++);
+	if (num) {
+		READ_BUF(4 * num);
+		gdev->gd_notify_types = be32_to_cpup(p++);
+		for (i = 1; i < num; i++) {
+			if (be32_to_cpup(p++)) {
+				status = nfserr_inval;
+				goto out;
+			}
+		}
+	}
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutget *lgp)
+{
+	DECODE_HEAD;
+
+	READ_BUF(36);
+	lgp->lg_signal = be32_to_cpup(p++);
+	lgp->lg_layout_type = be32_to_cpup(p++);
+	lgp->lg_seg.iomode = be32_to_cpup(p++);
+	p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+	p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+	p = xdr_decode_hyper(p, &lgp->lg_minlength);
+	nfsd4_decode_stateid(argp, &lgp->lg_sid);
+	READ_BUF(4);
+	lgp->lg_maxcount = be32_to_cpup(p++);
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutcommit *lcp)
+{
+	DECODE_HEAD;
+	u32 timechange;
+
+	READ_BUF(20);
+	p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+	p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+	lcp->lc_reclaim = be32_to_cpup(p++);
+	nfsd4_decode_stateid(argp, &lcp->lc_sid);
+	READ_BUF(4);
+	lcp->lc_newoffset = be32_to_cpup(p++);
+	if (lcp->lc_newoffset) {
+		READ_BUF(8);
+		p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+	} else
+		lcp->lc_last_wr = 0;
+	READ_BUF(4);
+	timechange = be32_to_cpup(p++);
+	if (timechange) {
+		status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+		if (status)
+			return status;
+	} else {
+		lcp->lc_mtime.tv_nsec = UTIME_NOW;
+	}
+	READ_BUF(8);
+	lcp->lc_layout_type = be32_to_cpup(p++);
+
+	/*
+	 * Save the layout update in XDR format and let the layout driver deal
+	 * with it later.
+	 */
+	lcp->lc_up_len = be32_to_cpup(p++);
+	if (lcp->lc_up_len > 0) {
+		READ_BUF(lcp->lc_up_len);
+		READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+	}
+
+	DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+		struct nfsd4_layoutreturn *lrp)
+{
+	DECODE_HEAD;
+
+	READ_BUF(16);
+	lrp->lr_reclaim = be32_to_cpup(p++);
+	lrp->lr_layout_type = be32_to_cpup(p++);
+	lrp->lr_seg.iomode = be32_to_cpup(p++);
+	lrp->lr_return_type = be32_to_cpup(p++);
+	if (lrp->lr_return_type == RETURN_FILE) {
+		READ_BUF(16);
+		p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+		p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+		nfsd4_decode_stateid(argp, &lrp->lr_sid);
+		READ_BUF(4);
+		lrp->lrf_body_len = be32_to_cpup(p++);
+		if (lrp->lrf_body_len > 0) {
+			READ_BUF(lrp->lrf_body_len);
+			READMEM(lrp->lrf_body, lrp->lrf_body_len);
+		}
+	} else {
+		lrp->lr_seg.offset = 0;
+		lrp->lr_seg.length = NFS4_MAX_UINT64;
+	}
+
+	DECODE_TAIL;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static __be32
 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
 		       struct nfsd4_fallocate *fallocate)
@@ -1616,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_DESTROY_SESSION]	= (nfsd4_dec)nfsd4_decode_destroy_session,
 	[OP_FREE_STATEID]	= (nfsd4_dec)nfsd4_decode_free_stateid,
 	[OP_GET_DIR_DELEGATION]	= (nfsd4_dec)nfsd4_decode_notsupp,
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_layoutcommit,
+	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_layoutget,
+	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else
 	[OP_GETDEVICEINFO]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_GETDEVICELIST]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTCOMMIT]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTGET]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_LAYOUTRETURN]	= (nfsd4_dec)nfsd4_decode_notsupp,
+#endif
 	[OP_SECINFO_NO_NAME]	= (nfsd4_dec)nfsd4_decode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_dec)nfsd4_decode_sequence,
 	[OP_SET_SSV]		= (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2548,6 +2678,30 @@ out_acl:
 			get_parent_attributes(exp, &stat);
 		p = xdr_encode_hyper(p, stat.ino);
 	}
+#ifdef CONFIG_NFSD_PNFS
+	if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
+	    (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
+		if (exp->ex_layout_type) {
+			p = xdr_reserve_space(xdr, 8);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(1);
+			*p++ = cpu_to_be32(exp->ex_layout_type);
+		} else {
+			p = xdr_reserve_space(xdr, 4);
+			if (!p)
+				goto out_resource;
+			*p++ = cpu_to_be32(0);
+		}
+	}
+
+	if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out_resource;
+		*p++ = cpu_to_be32(stat.blksize);
+	}
+#endif /* CONFIG_NFSD_PNFS */
 	if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
 		status = nfsd4_encode_security_label(xdr, rqstp, context,
 								contextlen);
@@ -3824,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return nfserr;
 }
 
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_getdeviceinfo *gdev)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	const struct nfsd4_layout_ops *ops =
+		nfsd4_layout_ops[gdev->gd_layout_type];
+	u32 starting_len = xdr->buf->len, needed_len;
+	__be32 *p;
+
+	dprintk("%s: err %d\n", __func__, nfserr);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_resource;
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		goto out;
+
+	*p++ = cpu_to_be32(gdev->gd_layout_type);
+
+	/* If maxcount is 0 then just update notifications */
+	if (gdev->gd_maxcount != 0) {
+		nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+		if (nfserr) {
+			/*
+			 * We don't bother to burden the layout drivers with
+			 * enforcing gd_maxcount, just tell the client to
+			 * come back with a bigger buffer if it's not enough.
+			 */
+			if (xdr->buf->len + 4 > gdev->gd_maxcount)
+				goto toosmall;
+			goto out;
+		}
+	}
+
+	nfserr = nfserr_resource;
+	if (gdev->gd_notify_types) {
+		p = xdr_reserve_space(xdr, 4 + 4);
+		if (!p)
+			goto out;
+		*p++ = cpu_to_be32(1);			/* bitmap length */
+		*p++ = cpu_to_be32(gdev->gd_notify_types);
+	} else {
+		p = xdr_reserve_space(xdr, 4);
+		if (!p)
+			goto out;
+		*p++ = 0;
+	}
+
+	nfserr = 0;
+out:
+	kfree(gdev->gd_device);
+	dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
+	return nfserr;
+
+toosmall:
+	dprintk("%s: maxcount too small\n", __func__);
+	needed_len = xdr->buf->len + 4 /* notifications */;
+	xdr_truncate_encode(xdr, starting_len);
+	p = xdr_reserve_space(xdr, 4);
+	if (!p) {
+		nfserr = nfserr_resource;
+	} else {
+		*p++ = cpu_to_be32(needed_len);
+		nfserr = nfserr_toosmall;
+	}
+	goto out;
+}
+
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_layoutget *lgp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	const struct nfsd4_layout_ops *ops =
+		nfsd4_layout_ops[lgp->lg_layout_type];
+	__be32 *p;
+
+	dprintk("%s: err %d\n", __func__, nfserr);
+	if (nfserr)
+		goto out;
+
+	nfserr = nfserr_resource;
+	p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
+	if (!p)
+		goto out;
+
+	*p++ = cpu_to_be32(1);	/* we always set return-on-close */
+	*p++ = cpu_to_be32(lgp->lg_sid.si_generation);
+	p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
+				    sizeof(stateid_opaque_t));
+
+	*p++ = cpu_to_be32(1);	/* we always return a single layout */
+	p = xdr_encode_hyper(p, lgp->lg_seg.offset);
+	p = xdr_encode_hyper(p, lgp->lg_seg.length);
+	*p++ = cpu_to_be32(lgp->lg_seg.iomode);
+	*p++ = cpu_to_be32(lgp->lg_layout_type);
+
+	nfserr = ops->encode_layoutget(xdr, lgp);
+out:
+	kfree(lgp->lg_content);
+	return nfserr;
+}
+
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+			  struct nfsd4_layoutcommit *lcp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(lcp->lc_size_chg);
+	if (lcp->lc_size_chg) {
+		p = xdr_reserve_space(xdr, 8);
+		if (!p)
+			return nfserr_resource;
+		p = xdr_encode_hyper(p, lcp->lc_newsize);
+	}
+
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+		struct nfsd4_layoutreturn *lrp)
+{
+	struct xdr_stream *xdr = &resp->xdr;
+	__be32 *p;
+
+	if (nfserr)
+		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+	*p++ = cpu_to_be32(lrp->lrs_present);
+	if (lrp->lrs_present)
+		nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+	return nfs_ok;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
 static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
 		  struct nfsd4_seek *seek)
@@ -3900,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_DESTROY_SESSION]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_FREE_STATEID]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_GET_DIR_DELEGATION]	= (nfsd4_enc)nfsd4_encode_noop,
+#ifdef CONFIG_NFSD_PNFS
+	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_layoutcommit,
+	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_layoutget,
+	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else
 	[OP_GETDEVICEINFO]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_GETDEVICELIST]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTCOMMIT]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTGET]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_LAYOUTRETURN]	= (nfsd4_enc)nfsd4_encode_noop,
+#endif
 	[OP_SECINFO_NO_NAME]	= (nfsd4_enc)nfsd4_encode_secinfo_no_name,
 	[OP_SEQUENCE]		= (nfsd4_enc)nfsd4_encode_sequence,
 	[OP_SET_SSV]		= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
 #include "cache.h"
 #include "state.h"
 #include "netns.h"
+#include "pnfs.h"
 
 /*
  *	We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
 	retval = nfsd4_init_slabs();
 	if (retval)
 		goto out_unregister_pernet;
-	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+	retval = nfsd4_init_pnfs();
 	if (retval)
 		goto out_free_slabs;
+	retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+	if (retval)
+		goto out_exit_pnfs;
 	nfsd_stat_init();	/* Statistics */
 	retval = nfsd_reply_cache_init();
 	if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
 out_free_stat:
 	nfsd_stat_shutdown();
 	nfsd_fault_inject_cleanup();
+out_exit_pnfs:
+	nfsd4_exit_pnfs();
 out_free_slabs:
 	nfsd4_free_slabs();
 out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
 	nfsd_stat_shutdown();
 	nfsd_lockd_shutdown();
 	nfsd4_free_slabs();
+	nfsd4_exit_pnfs();
 	nfsd_fault_inject_cleanup();
 	unregister_filesystem(&nfsd_fs_type);
 	unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void		nfsd_lockd_shutdown(void);
 
 #define NFSD4_SUPPORTED_ATTRS_WORD2 0
 
+/* 4.1 */
+#ifdef CONFIG_NFSD_PNFS
+#define PNFSD_SUPPORTED_ATTRS_WORD1	FATTR4_WORD1_FS_LAYOUT_TYPES
+#define PNFSD_SUPPORTED_ATTRS_WORD2 \
+(FATTR4_WORD2_LAYOUT_BLKSIZE	| FATTR4_WORD2_LAYOUT_TYPES)
+#else
+#define PNFSD_SUPPORTED_ATTRS_WORD1	0
+#define PNFSD_SUPPORTED_ATTRS_WORD2	0
+#endif /* CONFIG_NFSD_PNFS */
+
 #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
 	NFSD4_SUPPORTED_ATTRS_WORD0
 
 #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
-	NFSD4_SUPPORTED_ATTRS_WORD1
+	(NFSD4_SUPPORTED_ATTRS_WORD1	| PNFSD_SUPPORTED_ATTRS_WORD1)
 
 #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
-	(NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+	(NFSD4_SUPPORTED_ATTRS_WORD2	| PNFSD_SUPPORTED_ATTRS_WORD2 | \
+	 FATTR4_WORD2_SUPPATTR_EXCLCREAT)
 
+/* 4.2 */
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #define NFSD4_2_SECURITY_ATTRS		FATTR4_WORD2_SECURITY_LABEL
 #else
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..a9616a4e13cd
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,80 @@
+#ifndef _FS_NFSD_PNFS_H
+#define _FS_NFSD_PNFS_H 1
+
+#include <linux/exportfs.h>
+#include <linux/nfsd/export.h>
+
+#include "state.h"
+#include "xdr4.h"
+
+struct xdr_stream;
+
+struct nfsd4_deviceid_map {
+	struct list_head	hash;
+	u64			idx;
+	int			fsid_type;
+	u32			fsid[];
+};
+
+struct nfsd4_layout_ops {
+	u32		notify_types;
+
+	__be32 (*proc_getdeviceinfo)(struct super_block *sb,
+			struct nfsd4_getdeviceinfo *gdevp);
+	__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
+			struct nfsd4_getdeviceinfo *gdevp);
+
+	__be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
+			struct nfsd4_layoutget *lgp);
+	__be32 (*encode_layoutget)(struct xdr_stream *,
+			struct nfsd4_layoutget *lgp);
+
+	__be32 (*proc_layoutcommit)(struct inode *inode,
+			struct nfsd4_layoutcommit *lcp);
+};
+
+extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+
+__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate, stateid_t *stateid,
+		bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
+__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
+		struct nfs4_layout_stateid *ls);
+__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp);
+__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+		struct nfsd4_compound_state *cstate,
+		struct nfsd4_layoutreturn *lrp);
+int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+		u32 device_generation);
+struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
+
+#ifdef CONFIG_NFSD_PNFS
+void nfsd4_setup_layout_type(struct svc_export *exp);
+void nfsd4_return_all_client_layouts(struct nfs4_client *);
+void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+		struct nfs4_file *fp);
+int nfsd4_init_pnfs(void);
+void nfsd4_exit_pnfs(void);
+#else
+static inline void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+}
+
+static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+}
+static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+		struct nfs4_file *fp)
+{
+}
+static inline void nfsd4_exit_pnfs(void)
+{
+}
+static inline int nfsd4_init_pnfs(void)
+{
+	return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 38ebb1268b59..5f66b7fd0297 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
 /* For a deleg stateid kept around only to process free_stateid's: */
 #define NFS4_REVOKED_DELEG_STID 16
 #define NFS4_CLOSED_DELEG_STID 32
+#define NFS4_LAYOUT_STID 64
 	unsigned char sc_type;
 	stateid_t sc_stateid;
 	struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
 	struct list_head	cl_delegations;
 	struct list_head	cl_revoked;	/* unacknowledged, revoked 4.1 state */
 	struct list_head        cl_lru;         /* tail queue */
+#ifdef CONFIG_NFSD_PNFS
+	struct list_head	cl_lo_states;	/* outstanding layout states */
+#endif
 	struct xdr_netobj	cl_name; 	/* id generated by client */
 	nfs4_verifier		cl_verifier; 	/* generated by client */
 	time_t                  cl_time;        /* time of last lease renewal */
@@ -496,6 +500,9 @@ struct nfs4_file {
 	int			fi_delegees;
 	struct knfsd_fh		fi_fhandle;
 	bool			fi_had_conflict;
+#ifdef CONFIG_NFSD_PNFS
+	struct list_head	fi_lo_states;
+#endif
 };
 
 /*
@@ -528,6 +535,20 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
 	return container_of(s, struct nfs4_ol_stateid, st_stid);
 }
 
+struct nfs4_layout_stateid {
+	struct nfs4_stid		ls_stid;
+	struct list_head		ls_perclnt;
+	struct list_head		ls_perfile;
+	spinlock_t			ls_lock;
+	struct list_head		ls_layouts;
+	u32				ls_layout_type;
+};
+
+static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
+{
+	return container_of(s, struct nfs4_layout_stateid, ls_stid);
+}
+
 /* flags for preprocess_seqid_op() */
 #define RD_STATE	        0x00000010
 #define WR_STATE	        0x00000020
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
 	u32 rca_one_fs;
 };
 
+struct nfsd4_deviceid {
+	u64			fsid_idx;
+	u32			generation;
+	u32			pad;
+};
+
+struct nfsd4_layout_seg {
+	u32			iomode;
+	u64			offset;
+	u64			length;
+};
+
+struct nfsd4_getdeviceinfo {
+	struct nfsd4_deviceid	gd_devid;	/* request */
+	u32			gd_layout_type;	/* request */
+	u32			gd_maxcount;	/* request */
+	u32			gd_notify_types;/* request - response */
+	void			*gd_device;	/* response */
+};
+
+struct nfsd4_layoutget {
+	u64			lg_minlength;	/* request */
+	u32			lg_signal;	/* request */
+	u32			lg_layout_type;	/* request */
+	u32			lg_maxcount;	/* request */
+	stateid_t		lg_sid;		/* request/response */
+	struct nfsd4_layout_seg	lg_seg;		/* request/response */
+	void			*lg_content;	/* response */
+};
+
+struct nfsd4_layoutcommit {
+	stateid_t		lc_sid;		/* request */
+	struct nfsd4_layout_seg	lc_seg;		/* request */
+	u32			lc_reclaim;	/* request */
+	u32			lc_newoffset;	/* request */
+	u64			lc_last_wr;	/* request */
+	struct timespec		lc_mtime;	/* request */
+	u32			lc_layout_type;	/* request */
+	u32			lc_up_len;	/* layout length */
+	void			*lc_up_layout;	/* decoded by callback */
+	u32			lc_size_chg;	/* boolean for response */
+	u64			lc_newsize;	/* response */
+};
+
+struct nfsd4_layoutreturn {
+	u32			lr_return_type;	/* request */
+	u32			lr_layout_type;	/* request */
+	struct nfsd4_layout_seg	lr_seg;		/* request */
+	u32			lr_reclaim;	/* request */
+	u32			lrf_body_len;	/* request */
+	void			*lrf_body;	/* request */
+	stateid_t		lr_sid;		/* request/response */
+	u32			lrs_present;	/* response */
+};
+
 struct nfsd4_fallocate {
 	/* request */
 	stateid_t	falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
 		struct nfsd4_reclaim_complete	reclaim_complete;
 		struct nfsd4_test_stateid	test_stateid;
 		struct nfsd4_free_stateid	free_stateid;
+		struct nfsd4_getdeviceinfo	getdeviceinfo;
+		struct nfsd4_layoutget		layoutget;
+		struct nfsd4_layoutcommit	layoutcommit;
+		struct nfsd4_layoutreturn	layoutreturn;
 
 		/* NFSv4.2 */
 		struct nfsd4_fallocate		allocate;
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 8a3589c2542c..bc10d687f2ce 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -411,6 +411,7 @@ enum lock_type4 {
 #define FATTR4_WORD1_TIME_MODIFY_SET    (1UL << 22)
 #define FATTR4_WORD1_MOUNTED_ON_FILEID  (1UL << 23)
 #define FATTR4_WORD1_FS_LAYOUT_TYPES    (1UL << 30)
+#define FATTR4_WORD2_LAYOUT_TYPES       (1UL << 0)
 #define FATTR4_WORD2_LAYOUT_BLKSIZE     (1UL << 1)
 #define FATTR4_WORD2_MDSTHRESHOLD       (1UL << 4)
 #define FATTR4_WORD2_SECURITY_LABEL     (1UL << 16)
diff --git a/include/uapi/linux/nfsd/debug.h b/include/uapi/linux/nfsd/debug.h
index 1fdc95bb2375..0bf130a1c58d 100644
--- a/include/uapi/linux/nfsd/debug.h
+++ b/include/uapi/linux/nfsd/debug.h
@@ -32,6 +32,7 @@
 #define NFSDDBG_REPCACHE	0x0080
 #define NFSDDBG_XDR		0x0100
 #define NFSDDBG_LOCKD		0x0200
+#define NFSDDBG_PNFS		0x0400
 #define NFSDDBG_ALL		0x7FFF
 #define NFSDDBG_NOCHANGE	0xFFFF
 
diff --git a/include/uapi/linux/nfsd/export.h b/include/uapi/linux/nfsd/export.h
index 584b6ef3a5e8..4742f2cb42f2 100644
--- a/include/uapi/linux/nfsd/export.h
+++ b/include/uapi/linux/nfsd/export.h
@@ -47,8 +47,10 @@
  * exported filesystem.
  */
 #define	NFSEXP_V4ROOT		0x10000
+#define NFSEXP_NOPNFS		0x20000
+
 /* All flags that we claim to support.  (Note we don't support NOACL.) */
-#define NFSEXP_ALLFLAGS		0x1FE7F
+#define NFSEXP_ALLFLAGS		0x3FE7F
 
 /* The flags that may vary depending on security flavor: */
 #define NFSEXP_SECINFO_FLAGS	(NFSEXP_READONLY | NFSEXP_ROOTSQUASH \
-- 
cgit v1.2.3


From 366e41d9774d7010cb63112b6db2fce6dc7809c0 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Sat, 31 Jan 2015 10:40:13 -0500
Subject: ipv6: pull cork initialization into its own function.

Pull IPv6 cork initialization into its own function that
can be re-used.  IPv6 specific cork data did not have an
explicit data structure.  This patch creats eone so that
just ipv6 cork data can be as arguemts.  Also, since
IPv6 tries to save the flow label into inet_cork_full
tructure, pass the full cork.

Adjust ip6_cork_release() to take cork data structures.

Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ipv6.h  |  12 ++--
 net/ipv6/ip6_output.c | 158 ++++++++++++++++++++++++++++----------------------
 2 files changed, 96 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 2805062c013f..4d5169f5d7d1 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -125,6 +125,12 @@ struct ipv6_mc_socklist;
 struct ipv6_ac_socklist;
 struct ipv6_fl_socklist;
 
+struct inet6_cork {
+	struct ipv6_txoptions *opt;
+	u8 hop_limit;
+	u8 tclass;
+};
+
 /**
  * struct ipv6_pinfo - ipv6 private area
  *
@@ -217,11 +223,7 @@ struct ipv6_pinfo {
 	struct ipv6_txoptions	*opt;
 	struct sk_buff		*pktoptions;
 	struct sk_buff		*rxpmtu;
-	struct {
-		struct ipv6_txoptions *opt;
-		u8 hop_limit;
-		u8 tclass;
-	} cork;
+	struct inet6_cork	cork;
 };
 
 /* WARNING: don't change the layout of the members in {raw,udp,tcp}6_sock! */
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ce69a12ae48c..f9f08c43c6e1 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1135,6 +1135,74 @@ static void ip6_append_data_mtu(unsigned int *mtu,
 	}
 }
 
+static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
+			  struct inet6_cork *v6_cork,
+			  int hlimit, int tclass, struct ipv6_txoptions *opt,
+			  struct rt6_info *rt, struct flowi6 *fl6)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+	unsigned int mtu;
+
+	/*
+	 * setup for corking
+	 */
+	if (opt) {
+		if (WARN_ON(v6_cork->opt))
+			return -EINVAL;
+
+		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
+		if (unlikely(v6_cork->opt == NULL))
+			return -ENOBUFS;
+
+		v6_cork->opt->tot_len = opt->tot_len;
+		v6_cork->opt->opt_flen = opt->opt_flen;
+		v6_cork->opt->opt_nflen = opt->opt_nflen;
+
+		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
+						    sk->sk_allocation);
+		if (opt->dst0opt && !v6_cork->opt->dst0opt)
+			return -ENOBUFS;
+
+		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
+						    sk->sk_allocation);
+		if (opt->dst1opt && !v6_cork->opt->dst1opt)
+			return -ENOBUFS;
+
+		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
+						   sk->sk_allocation);
+		if (opt->hopopt && !v6_cork->opt->hopopt)
+			return -ENOBUFS;
+
+		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
+						    sk->sk_allocation);
+		if (opt->srcrt && !v6_cork->opt->srcrt)
+			return -ENOBUFS;
+
+		/* need source address above miyazawa*/
+	}
+	dst_hold(&rt->dst);
+	cork->base.dst = &rt->dst;
+	cork->fl.u.ip6 = *fl6;
+	v6_cork->hop_limit = hlimit;
+	v6_cork->tclass = tclass;
+	if (rt->dst.flags & DST_XFRM_TUNNEL)
+		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
+	else
+		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
+	if (np->frag_size < mtu) {
+		if (np->frag_size)
+			mtu = np->frag_size;
+	}
+	cork->base.fragsize = mtu;
+	if (dst_allfrag(rt->dst.path))
+		cork->base.flags |= IPCORK_ALLFRAG;
+	cork->base.length = 0;
+
+	return 0;
+}
+
 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 	int offset, int len, int odd, struct sk_buff *skb),
 	void *from, int length, int transhdrlen,
@@ -1162,59 +1230,10 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		/*
 		 * setup for corking
 		 */
-		if (opt) {
-			if (WARN_ON(np->cork.opt))
-				return -EINVAL;
-
-			np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
-			if (unlikely(np->cork.opt == NULL))
-				return -ENOBUFS;
-
-			np->cork.opt->tot_len = opt->tot_len;
-			np->cork.opt->opt_flen = opt->opt_flen;
-			np->cork.opt->opt_nflen = opt->opt_nflen;
-
-			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
-							    sk->sk_allocation);
-			if (opt->dst0opt && !np->cork.opt->dst0opt)
-				return -ENOBUFS;
-
-			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
-							    sk->sk_allocation);
-			if (opt->dst1opt && !np->cork.opt->dst1opt)
-				return -ENOBUFS;
-
-			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
-							   sk->sk_allocation);
-			if (opt->hopopt && !np->cork.opt->hopopt)
-				return -ENOBUFS;
-
-			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
-							    sk->sk_allocation);
-			if (opt->srcrt && !np->cork.opt->srcrt)
-				return -ENOBUFS;
-
-			/* need source address above miyazawa*/
-		}
-		dst_hold(&rt->dst);
-		cork->dst = &rt->dst;
-		inet->cork.fl.u.ip6 = *fl6;
-		np->cork.hop_limit = hlimit;
-		np->cork.tclass = tclass;
-		if (rt->dst.flags & DST_XFRM_TUNNEL)
-			mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
-			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
-		else
-			mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
-			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
-		if (np->frag_size < mtu) {
-			if (np->frag_size)
-				mtu = np->frag_size;
-		}
-		cork->fragsize = mtu;
-		if (dst_allfrag(rt->dst.path))
-			cork->flags |= IPCORK_ALLFRAG;
-		cork->length = 0;
+		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
+				     tclass, opt, rt, fl6);
+		if (err)
+			return err;
 		exthdrlen = (opt ? opt->opt_flen : 0);
 		length += exthdrlen;
 		transhdrlen += exthdrlen;
@@ -1226,8 +1245,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
 		transhdrlen = 0;
 		exthdrlen = 0;
 		dst_exthdrlen = 0;
-		mtu = cork->fragsize;
 	}
+	mtu = cork->fragsize;
 	orig_mtu = mtu;
 
 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
@@ -1503,23 +1522,24 @@ error:
 }
 EXPORT_SYMBOL_GPL(ip6_append_data);
 
-static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
+static void ip6_cork_release(struct inet_cork_full *cork,
+			     struct inet6_cork *v6_cork)
 {
-	if (np->cork.opt) {
-		kfree(np->cork.opt->dst0opt);
-		kfree(np->cork.opt->dst1opt);
-		kfree(np->cork.opt->hopopt);
-		kfree(np->cork.opt->srcrt);
-		kfree(np->cork.opt);
-		np->cork.opt = NULL;
+	if (v6_cork->opt) {
+		kfree(v6_cork->opt->dst0opt);
+		kfree(v6_cork->opt->dst1opt);
+		kfree(v6_cork->opt->hopopt);
+		kfree(v6_cork->opt->srcrt);
+		kfree(v6_cork->opt);
+		v6_cork->opt = NULL;
 	}
 
-	if (inet->cork.base.dst) {
-		dst_release(inet->cork.base.dst);
-		inet->cork.base.dst = NULL;
-		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
+	if (cork->base.dst) {
+		dst_release(cork->base.dst);
+		cork->base.dst = NULL;
+		cork->base.flags &= ~IPCORK_ALLFRAG;
 	}
-	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
+	memset(&cork->fl, 0, sizeof(cork->fl));
 }
 
 int ip6_push_pending_frames(struct sock *sk)
@@ -1599,7 +1619,7 @@ int ip6_push_pending_frames(struct sock *sk)
 	}
 
 out:
-	ip6_cork_release(inet, np);
+	ip6_cork_release(&inet->cork, &np->cork);
 	return err;
 error:
 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
@@ -1618,6 +1638,6 @@ void ip6_flush_pending_frames(struct sock *sk)
 		kfree_skb(skb);
 	}
 
-	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
+	ip6_cork_release(&inet_sk(sk)->cork, &inet6_sk(sk)->cork);
 }
 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
-- 
cgit v1.2.3


From 5a2e87b16875f9b83b7e9494cf1fce8e17dc764a Mon Sep 17 00:00:00 2001
From: Jack Morgenstein <jackm@dev.mellanox.co.il>
Date: Mon, 2 Feb 2015 15:18:42 +0200
Subject: net/mlx4_core: Fix kernel Oops (mem corruption) when working with
 more than 80 VFs

Commit de966c592802 (net/mlx4_core: Support more than 64 VFs) was meant to
allow up to 126 VFs.  However, due to leaving MLX4_MFUNC_MAX too low, using
more than 80 VFs resulted in memory corruptions (and Oopses) when more than
80 VFs were requested. In addition, the number of slaves was left too high.

This commit fixes these issues.

Fixes: de966c592802 ("net/mlx4_core: Support more than 64 VFs")
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/mlx4.h | 3 ++-
 include/linux/mlx4/device.h               | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index bdd4eea2247c..210691c89b6c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -235,7 +235,8 @@ do {									\
 extern int mlx4_log_num_mgm_entry_size;
 extern int log_mtts_per_seg;
 
-#define MLX4_MAX_NUM_SLAVES	(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF)
+#define MLX4_MAX_NUM_SLAVES	(min(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF, \
+				     MLX4_MFUNC_MAX))
 #define ALL_SLAVES 0xff
 
 struct mlx4_bitmap {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 25c791e295fd..5f3a9aa7225d 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -97,7 +97,7 @@ enum {
 	MLX4_MAX_NUM_PF		= 16,
 	MLX4_MAX_NUM_VF		= 126,
 	MLX4_MAX_NUM_VF_P_PORT  = 64,
-	MLX4_MFUNC_MAX		= 80,
+	MLX4_MFUNC_MAX		= 128,
 	MLX4_MAX_EQ_NUM		= 1024,
 	MLX4_MFUNC_EQ_NUM	= 4,
 	MLX4_MFUNC_MAX_EQES     = 8,
-- 
cgit v1.2.3


From 3e87523897e18a3e17fc8955ed795188be737ff1 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 2 Feb 2015 09:39:02 -0500
Subject: sched/wait: Remove might_sleep() from wait_event_cmd()

The patch e22b886a8a43 ("sched/wait: Add might_sleep() checks")
introduced a bug in the raid5 subsystem.

The function raid5_quiesce() (and resize_stripes()) uses the 'cmd'
part to release and acquire a spinlock (so we call the sleep
primitives in atomic context), and therefore we cannot do the
might_sleep() check.

Remove it.

Fixes: e22b886a8a43 ("sched/wait: Add might_sleep() checks")
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1502020935580.13510@file01.intranet.prod.int.rdu2.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 2232ed16635a..37423e0e1379 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -363,7 +363,6 @@ do {									\
  */
 #define wait_event_cmd(wq, condition, cmd1, cmd2)			\
 do {									\
-	might_sleep();							\
 	if (condition)							\
 		break;							\
 	__wait_event_cmd(wq, condition, cmd1, cmd2);			\
-- 
cgit v1.2.3


From f0b66a2cf68ed3613fe72fe01ed309f998e2bbb3 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Tue, 3 Feb 2015 09:29:52 -0600
Subject: PCI: Add pci_device_to_OF_node() stub for !CONFIG_OF

Add a stub for pci_device_to_OF_node() so drivers don't need to
use #ifdef CONFIG_OF around calls to it.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 360a966a97a5..fbb5795a8d86 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1847,6 +1847,8 @@ static inline void pci_set_of_node(struct pci_dev *dev) { }
 static inline void pci_release_of_node(struct pci_dev *dev) { }
 static inline void pci_set_bus_of_node(struct pci_bus *bus) { }
 static inline void pci_release_bus_of_node(struct pci_bus *bus) { }
+static inline struct device_node *
+pci_device_to_OF_node(const struct pci_dev *pdev) { return NULL; }
 #endif  /* CONFIG_OF */
 
 #ifdef CONFIG_EEH
-- 
cgit v1.2.3


From 344d635b9a41b19837ccf8083a99ea688027019c Mon Sep 17 00:00:00 2001
From: Brad Griffis <bgriffis@ti.com>
Date: Tue, 3 Feb 2015 11:44:12 -0800
Subject: Input: ti_am335x_tsc - remove udelay in interrupt handler

TSC interrupt handler had udelay to avoid reporting of false pen-up
interrupt to user space. This patch implements workaround suggesting in
Advisory 1.0.31 of silicon errata for am335x, thus eliminating udelay and
touchscreen lag. This also improves performance of touchscreen and
eliminates sudden jump of cursor at touch release.

IDLECONFIG and CHARGECONFIG registers are to be configured with same values
in order to eliminate false pen-up events. This workaround may result in
false pen-down to be detected, hence considerable charge step delay needs
to be added. The charge delay is set to 0xB000 (in terms of ADC clock
cycles) by default.

TSC steps are disabled at the end of every sampling cycle and EOS bit is
set. Once the EOS bit is set, the TSC steps need to be re-enabled to begin
next sampling cycle.

Signed-off-by: Brad Griffis <bgriffis@ti.com>
[vigneshr@ti.com: Ported the patch from v3.12 to v3.19rc1]
Signed-off-by: Vignesh R <vigneshr@ti.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/ti_am335x_tsc.c | 67 ++++++++++++++-----------------
 include/linux/mfd/ti_am335x_tscadc.h      |  3 +-
 2 files changed, 33 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/ti_am335x_tsc.c b/drivers/input/touchscreen/ti_am335x_tsc.c
index dfbb9fe6a270..0625c102a1d0 100644
--- a/drivers/input/touchscreen/ti_am335x_tsc.c
+++ b/drivers/input/touchscreen/ti_am335x_tsc.c
@@ -173,11 +173,9 @@ static void titsc_step_config(struct titsc *ts_dev)
 		titsc_writel(ts_dev, REG_STEPDELAY(i), STEPCONFIG_OPENDLY);
 	}
 
-	/* Charge step configuration */
-	config = ts_dev->bit_xp | ts_dev->bit_yn |
-			STEPCHARGE_RFP_XPUL | STEPCHARGE_RFM_XNUR |
-			STEPCHARGE_INM_AN1 | STEPCHARGE_INP(ts_dev->inp_yp);
+	/* Make CHARGECONFIG same as IDLECONFIG */
 
+	config = titsc_readl(ts_dev, REG_IDLECONFIG);
 	titsc_writel(ts_dev, REG_CHARGECONFIG, config);
 	titsc_writel(ts_dev, REG_CHARGEDELAY, CHARGEDLY_OPENDLY);
 
@@ -261,12 +259,34 @@ static irqreturn_t titsc_irq(int irq, void *dev)
 {
 	struct titsc *ts_dev = dev;
 	struct input_dev *input_dev = ts_dev->input;
-	unsigned int status, irqclr = 0;
+	unsigned int fsm, status, irqclr = 0;
 	unsigned int x = 0, y = 0;
 	unsigned int z1, z2, z;
-	unsigned int fsm;
 
-	status = titsc_readl(ts_dev, REG_IRQSTATUS);
+	status = titsc_readl(ts_dev, REG_RAWIRQSTATUS);
+	if (status & IRQENB_HW_PEN) {
+		ts_dev->pen_down = true;
+		titsc_writel(ts_dev, REG_IRQWAKEUP, 0x00);
+		titsc_writel(ts_dev, REG_IRQCLR, IRQENB_HW_PEN);
+		irqclr |= IRQENB_HW_PEN;
+	}
+
+	if (status & IRQENB_PENUP) {
+		fsm = titsc_readl(ts_dev, REG_ADCFSM);
+		if (fsm == ADCFSM_STEPID) {
+			ts_dev->pen_down = false;
+			input_report_key(input_dev, BTN_TOUCH, 0);
+			input_report_abs(input_dev, ABS_PRESSURE, 0);
+			input_sync(input_dev);
+		} else {
+			ts_dev->pen_down = true;
+		}
+		irqclr |= IRQENB_PENUP;
+	}
+
+	if (status & IRQENB_EOS)
+		irqclr |= IRQENB_EOS;
+
 	/*
 	 * ADC and touchscreen share the IRQ line.
 	 * FIFO1 interrupts are used by ADC. Handle FIFO0 IRQs here only
@@ -297,37 +317,11 @@ static irqreturn_t titsc_irq(int irq, void *dev)
 		}
 		irqclr |= IRQENB_FIFO0THRES;
 	}
-
-	/*
-	 * Time for sequencer to settle, to read
-	 * correct state of the sequencer.
-	 */
-	udelay(SEQ_SETTLE);
-
-	status = titsc_readl(ts_dev, REG_RAWIRQSTATUS);
-	if (status & IRQENB_PENUP) {
-		/* Pen up event */
-		fsm = titsc_readl(ts_dev, REG_ADCFSM);
-		if (fsm == ADCFSM_STEPID) {
-			ts_dev->pen_down = false;
-			input_report_key(input_dev, BTN_TOUCH, 0);
-			input_report_abs(input_dev, ABS_PRESSURE, 0);
-			input_sync(input_dev);
-		} else {
-			ts_dev->pen_down = true;
-		}
-		irqclr |= IRQENB_PENUP;
-	}
-
-	if (status & IRQENB_HW_PEN) {
-
-		titsc_writel(ts_dev, REG_IRQWAKEUP, 0x00);
-		titsc_writel(ts_dev, REG_IRQCLR, IRQENB_HW_PEN);
-	}
-
 	if (irqclr) {
 		titsc_writel(ts_dev, REG_IRQSTATUS, irqclr);
-		am335x_tsc_se_set_cache(ts_dev->mfd_tscadc, ts_dev->step_mask);
+		if (status & IRQENB_EOS)
+			am335x_tsc_se_set_cache(ts_dev->mfd_tscadc,
+						ts_dev->step_mask);
 		return IRQ_HANDLED;
 	}
 	return IRQ_NONE;
@@ -417,6 +411,7 @@ static int titsc_probe(struct platform_device *pdev)
 	}
 
 	titsc_writel(ts_dev, REG_IRQENABLE, IRQENB_FIFO0THRES);
+	titsc_writel(ts_dev, REG_IRQENABLE, IRQENB_EOS);
 	err = titsc_config_wires(ts_dev);
 	if (err) {
 		dev_err(&pdev->dev, "wrong i/p wire configuration\n");
diff --git a/include/linux/mfd/ti_am335x_tscadc.h b/include/linux/mfd/ti_am335x_tscadc.h
index e2e70053470e..3f4e994ace2b 100644
--- a/include/linux/mfd/ti_am335x_tscadc.h
+++ b/include/linux/mfd/ti_am335x_tscadc.h
@@ -52,6 +52,7 @@
 
 /* IRQ enable */
 #define IRQENB_HW_PEN		BIT(0)
+#define IRQENB_EOS		BIT(1)
 #define IRQENB_FIFO0THRES	BIT(2)
 #define IRQENB_FIFO0OVRRUN	BIT(3)
 #define IRQENB_FIFO0UNDRFLW	BIT(4)
@@ -107,7 +108,7 @@
 /* Charge delay */
 #define CHARGEDLY_OPEN_MASK	(0x3FFFF << 0)
 #define CHARGEDLY_OPEN(val)	((val) << 0)
-#define CHARGEDLY_OPENDLY	CHARGEDLY_OPEN(1)
+#define CHARGEDLY_OPENDLY	CHARGEDLY_OPEN(0x400)
 
 /* Control register */
 #define CNTRLREG_TSCSSENB	BIT(0)
-- 
cgit v1.2.3


From a49170b552423a3e85fc4f0d778c707402ee4863 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 2 Feb 2015 10:42:58 +0800
Subject: ACPI: Return translation offset when parsing ACPI address space
 resources

Change function acpi_dev_resource_address_space() and
acpi_dev_resource_ext_address_space() to return address space
translation offset.

It's based on a patch from Yinghai Lu <yinghai@kernel.org>.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/resource.c        | 58 +++++++++++++++++++++++-------------------
 drivers/pnp/pnpacpi/rsparser.c | 29 +++++++++++----------
 include/linux/acpi.h           |  9 +++++--
 3 files changed, 54 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 57891a621b96..c902c8eece81 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -184,13 +184,14 @@ bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res)
 }
 EXPORT_SYMBOL_GPL(acpi_dev_resource_io);
 
-static bool acpi_decode_space(struct resource *res,
+static bool acpi_decode_space(struct resource_win *win,
 			      struct acpi_resource_address *addr,
 			      struct acpi_address64_attribute *attr)
 {
 	u8 iodec = attr->granularity == 0xfff ? ACPI_DECODE_10 : ACPI_DECODE_16;
 	bool wp = addr->info.mem.write_protect;
 	u64 len = attr->address_length;
+	struct resource *res = &win->res;
 
 	/*
 	 * Filter out invalid descriptor according to ACPI Spec 5.0, section
@@ -218,6 +219,8 @@ static bool acpi_decode_space(struct resource *res,
 		return false;
 	}
 
+	win->offset = attr->translation_offset;
+
 	if (addr->producer_consumer == ACPI_PRODUCER)
 		res->flags |= IORESOURCE_WINDOW;
 
@@ -230,27 +233,28 @@ static bool acpi_decode_space(struct resource *res,
 /**
  * acpi_dev_resource_address_space - Extract ACPI address space information.
  * @ares: Input ACPI resource object.
- * @res: Output generic resource object.
+ * @win: Output generic resource object.
  *
  * Check if the given ACPI resource object represents an address space resource
  * and if that's the case, use the information in it to populate the generic
- * resource object pointed to by @res.
+ * resource object pointed to by @win.
  *
  * Return:
- * 1) false with res->flags setting to zero: not the expected resource type
- * 2) false with IORESOURCE_DISABLED in res->flags: valid unassigned resource
+ * 1) false with win->res.flags setting to zero: not the expected resource type
+ * 2) false with IORESOURCE_DISABLED in win->res.flags: valid unassigned
+ *    resource
  * 3) true: valid assigned resource
  */
 bool acpi_dev_resource_address_space(struct acpi_resource *ares,
-				     struct resource *res)
+				     struct resource_win *win)
 {
 	struct acpi_resource_address64 addr;
 
-	res->flags = 0;
+	win->res.flags = 0;
 	if (ACPI_FAILURE(acpi_resource_to_address64(ares, &addr)))
 		return false;
 
-	return acpi_decode_space(res, (struct acpi_resource_address *)&addr,
+	return acpi_decode_space(win, (struct acpi_resource_address *)&addr,
 				 &addr.address);
 }
 EXPORT_SYMBOL_GPL(acpi_dev_resource_address_space);
@@ -258,29 +262,30 @@ EXPORT_SYMBOL_GPL(acpi_dev_resource_address_space);
 /**
  * acpi_dev_resource_ext_address_space - Extract ACPI address space information.
  * @ares: Input ACPI resource object.
- * @res: Output generic resource object.
+ * @win: Output generic resource object.
  *
  * Check if the given ACPI resource object represents an extended address space
  * resource and if that's the case, use the information in it to populate the
- * generic resource object pointed to by @res.
+ * generic resource object pointed to by @win.
  *
  * Return:
- * 1) false with res->flags setting to zero: not the expected resource type
- * 2) false with IORESOURCE_DISABLED in res->flags: valid unassigned resource
+ * 1) false with win->res.flags setting to zero: not the expected resource type
+ * 2) false with IORESOURCE_DISABLED in win->res.flags: valid unassigned
+ *    resource
  * 3) true: valid assigned resource
  */
 bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares,
-					 struct resource *res)
+					 struct resource_win *win)
 {
 	struct acpi_resource_extended_address64 *ext_addr;
 
-	res->flags = 0;
+	win->res.flags = 0;
 	if (ares->type != ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64)
 		return false;
 
 	ext_addr = &ares->data.ext_address64;
 
-	return acpi_decode_space(res, (struct acpi_resource_address *)ext_addr,
+	return acpi_decode_space(win, (struct acpi_resource_address *)ext_addr,
 				 &ext_addr->address);
 }
 EXPORT_SYMBOL_GPL(acpi_dev_resource_ext_address_space);
@@ -441,7 +446,7 @@ struct res_proc_context {
 	int error;
 };
 
-static acpi_status acpi_dev_new_resource_entry(struct resource *r,
+static acpi_status acpi_dev_new_resource_entry(struct resource_win *win,
 					       struct res_proc_context *c)
 {
 	struct resource_list_entry *rentry;
@@ -451,7 +456,7 @@ static acpi_status acpi_dev_new_resource_entry(struct resource *r,
 		c->error = -ENOMEM;
 		return AE_NO_MEMORY;
 	}
-	rentry->res = *r;
+	rentry->res = win->res;
 	list_add_tail(&rentry->node, c->list);
 	c->count++;
 	return AE_OK;
@@ -461,7 +466,8 @@ static acpi_status acpi_dev_process_resource(struct acpi_resource *ares,
 					     void *context)
 {
 	struct res_proc_context *c = context;
-	struct resource r;
+	struct resource_win win;
+	struct resource *res = &win.res;
 	int i;
 
 	if (c->preproc) {
@@ -476,18 +482,18 @@ static acpi_status acpi_dev_process_resource(struct acpi_resource *ares,
 		}
 	}
 
-	memset(&r, 0, sizeof(r));
+	memset(&win, 0, sizeof(win));
 
-	if (acpi_dev_resource_memory(ares, &r)
-	    || acpi_dev_resource_io(ares, &r)
-	    || acpi_dev_resource_address_space(ares, &r)
-	    || acpi_dev_resource_ext_address_space(ares, &r))
-		return acpi_dev_new_resource_entry(&r, c);
+	if (acpi_dev_resource_memory(ares, res)
+	    || acpi_dev_resource_io(ares, res)
+	    || acpi_dev_resource_address_space(ares, &win)
+	    || acpi_dev_resource_ext_address_space(ares, &win))
+		return acpi_dev_new_resource_entry(&win, c);
 
-	for (i = 0; acpi_dev_resource_interrupt(ares, i, &r); i++) {
+	for (i = 0; acpi_dev_resource_interrupt(ares, i, res); i++) {
 		acpi_status status;
 
-		status = acpi_dev_new_resource_entry(&r, c);
+		status = acpi_dev_new_resource_entry(&win, c);
 		if (ACPI_FAILURE(status))
 			return status;
 	}
diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c
index 2d9bc789af0f..ff0356fb378f 100644
--- a/drivers/pnp/pnpacpi/rsparser.c
+++ b/drivers/pnp/pnpacpi/rsparser.c
@@ -180,20 +180,21 @@ static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
 	struct pnp_dev *dev = data;
 	struct acpi_resource_dma *dma;
 	struct acpi_resource_vendor_typed *vendor_typed;
-	struct resource r = {0};
+	struct resource_win win = {{0}, 0};
+	struct resource *r = &win.res;
 	int i, flags;
 
-	if (acpi_dev_resource_address_space(res, &r)
-	    || acpi_dev_resource_ext_address_space(res, &r)) {
-		pnp_add_resource(dev, &r);
+	if (acpi_dev_resource_address_space(res, &win)
+	    || acpi_dev_resource_ext_address_space(res, &win)) {
+		pnp_add_resource(dev, &win.res);
 		return AE_OK;
 	}
 
-	r.flags = 0;
-	if (acpi_dev_resource_interrupt(res, 0, &r)) {
-		pnpacpi_add_irqresource(dev, &r);
-		for (i = 1; acpi_dev_resource_interrupt(res, i, &r); i++)
-			pnpacpi_add_irqresource(dev, &r);
+	r->flags = 0;
+	if (acpi_dev_resource_interrupt(res, 0, r)) {
+		pnpacpi_add_irqresource(dev, r);
+		for (i = 1; acpi_dev_resource_interrupt(res, i, r); i++)
+			pnpacpi_add_irqresource(dev, r);
 
 		if (i > 1) {
 			/*
@@ -209,7 +210,7 @@ static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
 			}
 		}
 		return AE_OK;
-	} else if (r.flags & IORESOURCE_DISABLED) {
+	} else if (r->flags & IORESOURCE_DISABLED) {
 		pnp_add_irq_resource(dev, 0, IORESOURCE_DISABLED);
 		return AE_OK;
 	}
@@ -218,13 +219,13 @@ static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
 	case ACPI_RESOURCE_TYPE_MEMORY24:
 	case ACPI_RESOURCE_TYPE_MEMORY32:
 	case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
-		if (acpi_dev_resource_memory(res, &r))
-			pnp_add_resource(dev, &r);
+		if (acpi_dev_resource_memory(res, r))
+			pnp_add_resource(dev, r);
 		break;
 	case ACPI_RESOURCE_TYPE_IO:
 	case ACPI_RESOURCE_TYPE_FIXED_IO:
-		if (acpi_dev_resource_io(res, &r))
-			pnp_add_resource(dev, &r);
+		if (acpi_dev_resource_io(res, r))
+			pnp_add_resource(dev, r);
 		break;
 	case ACPI_RESOURCE_TYPE_DMA:
 		dma = &res->data.dma;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index d459cd17b477..be9eaee8f4ae 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -285,12 +285,17 @@ extern int pnpacpi_disabled;
 
 #define PXM_INVAL	(-1)
 
+struct resource_win {
+	struct resource res;
+	resource_size_t offset;
+};
+
 bool acpi_dev_resource_memory(struct acpi_resource *ares, struct resource *res);
 bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res);
 bool acpi_dev_resource_address_space(struct acpi_resource *ares,
-				     struct resource *res);
+				     struct resource_win *win);
 bool acpi_dev_resource_ext_address_space(struct acpi_resource *ares,
-					 struct resource *res);
+					 struct resource_win *win);
 unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable);
 bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
 				 struct resource *res);
-- 
cgit v1.2.3


From 93286f4798590e711aa395503401f8632fb74f9a Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 2 Feb 2015 10:43:00 +0800
Subject: ACPI: Add field offset to struct resource_list_entry

Add field offset to struct resource_list_entry to host address space
translation offset so it could be used to represent bridge resources.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/resource.c | 1 +
 include/linux/acpi.h    | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 4dc8cfb2e94e..1c3abae6f2fa 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -472,6 +472,7 @@ static acpi_status acpi_dev_new_resource_entry(struct resource_win *win,
 		return AE_NO_MEMORY;
 	}
 	rentry->res = win->res;
+	rentry->offset = win->offset;
 	list_add_tail(&rentry->node, c->list);
 	c->count++;
 	return AE_OK;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index be9eaee8f4ae..21dac3cb62d2 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -303,6 +303,7 @@ bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
 struct resource_list_entry {
 	struct list_head node;
 	struct resource res;
+	resource_size_t offset;
 };
 
 void acpi_dev_free_resource_list(struct list_head *list);
-- 
cgit v1.2.3


From 62d1141ff34e35de496ba06491c8e854b23b3f3e Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 2 Feb 2015 10:43:01 +0800
Subject: ACPI: Introduce helper function acpi_dev_filter_resource_type()

Introduce helper function acpi_dev_filter_resource_type(), which may
be used by acpi_dev_get_resources() to filer out resource based on
resource type.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/resource.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h    |  8 +++++++
 2 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 1c3abae6f2fa..3ea0d17eb951 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -569,3 +569,58 @@ int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
 	return c.count;
 }
 EXPORT_SYMBOL_GPL(acpi_dev_get_resources);
+
+/**
+ * acpi_dev_filter_resource_type - Filter ACPI resource according to resource
+ *				   types
+ * @ares: Input ACPI resource object.
+ * @types: Valid resource types of IORESOURCE_XXX
+ *
+ * This is a hepler function to support acpi_dev_get_resources(), which filters
+ * ACPI resource objects according to resource types.
+ */
+int acpi_dev_filter_resource_type(struct acpi_resource *ares,
+				  unsigned long types)
+{
+	unsigned long type = 0;
+
+	switch (ares->type) {
+	case ACPI_RESOURCE_TYPE_MEMORY24:
+	case ACPI_RESOURCE_TYPE_MEMORY32:
+	case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
+		type = IORESOURCE_MEM;
+		break;
+	case ACPI_RESOURCE_TYPE_IO:
+	case ACPI_RESOURCE_TYPE_FIXED_IO:
+		type = IORESOURCE_IO;
+		break;
+	case ACPI_RESOURCE_TYPE_IRQ:
+	case ACPI_RESOURCE_TYPE_EXTENDED_IRQ:
+		type = IORESOURCE_IRQ;
+		break;
+	case ACPI_RESOURCE_TYPE_DMA:
+	case ACPI_RESOURCE_TYPE_FIXED_DMA:
+		type = IORESOURCE_DMA;
+		break;
+	case ACPI_RESOURCE_TYPE_GENERIC_REGISTER:
+		type = IORESOURCE_REG;
+		break;
+	case ACPI_RESOURCE_TYPE_ADDRESS16:
+	case ACPI_RESOURCE_TYPE_ADDRESS32:
+	case ACPI_RESOURCE_TYPE_ADDRESS64:
+	case ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64:
+		if (ares->data.address.resource_type == ACPI_MEMORY_RANGE)
+			type = IORESOURCE_MEM;
+		else if (ares->data.address.resource_type == ACPI_IO_RANGE)
+			type = IORESOURCE_IO;
+		else if (ares->data.address.resource_type ==
+			 ACPI_BUS_NUMBER_RANGE)
+			type = IORESOURCE_BUS;
+		break;
+	default:
+		break;
+	}
+
+	return (type & types) ? 0 : 1;
+}
+EXPORT_SYMBOL_GPL(acpi_dev_filter_resource_type);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 21dac3cb62d2..e818decb631f 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -310,6 +310,14 @@ void acpi_dev_free_resource_list(struct list_head *list);
 int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
 			   int (*preproc)(struct acpi_resource *, void *),
 			   void *preproc_data);
+int acpi_dev_filter_resource_type(struct acpi_resource *ares,
+				  unsigned long types);
+
+static inline int acpi_dev_filter_resource_type_cb(struct acpi_resource *ares,
+						   void *arg)
+{
+	return acpi_dev_filter_resource_type(ares, (unsigned long)arg);
+}
 
 int acpi_check_resource_conflict(const struct resource *res);
 
-- 
cgit v1.2.3


From c1dbe2fbb33ef425a81e1a7cffd17c113c87cdbc Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 27 Jan 2015 21:13:39 +0100
Subject: PM / Domains: Remove reference counting for the
 generic_pm_domain_data

The reference counting was needed when genpd supported PM domain device
callbacks. Since this option has been removed, let's also remove the
reference counting of the struct generic_pm_domain_data.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 10 ++--------
 include/linux/pm_domain.h   |  1 -
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index f9e7df554b2f..351df5bbd9c9 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1456,7 +1456,6 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 		gpd_data = gpd_data_new;
 		dev->power.subsys_data->domain_data = &gpd_data->base;
 	}
-	gpd_data->refcount++;
 	if (td)
 		gpd_data->td = *td;
 
@@ -1504,7 +1503,6 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 {
 	struct generic_pm_domain_data *gpd_data;
 	struct pm_domain_data *pdd;
-	bool remove = false;
 	int ret = 0;
 
 	dev_dbg(dev, "%s()\n", __func__);
@@ -1533,10 +1531,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	pdd = dev->power.subsys_data->domain_data;
 	list_del_init(&pdd->list_node);
 	gpd_data = to_gpd_data(pdd);
-	if (--gpd_data->refcount == 0) {
-		dev->power.subsys_data->domain_data = NULL;
-		remove = true;
-	}
+	dev->power.subsys_data->domain_data = NULL;
 
 	spin_unlock_irq(&dev->power.lock);
 
@@ -1547,8 +1542,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	genpd_release_lock(genpd);
 
 	dev_pm_put_subsys_data(dev);
-	if (remove)
-		genpd_free_dev_data(dev, gpd_data);
+	genpd_free_dev_data(dev, gpd_data);
 
 	return 0;
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index ed607760fc20..e160a0bba28d 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -114,7 +114,6 @@ struct generic_pm_domain_data {
 	struct gpd_timing_data td;
 	struct notifier_block nb;
 	struct mutex lock;
-	unsigned int refcount;
 	int need_restore;
 };
 
-- 
cgit v1.2.3


From c0356db7d1b66840882744cbd9d9c5960b2d88c7 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 27 Jan 2015 21:13:42 +0100
Subject: PM / Domains: Eliminate the mutex for the generic_pm_domain_data

While adding devices to their PM domains, dev_pm_qos_add_notifier() was
invoked while allocating the generic_pm_domain_data for the device.

Since the generic_pm_domain_data's device pointer will be assigned
after allocation, the ->genpd_dev_pm_qos_notifier() callback could be
called prior having a valid pointer to the device. Similar scenario
existed while removing a device from a genpd.

To cope with these scenarios a mutex was used to protect the pointer to
the device.

By re-order the sequence for when dev_pm_qos_add|remove_notifier() are
invoked, we make sure the ->genpd_dev_pm_qos_notifier() callback are
always called with a valid device pointer available.

In this way, we eliminate the need for protecting the pointer and thus
we can remove the mutex from the struct generic_pm_domain_data.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 37 ++++++++++++++-----------------------
 include/linux/pm_domain.h   |  1 -
 2 files changed, 14 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 88198ba919d9..1f026c18bc5c 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -344,14 +344,7 @@ static int genpd_dev_pm_qos_notifier(struct notifier_block *nb,
 	struct device *dev;
 
 	gpd_data = container_of(nb, struct generic_pm_domain_data, nb);
-
-	mutex_lock(&gpd_data->lock);
 	dev = gpd_data->base.dev;
-	if (!dev) {
-		mutex_unlock(&gpd_data->lock);
-		return NOTIFY_DONE;
-	}
-	mutex_unlock(&gpd_data->lock);
 
 	for (;;) {
 		struct generic_pm_domain *genpd;
@@ -1392,16 +1385,12 @@ static struct generic_pm_domain_data *genpd_alloc_dev_data(struct device *dev)
 	if (!gpd_data)
 		return NULL;
 
-	mutex_init(&gpd_data->lock);
-	gpd_data->nb.notifier_call = genpd_dev_pm_qos_notifier;
-	dev_pm_qos_add_notifier(dev, &gpd_data->nb);
 	return gpd_data;
 }
 
 static void genpd_free_dev_data(struct device *dev,
 				struct generic_pm_domain_data *gpd_data)
 {
-	dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
 	kfree(gpd_data);
 }
 
@@ -1414,7 +1403,7 @@ static void genpd_free_dev_data(struct device *dev,
 int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 			  struct gpd_timing_data *td)
 {
-	struct generic_pm_domain_data *gpd_data_new, *gpd_data = NULL;
+	struct generic_pm_domain_data *gpd_data;
 	int ret = 0;
 
 	dev_dbg(dev, "%s()\n", __func__);
@@ -1422,8 +1411,8 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(dev))
 		return -EINVAL;
 
-	gpd_data_new = genpd_alloc_dev_data(dev);
-	if (!gpd_data_new)
+	gpd_data = genpd_alloc_dev_data(dev);
+	if (!gpd_data)
 		return -ENOMEM;
 
 	genpd_acquire_lock(genpd);
@@ -1445,7 +1434,6 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 		goto out;
 	}
 
-	gpd_data = gpd_data_new;
 	dev->power.subsys_data->domain_data = &gpd_data->base;
 
 	if (td)
@@ -1461,19 +1449,20 @@ int __pm_genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	genpd->device_count++;
 	genpd->max_off_time_changed = true;
 
-	mutex_lock(&gpd_data->lock);
 	gpd_data->base.dev = dev;
 	list_add_tail(&gpd_data->base.list_node, &genpd->dev_list);
 	gpd_data->need_restore = -1;
 	gpd_data->td.constraint_changed = true;
 	gpd_data->td.effective_constraint_ns = -1;
-	mutex_unlock(&gpd_data->lock);
+	gpd_data->nb.notifier_call = genpd_dev_pm_qos_notifier;
 
  out:
 	genpd_release_lock(genpd);
 
-	if (gpd_data != gpd_data_new)
-		genpd_free_dev_data(dev, gpd_data_new);
+	if (ret)
+		genpd_free_dev_data(dev, gpd_data);
+	else
+		dev_pm_qos_add_notifier(dev, &gpd_data->nb);
 
 	return ret;
 }
@@ -1509,6 +1498,11 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	    ||  pd_to_genpd(dev->pm_domain) != genpd)
 		return -EINVAL;
 
+	/* The above validation also means we have existing domain_data. */
+	pdd = dev->power.subsys_data->domain_data;
+	gpd_data = to_gpd_data(pdd);
+	dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
+
 	genpd_acquire_lock(genpd);
 
 	if (genpd->prepared_count > 0) {
@@ -1525,16 +1519,12 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 	spin_lock_irq(&dev->power.lock);
 
 	dev->pm_domain = NULL;
-	pdd = dev->power.subsys_data->domain_data;
 	list_del_init(&pdd->list_node);
-	gpd_data = to_gpd_data(pdd);
 	dev->power.subsys_data->domain_data = NULL;
 
 	spin_unlock_irq(&dev->power.lock);
 
-	mutex_lock(&gpd_data->lock);
 	pdd->dev = NULL;
-	mutex_unlock(&gpd_data->lock);
 
 	genpd_release_lock(genpd);
 
@@ -1545,6 +1535,7 @@ int pm_genpd_remove_device(struct generic_pm_domain *genpd,
 
  out:
 	genpd_release_lock(genpd);
+	dev_pm_qos_add_notifier(dev, &gpd_data->nb);
 
 	return ret;
 }
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index e160a0bba28d..080e778118ba 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -113,7 +113,6 @@ struct generic_pm_domain_data {
 	struct pm_domain_data base;
 	struct gpd_timing_data td;
 	struct notifier_block nb;
-	struct mutex lock;
 	int need_restore;
 };
 
-- 
cgit v1.2.3


From 1e95e3b2da424db68d0a465273f1901a990c6277 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 29 Jan 2015 18:39:05 +0100
Subject: PM: Convert dev_pm_put_subsys_data() into a void function

Clients using the dev_pm_put_subsys_data() API isn't interested of a
return value. They care only of decreasing a reference to the device's
pm_subsys_data. So, let's convert the API to a void function, which
anyway seems like reasonable thing to do.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/common.c | 14 ++++----------
 include/linux/pm.h          |  2 +-
 2 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c
index a1ee51d43da1..f32b802b98f4 100644
--- a/drivers/base/power/common.c
+++ b/drivers/base/power/common.c
@@ -56,13 +56,11 @@ EXPORT_SYMBOL_GPL(dev_pm_get_subsys_data);
  * @dev: Device to handle.
  *
  * If the reference counter of power.subsys_data is zero after dropping the
- * reference, power.subsys_data is removed.  Return 1 if that happens or 0
- * otherwise.
+ * reference, power.subsys_data is removed.
  */
-int dev_pm_put_subsys_data(struct device *dev)
+void dev_pm_put_subsys_data(struct device *dev)
 {
 	struct pm_subsys_data *psd;
-	int ret = 1;
 
 	spin_lock_irq(&dev->power.lock);
 
@@ -70,18 +68,14 @@ int dev_pm_put_subsys_data(struct device *dev)
 	if (!psd)
 		goto out;
 
-	if (--psd->refcount == 0) {
+	if (--psd->refcount == 0)
 		dev->power.subsys_data = NULL;
-	} else {
+	else
 		psd = NULL;
-		ret = 0;
-	}
 
  out:
 	spin_unlock_irq(&dev->power.lock);
 	kfree(psd);
-
-	return ret;
 }
 EXPORT_SYMBOL_GPL(dev_pm_put_subsys_data);
 
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 8b5976364619..e2f1be6dd9dd 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -597,7 +597,7 @@ struct dev_pm_info {
 
 extern void update_pm_runtime_accounting(struct device *dev);
 extern int dev_pm_get_subsys_data(struct device *dev);
-extern int dev_pm_put_subsys_data(struct device *dev);
+extern void dev_pm_put_subsys_data(struct device *dev);
 
 /*
  * Power domains provide callbacks that are executed during system suspend,
-- 
cgit v1.2.3


From 851c63e3b381fdbf5aca1a797f37d8606b5588d2 Mon Sep 17 00:00:00 2001
From: Daniel Thompson <daniel.thompson@linaro.org>
Date: Mon, 5 Jan 2015 15:43:39 +0000
Subject: of: Fix brace position for struct of_device_id definition

Currently it is not easy to grep for the definition of struct of_device_id.
This is trivially fixed by moving the brace to the right place.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Rob Herring <robh+dt@kernel.org>
Signed-off-by: Rob Herring <robh@kernel.org>
---
 include/linux/mod_devicetable.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 745def862580..bbf85d612be5 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -220,8 +220,7 @@ struct serio_device_id {
 /*
  * Struct used for matching a device
  */
-struct of_device_id
-{
+struct of_device_id {
 	char	name[32];
 	char	type[32];
 	char	compatible[128];
-- 
cgit v1.2.3


From 4c946d9c11d173c2ea6b9081b248f8072e6b46f1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 27 Nov 2014 19:52:04 -0500
Subject: vmci: propagate msghdr all way down to __qp_memcpy_to_queue()

Switch from passing msg->iov_iter.iov to passing msg itself

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/misc/vmw_vmci/vmci_queue_pair.c | 16 ++++++++--------
 include/linux/vmw_vmci_api.h            |  2 +-
 net/vmw_vsock/vmci_transport.c          |  3 +--
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 7aaaf51e1596..35f19a683822 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -370,12 +370,12 @@ static int __qp_memcpy_to_queue(struct vmci_queue *queue,
 			to_copy = size - bytes_copied;
 
 		if (is_iovec) {
-			struct iovec *iov = (struct iovec *)src;
+			struct msghdr *msg = (struct msghdr *)src;
 			int err;
 
 			/* The iovec will track bytes_copied internally. */
-			err = memcpy_fromiovec((u8 *)va + page_offset,
-					       iov, to_copy);
+			err = memcpy_from_msg((u8 *)va + page_offset,
+					      msg, to_copy);
 			if (err != 0) {
 				if (kernel_if->host)
 					kunmap(kernel_if->u.h.page[page_index]);
@@ -580,7 +580,7 @@ static int qp_memcpy_from_queue(void *dest,
  */
 static int qp_memcpy_to_queue_iov(struct vmci_queue *queue,
 				  u64 queue_offset,
-				  const void *src,
+				  const void *msg,
 				  size_t src_offset, size_t size)
 {
 
@@ -588,7 +588,7 @@ static int qp_memcpy_to_queue_iov(struct vmci_queue *queue,
 	 * We ignore src_offset because src is really a struct iovec * and will
 	 * maintain offset internally.
 	 */
-	return __qp_memcpy_to_queue(queue, queue_offset, src, size, true);
+	return __qp_memcpy_to_queue(queue, queue_offset, msg, size, true);
 }
 
 /*
@@ -3223,13 +3223,13 @@ EXPORT_SYMBOL_GPL(vmci_qpair_peek);
  * of bytes enqueued or < 0 on error.
  */
 ssize_t vmci_qpair_enquev(struct vmci_qp *qpair,
-			  void *iov,
+			  struct msghdr *msg,
 			  size_t iov_size,
 			  int buf_type)
 {
 	ssize_t result;
 
-	if (!qpair || !iov)
+	if (!qpair)
 		return VMCI_ERROR_INVALID_ARGS;
 
 	qp_lock(qpair);
@@ -3238,7 +3238,7 @@ ssize_t vmci_qpair_enquev(struct vmci_qp *qpair,
 		result = qp_enqueue_locked(qpair->produce_q,
 					   qpair->consume_q,
 					   qpair->produce_q_size,
-					   iov, iov_size,
+					   msg, iov_size,
 					   qp_memcpy_to_queue_iov);
 
 		if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY &&
diff --git a/include/linux/vmw_vmci_api.h b/include/linux/vmw_vmci_api.h
index 5691f752ce8f..63df3a2a8ce5 100644
--- a/include/linux/vmw_vmci_api.h
+++ b/include/linux/vmw_vmci_api.h
@@ -74,7 +74,7 @@ ssize_t vmci_qpair_dequeue(struct vmci_qp *qpair,
 ssize_t vmci_qpair_peek(struct vmci_qp *qpair, void *buf, size_t buf_size,
 			int mode);
 ssize_t vmci_qpair_enquev(struct vmci_qp *qpair,
-			  void *iov, size_t iov_size, int mode);
+			  struct msghdr *msg, size_t iov_size, int mode);
 ssize_t vmci_qpair_dequev(struct vmci_qp *qpair,
 			  struct msghdr *msg, size_t iov_size, int mode);
 ssize_t vmci_qpair_peekv(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size,
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 02d2e5229240..7f3255084a6c 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1850,8 +1850,7 @@ static ssize_t vmci_transport_stream_enqueue(
 	struct msghdr *msg,
 	size_t len)
 {
-	/* XXX: stripping const */
-	return vmci_qpair_enquev(vmci_trans(vsk)->qpair, (struct iovec *)msg->msg_iter.iov, len, 0);
+	return vmci_qpair_enquev(vmci_trans(vsk)->qpair, msg, len, 0);
 }
 
 static s64 vmci_transport_stream_has_data(struct vsock_sock *vsk)
-- 
cgit v1.2.3


From af2b040e470b470bfc881981db3c796072853eae Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 27 Nov 2014 21:44:24 -0500
Subject: rxrpc: switch rxrpc_send_data() to iov_iter primitives

Convert skb_add_data() to iov_iter; allows to get rid of the explicit
messing with iovec in its only caller - skb_add_data() will keep advancing
->msg_iter for us, so there's no need to similate that manually.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h | 11 +++++------
 net/rxrpc/ar-output.c  | 43 ++++++++++---------------------------------
 2 files changed, 15 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 85ab7d72b54c..9a8bafee1b67 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2484,19 +2484,18 @@ static inline int skb_put_padto(struct sk_buff *skb, unsigned int len)
 }
 
 static inline int skb_add_data(struct sk_buff *skb,
-			       char __user *from, int copy)
+			       struct iov_iter *from, int copy)
 {
 	const int off = skb->len;
 
 	if (skb->ip_summed == CHECKSUM_NONE) {
-		int err = 0;
-		__wsum csum = csum_and_copy_from_user(from, skb_put(skb, copy),
-							    copy, 0, &err);
-		if (!err) {
+		__wsum csum = 0;
+		if (csum_and_copy_from_iter(skb_put(skb, copy), copy,
+					    &csum, from) == copy) {
 			skb->csum = csum_block_add(skb->csum, csum, off);
 			return 0;
 		}
-	} else if (!copy_from_user(skb_put(skb, copy), from, copy))
+	} else if (copy_from_iter(skb_put(skb, copy), copy, from) == copy)
 		return 0;
 
 	__skb_trim(skb, off);
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index e1a9373e5979..963a5b91f3e8 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -529,13 +529,11 @@ static int rxrpc_send_data(struct kiocb *iocb,
 			   struct msghdr *msg, size_t len)
 {
 	struct rxrpc_skb_priv *sp;
-	unsigned char __user *from;
 	struct sk_buff *skb;
-	const struct iovec *iov;
 	struct sock *sk = &rx->sk;
 	long timeo;
 	bool more;
-	int ret, ioc, segment, copied;
+	int ret, copied;
 
 	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
 
@@ -545,25 +543,17 @@ static int rxrpc_send_data(struct kiocb *iocb,
 	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 		return -EPIPE;
 
-	iov = msg->msg_iter.iov;
-	ioc = msg->msg_iter.nr_segs - 1;
-	from = iov->iov_base;
-	segment = iov->iov_len;
-	iov++;
 	more = msg->msg_flags & MSG_MORE;
 
 	skb = call->tx_pending;
 	call->tx_pending = NULL;
 
 	copied = 0;
-	do {
+	if (len > iov_iter_count(&msg->msg_iter))
+		len = iov_iter_count(&msg->msg_iter);
+	while (len) {
 		int copy;
 
-		if (segment > len)
-			segment = len;
-
-		_debug("SEGMENT %d @%p", segment, from);
-
 		if (!skb) {
 			size_t size, chunk, max, space;
 
@@ -631,13 +621,13 @@ static int rxrpc_send_data(struct kiocb *iocb,
 		/* append next segment of data to the current buffer */
 		copy = skb_tailroom(skb);
 		ASSERTCMP(copy, >, 0);
-		if (copy > segment)
-			copy = segment;
+		if (copy > len)
+			copy = len;
 		if (copy > sp->remain)
 			copy = sp->remain;
 
 		_debug("add");
-		ret = skb_add_data(skb, from, copy);
+		ret = skb_add_data(skb, &msg->msg_iter, copy);
 		_debug("added");
 		if (ret < 0)
 			goto efault;
@@ -646,18 +636,6 @@ static int rxrpc_send_data(struct kiocb *iocb,
 		copied += copy;
 
 		len -= copy;
-		segment -= copy;
-		from += copy;
-		while (segment == 0 && ioc > 0) {
-			from = iov->iov_base;
-			segment = iov->iov_len;
-			iov++;
-			ioc--;
-		}
-		if (len == 0) {
-			segment = 0;
-			ioc = 0;
-		}
 
 		/* check for the far side aborting the call or a network error
 		 * occurring */
@@ -665,7 +643,7 @@ static int rxrpc_send_data(struct kiocb *iocb,
 			goto call_aborted;
 
 		/* add the packet to the send queue if it's now full */
-		if (sp->remain <= 0 || (segment == 0 && !more)) {
+		if (sp->remain <= 0 || (!len && !more)) {
 			struct rxrpc_connection *conn = call->conn;
 			uint32_t seq;
 			size_t pad;
@@ -711,11 +689,10 @@ static int rxrpc_send_data(struct kiocb *iocb,
 
 			memcpy(skb->head, &sp->hdr,
 			       sizeof(struct rxrpc_header));
-			rxrpc_queue_packet(call, skb, segment == 0 && !more);
+			rxrpc_queue_packet(call, skb, !iov_iter_count(&msg->msg_iter) && !more);
 			skb = NULL;
 		}
-
-	} while (segment > 0);
+	}
 
 success:
 	ret = copied;
-- 
cgit v1.2.3


From 21226abb4e9f14d88238964d89b279e461ddc30c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 28 Nov 2014 15:48:29 -0500
Subject: net: switch memcpy_fromiovec()/memcpy_fromiovecend() users to
 copy_from_iter()

That takes care of the majority of ->sendmsg() instances - most of them
via memcpy_to_msg() or assorted getfrag() callbacks.  One place where we
still keep memcpy_fromiovecend() is tipc - there we potentially read the
same data over and over; separate patch, that...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/skbuff.h |  3 +--
 include/net/udplite.h  |  3 +--
 net/ipv4/ip_output.c   |  6 ++----
 net/ipv4/ping.c        | 14 +++++++-------
 net/ipv4/raw.c         |  2 +-
 net/ipv4/tcp_input.c   |  2 +-
 net/ipv6/raw.c         |  2 +-
 7 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9a8bafee1b67..b349c96dc80a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2692,8 +2692,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
 
 static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
 {
-	/* XXX: stripping const */
-	return memcpy_fromiovec(data, (struct iovec *)msg->msg_iter.iov, len);
+	return copy_from_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
 }
 
 static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
diff --git a/include/net/udplite.h b/include/net/udplite.h
index ae7c8d1fbcad..80761938b9a7 100644
--- a/include/net/udplite.h
+++ b/include/net/udplite.h
@@ -20,8 +20,7 @@ static __inline__ int udplite_getfrag(void *from, char *to, int  offset,
 				      int len, int odd, struct sk_buff *skb)
 {
 	struct msghdr *msg = from;
-	/* XXX: stripping const */
-	return memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len);
+	return copy_from_iter(to, len, &msg->msg_iter) != len ? -EFAULT : 0;
 }
 
 /* Designate sk as UDP-Lite socket */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index b50861b22b6b..f998bc87ae38 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -755,13 +755,11 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
 	struct msghdr *msg = from;
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
-		/* XXX: stripping const */
-		if (memcpy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len) < 0)
+		if (copy_from_iter(to, len, &msg->msg_iter) != len)
 			return -EFAULT;
 	} else {
 		__wsum csum = 0;
-		/* XXX: stripping const */
-		if (csum_partial_copy_fromiovecend(to, (struct iovec *)msg->msg_iter.iov, offset, len, &csum) < 0)
+		if (csum_and_copy_from_iter(to, len, &csum, &msg->msg_iter) != len)
 			return -EFAULT;
 		skb->csum = csum_block_add(skb->csum, csum, odd);
 	}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 9e15ba701401..e9f66e1cda50 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -599,18 +599,18 @@ int ping_getfrag(void *from, char *to,
 	struct pingfakehdr *pfh = (struct pingfakehdr *)from;
 
 	if (offset == 0) {
-		if (fraglen < sizeof(struct icmphdr))
+		fraglen -= sizeof(struct icmphdr);
+		if (fraglen < 0)
 			BUG();
-		if (csum_partial_copy_fromiovecend(to + sizeof(struct icmphdr),
-			    pfh->msg->msg_iter.iov, 0, fraglen - sizeof(struct icmphdr),
-			    &pfh->wcheck))
+		if (csum_and_copy_from_iter(to + sizeof(struct icmphdr),
+			    fraglen, &pfh->wcheck,
+			    &pfh->msg->msg_iter) != fraglen)
 			return -EFAULT;
 	} else if (offset < sizeof(struct icmphdr)) {
 			BUG();
 	} else {
-		if (csum_partial_copy_fromiovecend
-				(to, pfh->msg->msg_iter.iov, offset - sizeof(struct icmphdr),
-				 fraglen, &pfh->wcheck))
+		if (csum_and_copy_from_iter(to, fraglen, &pfh->wcheck,
+					    &pfh->msg->msg_iter) != fraglen)
 			return -EFAULT;
 	}
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 2c9d252072a2..f027a708b7e0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -382,7 +382,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
 
 	skb->transport_header = skb->network_header;
 	err = -EFAULT;
-	if (memcpy_fromiovecend((void *)iph, msg->msg_iter.iov, 0, length))
+	if (memcpy_from_msg(iph, msg, length))
 		goto error_free;
 
 	iphlen = iph->ihl * 4;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 93c74829cbce..71fb37c70581 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4368,7 +4368,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
 	if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
 		goto err_free;
 
-	if (copy_from_iter(skb_put(skb, size), size, &msg->msg_iter) != size)
+	if (memcpy_from_msg(skb_put(skb, size), msg, size))
 		goto err_free;
 
 	TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 0dbb328fa688..dae7f1a1e464 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -648,7 +648,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 	skb->ip_summed = CHECKSUM_NONE;
 
 	skb->transport_header = skb->network_header;
-	err = memcpy_fromiovecend((void *)iph, msg->msg_iter.iov, 0, length);
+	err = memcpy_from_msg(iph, msg, length);
 	if (err)
 		goto error_fault;
 
-- 
cgit v1.2.3


From 31a25fae85956e3a9c778141d29e5e803fb0b124 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 28 Nov 2014 15:53:57 -0500
Subject: net: bury net/core/iovec.c - nothing in there is used anymore

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/socket.h |   7 ---
 net/core/Makefile      |   2 +-
 net/core/iovec.c       | 137 -------------------------------------------------
 3 files changed, 1 insertion(+), 145 deletions(-)
 delete mode 100644 net/core/iovec.c

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 6e49a14365dc..5c19cba34dce 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -318,13 +318,6 @@ struct ucred {
 /* IPX options */
 #define IPX_TYPE	1
 
-extern int csum_partial_copy_fromiovecend(unsigned char *kdata, 
-					  struct iovec *iov, 
-					  int offset, 
-					  unsigned int len, __wsum *csump);
-extern unsigned long iov_pages(const struct iovec *iov, int offset,
-			       unsigned long nr_segs);
-
 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr);
 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
 
diff --git a/net/core/Makefile b/net/core/Makefile
index 235e6c50708d..fec0856dd6c0 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the Linux networking core.
 #
 
-obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
+obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
 	 gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
 
 obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
diff --git a/net/core/iovec.c b/net/core/iovec.c
deleted file mode 100644
index dcbe98b3726a..000000000000
--- a/net/core/iovec.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- *	iovec manipulation routines.
- *
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- *
- *	Fixes:
- *		Andrew Lunn	:	Errors in iovec copying.
- *		Pedro Roque	:	Added memcpy_fromiovecend and
- *					csum_..._fromiovecend.
- *		Andi Kleen	:	fixed error handling for 2.1
- *		Alexey Kuznetsov:	2.1 optimisations
- *		Andi Kleen	:	Fix csum*fromiovecend for IPv6.
- */
-
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/net.h>
-#include <linux/in6.h>
-#include <asm/uaccess.h>
-#include <asm/byteorder.h>
-#include <net/checksum.h>
-#include <net/sock.h>
-
-/*
- *	And now for the all-in-one: copy and checksum from a user iovec
- *	directly to a datagram
- *	Calls to csum_partial but the last must be in 32 bit chunks
- *
- *	ip_build_xmit must ensure that when fragmenting only the last
- *	call to this function will be unaligned also.
- */
-int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov,
-				 int offset, unsigned int len, __wsum *csump)
-{
-	__wsum csum = *csump;
-	int partial_cnt = 0, err = 0;
-
-	/* Skip over the finished iovecs */
-	while (offset >= iov->iov_len) {
-		offset -= iov->iov_len;
-		iov++;
-	}
-
-	while (len > 0) {
-		u8 __user *base = iov->iov_base + offset;
-		int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
-		offset = 0;
-
-		/* There is a remnant from previous iov. */
-		if (partial_cnt) {
-			int par_len = 4 - partial_cnt;
-
-			/* iov component is too short ... */
-			if (par_len > copy) {
-				if (copy_from_user(kdata, base, copy))
-					goto out_fault;
-				kdata += copy;
-				base += copy;
-				partial_cnt += copy;
-				len -= copy;
-				iov++;
-				if (len)
-					continue;
-				*csump = csum_partial(kdata - partial_cnt,
-							 partial_cnt, csum);
-				goto out;
-			}
-			if (copy_from_user(kdata, base, par_len))
-				goto out_fault;
-			csum = csum_partial(kdata - partial_cnt, 4, csum);
-			kdata += par_len;
-			base  += par_len;
-			copy  -= par_len;
-			len   -= par_len;
-			partial_cnt = 0;
-		}
-
-		if (len > copy) {
-			partial_cnt = copy % 4;
-			if (partial_cnt) {
-				copy -= partial_cnt;
-				if (copy_from_user(kdata + copy, base + copy,
-						partial_cnt))
-					goto out_fault;
-			}
-		}
-
-		if (copy) {
-			csum = csum_and_copy_from_user(base, kdata, copy,
-							csum, &err);
-			if (err)
-				goto out;
-		}
-		len   -= copy + partial_cnt;
-		kdata += copy + partial_cnt;
-		iov++;
-	}
-	*csump = csum;
-out:
-	return err;
-
-out_fault:
-	err = -EFAULT;
-	goto out;
-}
-EXPORT_SYMBOL(csum_partial_copy_fromiovecend);
-
-unsigned long iov_pages(const struct iovec *iov, int offset,
-			unsigned long nr_segs)
-{
-	unsigned long seg, base;
-	int pages = 0, len, size;
-
-	while (nr_segs && (offset >= iov->iov_len)) {
-		offset -= iov->iov_len;
-		++iov;
-		--nr_segs;
-	}
-
-	for (seg = 0; seg < nr_segs; seg++) {
-		base = (unsigned long)iov[seg].iov_base + offset;
-		len = iov[seg].iov_len - offset;
-		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
-		pages += size;
-		offset = 0;
-	}
-
-	return pages;
-}
-EXPORT_SYMBOL(iov_pages);
-- 
cgit v1.2.3


From aad9a1cec7dcd1d45809b64643fce37061b17788 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2014 14:49:01 -0500
Subject: vhost: switch vhost get_indirect() to iov_iter, kill
 memcpy_fromiovec()

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/vhost/vhost.c |  6 ++++--
 include/linux/uio.h   |  1 -
 lib/iovec.c           | 25 -------------------------
 3 files changed, 4 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index cb807d0ea498..2ee28266fd07 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1125,6 +1125,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 	struct vring_desc desc;
 	unsigned int i = 0, count, found = 0;
 	u32 len = vhost32_to_cpu(vq, indirect->len);
+	struct iov_iter from;
 	int ret;
 
 	/* Sanity check */
@@ -1142,6 +1143,7 @@ static int get_indirect(struct vhost_virtqueue *vq,
 		vq_err(vq, "Translation failure %d in indirect.\n", ret);
 		return ret;
 	}
+	iov_iter_init(&from, READ, vq->indirect, ret, len);
 
 	/* We will use the result as an address to read from, so most
 	 * architectures only need a compiler barrier here. */
@@ -1164,8 +1166,8 @@ static int get_indirect(struct vhost_virtqueue *vq,
 			       i, count);
 			return -EINVAL;
 		}
-		if (unlikely(memcpy_fromiovec((unsigned char *)&desc,
-					      vq->indirect, sizeof desc))) {
+		if (unlikely(copy_from_iter(&desc, sizeof(desc), &from) !=
+			     sizeof(desc))) {
 			vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n",
 			       i, (size_t)vhost64_to_cpu(vq, indirect->addr) + i * sizeof desc);
 			return -EINVAL;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 1c5e453f7ea9..af3439f4ebf2 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -135,7 +135,6 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 
-int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len);
 int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
 			int offset, int len);
 int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
diff --git a/lib/iovec.c b/lib/iovec.c
index 2d99cb4a5006..4a90875c64ae 100644
--- a/lib/iovec.c
+++ b/lib/iovec.c
@@ -2,31 +2,6 @@
 #include <linux/export.h>
 #include <linux/uio.h>
 
-/*
- *	Copy iovec to kernel. Returns -EFAULT on error.
- *
- *	Note: this modifies the original iovec.
- */
-
-int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
-{
-	while (len > 0) {
-		if (iov->iov_len) {
-			int copy = min_t(unsigned int, len, iov->iov_len);
-			if (copy_from_user(kdata, iov->iov_base, copy))
-				return -EFAULT;
-			len -= copy;
-			kdata += copy;
-			iov->iov_base += copy;
-			iov->iov_len -= copy;
-		}
-		iov++;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovec);
-
 /*
  *	Copy kernel to iovec. Returns -EFAULT on error.
  */
-- 
cgit v1.2.3


From ba7438aed924133df54a60e4cd5499d359bcf2a8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2014 15:51:28 -0500
Subject: vhost: don't bother copying iovecs in handle_rx(), kill
 memcpy_toiovecend()

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/vhost/net.c | 82 +++++++++++++++--------------------------------------
 include/linux/uio.h |  3 --
 lib/iovec.c         | 26 -----------------
 3 files changed, 23 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d86cc9bb9ea4..e022cc40303d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -84,10 +84,6 @@ struct vhost_net_ubuf_ref {
 
 struct vhost_net_virtqueue {
 	struct vhost_virtqueue vq;
-	/* hdr is used to store the virtio header.
-	 * Since each iovec has >= 1 byte length, we never need more than
-	 * header length entries to store the header. */
-	struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)];
 	size_t vhost_hlen;
 	size_t sock_hlen;
 	/* vhost zerocopy support fields below: */
@@ -235,44 +231,6 @@ static bool vhost_sock_zcopy(struct socket *sock)
 		sock_flag(sock->sk, SOCK_ZEROCOPY);
 }
 
-/* Pop first len bytes from iovec. Return number of segments used. */
-static int move_iovec_hdr(struct iovec *from, struct iovec *to,
-			  size_t len, int iov_count)
-{
-	int seg = 0;
-	size_t size;
-
-	while (len && seg < iov_count) {
-		size = min(from->iov_len, len);
-		to->iov_base = from->iov_base;
-		to->iov_len = size;
-		from->iov_len -= size;
-		from->iov_base += size;
-		len -= size;
-		++from;
-		++to;
-		++seg;
-	}
-	return seg;
-}
-/* Copy iovec entries for len bytes from iovec. */
-static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
-			   size_t len, int iovcount)
-{
-	int seg = 0;
-	size_t size;
-
-	while (len && seg < iovcount) {
-		size = min(from->iov_len, len);
-		to->iov_base = from->iov_base;
-		to->iov_len = size;
-		len -= size;
-		++from;
-		++to;
-		++seg;
-	}
-}
-
 /* In case of DMA done not in order in lower device driver for some reason.
  * upend_idx is used to track end of used idx, done_idx is used to track head
  * of used idx. Once lower device DMA done contiguously, we will signal KVM
@@ -570,9 +528,9 @@ static void handle_rx(struct vhost_net *net)
 		.msg_controllen = 0,
 		.msg_flags = MSG_DONTWAIT,
 	};
-	struct virtio_net_hdr_mrg_rxbuf hdr = {
-		.hdr.flags = 0,
-		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
+	struct virtio_net_hdr hdr = {
+		.flags = 0,
+		.gso_type = VIRTIO_NET_HDR_GSO_NONE
 	};
 	size_t total_len = 0;
 	int err, mergeable;
@@ -580,6 +538,7 @@ static void handle_rx(struct vhost_net *net)
 	size_t vhost_hlen, sock_hlen;
 	size_t vhost_len, sock_len;
 	struct socket *sock;
+	struct iov_iter fixup;
 
 	mutex_lock(&vq->mutex);
 	sock = vq->private_data;
@@ -624,14 +583,19 @@ static void handle_rx(struct vhost_net *net)
 			break;
 		}
 		/* We don't need to be notified again. */
-		if (unlikely((vhost_hlen)))
-			/* Skip header. TODO: support TSO. */
-			move_iovec_hdr(vq->iov, nvq->hdr, vhost_hlen, in);
-		else
-			/* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
-			 * needed because recvmsg can modify msg_iov. */
-			copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in);
-		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, sock_len);
+		iov_iter_init(&msg.msg_iter, READ, vq->iov, in, vhost_len);
+		fixup = msg.msg_iter;
+		if (unlikely((vhost_hlen))) {
+			/* We will supply the header ourselves
+			 * TODO: support TSO.
+			 */
+			iov_iter_advance(&msg.msg_iter, vhost_hlen);
+		} else {
+			/* It'll come from socket; we'll need to patch
+			 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
+			 */
+			iov_iter_advance(&fixup, sizeof(hdr));
+		}
 		err = sock->ops->recvmsg(NULL, sock, &msg,
 					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
 		/* Userspace might have consumed the packet meanwhile:
@@ -643,18 +607,18 @@ static void handle_rx(struct vhost_net *net)
 			vhost_discard_vq_desc(vq, headcount);
 			continue;
 		}
+		/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
 		if (unlikely(vhost_hlen) &&
-		    memcpy_toiovecend(nvq->hdr, (unsigned char *)&hdr, 0,
-				      vhost_hlen)) {
+		    copy_to_iter(&hdr, sizeof(hdr), &fixup) != sizeof(hdr)) {
 			vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
 			       vq->iov->iov_base);
 			break;
 		}
-		/* TODO: Should check and handle checksum. */
+		/* Supply (or replace) ->num_buffers if VIRTIO_NET_F_MRG_RXBUF
+		 * TODO: Should check and handle checksum.
+		 */
 		if (likely(mergeable) &&
-		    memcpy_toiovecend(nvq->hdr, (unsigned char *)&headcount,
-				      offsetof(typeof(hdr), num_buffers),
-				      sizeof hdr.num_buffers)) {
+		    copy_to_iter(&headcount, 2, &fixup) != 2) {
 			vq_err(vq, "Failed num_buffers write");
 			vhost_discard_vq_desc(vq, headcount);
 			break;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index af3439f4ebf2..02bd8a92038a 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -137,7 +137,4 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct io
 
 int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
 			int offset, int len);
-int memcpy_toiovecend(const struct iovec *v, unsigned char *kdata,
-		      int offset, int len);
-
 #endif
diff --git a/lib/iovec.c b/lib/iovec.c
index 4a90875c64ae..d8f17a9b1ccf 100644
--- a/lib/iovec.c
+++ b/lib/iovec.c
@@ -2,32 +2,6 @@
 #include <linux/export.h>
 #include <linux/uio.h>
 
-/*
- *	Copy kernel to iovec. Returns -EFAULT on error.
- */
-
-int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
-		      int offset, int len)
-{
-	int copy;
-	for (; len > 0; ++iov) {
-		/* Skip over the finished iovecs */
-		if (unlikely(offset >= iov->iov_len)) {
-			offset -= iov->iov_len;
-			continue;
-		}
-		copy = min_t(unsigned int, iov->iov_len - offset, len);
-		if (copy_to_user(iov->iov_base + offset, kdata, copy))
-			return -EFAULT;
-		offset = 0;
-		kdata += copy;
-		len -= copy;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_toiovecend);
-
 /*
  *	Copy iovec to kernel. Returns -EFAULT on error.
  */
-- 
cgit v1.2.3


From 57dd8a0735aabff4862025cf64ad94da3d80e620 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 10 Dec 2014 16:03:43 -0500
Subject: vhost: vhost_scsi_handle_vq() should just use copy_from_user()

it has just verified that it asks no more than the length of the
first segment of iovec.

And with that the last user of stuff in lib/iovec.c is gone.
RIP.

Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Nicholas A. Bellinger <nab@linux-iscsi.org>
Cc: kvm@vger.kernel.org
Cc: virtualization@lists.linux-foundation.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/vhost/scsi.c |  2 +-
 include/linux/uio.h  |  2 --
 lib/Makefile         |  2 +-
 lib/iovec.c          | 36 ------------------------------------
 4 files changed, 2 insertions(+), 40 deletions(-)
 delete mode 100644 lib/iovec.c

(limited to 'include/linux')

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index d695b1673ae5..dc78d87e0fc2 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1079,7 +1079,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			       req_size, vq->iov[0].iov_len);
 			break;
 		}
-		ret = memcpy_fromiovecend(req, &vq->iov[0], 0, req_size);
+		ret = copy_from_user(req, vq->iov[0].iov_base, req_size);
 		if (unlikely(ret)) {
 			vq_err(vq, "Faulted on virtio_scsi_cmd_req\n");
 			break;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 02bd8a92038a..3e0cb4ea3905 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -135,6 +135,4 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 
-int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
-			int offset, int len);
 #endif
diff --git a/lib/Makefile b/lib/Makefile
index 3c3b30b9e020..1071d06398c3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -24,7 +24,7 @@ obj-y	+= lockref.o
 
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
-	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
+	 gcd.o lcm.o list_sort.o uuid.o flex_array.o clz_ctz.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
 	 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o
 obj-y += string_helpers.o
diff --git a/lib/iovec.c b/lib/iovec.c
deleted file mode 100644
index d8f17a9b1ccf..000000000000
--- a/lib/iovec.c
+++ /dev/null
@@ -1,36 +0,0 @@
-#include <linux/uaccess.h>
-#include <linux/export.h>
-#include <linux/uio.h>
-
-/*
- *	Copy iovec to kernel. Returns -EFAULT on error.
- */
-
-int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
-			int offset, int len)
-{
-	/* No data? Done! */
-	if (len == 0)
-		return 0;
-
-	/* Skip over the finished iovecs */
-	while (offset >= iov->iov_len) {
-		offset -= iov->iov_len;
-		iov++;
-	}
-
-	while (len > 0) {
-		u8 __user *base = iov->iov_base + offset;
-		int copy = min_t(unsigned int, len, iov->iov_len - offset);
-
-		offset = 0;
-		if (copy_from_user(kdata, base, copy))
-			return -EFAULT;
-		len -= copy;
-		kdata += copy;
-		iov++;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(memcpy_fromiovecend);
-- 
cgit v1.2.3


From 44fc0e5eec00db5fba748803c95920098089c4cc Mon Sep 17 00:00:00 2001
From: Johan Hedberg <johan.hedberg@intel.com>
Date: Fri, 30 Jan 2015 13:14:36 +0200
Subject: sched/wait: Introduce wait_on_bit_timeout()

Add a new wait_on_bit_timeout() helper, basically the same as
wait_on_bit() except that it also takes a 'timeout' parameter.

All the building blocks like bit_wait_timeout() and
out_of_line_wait_on_bit_timeout() are already in place so the
addition is rather simple.

Signed-off-by: Johan Hedberg <johan.hedberg@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: davem@davemloft.net
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1422616476-2917-2-git-send-email-johan.hedberg@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/wait.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 37423e0e1379..537d58eea8a0 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -989,6 +989,32 @@ wait_on_bit_io(void *word, int bit, unsigned mode)
 				       mode);
 }
 
+/**
+ * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses
+ * @word: the word being waited on, a kernel virtual address
+ * @bit: the bit of the word being waited on
+ * @mode: the task state to sleep in
+ * @timeout: timeout, in jiffies
+ *
+ * Use the standard hashed waitqueue table to wait for a bit
+ * to be cleared. This is similar to wait_on_bit(), except also takes a
+ * timeout parameter.
+ *
+ * Returned value will be zero if the bit was cleared before the
+ * @timeout elapsed, or non-zero if the @timeout elapsed or process
+ * received a signal and the mode permitted wakeup on that signal.
+ */
+static inline int
+wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout)
+{
+	might_sleep();
+	if (!test_bit(bit, word))
+		return 0;
+	return out_of_line_wait_on_bit_timeout(word, bit,
+					       bit_wait_timeout,
+					       mode, timeout);
+}
+
 /**
  * wait_on_bit_action - wait for a bit to be cleared
  * @word: the word being waited on, a kernel virtual address
-- 
cgit v1.2.3


From 2fde4f94e0a9531251e706fa57131b51b0df042e Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 7 Jan 2015 15:01:54 +0000
Subject: perf: Decouple unthrottling and rotating

Currently the adjusments made as part of perf_event_task_tick() use the
percpu rotation lists to iterate over any active PMU contexts, but these
are not used by the context rotation code, having been replaced by
separate (per-context) hrtimer callbacks. However, some manipulation of
the rotation lists (i.e. removal of contexts) has remained in
perf_rotate_context(). This leads to the following issues:

* Contexts are not always removed from the rotation lists. Removal of
  PMUs which have been placed in rotation lists, but have not been
  removed by a hrtimer callback can result in corruption of the rotation
  lists (when memory backing the context is freed).

  This has been observed to result in hangs when PMU drivers built as
  modules are inserted and removed around the creation of events for
  said PMUs.

* Contexts which do not require rotation may be removed from the
  rotation lists as a result of a hrtimer, and will not be considered by
  the unthrottling code in perf_event_task_tick.

This patch fixes the issue by updating the rotation ist when events are
scheduled in/out, ensuring that each rotation list stays in sync with
the HW state. As each event holds a refcount on the module of its PMU,
this ensures that when a PMU module is unloaded none of its CPU contexts
can be in a rotation list. By maintaining a list of perf_event_contexts
rather than perf_event_cpu_contexts, we don't need separate paths to
handle the cpu and task contexts, which also makes the code a little
simpler.

As the rotation_list variables are not used for rotation, these are
renamed to active_ctx_list, which better matches their current function.
perf_pmu_rotate_{start,stop} are renamed to
perf_pmu_ctx_{activate,deactivate}.

Reported-by: Johannes Jensen <johannes.jensen@arm.com>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Will Deacon <Will.Deacon@arm.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20150129134511.GR17721@leverpostej
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  2 +-
 kernel/events/core.c       | 81 +++++++++++++++++-----------------------------
 2 files changed, 30 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 216653466a67..5cad0e6f3552 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -469,6 +469,7 @@ struct perf_event_context {
 	 */
 	struct mutex			mutex;
 
+	struct list_head		active_ctx_list;
 	struct list_head		pinned_groups;
 	struct list_head		flexible_groups;
 	struct list_head		event_list;
@@ -519,7 +520,6 @@ struct perf_cpu_context {
 	int				exclusive;
 	struct hrtimer			hrtimer;
 	ktime_t				hrtimer_interval;
-	struct list_head		rotation_list;
 	struct pmu			*unique_pmu;
 	struct perf_cgroup		*cgrp;
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 37cc20e8aa3b..7f2fbb8b5069 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -872,22 +872,32 @@ void perf_pmu_enable(struct pmu *pmu)
 		pmu->pmu_enable(pmu);
 }
 
-static DEFINE_PER_CPU(struct list_head, rotation_list);
+static DEFINE_PER_CPU(struct list_head, active_ctx_list);
 
 /*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
+ * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
+ * perf_event_task_tick() are fully serialized because they're strictly cpu
+ * affine and perf_event_ctx{activate,deactivate} are called with IRQs
+ * disabled, while perf_event_task_tick is called from IRQ context.
  */
-static void perf_pmu_rotate_start(struct pmu *pmu)
+static void perf_event_ctx_activate(struct perf_event_context *ctx)
 {
-	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-	struct list_head *head = this_cpu_ptr(&rotation_list);
+	struct list_head *head = this_cpu_ptr(&active_ctx_list);
 
 	WARN_ON(!irqs_disabled());
 
-	if (list_empty(&cpuctx->rotation_list))
-		list_add(&cpuctx->rotation_list, head);
+	WARN_ON(!list_empty(&ctx->active_ctx_list));
+
+	list_add(&ctx->active_ctx_list, head);
+}
+
+static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+{
+	WARN_ON(!irqs_disabled());
+
+	WARN_ON(list_empty(&ctx->active_ctx_list));
+
+	list_del_init(&ctx->active_ctx_list);
 }
 
 static void get_ctx(struct perf_event_context *ctx)
@@ -1233,8 +1243,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 		ctx->nr_branch_stack++;
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
-	if (!ctx->nr_events)
-		perf_pmu_rotate_start(ctx->pmu);
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat++;
@@ -1561,7 +1569,8 @@ event_sched_out(struct perf_event *event,
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu--;
-	ctx->nr_active--;
+	if (!--ctx->nr_active)
+		perf_event_ctx_deactivate(ctx);
 	if (event->attr.freq && event->attr.sample_freq)
 		ctx->nr_freq--;
 	if (event->attr.exclusive || !cpuctx->active_oncpu)
@@ -1885,7 +1894,8 @@ event_sched_in(struct perf_event *event,
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
-	ctx->nr_active++;
+	if (!ctx->nr_active++)
+		perf_event_ctx_activate(ctx);
 	if (event->attr.freq && event->attr.sample_freq)
 		ctx->nr_freq++;
 
@@ -2742,12 +2752,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 
 	perf_pmu_enable(ctx->pmu);
 	perf_ctx_unlock(cpuctx, ctx);
-
-	/*
-	 * Since these rotations are per-cpu, we need to ensure the
-	 * cpu-context we got scheduled on is actually rotating.
-	 */
-	perf_pmu_rotate_start(ctx->pmu);
 }
 
 /*
@@ -3035,25 +3039,18 @@ static void rotate_ctx(struct perf_event_context *ctx)
 		list_rotate_left(&ctx->flexible_groups);
 }
 
-/*
- * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
- * because they're strictly cpu affine and rotate_start is called with IRQs
- * disabled, while rotate_context is called from IRQ context.
- */
 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
 	struct perf_event_context *ctx = NULL;
-	int rotate = 0, remove = 1;
+	int rotate = 0;
 
 	if (cpuctx->ctx.nr_events) {
-		remove = 0;
 		if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
 			rotate = 1;
 	}
 
 	ctx = cpuctx->task_ctx;
 	if (ctx && ctx->nr_events) {
-		remove = 0;
 		if (ctx->nr_events != ctx->nr_active)
 			rotate = 1;
 	}
@@ -3077,8 +3074,6 @@ static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 	perf_pmu_enable(cpuctx->ctx.pmu);
 	perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 done:
-	if (remove)
-		list_del_init(&cpuctx->rotation_list);
 
 	return rotate;
 }
@@ -3096,9 +3091,8 @@ bool perf_event_can_stop_tick(void)
 
 void perf_event_task_tick(void)
 {
-	struct list_head *head = this_cpu_ptr(&rotation_list);
-	struct perf_cpu_context *cpuctx, *tmp;
-	struct perf_event_context *ctx;
+	struct list_head *head = this_cpu_ptr(&active_ctx_list);
+	struct perf_event_context *ctx, *tmp;
 	int throttled;
 
 	WARN_ON(!irqs_disabled());
@@ -3106,14 +3100,8 @@ void perf_event_task_tick(void)
 	__this_cpu_inc(perf_throttled_seq);
 	throttled = __this_cpu_xchg(perf_throttled_count, 0);
 
-	list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
-		ctx = &cpuctx->ctx;
+	list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
 		perf_adjust_freq_unthr_context(ctx, throttled);
-
-		ctx = cpuctx->task_ctx;
-		if (ctx)
-			perf_adjust_freq_unthr_context(ctx, throttled);
-	}
 }
 
 static int event_enable_on_exec(struct perf_event *event,
@@ -3272,6 +3260,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
 {
 	raw_spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
+	INIT_LIST_HEAD(&ctx->active_ctx_list);
 	INIT_LIST_HEAD(&ctx->pinned_groups);
 	INIT_LIST_HEAD(&ctx->flexible_groups);
 	INIT_LIST_HEAD(&ctx->event_list);
@@ -6954,7 +6943,6 @@ skip_type:
 
 		__perf_cpu_hrtimer_init(cpuctx, cpu);
 
-		INIT_LIST_HEAD(&cpuctx->rotation_list);
 		cpuctx->unique_pmu = pmu;
 	}
 
@@ -8384,7 +8372,7 @@ static void __init perf_event_init_all_cpus(void)
 	for_each_possible_cpu(cpu) {
 		swhash = &per_cpu(swevent_htable, cpu);
 		mutex_init(&swhash->hlist_mutex);
-		INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
+		INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
 	}
 }
 
@@ -8405,22 +8393,11 @@ static void perf_event_init_cpu(int cpu)
 }
 
 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
-static void perf_pmu_rotate_stop(struct pmu *pmu)
-{
-	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-	WARN_ON(!irqs_disabled());
-
-	list_del_init(&cpuctx->rotation_list);
-}
-
 static void __perf_event_exit_context(void *__info)
 {
 	struct remove_event re = { .detach_group = true };
 	struct perf_event_context *ctx = __info;
 
-	perf_pmu_rotate_stop(ctx->pmu);
-
 	rcu_read_lock();
 	list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
 		__perf_remove_from_context(&re);
-- 
cgit v1.2.3


From 12cf89b550d13eb7cb86ef182bd6c04345a33a1f Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Tue, 3 Feb 2015 16:45:18 -0600
Subject: livepatch: rename config to CONFIG_LIVEPATCH

Rename CONFIG_LIVE_PATCHING to CONFIG_LIVEPATCH to make the naming of
the config and the code more consistent.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/Kconfig                 | 2 +-
 arch/x86/include/asm/livepatch.h | 4 ++--
 arch/x86/kernel/Makefile         | 2 +-
 include/linux/livepatch.h        | 4 ++--
 kernel/livepatch/Kconfig         | 6 +++---
 kernel/livepatch/Makefile        | 2 +-
 samples/Kconfig                  | 4 ++--
 samples/livepatch/Makefile       | 2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 29b095231276..11970b076862 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,7 +17,7 @@ config X86_64
 	depends on 64BIT
 	select X86_DEV_DMA_OPS
 	select ARCH_USE_CMPXCHG_LOCKREF
-	select HAVE_LIVE_PATCHING
+	select HAVE_LIVEPATCH
 
 ### Arch settings
 config X86
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index 26e58134c8cb..a455a53d789a 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -24,7 +24,7 @@
 #include <linux/module.h>
 #include <linux/ftrace.h>
 
-#ifdef CONFIG_LIVE_PATCHING
+#ifdef CONFIG_LIVEPATCH
 static inline int klp_check_compiler_support(void)
 {
 #ifndef CC_USING_FENTRY
@@ -40,7 +40,7 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 	regs->ip = ip;
 }
 #else
-#error Live patching support is disabled; check CONFIG_LIVE_PATCHING
+#error Live patching support is disabled; check CONFIG_LIVEPATCH
 #endif
 
 #endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 316b34e74c15..732223496968 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -63,7 +63,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-y				+= apic/
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_LIVE_PATCHING)	+= livepatch.o
+obj-$(CONFIG_LIVEPATCH)		+= livepatch.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
 obj-$(CONFIG_FTRACE_SYSCALLS)	+= ftrace.o
 obj-$(CONFIG_X86_TSC)		+= trace_clock.o
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index f14c6fb262b4..95023fd8b00d 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -24,7 +24,7 @@
 #include <linux/module.h>
 #include <linux/ftrace.h>
 
-#if IS_ENABLED(CONFIG_LIVE_PATCHING)
+#if IS_ENABLED(CONFIG_LIVEPATCH)
 
 #include <asm/livepatch.h>
 
@@ -128,6 +128,6 @@ extern int klp_unregister_patch(struct klp_patch *);
 extern int klp_enable_patch(struct klp_patch *);
 extern int klp_disable_patch(struct klp_patch *);
 
-#endif /* CONFIG_LIVE_PATCHING */
+#endif /* CONFIG_LIVEPATCH */
 
 #endif /* _LINUX_LIVEPATCH_H_ */
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index 347ee2221137..045022557936 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -1,15 +1,15 @@
-config HAVE_LIVE_PATCHING
+config HAVE_LIVEPATCH
 	bool
 	help
 	  Arch supports kernel live patching
 
-config LIVE_PATCHING
+config LIVEPATCH
 	bool "Kernel Live Patching"
 	depends on DYNAMIC_FTRACE_WITH_REGS
 	depends on MODULES
 	depends on SYSFS
 	depends on KALLSYMS_ALL
-	depends on HAVE_LIVE_PATCHING
+	depends on HAVE_LIVEPATCH
 	help
 	  Say Y here if you want to support kernel live patching.
 	  This option has no runtime impact until a kernel "patch"
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index 7c1f00861428..e8780c0901d9 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
-obj-$(CONFIG_LIVE_PATCHING) += livepatch.o
+obj-$(CONFIG_LIVEPATCH) += livepatch.o
 
 livepatch-objs := core.o
diff --git a/samples/Kconfig b/samples/Kconfig
index 0aed20df5f0b..224ebb46bed5 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -63,9 +63,9 @@ config SAMPLE_RPMSG_CLIENT
 	  to communicate with an AMP-configured remote processor over
 	  the rpmsg bus.
 
-config SAMPLE_LIVE_PATCHING
+config SAMPLE_LIVEPATCH
 	tristate "Build live patching sample -- loadable modules only"
-	depends on LIVE_PATCHING && m
+	depends on LIVEPATCH && m
 	help
 	  Builds a sample live patch that replaces the procfs handler
 	  for /proc/cmdline to print "this has been live patched".
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile
index 7f1cdc131a02..10319d7ea0b1 100644
--- a/samples/livepatch/Makefile
+++ b/samples/livepatch/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_SAMPLE_LIVE_PATCHING) += livepatch-sample.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o
-- 
cgit v1.2.3


From 509102760da3a21831e763560ba4715760e3fbda Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Wed, 4 Feb 2015 11:45:28 +0100
Subject: regulator: Fix build breakage on !REGULATOR

Add missing stubs for regulator_suspend_prepare() and
regulator_suspend_finish() to fix exynos_defconfig build without
REGULATOR:

arch/arm/mach-exynos/built-in.o: In function `exynos_suspend_finish':
arch/arm/mach-exynos/suspend.c:537: undefined reference to `regulator_suspend_finish'
arch/arm/mach-exynos/built-in.o: In function `exynos_suspend_prepare':
arch/arm/mach-exynos/suspend.c:520: undefined reference to `regulator_suspend_prepare'
make: *** [vmlinux] Error 1

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Reported-by: Joerg Roedel <joro@8bytes.org>
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/machine.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h
index 0b08d05d470b..b07562e082c4 100644
--- a/include/linux/regulator/machine.h
+++ b/include/linux/regulator/machine.h
@@ -191,15 +191,22 @@ struct regulator_init_data {
 	void *driver_data;	/* core does not touch this */
 };
 
-int regulator_suspend_prepare(suspend_state_t state);
-int regulator_suspend_finish(void);
-
 #ifdef CONFIG_REGULATOR
 void regulator_has_full_constraints(void);
+int regulator_suspend_prepare(suspend_state_t state);
+int regulator_suspend_finish(void);
 #else
 static inline void regulator_has_full_constraints(void)
 {
 }
+static inline int regulator_suspend_prepare(suspend_state_t state)
+{
+	return 0;
+}
+static inline int regulator_suspend_finish(void)
+{
+	return 0;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 9064bf3c3cdf92f6b9ac6634ff570dedf0035992 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 3 Feb 2015 17:03:35 -0600
Subject: spi: spi-pxa2xx: only include mach/dma.h for legacy DMA

Move the include of mach/dma.h to the legacy PXA DMA code where it is used.
This enables building spi-pxa2xx on ARM64.

Signed-off-by: Rob Herring <robh@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-pxa2xx-pxadma.c | 1 +
 include/linux/spi/pxa2xx_spi.h  | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/spi/spi-pxa2xx-pxadma.c b/drivers/spi/spi-pxa2xx-pxadma.c
index 6c82d496b58e..3fecaaa5b0a5 100644
--- a/drivers/spi/spi-pxa2xx-pxadma.c
+++ b/drivers/spi/spi-pxa2xx-pxadma.c
@@ -25,6 +25,7 @@
 #include <linux/spi/spi.h>
 #include <linux/spi/pxa2xx_spi.h>
 
+#include <mach/dma.h>
 #include "spi-pxa2xx.h"
 
 #define DMA_INT_MASK		(DCSR_ENDINTR | DCSR_STARTINTR | DCSR_BUSERR)
diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
index d5a316550177..46d8fa942631 100644
--- a/include/linux/spi/pxa2xx_spi.h
+++ b/include/linux/spi/pxa2xx_spi.h
@@ -57,7 +57,6 @@ struct pxa2xx_spi_chip {
 #if defined(CONFIG_ARCH_PXA) || defined(CONFIG_ARCH_MMP)
 
 #include <linux/clk.h>
-#include <mach/dma.h>
 
 extern void pxa2xx_set_spi_info(unsigned id, struct pxa2xx_spi_master *info);
 
-- 
cgit v1.2.3


From 2bd82484bb4c5db1d5dc983ac7c409b2782e0154 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 3 Feb 2015 23:48:24 -0800
Subject: xps: fix xps for stacked devices

A typical qdisc setup is the following :

bond0 : bonding device, using HTB hierarchy
eth1/eth2 : slaves, multiqueue NIC, using MQ + FQ qdisc

XPS allows to spread packets on specific tx queues, based on the cpu
doing the send.

Problem is that dequeues from bond0 qdisc can happen on random cpus,
due to the fact that qdisc_run() can dequeue a batch of packets.

CPUA -> queue packet P1 on bond0 qdisc, P1->ooo_okay=1
CPUA -> queue packet P2 on bond0 qdisc, P2->ooo_okay=0

CPUB -> dequeue packet P1 from bond0
        enqueue packet on eth1/eth2
CPUC -> dequeue packet P2 from bond0
        enqueue packet on eth1/eth2 using sk cache (ooo_okay is 0)

get_xps_queue() then might select wrong queue for P1, since current cpu
might be different than CPUA.

P2 might be sent on the old queue (stored in sk->sk_tx_queue_mapping),
if CPUC runs a bit faster (or CPUB spins a bit on qdisc lock)

Effect of this bug is TCP reorders, and more generally not optimal
TX queue placement. (A victim bulk flow can be migrated to the wrong TX
queue for a while)

To fix this, we have to record sender cpu number the first time
dev_queue_xmit() is called for one tx skb.

We can union napi_id (used on receive path) and sender_cpu,
granted we clear sender_cpu in skb_scrub_packet() (credit to Willem for
this union idea)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    | 7 +++++--
 net/core/flow_dissector.c | 7 ++++++-
 net/core/skbuff.c         | 4 ++++
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 85ab7d72b54c..2748ff639144 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -626,8 +626,11 @@ struct sk_buff {
 	__u32			hash;
 	__be16			vlan_proto;
 	__u16			vlan_tci;
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	unsigned int	napi_id;
+#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
+	union {
+		unsigned int	napi_id;
+		unsigned int	sender_cpu;
+	};
 #endif
 #ifdef CONFIG_NETWORK_SECMARK
 	__u32			secmark;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index beb83d1ac1c6..2c35c02a931e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -422,7 +422,7 @@ static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
 	dev_maps = rcu_dereference(dev->xps_maps);
 	if (dev_maps) {
 		map = rcu_dereference(
-		    dev_maps->cpu_map[raw_smp_processor_id()]);
+		    dev_maps->cpu_map[skb->sender_cpu - 1]);
 		if (map) {
 			if (map->len == 1)
 				queue_index = map->queues[0];
@@ -468,6 +468,11 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
 {
 	int queue_index = 0;
 
+#ifdef CONFIG_XPS
+	if (skb->sender_cpu == 0)
+		skb->sender_cpu = raw_smp_processor_id() + 1;
+#endif
+
 	if (dev->real_num_tx_queues != 1) {
 		const struct net_device_ops *ops = dev->netdev_ops;
 		if (ops->ndo_select_queue)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a5bff2767f15..88c613eab142 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -825,6 +825,9 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	CHECK_SKB_FIELD(napi_id);
 #endif
+#ifdef CONFIG_XPS
+	CHECK_SKB_FIELD(sender_cpu);
+#endif
 #ifdef CONFIG_NET_SCHED
 	CHECK_SKB_FIELD(tc_index);
 #ifdef CONFIG_NET_CLS_ACT
@@ -4169,6 +4172,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 	skb->ignore_df = 0;
 	skb_dst_drop(skb);
 	skb->mark = 0;
+	skb->sender_cpu = 0;
 	skb_init_secmark(skb);
 	secpath_reset(skb);
 	nf_reset(skb);
-- 
cgit v1.2.3


From dcdc8994697faa789669c3fdaca1a8bc27a8f356 Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Mon, 2 Feb 2015 16:07:34 -0800
Subject: net: add skb functions to process remote checksum offload

This patch adds skb_remcsum_process and skb_gro_remcsum_process to
perform the appropriate adjustments to the skb when receiving
remote checksum offload.

Updated vxlan and gue to use these functions.

Tested: Ran TCP_RR and TCP_STREAM netperf for VXLAN and GUE, did
not see any change in performance.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c       | 18 ++----------------
 include/linux/netdevice.h | 15 +++++++++++++++
 include/linux/skbuff.h    | 21 +++++++++++++++++++++
 net/ipv4/fou.c            | 18 ++----------------
 4 files changed, 40 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 31bac2a21ce3..c184717e8b28 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -558,7 +558,6 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 					  u32 data)
 {
 	size_t start, offset, plen;
-	__wsum delta;
 
 	if (skb->remcsum_offload)
 		return vh;
@@ -580,12 +579,7 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
 			return NULL;
 	}
 
-	delta = remcsum_adjust((void *)vh + hdrlen,
-			       NAPI_GRO_CB(skb)->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
-	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+	skb_gro_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
 
 	skb->remcsum_offload = 1;
 
@@ -1159,7 +1153,6 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 				      size_t hdrlen, u32 data)
 {
 	size_t start, offset, plen;
-	__wsum delta;
 
 	if (skb->remcsum_offload) {
 		/* Already processed in GRO path */
@@ -1179,14 +1172,7 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
 
 	vh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
 
-	if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE))
-		__skb_checksum_complete(skb);
-
-	delta = remcsum_adjust((void *)vh + hdrlen,
-			       skb->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
+	skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset);
 
 	return vh;
 }
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 16251e96e6aa..1347ac50d2af 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2318,6 +2318,21 @@ do {									\
 					   compute_pseudo(skb, proto));	\
 } while (0)
 
+static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
+					   int start, int offset)
+{
+	__wsum delta;
+
+	BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
+
+	delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset);
+
+	/* Adjust skb->csum since we changed the packet */
+	skb->csum = csum_add(skb->csum, delta);
+	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+}
+
+
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
 				  const void *daddr, const void *saddr,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2748ff639144..5405dfe02572 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3099,6 +3099,27 @@ do {									\
 				       compute_pseudo(skb, proto));	\
 } while (0)
 
+/* Update skbuf and packet to reflect the remote checksum offload operation.
+ * When called, ptr indicates the starting point for skb->csum when
+ * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
+ * here, skb_postpull_rcsum is done so skb->csum start is ptr.
+ */
+static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
+				       int start, int offset)
+{
+	__wsum delta;
+
+	 if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
+		__skb_checksum_complete(skb);
+		skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
+	}
+
+	delta = remcsum_adjust(ptr, skb->csum, start, offset);
+
+	/* Adjust skb->csum since we changed the packet */
+	skb->csum = csum_add(skb->csum, delta);
+}
+
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 void nf_conntrack_destroy(struct nf_conntrack *nfct);
 static inline void nf_conntrack_put(struct nf_conntrack *nfct)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 3bc0cf07661c..92ddea1e6457 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -70,7 +70,6 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 	size_t start = ntohs(pd[0]);
 	size_t offset = ntohs(pd[1]);
 	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
-	__wsum delta;
 
 	if (skb->remcsum_offload) {
 		/* Already processed in GRO path */
@@ -82,14 +81,7 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
 		return NULL;
 	guehdr = (struct guehdr *)&udp_hdr(skb)[1];
 
-	if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE))
-		__skb_checksum_complete(skb);
-
-	delta = remcsum_adjust((void *)guehdr + hdrlen,
-			       skb->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
+	skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
 
 	return guehdr;
 }
@@ -228,7 +220,6 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 	size_t start = ntohs(pd[0]);
 	size_t offset = ntohs(pd[1]);
 	size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
-	__wsum delta;
 
 	if (skb->remcsum_offload)
 		return guehdr;
@@ -243,12 +234,7 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
 			return NULL;
 	}
 
-	delta = remcsum_adjust((void *)guehdr + hdrlen,
-			       NAPI_GRO_CB(skb)->csum, start, offset);
-
-	/* Adjust skb->csum since we changed the packet */
-	skb->csum = csum_add(skb->csum, delta);
-	NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+	skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset);
 
 	skb->remcsum_offload = 1;
 
-- 
cgit v1.2.3


From 61bd3857ff2c7daf756d49b41e6277bbdaa8f789 Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 3 Feb 2015 16:48:29 +0200
Subject: net/core: Add event for a change in slave state

Add event which provides an indication on a change in the state
of a bonding slave. The event handler should cast the pointer to the
appropriate type (struct netdev_bonding_info) in order to get the
full info about the slave.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 15 +++++++++++++++
 net/core/dev.c            | 20 ++++++++++++++++++++
 net/core/rtnetlink.c      |  1 +
 3 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1347ac50d2af..ce784d5018e0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -51,6 +51,7 @@
 #include <linux/netdev_features.h>
 #include <linux/neighbour.h>
 #include <uapi/linux/netdevice.h>
+#include <uapi/linux/if_bonding.h>
 
 struct netpoll_info;
 struct device;
@@ -2056,6 +2057,7 @@ struct pcpu_sw_netstats {
 #define NETDEV_RESEND_IGMP	0x0016
 #define NETDEV_PRECHANGEMTU	0x0017 /* notify before mtu change happened */
 #define NETDEV_CHANGEINFODATA	0x0018
+#define NETDEV_BONDING_INFO	0x0019
 
 int register_netdevice_notifier(struct notifier_block *nb);
 int unregister_netdevice_notifier(struct notifier_block *nb);
@@ -3494,6 +3496,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 				    netdev_features_t features);
 
+struct netdev_bonding_info {
+	ifslave	slave;
+	ifbond	master;
+};
+
+struct netdev_notifier_bonding_info {
+	struct netdev_notifier_info info; /* must be first */
+	struct netdev_bonding_info  bonding_info;
+};
+
+void netdev_bonding_info_change(struct net_device *dev,
+				struct netdev_bonding_info *bonding_info);
+
 static inline
 struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 1d564d68e31a..ede0b161b115 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5355,6 +5355,26 @@ void netdev_upper_dev_unlink(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_upper_dev_unlink);
 
+/**
+ * netdev_bonding_info_change - Dispatch event about slave change
+ * @dev: device
+ * @netdev_bonding_info: info to dispatch
+ *
+ * Send NETDEV_BONDING_INFO to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
+ */
+void netdev_bonding_info_change(struct net_device *dev,
+				struct netdev_bonding_info *bonding_info)
+{
+	struct netdev_notifier_bonding_info	info;
+
+	memcpy(&info.bonding_info, bonding_info,
+	       sizeof(struct netdev_bonding_info));
+	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
+				      &info.info);
+}
+EXPORT_SYMBOL(netdev_bonding_info_change);
+
 void netdev_adjacent_add_links(struct net_device *dev)
 {
 	struct netdev_adjacent *iter;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 673cb4c6f391..4cd5e350d129 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3180,6 +3180,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_UNREGISTER_FINAL:
 	case NETDEV_RELEASE:
 	case NETDEV_JOIN:
+	case NETDEV_BONDING_INFO:
 		break;
 	default:
 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
-- 
cgit v1.2.3


From 59e14e325066be49b49b6c2503337c69a9ee29fc Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 3 Feb 2015 16:48:32 +0200
Subject: net/mlx4_core: Port aggregation low level interface

Implement the hardware interface required for port aggregation.

1. Disable RX port check on receive - don't perform a validity check
that matches to QP's port and the port where the packet is received.

2. Virtual to physical port remap - configure virtual to physical port
mapping. Port remap capability for virtual functions.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c |  9 +++++
 drivers/net/ethernet/mellanox/mlx4/fw.c  | 56 +++++++++++++++++++++++++++++---
 include/linux/mlx4/cmd.h                 |  7 ++++
 include/linux/mlx4/device.h              | 10 +++++-
 include/linux/mlx4/qp.h                  |  1 +
 5 files changed, 77 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 154effbfd8be..a681d7c0bb9f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1583,6 +1583,15 @@ static struct mlx4_cmd_info cmd_info[] = {
 		.verify = NULL,
 		.wrapper = mlx4_CMD_EPERM_wrapper
 	},
+	{
+		.opcode = MLX4_CMD_VIRT_PORT_MAP,
+		.has_inbox = false,
+		.has_outbox = false,
+		.out_is_imm = false,
+		.encode_slave_id = false,
+		.verify = NULL,
+		.wrapper = mlx4_CMD_EPERM_wrapper
+	},
 };
 
 static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index dbabfae3a3de..4b08a393ebcb 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -142,7 +142,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[17] = "Asymmetric EQs support",
 		[18] = "More than 80 VFs support",
 		[19] = "Performance optimized for limited rule configuration flow steering support",
-		[20] = "Recoverable error events support"
+		[20] = "Recoverable error events support",
+		[21] = "Port Remap support"
 	};
 	int i;
 
@@ -863,6 +864,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE;
 	MLX4_GET(dev_cap->bmme_flags, outbox,
 		 QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
+	if (dev_cap->bmme_flags & MLX4_FLAG_PORT_REMAP)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_REMAP;
 	MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET);
 	if (field & 0x20)
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV;
@@ -1120,9 +1123,10 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave,
 	field &= 0x7f;
 	MLX4_PUT(outbox->buf, field, QUERY_DEV_CAP_BF_OFFSET);
 
-	/* For guests, disable mw type 2 */
+	/* For guests, disable mw type 2 and port remap*/
 	MLX4_GET(bmme_flags, outbox->buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
 	bmme_flags &= ~MLX4_BMME_FLAG_TYPE_2_WIN;
+	bmme_flags &= ~MLX4_FLAG_PORT_REMAP;
 	MLX4_PUT(outbox->buf, bmme_flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET);
 
 	/* turn off device-managed steering capability if not enabled */
@@ -2100,13 +2104,16 @@ struct mlx4_config_dev {
 	__be32	rsvd1[3];
 	__be16	vxlan_udp_dport;
 	__be16	rsvd2;
-	__be32	rsvd3[27];
-	__be16	rsvd4;
-	u8	rsvd5;
+	__be32	rsvd3;
+	__be32	roce_flags;
+	__be32	rsvd4[25];
+	__be16	rsvd5;
+	u8	rsvd6;
 	u8	rx_checksum_val;
 };
 
 #define MLX4_VXLAN_UDP_DPORT (1 << 0)
+#define MLX4_DISABLE_RX_PORT BIT(18)
 
 static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev)
 {
@@ -2209,6 +2216,45 @@ int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port)
 }
 EXPORT_SYMBOL_GPL(mlx4_config_vxlan_port);
 
+#define CONFIG_DISABLE_RX_PORT BIT(15)
+int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis)
+{
+	struct mlx4_config_dev config_dev;
+
+	memset(&config_dev, 0, sizeof(config_dev));
+	config_dev.update_flags = cpu_to_be32(MLX4_DISABLE_RX_PORT);
+	if (dis)
+		config_dev.roce_flags =
+			cpu_to_be32(CONFIG_DISABLE_RX_PORT);
+
+	return mlx4_CONFIG_DEV_set(dev, &config_dev);
+}
+
+int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct {
+		__be32 v_port1;
+		__be32 v_port2;
+	} *v2p;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+
+	v2p = mailbox->buf;
+	v2p->v_port1 = cpu_to_be32(port1);
+	v2p->v_port2 = cpu_to_be32(port2);
+
+	err = mlx4_cmd(dev, mailbox->dma, 0,
+		       MLX4_SET_PORT_VIRT2PHY, MLX4_CMD_VIRT_PORT_MAP,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	return err;
+}
+
 
 int mlx4_SET_ICM_SIZE(struct mlx4_dev *dev, u64 icm_size, u64 *aux_pages)
 {
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index ae95adc78509..7b6d4e9ff603 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -71,6 +71,7 @@ enum {
 
 	/*master notify fw on finish for slave's flr*/
 	MLX4_CMD_INFORM_FLR_DONE = 0x5b,
+	MLX4_CMD_VIRT_PORT_MAP   = 0x5c,
 	MLX4_CMD_GET_OP_REQ      = 0x59,
 
 	/* TPT commands */
@@ -170,6 +171,12 @@ enum {
 	MLX4_CMD_TIME_CLASS_C	= 60000,
 };
 
+enum {
+	/* virtual to physical port mapping opcode modifiers */
+	MLX4_GET_PORT_VIRT2PHY = 0x0,
+	MLX4_SET_PORT_VIRT2PHY = 0x1,
+};
+
 enum {
 	MLX4_MAILBOX_SIZE	= 4096,
 	MLX4_ACCESS_MEM_ALIGN	= 256,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index c95d659a39f2..d9afd99dde39 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -201,7 +201,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_SYS_EQS		= 1LL <<  17,
 	MLX4_DEV_CAP_FLAG2_80_VFS		= 1LL <<  18,
 	MLX4_DEV_CAP_FLAG2_FS_A0		= 1LL <<  19,
-	MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20
+	MLX4_DEV_CAP_FLAG2_RECOVERABLE_ERROR_EVENT = 1LL << 20,
+	MLX4_DEV_CAP_FLAG2_PORT_REMAP		= 1LL <<  21
 };
 
 enum {
@@ -253,9 +254,14 @@ enum {
 	MLX4_BMME_FLAG_TYPE_2_WIN	= 1 <<  9,
 	MLX4_BMME_FLAG_RESERVED_LKEY	= 1 << 10,
 	MLX4_BMME_FLAG_FAST_REG_WR	= 1 << 11,
+	MLX4_BMME_FLAG_PORT_REMAP	= 1 << 24,
 	MLX4_BMME_FLAG_VSD_INIT2RTR	= 1 << 28,
 };
 
+enum {
+	MLX4_FLAG_PORT_REMAP		= MLX4_BMME_FLAG_PORT_REMAP
+};
+
 enum mlx4_event {
 	MLX4_EVENT_TYPE_COMP		   = 0x00,
 	MLX4_EVENT_TYPE_PATH_MIG	   = 0x01,
@@ -1378,6 +1384,8 @@ int mlx4_phys_to_slave_port(struct mlx4_dev *dev, int slave, int port);
 int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port);
 
 int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port);
+int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis);
+int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2);
 int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port);
 int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port);
 int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port,
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 467ccdf94c98..2bbc62aa818a 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -96,6 +96,7 @@ enum {
 	MLX4_QP_BIT_RRE				= 1 << 15,
 	MLX4_QP_BIT_RWE				= 1 << 14,
 	MLX4_QP_BIT_RAE				= 1 << 13,
+	MLX4_QP_BIT_FPP				= 1 <<	3,
 	MLX4_QP_BIT_RIC				= 1 <<	4,
 };
 
-- 
cgit v1.2.3


From 53f33ae295a5098f12218da1400f55ad7df7447c Mon Sep 17 00:00:00 2001
From: Moni Shoua <monis@mellanox.com>
Date: Tue, 3 Feb 2015 16:48:33 +0200
Subject: net/mlx4_core: Port aggregation upper layer interface

Supply interface functions to bond and unbond ports of a mlx4 internal
interfaces. Example for such an interface is the one registered by the
mlx4 IB driver under RoCE.

There are

1. Functions to go in/out to/from bonded mode
2. Function to remap virtual ports to physical ports

The bond_mutex prevents simultaneous access to data that keep status of
the device in bonded mode.

The upper mlx4 interface marks to the mlx4 core module that they
want to be subject for such bonding by setting the MLX4_INTFF_BONDING
flag. Interface which goes to/from bonded mode is re-created.

The mlx4 Ethernet driver does not set this flag when registering the
interface, the IB driver does.

Signed-off-by: Moni Shoua <monis@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_resources.c  |  8 +-
 drivers/net/ethernet/mellanox/mlx4/intf.c          | 54 +++++++++++++
 drivers/net/ethernet/mellanox/mlx4/main.c          | 89 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |  3 +
 drivers/net/ethernet/mellanox/mlx4/qp.c            |  2 +
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  3 +
 include/linux/mlx4/device.h                        |  1 +
 include/linux/mlx4/driver.h                        | 19 +++++
 8 files changed, 177 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
index f1a5500ff72d..34f2fdf4fe5d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_resources.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -50,10 +50,14 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
 	context->mtu_msgmax = 0xff;
 	if (!is_tx && !rss)
 		context->rq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
-	if (is_tx)
+	if (is_tx) {
 		context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4);
-	else
+		if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP)
+			context->params2 |= MLX4_QP_BIT_FPP;
+
+	} else {
 		context->sq_size_stride = ilog2(TXBB_SIZE) - 4;
+	}
 	context->usr_page = cpu_to_be32(mdev->priv_uar.index);
 	context->local_qpn = cpu_to_be32(qpn);
 	context->pri_path.ackto = 1 & 0x07;
diff --git a/drivers/net/ethernet/mellanox/mlx4/intf.c b/drivers/net/ethernet/mellanox/mlx4/intf.c
index 68d2bad325d5..6fce58718837 100644
--- a/drivers/net/ethernet/mellanox/mlx4/intf.c
+++ b/drivers/net/ethernet/mellanox/mlx4/intf.c
@@ -33,11 +33,13 @@
 
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/errno.h>
 
 #include "mlx4.h"
 
 struct mlx4_device_context {
 	struct list_head	list;
+	struct list_head	bond_list;
 	struct mlx4_interface  *intf;
 	void		       *context;
 };
@@ -115,6 +117,58 @@ void mlx4_unregister_interface(struct mlx4_interface *intf)
 }
 EXPORT_SYMBOL_GPL(mlx4_unregister_interface);
 
+int mlx4_do_bond(struct mlx4_dev *dev, bool enable)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_device_context *dev_ctx = NULL, *temp_dev_ctx;
+	unsigned long flags;
+	int ret;
+	LIST_HEAD(bond_list);
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP))
+		return -ENOTSUPP;
+
+	ret = mlx4_disable_rx_port_check(dev, enable);
+	if (ret) {
+		mlx4_err(dev, "Fail to %s rx port check\n",
+			 enable ? "enable" : "disable");
+		return ret;
+	}
+	if (enable) {
+		dev->flags |= MLX4_FLAG_BONDED;
+	} else {
+		 ret = mlx4_virt2phy_port_map(dev, 1, 2);
+		if (ret) {
+			mlx4_err(dev, "Fail to reset port map\n");
+			return ret;
+		}
+		dev->flags &= ~MLX4_FLAG_BONDED;
+	}
+
+	spin_lock_irqsave(&priv->ctx_lock, flags);
+	list_for_each_entry_safe(dev_ctx, temp_dev_ctx, &priv->ctx_list, list) {
+		if (dev_ctx->intf->flags & MLX4_INTFF_BONDING) {
+			list_add_tail(&dev_ctx->bond_list, &bond_list);
+			list_del(&dev_ctx->list);
+		}
+	}
+	spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+	list_for_each_entry(dev_ctx, &bond_list, bond_list) {
+		dev_ctx->intf->remove(dev, dev_ctx->context);
+		dev_ctx->context =  dev_ctx->intf->add(dev);
+
+		spin_lock_irqsave(&priv->ctx_lock, flags);
+		list_add_tail(&dev_ctx->list, &priv->ctx_list);
+		spin_unlock_irqrestore(&priv->ctx_lock, flags);
+
+		mlx4_dbg(dev, "Inrerface for protocol %d restarted with when bonded mode is %s\n",
+			 dev_ctx->intf->protocol, enable ?
+			 "enabled" : "disabled");
+	}
+	return 0;
+}
+
 void mlx4_dispatch_event(struct mlx4_dev *dev, enum mlx4_dev_event type,
 			 unsigned long param)
 {
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index cc9f48439244..f3245fe0f442 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1160,6 +1160,91 @@ err_set_port:
 	return err ? err : count;
 }
 
+int mlx4_bond(struct mlx4_dev *dev)
+{
+	int ret = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->bond_mutex);
+
+	if (!mlx4_is_bonded(dev))
+		ret = mlx4_do_bond(dev, true);
+	else
+		ret = 0;
+
+	mutex_unlock(&priv->bond_mutex);
+	if (ret)
+		mlx4_err(dev, "Failed to bond device: %d\n", ret);
+	else
+		mlx4_dbg(dev, "Device is bonded\n");
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_bond);
+
+int mlx4_unbond(struct mlx4_dev *dev)
+{
+	int ret = 0;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	mutex_lock(&priv->bond_mutex);
+
+	if (mlx4_is_bonded(dev))
+		ret = mlx4_do_bond(dev, false);
+
+	mutex_unlock(&priv->bond_mutex);
+	if (ret)
+		mlx4_err(dev, "Failed to unbond device: %d\n", ret);
+	else
+		mlx4_dbg(dev, "Device is unbonded\n");
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx4_unbond);
+
+
+int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p)
+{
+	u8 port1 = v2p->port1;
+	u8 port2 = v2p->port2;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	int err;
+
+	if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP))
+		return -ENOTSUPP;
+
+	mutex_lock(&priv->bond_mutex);
+
+	/* zero means keep current mapping for this port */
+	if (port1 == 0)
+		port1 = priv->v2p.port1;
+	if (port2 == 0)
+		port2 = priv->v2p.port2;
+
+	if ((port1 < 1) || (port1 > MLX4_MAX_PORTS) ||
+	    (port2 < 1) || (port2 > MLX4_MAX_PORTS) ||
+	    (port1 == 2 && port2 == 1)) {
+		/* besides boundary checks cross mapping makes
+		 * no sense and therefore not allowed */
+		err = -EINVAL;
+	} else if ((port1 == priv->v2p.port1) &&
+		 (port2 == priv->v2p.port2)) {
+		err = 0;
+	} else {
+		err = mlx4_virt2phy_port_map(dev, port1, port2);
+		if (!err) {
+			mlx4_dbg(dev, "port map changed: [%d][%d]\n",
+				 port1, port2);
+			priv->v2p.port1 = port1;
+			priv->v2p.port2 = port2;
+		} else {
+			mlx4_err(dev, "Failed to change port mape: %d\n", err);
+		}
+	}
+
+	mutex_unlock(&priv->bond_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx4_port_map_set);
+
 static int mlx4_load_fw(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -2638,6 +2723,7 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
 	spin_lock_init(&priv->ctx_lock);
 
 	mutex_init(&priv->port_mutex);
+	mutex_init(&priv->bond_mutex);
 
 	INIT_LIST_HEAD(&priv->pgdir_list);
 	mutex_init(&priv->pgdir_mutex);
@@ -2934,6 +3020,9 @@ slave_start:
 			goto err_port;
 	}
 
+	priv->v2p.port1 = 1;
+	priv->v2p.port2 = 2;
+
 	err = mlx4_register_device(dev);
 	if (err)
 		goto err_port;
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 148dc0945aab..803f17653da7 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -885,6 +885,8 @@ struct mlx4_priv {
 	int			reserved_mtts;
 	int			fs_hash_mode;
 	u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	struct mlx4_port_map	v2p; /* cached port mapping configuration */
+	struct mutex		bond_mutex; /* for bond mode */
 	__be64			slave_node_guids[MLX4_MFUNC_MAX];
 
 	atomic_t		opreq_count;
@@ -1364,6 +1366,7 @@ int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port);
 /* Returns the VF index of slave */
 int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave);
 int mlx4_config_mad_demux(struct mlx4_dev *dev);
+int mlx4_do_bond(struct mlx4_dev *dev, bool enable);
 
 enum mlx4_zone_flags {
 	MLX4_ZONE_ALLOW_ALLOC_FROM_LOWER_PRIO	= 1UL << 0,
diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c
index 1586ecce13c7..2bb8553bd905 100644
--- a/drivers/net/ethernet/mellanox/mlx4/qp.c
+++ b/drivers/net/ethernet/mellanox/mlx4/qp.c
@@ -882,6 +882,8 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt,
 	for (i = 0; i < ARRAY_SIZE(states) - 1; i++) {
 		context->flags &= cpu_to_be32(~(0xf << 28));
 		context->flags |= cpu_to_be32(states[i + 1] << 28);
+		if (states[i + 1] != MLX4_QP_STATE_RTR)
+			context->params2 &= ~MLX4_QP_BIT_FPP;
 		err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1],
 				     context, 0, 0, qp);
 		if (err) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 79feeb6b0d87..c5f3dfca226b 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -2944,6 +2944,9 @@ static int verify_qp_parameters(struct mlx4_dev *dev,
 	qp_type	= (be32_to_cpu(qp_ctx->flags) >> 16) & 0xff;
 	optpar	= be32_to_cpu(*(__be32 *) inbox->buf);
 
+	if (slave != mlx4_master_func_num(dev))
+		qp_ctx->params2 &= ~MLX4_QP_BIT_FPP;
+
 	switch (qp_type) {
 	case MLX4_QP_ST_RC:
 	case MLX4_QP_ST_XRC:
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index d9afd99dde39..977b0b164431 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -70,6 +70,7 @@ enum {
 	MLX4_FLAG_SLAVE		= 1 << 3,
 	MLX4_FLAG_SRIOV		= 1 << 4,
 	MLX4_FLAG_OLD_REG_MAC	= 1 << 6,
+	MLX4_FLAG_BONDED	= 1 << 7
 };
 
 enum {
diff --git a/include/linux/mlx4/driver.h b/include/linux/mlx4/driver.h
index 022055c8fb26..9553a73d2049 100644
--- a/include/linux/mlx4/driver.h
+++ b/include/linux/mlx4/driver.h
@@ -49,6 +49,10 @@ enum mlx4_dev_event {
 	MLX4_DEV_EVENT_SLAVE_SHUTDOWN,
 };
 
+enum {
+	MLX4_INTFF_BONDING	= 1 << 0
+};
+
 struct mlx4_interface {
 	void *			(*add)	 (struct mlx4_dev *dev);
 	void			(*remove)(struct mlx4_dev *dev, void *context);
@@ -57,11 +61,26 @@ struct mlx4_interface {
 	void *			(*get_dev)(struct mlx4_dev *dev, void *context, u8 port);
 	struct list_head	list;
 	enum mlx4_protocol	protocol;
+	int			flags;
 };
 
 int mlx4_register_interface(struct mlx4_interface *intf);
 void mlx4_unregister_interface(struct mlx4_interface *intf);
 
+int mlx4_bond(struct mlx4_dev *dev);
+int mlx4_unbond(struct mlx4_dev *dev);
+static inline int mlx4_is_bonded(struct mlx4_dev *dev)
+{
+	return !!(dev->flags & MLX4_FLAG_BONDED);
+}
+
+struct mlx4_port_map {
+	u8	port1;
+	u8	port2;
+};
+
+int mlx4_port_map_set(struct mlx4_dev *dev, struct mlx4_port_map *v2p);
+
 void *mlx4_get_protocol_dev(struct mlx4_dev *dev, enum mlx4_protocol proto, int port);
 
 static inline u64 mlx4_mac_to_u64(u8 *addr)
-- 
cgit v1.2.3


From f2dba9c6ff0d9a515b4c3f1b037cd65c8b2a868c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 4 Feb 2015 07:33:23 +1100
Subject: rhashtable: Introduce rhashtable_walk_*

Some existing rhashtable users get too intimate with it by walking
the buckets directly.  This prevents us from easily changing the
internals of rhashtable.

This patch adds the helpers rhashtable_walk_init/exit/start/next/stop
which will replace these custom walkers.

They are meant to be usable for both procfs seq_file walks as well
as walking by a netlink dump.  The iterator structure should fit
inside a netlink dump cb structure, with at least one element to
spare.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h |  35 ++++++++++
 lib/rhashtable.c           | 163 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index e0337844358e..58851275fed9 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -18,6 +18,7 @@
 #ifndef _LINUX_RHASHTABLE_H
 #define _LINUX_RHASHTABLE_H
 
+#include <linux/compiler.h>
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
 #include <linux/mutex.h>
@@ -111,6 +112,7 @@ struct rhashtable_params {
  * @p: Configuration parameters
  * @run_work: Deferred worker to expand/shrink asynchronously
  * @mutex: Mutex to protect current/future table swapping
+ * @walkers: List of active walkers
  * @being_destroyed: True if table is set up for destruction
  */
 struct rhashtable {
@@ -121,9 +123,36 @@ struct rhashtable {
 	struct rhashtable_params	p;
 	struct work_struct		run_work;
 	struct mutex                    mutex;
+	struct list_head		walkers;
 	bool                            being_destroyed;
 };
 
+/**
+ * struct rhashtable_walker - Hash table walker
+ * @list: List entry on list of walkers
+ * @resize: Resize event occured
+ */
+struct rhashtable_walker {
+	struct list_head list;
+	bool resize;
+};
+
+/**
+ * struct rhashtable_iter - Hash table iterator, fits into netlink cb
+ * @ht: Table to iterate through
+ * @p: Current pointer
+ * @walker: Associated rhashtable walker
+ * @slot: Current slot
+ * @skip: Number of entries to skip in slot
+ */
+struct rhashtable_iter {
+	struct rhashtable *ht;
+	struct rhash_head *p;
+	struct rhashtable_walker *walker;
+	unsigned int slot;
+	unsigned int skip;
+};
+
 static inline unsigned long rht_marker(const struct rhashtable *ht, u32 hash)
 {
 	return NULLS_MARKER(ht->p.nulls_base + hash);
@@ -179,6 +208,12 @@ bool rhashtable_lookup_compare_insert(struct rhashtable *ht,
 				      bool (*compare)(void *, void *),
 				      void *arg);
 
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter);
+void rhashtable_walk_exit(struct rhashtable_iter *iter);
+int rhashtable_walk_start(struct rhashtable_iter *iter) __acquires(RCU);
+void *rhashtable_walk_next(struct rhashtable_iter *iter);
+void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU);
+
 void rhashtable_destroy(struct rhashtable *ht);
 
 #define rht_dereference(p, ht) \
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 904b419b72f5..057919164e23 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -484,6 +484,7 @@ static void rht_deferred_worker(struct work_struct *work)
 {
 	struct rhashtable *ht;
 	struct bucket_table *tbl;
+	struct rhashtable_walker *walker;
 
 	ht = container_of(work, struct rhashtable, run_work);
 	mutex_lock(&ht->mutex);
@@ -492,6 +493,9 @@ static void rht_deferred_worker(struct work_struct *work)
 
 	tbl = rht_dereference(ht->tbl, ht);
 
+	list_for_each_entry(walker, &ht->walkers, list)
+		walker->resize = true;
+
 	if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size))
 		rhashtable_expand(ht);
 	else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size))
@@ -822,6 +826,164 @@ exit:
 }
 EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert);
 
+/**
+ * rhashtable_walk_init - Initialise an iterator
+ * @ht:		Table to walk over
+ * @iter:	Hash table Iterator
+ *
+ * This function prepares a hash table walk.
+ *
+ * Note that if you restart a walk after rhashtable_walk_stop you
+ * may see the same object twice.  Also, you may miss objects if
+ * there are removals in between rhashtable_walk_stop and the next
+ * call to rhashtable_walk_start.
+ *
+ * For a completely stable walk you should construct your own data
+ * structure outside the hash table.
+ *
+ * This function may sleep so you must not call it from interrupt
+ * context or with spin locks held.
+ *
+ * You must call rhashtable_walk_exit if this function returns
+ * successfully.
+ */
+int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter)
+{
+	iter->ht = ht;
+	iter->p = NULL;
+	iter->slot = 0;
+	iter->skip = 0;
+
+	iter->walker = kmalloc(sizeof(*iter->walker), GFP_KERNEL);
+	if (!iter->walker)
+		return -ENOMEM;
+
+	mutex_lock(&ht->mutex);
+	list_add(&iter->walker->list, &ht->walkers);
+	mutex_unlock(&ht->mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_init);
+
+/**
+ * rhashtable_walk_exit - Free an iterator
+ * @iter:	Hash table Iterator
+ *
+ * This function frees resources allocated by rhashtable_walk_init.
+ */
+void rhashtable_walk_exit(struct rhashtable_iter *iter)
+{
+	mutex_lock(&iter->ht->mutex);
+	list_del(&iter->walker->list);
+	mutex_unlock(&iter->ht->mutex);
+	kfree(iter->walker);
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
+
+/**
+ * rhashtable_walk_start - Start a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Start a hash table walk.  Note that we take the RCU lock in all
+ * cases including when we return an error.  So you must always call
+ * rhashtable_walk_stop to clean up.
+ *
+ * Returns zero if successful.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may use it immediately
+ * by calling rhashtable_walk_next.
+ */
+int rhashtable_walk_start(struct rhashtable_iter *iter)
+{
+	rcu_read_lock();
+
+	if (iter->walker->resize) {
+		iter->slot = 0;
+		iter->skip = 0;
+		iter->walker->resize = false;
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_start);
+
+/**
+ * rhashtable_walk_next - Return the next object and advance the iterator
+ * @iter:	Hash table iterator
+ *
+ * Note that you must call rhashtable_walk_stop when you are finished
+ * with the walk.
+ *
+ * Returns the next object or NULL when the end of the table is reached.
+ *
+ * Returns -EAGAIN if resize event occured.  Note that the iterator
+ * will rewind back to the beginning and you may continue to use it.
+ */
+void *rhashtable_walk_next(struct rhashtable_iter *iter)
+{
+	const struct bucket_table *tbl;
+	struct rhashtable *ht = iter->ht;
+	struct rhash_head *p = iter->p;
+	void *obj = NULL;
+
+	tbl = rht_dereference_rcu(ht->tbl, ht);
+
+	if (p) {
+		p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot);
+		goto next;
+	}
+
+	for (; iter->slot < tbl->size; iter->slot++) {
+		int skip = iter->skip;
+
+		rht_for_each_rcu(p, tbl, iter->slot) {
+			if (!skip)
+				break;
+			skip--;
+		}
+
+next:
+		if (!rht_is_a_nulls(p)) {
+			iter->skip++;
+			iter->p = p;
+			obj = rht_obj(ht, p);
+			goto out;
+		}
+
+		iter->skip = 0;
+	}
+
+	iter->p = NULL;
+
+out:
+	if (iter->walker->resize) {
+		iter->p = NULL;
+		iter->slot = 0;
+		iter->skip = 0;
+		iter->walker->resize = false;
+		return ERR_PTR(-EAGAIN);
+	}
+
+	return obj;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_next);
+
+/**
+ * rhashtable_walk_stop - Finish a hash table walk
+ * @iter:	Hash table iterator
+ *
+ * Finish a hash table walk.
+ */
+void rhashtable_walk_stop(struct rhashtable_iter *iter)
+{
+	rcu_read_unlock();
+	iter->p = NULL;
+}
+EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
+
 static size_t rounded_hashtable_size(struct rhashtable_params *params)
 {
 	return max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
@@ -894,6 +1056,7 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
 	memset(ht, 0, sizeof(*ht));
 	mutex_init(&ht->mutex);
 	memcpy(&ht->p, params, sizeof(*params));
+	INIT_LIST_HEAD(&ht->walkers);
 
 	if (params->locks_mul)
 		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
-- 
cgit v1.2.3


From 7fbc1067f06098c6b674e672fbb17e758fcc9402 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 28 Oct 2013 10:32:35 +0100
Subject: exportfs: add methods for block layout exports

Add three methods to allow exporting pnfs block layout volumes:

 - get_uuid: get a filesystem unique signature exposed to clients
 - map_blocks: map and if nessecary allocate blocks for a layout
 - commit_blocks: commit blocks in a layout once the client is done with them

For now we stick the external pnfs block layout interfaces into s_export_op to
avoid mixing them up with the internal interface between the NFS server and
the layout drivers.  Once we've fully internalized the latter interface we
can redecide if these methods should stay in s_export_ops.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/exportfs.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 41b223a59a63..fa05e04c5531 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 
 struct dentry;
+struct iattr;
 struct inode;
 struct super_block;
 struct vfsmount;
@@ -180,6 +181,21 @@ struct fid {
  *    get_name is not (which is possibly inconsistent)
  */
 
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE	0x01	/* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC	0x02	/* delayed allocation blocks */
+#define IOMAP_MAPPED	0x03	/* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN	0x04	/* blocks allocated @blkno in unwritten state */
+
+#define IOMAP_NULL_BLOCK -1LL	/* blkno is not valid */
+
+struct iomap {
+	sector_t	blkno;	/* first sector of mapping */
+	loff_t		offset;	/* file offset of mapping, bytes */
+	u64		length;	/* length of mapping, bytes */
+	int		type;	/* type of mapping */
+};
+
 struct export_operations {
 	int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
 			struct inode *parent);
@@ -191,6 +207,13 @@ struct export_operations {
 			struct dentry *child);
 	struct dentry * (*get_parent)(struct dentry *child);
 	int (*commit_metadata)(struct inode *inode);
+
+	int (*get_uuid)(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+	int (*map_blocks)(struct inode *inode, loff_t offset,
+			  u64 len, struct iomap *iomap,
+			  bool write, u32 *device_generation);
+	int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
+			     int nr_iomaps, struct iattr *iattr);
 };
 
 extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
-- 
cgit v1.2.3


From 90e97820619dc912b52cc9d103272819d8b51259 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 5 Feb 2015 13:44:43 +0800
Subject: resources: Move struct resource_list_entry from ACPI into resource
 core

Currently ACPI, PCI and pnp all implement the same resource list
management with different data structure. We need to transfer from
one data structure into another when passing resources from one
subsystem into another subsystem. So move struct resource_list_entry
from ACPI into resource core and rename it as resource_entry,
then it could be reused by different subystems and avoid the data
structure conversion.

Introduce dedicated header file resource_ext.h instead of embedding
it into ioport.h to avoid header file inclusion order issues.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Acked-by: Vinod Koul <vinod.koul@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/acpi_lpss.c     |  8 ++---
 drivers/acpi/acpi_platform.c |  4 +--
 drivers/acpi/resource.c      | 17 ++++------
 drivers/dma/acpi-dma.c       | 10 +++---
 include/linux/acpi.h         | 12 +------
 include/linux/resource_ext.h | 77 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/resource.c            | 25 ++++++++++++++
 7 files changed, 120 insertions(+), 33 deletions(-)
 create mode 100644 include/linux/resource_ext.h

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
index 4f3febf8a589..dfd1b8095dad 100644
--- a/drivers/acpi/acpi_lpss.c
+++ b/drivers/acpi/acpi_lpss.c
@@ -313,7 +313,7 @@ static int acpi_lpss_create_device(struct acpi_device *adev,
 {
 	struct lpss_device_desc *dev_desc;
 	struct lpss_private_data *pdata;
-	struct resource_list_entry *rentry;
+	struct resource_entry *rentry;
 	struct list_head resource_list;
 	struct platform_device *pdev;
 	int ret;
@@ -333,12 +333,12 @@ static int acpi_lpss_create_device(struct acpi_device *adev,
 		goto err_out;
 
 	list_for_each_entry(rentry, &resource_list, node)
-		if (resource_type(&rentry->res) == IORESOURCE_MEM) {
+		if (resource_type(rentry->res) == IORESOURCE_MEM) {
 			if (dev_desc->prv_size_override)
 				pdata->mmio_size = dev_desc->prv_size_override;
 			else
-				pdata->mmio_size = resource_size(&rentry->res);
-			pdata->mmio_base = ioremap(rentry->res.start,
+				pdata->mmio_size = resource_size(rentry->res);
+			pdata->mmio_base = ioremap(rentry->res->start,
 						   pdata->mmio_size);
 			break;
 		}
diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index 6ba8beb6b9d2..1284138e42ab 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -45,7 +45,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
 	struct platform_device *pdev = NULL;
 	struct acpi_device *acpi_parent;
 	struct platform_device_info pdevinfo;
-	struct resource_list_entry *rentry;
+	struct resource_entry *rentry;
 	struct list_head resource_list;
 	struct resource *resources = NULL;
 	int count;
@@ -71,7 +71,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
 		}
 		count = 0;
 		list_for_each_entry(rentry, &resource_list, node)
-			resources[count++] = rentry->res;
+			resources[count++] = *rentry->res;
 
 		acpi_dev_free_resource_list(&resource_list);
 	}
diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c
index 3ea0d17eb951..4752b9939987 100644
--- a/drivers/acpi/resource.c
+++ b/drivers/acpi/resource.c
@@ -444,12 +444,7 @@ EXPORT_SYMBOL_GPL(acpi_dev_resource_interrupt);
  */
 void acpi_dev_free_resource_list(struct list_head *list)
 {
-	struct resource_list_entry *rentry, *re;
-
-	list_for_each_entry_safe(rentry, re, list, node) {
-		list_del(&rentry->node);
-		kfree(rentry);
-	}
+	resource_list_free(list);
 }
 EXPORT_SYMBOL_GPL(acpi_dev_free_resource_list);
 
@@ -464,16 +459,16 @@ struct res_proc_context {
 static acpi_status acpi_dev_new_resource_entry(struct resource_win *win,
 					       struct res_proc_context *c)
 {
-	struct resource_list_entry *rentry;
+	struct resource_entry *rentry;
 
-	rentry = kmalloc(sizeof(*rentry), GFP_KERNEL);
+	rentry = resource_list_create_entry(NULL, 0);
 	if (!rentry) {
 		c->error = -ENOMEM;
 		return AE_NO_MEMORY;
 	}
-	rentry->res = win->res;
+	*rentry->res = win->res;
 	rentry->offset = win->offset;
-	list_add_tail(&rentry->node, c->list);
+	resource_list_add_tail(rentry, c->list);
 	c->count++;
 	return AE_OK;
 }
@@ -534,7 +529,7 @@ static acpi_status acpi_dev_process_resource(struct acpi_resource *ares,
  * returned as the final error code.
  *
  * The resultant struct resource objects are put on the list pointed to by
- * @list, that must be empty initially, as members of struct resource_list_entry
+ * @list, that must be empty initially, as members of struct resource_entry
  * objects.  Callers of this routine should use %acpi_dev_free_resource_list() to
  * free that list.
  *
diff --git a/drivers/dma/acpi-dma.c b/drivers/dma/acpi-dma.c
index de361a156b34..5a635646e05c 100644
--- a/drivers/dma/acpi-dma.c
+++ b/drivers/dma/acpi-dma.c
@@ -43,7 +43,7 @@ static int acpi_dma_parse_resource_group(const struct acpi_csrt_group *grp,
 {
 	const struct acpi_csrt_shared_info *si;
 	struct list_head resource_list;
-	struct resource_list_entry *rentry;
+	struct resource_entry *rentry;
 	resource_size_t mem = 0, irq = 0;
 	int ret;
 
@@ -56,10 +56,10 @@ static int acpi_dma_parse_resource_group(const struct acpi_csrt_group *grp,
 		return 0;
 
 	list_for_each_entry(rentry, &resource_list, node) {
-		if (resource_type(&rentry->res) == IORESOURCE_MEM)
-			mem = rentry->res.start;
-		else if (resource_type(&rentry->res) == IORESOURCE_IRQ)
-			irq = rentry->res.start;
+		if (resource_type(rentry->res) == IORESOURCE_MEM)
+			mem = rentry->res->start;
+		else if (resource_type(rentry->res) == IORESOURCE_IRQ)
+			irq = rentry->res->start;
 	}
 
 	acpi_dev_free_resource_list(&resource_list);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e818decb631f..e53822148b6a 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -27,6 +27,7 @@
 
 #include <linux/errno.h>
 #include <linux/ioport.h>	/* for struct resource */
+#include <linux/resource_ext.h>
 #include <linux/device.h>
 #include <linux/property.h>
 
@@ -285,11 +286,6 @@ extern int pnpacpi_disabled;
 
 #define PXM_INVAL	(-1)
 
-struct resource_win {
-	struct resource res;
-	resource_size_t offset;
-};
-
 bool acpi_dev_resource_memory(struct acpi_resource *ares, struct resource *res);
 bool acpi_dev_resource_io(struct acpi_resource *ares, struct resource *res);
 bool acpi_dev_resource_address_space(struct acpi_resource *ares,
@@ -300,12 +296,6 @@ unsigned long acpi_dev_irq_flags(u8 triggering, u8 polarity, u8 shareable);
 bool acpi_dev_resource_interrupt(struct acpi_resource *ares, int index,
 				 struct resource *res);
 
-struct resource_list_entry {
-	struct list_head node;
-	struct resource res;
-	resource_size_t offset;
-};
-
 void acpi_dev_free_resource_list(struct list_head *list);
 int acpi_dev_get_resources(struct acpi_device *adev, struct list_head *list,
 			   int (*preproc)(struct acpi_resource *, void *),
diff --git a/include/linux/resource_ext.h b/include/linux/resource_ext.h
new file mode 100644
index 000000000000..e2bf63d881d4
--- /dev/null
+++ b/include/linux/resource_ext.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2015, Intel Corporation
+ * Author: Jiang Liu <jiang.liu@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#ifndef _LINUX_RESOURCE_EXT_H
+#define _LINUX_RESOURCE_EXT_H
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/ioport.h>
+#include <linux/slab.h>
+
+/* Represent resource window for bridge devices */
+struct resource_win {
+	struct resource res;		/* In master (CPU) address space */
+	resource_size_t offset;		/* Translation offset for bridge */
+};
+
+/*
+ * Common resource list management data structure and interfaces to support
+ * ACPI, PNP and PCI host bridge etc.
+ */
+struct resource_entry {
+	struct list_head	node;
+	struct resource		*res;	/* In master (CPU) address space */
+	resource_size_t		offset;	/* Translation offset for bridge */
+	struct resource		__res;	/* Default storage for res */
+};
+
+extern struct resource_entry *
+resource_list_create_entry(struct resource *res, size_t extra_size);
+extern void resource_list_free(struct list_head *head);
+
+static inline void resource_list_add(struct resource_entry *entry,
+				     struct list_head *head)
+{
+	list_add(&entry->node, head);
+}
+
+static inline void resource_list_add_tail(struct resource_entry *entry,
+					  struct list_head *head)
+{
+	list_add_tail(&entry->node, head);
+}
+
+static inline void resource_list_del(struct resource_entry *entry)
+{
+	list_del(&entry->node);
+}
+
+static inline void resource_list_free_entry(struct resource_entry *entry)
+{
+	kfree(entry);
+}
+
+static inline void
+resource_list_destroy_entry(struct resource_entry *entry)
+{
+	resource_list_del(entry);
+	resource_list_free_entry(entry);
+}
+
+#define resource_list_for_each_entry(entry, list)	\
+	list_for_each_entry((entry), (list), node)
+
+#define resource_list_for_each_entry_safe(entry, tmp, list)	\
+	list_for_each_entry_safe((entry), (tmp), (list), node)
+
+#endif /* _LINUX_RESOURCE_EXT_H */
diff --git a/kernel/resource.c b/kernel/resource.c
index 0bcebffc4e77..19f2357dfda3 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -22,6 +22,7 @@
 #include <linux/device.h>
 #include <linux/pfn.h>
 #include <linux/mm.h>
+#include <linux/resource_ext.h>
 #include <asm/io.h>
 
 
@@ -1529,6 +1530,30 @@ int iomem_is_exclusive(u64 addr)
 	return err;
 }
 
+struct resource_entry *resource_list_create_entry(struct resource *res,
+						  size_t extra_size)
+{
+	struct resource_entry *entry;
+
+	entry = kzalloc(sizeof(*entry) + extra_size, GFP_KERNEL);
+	if (entry) {
+		INIT_LIST_HEAD(&entry->node);
+		entry->res = res ? res : &entry->__res;
+	}
+
+	return entry;
+}
+EXPORT_SYMBOL(resource_list_create_entry);
+
+void resource_list_free(struct list_head *head)
+{
+	struct resource_entry *entry, *tmp;
+
+	list_for_each_entry_safe(entry, tmp, head, node)
+		resource_list_destroy_entry(entry);
+}
+EXPORT_SYMBOL(resource_list_free);
+
 static int __init strict_iomem(char *str)
 {
 	if (strstr(str, "relaxed"))
-- 
cgit v1.2.3


From 14d76b68f2819a1d0b50236a7e9e9f2ea69869d9 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Thu, 5 Feb 2015 13:44:44 +0800
Subject: PCI: Use common resource list management code instead of private
 implementation

Use common resource list management data structure and interfaces
instead of private implementation.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/arm/kernel/bios32.c            |  5 ++---
 arch/x86/pci/bus_numa.c             |  4 ++--
 drivers/pci/bus.c                   | 18 ++++++------------
 drivers/pci/host-bridge.c           |  8 ++++----
 drivers/pci/host/pci-host-generic.c |  4 ++--
 drivers/pci/host/pci-xgene.c        |  4 ++--
 drivers/pci/host/pcie-xilinx.c      |  4 ++--
 drivers/pci/probe.c                 | 10 +++++-----
 include/linux/pci.h                 |  9 ++-------
 9 files changed, 27 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c
index a4effd6d8f2f..016991792b0b 100644
--- a/arch/arm/kernel/bios32.c
+++ b/arch/arm/kernel/bios32.c
@@ -422,17 +422,16 @@ static int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 static int pcibios_init_resources(int busnr, struct pci_sys_data *sys)
 {
 	int ret;
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 
 	if (list_empty(&sys->resources)) {
 		pci_add_resource_offset(&sys->resources,
 			 &iomem_resource, sys->mem_offset);
 	}
 
-	list_for_each_entry(window, &sys->resources, list) {
+	resource_list_for_each_entry(window, &sys->resources)
 		if (resource_type(window->res) == IORESOURCE_IO)
 			return 0;
-	}
 
 	sys->io_res.start = (busnr * SZ_64K) ?  : pcibios_min_io;
 	sys->io_res.end = (busnr + 1) * SZ_64K - 1;
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
index f3a2cfc14125..7bcf06a7cd12 100644
--- a/arch/x86/pci/bus_numa.c
+++ b/arch/x86/pci/bus_numa.c
@@ -31,7 +31,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 {
 	struct pci_root_info *info = x86_find_pci_root_info(bus);
 	struct pci_root_res *root_res;
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	bool found = false;
 
 	if (!info)
@@ -41,7 +41,7 @@ void x86_pci_root_bus_resources(int bus, struct list_head *resources)
 	       bus);
 
 	/* already added by acpi ? */
-	list_for_each_entry(window, resources, list)
+	resource_list_for_each_entry(window, resources)
 		if (window->res->flags & IORESOURCE_BUS) {
 			found = true;
 			break;
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 8fb16188cd82..90fa3a78fb7c 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -20,17 +20,16 @@
 void pci_add_resource_offset(struct list_head *resources, struct resource *res,
 			     resource_size_t offset)
 {
-	struct pci_host_bridge_window *window;
+	struct resource_entry *entry;
 
-	window = kzalloc(sizeof(struct pci_host_bridge_window), GFP_KERNEL);
-	if (!window) {
+	entry = resource_list_create_entry(res, 0);
+	if (!entry) {
 		printk(KERN_ERR "PCI: can't add host bridge window %pR\n", res);
 		return;
 	}
 
-	window->res = res;
-	window->offset = offset;
-	list_add_tail(&window->list, resources);
+	entry->offset = offset;
+	resource_list_add_tail(entry, resources);
 }
 EXPORT_SYMBOL(pci_add_resource_offset);
 
@@ -42,12 +41,7 @@ EXPORT_SYMBOL(pci_add_resource);
 
 void pci_free_resource_list(struct list_head *resources)
 {
-	struct pci_host_bridge_window *window, *tmp;
-
-	list_for_each_entry_safe(window, tmp, resources, list) {
-		list_del(&window->list);
-		kfree(window);
-	}
+	resource_list_free(resources);
 }
 EXPORT_SYMBOL(pci_free_resource_list);
 
diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c
index 0e5f3c95af5b..39b2dbe585aa 100644
--- a/drivers/pci/host-bridge.c
+++ b/drivers/pci/host-bridge.c
@@ -35,10 +35,10 @@ void pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region *region,
 			     struct resource *res)
 {
 	struct pci_host_bridge *bridge = find_pci_host_bridge(bus);
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	resource_size_t offset = 0;
 
-	list_for_each_entry(window, &bridge->windows, list) {
+	resource_list_for_each_entry(window, &bridge->windows) {
 		if (resource_contains(window->res, res)) {
 			offset = window->offset;
 			break;
@@ -60,10 +60,10 @@ void pcibios_bus_to_resource(struct pci_bus *bus, struct resource *res,
 			     struct pci_bus_region *region)
 {
 	struct pci_host_bridge *bridge = find_pci_host_bridge(bus);
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	resource_size_t offset = 0;
 
-	list_for_each_entry(window, &bridge->windows, list) {
+	resource_list_for_each_entry(window, &bridge->windows) {
 		struct pci_bus_region bus_region;
 
 		if (resource_type(res) != resource_type(window->res))
diff --git a/drivers/pci/host/pci-host-generic.c b/drivers/pci/host/pci-host-generic.c
index 6eb1aa75bd37..aee3c620ecf9 100644
--- a/drivers/pci/host/pci-host-generic.c
+++ b/drivers/pci/host/pci-host-generic.c
@@ -149,14 +149,14 @@ static int gen_pci_parse_request_of_pci_ranges(struct gen_pci *pci)
 	struct device *dev = pci->host.dev.parent;
 	struct device_node *np = dev->of_node;
 	resource_size_t iobase;
-	struct pci_host_bridge_window *win;
+	struct resource_entry *win;
 
 	err = of_pci_get_host_bridge_resources(np, 0, 0xff, &pci->resources,
 					       &iobase);
 	if (err)
 		return err;
 
-	list_for_each_entry(win, &pci->resources, list) {
+	resource_list_for_each_entry(win, &pci->resources) {
 		struct resource *parent, *res = win->res;
 
 		switch (resource_type(res)) {
diff --git a/drivers/pci/host/pci-xgene.c b/drivers/pci/host/pci-xgene.c
index b1d0596457c5..a704257bab7f 100644
--- a/drivers/pci/host/pci-xgene.c
+++ b/drivers/pci/host/pci-xgene.c
@@ -401,11 +401,11 @@ static int xgene_pcie_map_ranges(struct xgene_pcie_port *port,
 				 struct list_head *res,
 				 resource_size_t io_base)
 {
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	struct device *dev = port->dev;
 	int ret;
 
-	list_for_each_entry(window, res, list) {
+	resource_list_for_each_entry(window, res) {
 		struct resource *res = window->res;
 		u64 restype = resource_type(res);
 
diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c
index ef3ebaf9a738..601261df7663 100644
--- a/drivers/pci/host/pcie-xilinx.c
+++ b/drivers/pci/host/pcie-xilinx.c
@@ -737,7 +737,7 @@ static int xilinx_pcie_parse_and_add_res(struct xilinx_pcie_port *port)
 	resource_size_t offset;
 	struct of_pci_range_parser parser;
 	struct of_pci_range range;
-	struct pci_host_bridge_window *win;
+	struct resource_entry *win;
 	int err = 0, mem_resno = 0;
 
 	/* Get the ranges */
@@ -807,7 +807,7 @@ static int xilinx_pcie_parse_and_add_res(struct xilinx_pcie_port *port)
 
 free_resources:
 	release_child_resources(&iomem_resource);
-	list_for_each_entry(win, &port->resources, list)
+	resource_list_for_each_entry(win, &port->resources)
 		devm_kfree(dev, win->res);
 	pci_free_resource_list(&port->resources);
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 23212f8ae09b..8d2f400e96cb 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1895,7 +1895,7 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 	int error;
 	struct pci_host_bridge *bridge;
 	struct pci_bus *b, *b2;
-	struct pci_host_bridge_window *window, *n;
+	struct resource_entry *window, *n;
 	struct resource *res;
 	resource_size_t offset;
 	char bus_addr[64];
@@ -1959,8 +1959,8 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 		printk(KERN_INFO "PCI host bridge to bus %s\n", dev_name(&b->dev));
 
 	/* Add initial resources to the bus */
-	list_for_each_entry_safe(window, n, resources, list) {
-		list_move_tail(&window->list, &bridge->windows);
+	resource_list_for_each_entry_safe(window, n, resources) {
+		list_move_tail(&window->node, &bridge->windows);
 		res = window->res;
 		offset = window->offset;
 		if (res->flags & IORESOURCE_BUS)
@@ -2060,12 +2060,12 @@ void pci_bus_release_busn_res(struct pci_bus *b)
 struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
 		struct pci_ops *ops, void *sysdata, struct list_head *resources)
 {
-	struct pci_host_bridge_window *window;
+	struct resource_entry *window;
 	bool found = false;
 	struct pci_bus *b;
 	int max;
 
-	list_for_each_entry(window, resources, list)
+	resource_list_for_each_entry(window, resources)
 		if (window->res->flags & IORESOURCE_BUS) {
 			found = true;
 			break;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9603094ed59b..faa60fa26314 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -29,6 +29,7 @@
 #include <linux/atomic.h>
 #include <linux/device.h>
 #include <linux/io.h>
+#include <linux/resource_ext.h>
 #include <uapi/linux/pci.h>
 
 #include <linux/pci_ids.h>
@@ -397,16 +398,10 @@ static inline int pci_channel_offline(struct pci_dev *pdev)
 	return (pdev->error_state != pci_channel_io_normal);
 }
 
-struct pci_host_bridge_window {
-	struct list_head list;
-	struct resource *res;		/* host bridge aperture (CPU address) */
-	resource_size_t offset;		/* bus address + offset = CPU address */
-};
-
 struct pci_host_bridge {
 	struct device dev;
 	struct pci_bus *bus;		/* root bus */
-	struct list_head windows;	/* pci_host_bridge_windows */
+	struct list_head windows;	/* resource_entry */
 	void (*release_fn)(struct pci_host_bridge *);
 	void *release_data;
 };
-- 
cgit v1.2.3


From ecf5636dcd59cd5508641f995cc4c2bafedbb995 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 5 Feb 2015 13:44:48 +0800
Subject: ACPI: Add interfaces to parse IOAPIC ID for IOAPIC hotplug

We need to parse APIC ID for IOAPIC registration for IOAPIC hotplug.
ACPI _MAT method and MADT table are used to figure out IOAPIC ID, just
like parsing CPU APIC ID for CPU hotplug.

[ tglx: Fixed docbook comment ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Len Brown <lenb@kernel.org>
Link: http://lkml.kernel.org/r/1414387308-27148-8-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/processor_core.c | 123 ++++++++++++++++++++++++++++++++++++++----
 include/linux/acpi.h          |   4 ++
 2 files changed, 118 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 02e48394276c..7962651cdbd4 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -4,6 +4,10 @@
  *
  *	Alex Chiang <achiang@hp.com>
  *	- Unified x86/ia64 implementations
+ *
+ * I/O APIC hotplug support
+ *	Yinghai Lu <yinghai@kernel.org>
+ *	Jiang Liu <jiang.liu@intel.com>
  */
 #include <linux/export.h>
 #include <linux/acpi.h>
@@ -12,6 +16,21 @@
 #define _COMPONENT		ACPI_PROCESSOR_COMPONENT
 ACPI_MODULE_NAME("processor_core");
 
+static struct acpi_table_madt *get_madt_table(void)
+{
+	static struct acpi_table_madt *madt;
+	static int read_madt;
+
+	if (!read_madt) {
+		if (ACPI_FAILURE(acpi_get_table(ACPI_SIG_MADT, 0,
+					(struct acpi_table_header **)&madt)))
+			madt = NULL;
+		read_madt++;
+	}
+
+	return madt;
+}
+
 static int map_lapic_id(struct acpi_subtable_header *entry,
 		 u32 acpi_id, int *apic_id)
 {
@@ -67,17 +86,10 @@ static int map_lsapic_id(struct acpi_subtable_header *entry,
 static int map_madt_entry(int type, u32 acpi_id)
 {
 	unsigned long madt_end, entry;
-	static struct acpi_table_madt *madt;
-	static int read_madt;
 	int phys_id = -1;	/* CPU hardware ID */
+	struct acpi_table_madt *madt;
 
-	if (!read_madt) {
-		if (ACPI_FAILURE(acpi_get_table(ACPI_SIG_MADT, 0,
-					(struct acpi_table_header **)&madt)))
-			madt = NULL;
-		read_madt++;
-	}
-
+	madt = get_madt_table();
 	if (!madt)
 		return phys_id;
 
@@ -203,3 +215,96 @@ int acpi_get_cpuid(acpi_handle handle, int type, u32 acpi_id)
 	return acpi_map_cpuid(phys_id, acpi_id);
 }
 EXPORT_SYMBOL_GPL(acpi_get_cpuid);
+
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+static int get_ioapic_id(struct acpi_subtable_header *entry, u32 gsi_base,
+			 u64 *phys_addr, int *ioapic_id)
+{
+	struct acpi_madt_io_apic *ioapic = (struct acpi_madt_io_apic *)entry;
+
+	if (ioapic->global_irq_base != gsi_base)
+		return 0;
+
+	*phys_addr = ioapic->address;
+	*ioapic_id = ioapic->id;
+	return 1;
+}
+
+static int parse_madt_ioapic_entry(u32 gsi_base, u64 *phys_addr)
+{
+	struct acpi_subtable_header *hdr;
+	unsigned long madt_end, entry;
+	struct acpi_table_madt *madt;
+	int apic_id = -1;
+
+	madt = get_madt_table();
+	if (!madt)
+		return apic_id;
+
+	entry = (unsigned long)madt;
+	madt_end = entry + madt->header.length;
+
+	/* Parse all entries looking for a match. */
+	entry += sizeof(struct acpi_table_madt);
+	while (entry + sizeof(struct acpi_subtable_header) < madt_end) {
+		hdr = (struct acpi_subtable_header *)entry;
+		if (hdr->type == ACPI_MADT_TYPE_IO_APIC &&
+		    get_ioapic_id(hdr, gsi_base, phys_addr, &apic_id))
+			break;
+		else
+			entry += hdr->length;
+	}
+
+	return apic_id;
+}
+
+static int parse_mat_ioapic_entry(acpi_handle handle, u32 gsi_base,
+				  u64 *phys_addr)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_subtable_header *header;
+	union acpi_object *obj;
+	int apic_id = -1;
+
+	if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer)))
+		goto exit;
+
+	if (!buffer.length || !buffer.pointer)
+		goto exit;
+
+	obj = buffer.pointer;
+	if (obj->type != ACPI_TYPE_BUFFER ||
+	    obj->buffer.length < sizeof(struct acpi_subtable_header))
+		goto exit;
+
+	header = (struct acpi_subtable_header *)obj->buffer.pointer;
+	if (header->type == ACPI_MADT_TYPE_IO_APIC)
+		get_ioapic_id(header, gsi_base, phys_addr, &apic_id);
+
+exit:
+	kfree(buffer.pointer);
+	return apic_id;
+}
+
+/**
+ * acpi_get_ioapic_id - Get IOAPIC ID and physical address matching @gsi_base
+ * @handle:	ACPI object for IOAPIC device
+ * @gsi_base:	GSI base to match with
+ * @phys_addr:	Pointer to store physical address of matching IOAPIC record
+ *
+ * Walk resources returned by ACPI_MAT method, then ACPI MADT table, to search
+ * for an ACPI IOAPIC record matching @gsi_base.
+ * Return IOAPIC id and store physical address in @phys_addr if found a match,
+ * otherwise return <0.
+ */
+int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr)
+{
+	int apic_id;
+
+	apic_id = parse_mat_ioapic_entry(handle, gsi_base, phys_addr);
+	if (apic_id == -1)
+		apic_id = parse_madt_ioapic_entry(gsi_base, phys_addr);
+
+	return apic_id;
+}
+#endif /* CONFIG_ACPI_HOTPLUG_IOAPIC */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e53822148b6a..24c7aa8b1d20 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -152,6 +152,10 @@ int acpi_map_cpu(acpi_handle handle, int physid, int *pcpu);
 int acpi_unmap_cpu(int cpu);
 #endif /* CONFIG_ACPI_HOTPLUG_CPU */
 
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+int acpi_get_ioapic_id(acpi_handle handle, u32 gsi_base, u64 *phys_addr);
+#endif
+
 int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base);
 int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base);
 int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base);
-- 
cgit v1.2.3


From a05d59a5673339ef6936d6940cdf68172ce75b9f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 6 Feb 2015 14:30:50 -0500
Subject: tracing: Add condition check to RCU lockdep checks

The trace_tlb_flush() tracepoint can be called when a CPU is going offline.
When a CPU is offline, RCU is no longer watching that CPU and since the
tracepoint is protected by RCU, it must not be called. To prevent the
tlb_flush tracepoint from being called when the CPU is offline, it was
converted to a TRACE_EVENT_CONDITION where the condition checks if the
CPU is online before calling the tracepoint.

Unfortunately, this was not enough to stop lockdep from complaining about
it. Even though the RCU protected code of the tracepoint will never be
called, the condition is hidden within the tracepoint, and even though the
condition prevents RCU code from being called, the lockdep checks are
outside the tracepoint (this is to test tracepoints even when they are not
enabled).

Even though tracepoints should be checked to be RCU safe when they are not
enabled, the condition should still be considered when checking RCU.

Link: http://lkml.kernel.org/r/CA+icZUUGiGDoL5NU8RuxKzFjoLjEKRtUWx=JB8B9a0EQv-eGzQ@mail.gmail.com

Fixes: 3a630178fd5f "tracing: generate RCU warnings even when tracepoints are disabled"
Cc: stable@vger.kernel.org # 3.18+
Acked-by: Dave Hansen <dave@sr71.net>
Reported-by: Sedat Dilek <sedat.dilek@gmail.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index e08e21e5f601..c72851328ca9 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -173,7 +173,7 @@ extern void syscall_unregfunc(void);
 				TP_PROTO(data_proto),			\
 				TP_ARGS(data_args),			\
 				TP_CONDITION(cond),,);			\
-		if (IS_ENABLED(CONFIG_LOCKDEP)) {			\
+		if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {		\
 			rcu_read_lock_sched_notrace();			\
 			rcu_dereference_sched(__tracepoint_##name.funcs);\
 			rcu_read_unlock_sched_notrace();		\
-- 
cgit v1.2.3


From a9b2c06dbef48ed31cff1764c5ce824829106f4f Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 6 Feb 2015 16:04:39 -0500
Subject: tcp: mitigate ACK loops for connections as tcp_request_sock

In the SYN_RECV state, where the TCP connection is represented by
tcp_request_sock, we now rate-limit SYNACKs in response to a client's
retransmitted SYNs: we do not send a SYNACK in response to client SYN
if it has been less than sysctl_tcp_invalid_ratelimit (default 500ms)
since we last sent a SYNACK in response to a client's retransmitted
SYN.

This allows the vast majority of legitimate client connections to
proceed unimpeded, even for the most aggressive platforms, iOS and
MacOS, which actually retransmit SYNs 1-second intervals for several
times in a row. They use SYN RTO timeouts following the progression:
1,1,1,1,1,2,4,8,16,32.

Reported-by: Avery Fay <avery@mixpanel.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      | 1 +
 include/net/tcp.h        | 1 +
 net/ipv4/tcp_minisocks.c | 6 +++++-
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 67309ece0772..bcc828d3b9b9 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -115,6 +115,7 @@ struct tcp_request_sock {
 	u32				rcv_isn;
 	u32				snt_isn;
 	u32				snt_synack; /* synack sent time */
+	u32				last_oow_ack_time; /* last SYNACK */
 	u32				rcv_nxt; /* the ack # by SYNACK. For
 						  * FastOpen it's the seq#
 						  * after data-in-SYN.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b81f45c67b2e..da4196fb78db 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1145,6 +1145,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
 	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
 	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
 	tcp_rsk(req)->snt_synack = tcp_time_stamp;
+	tcp_rsk(req)->last_oow_ack_time = 0;
 	req->mss = rx_opt->mss_clamp;
 	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
 	ireq->tstamp_ok = rx_opt->tstamp_ok;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bc9216dc9de1..131aa4950d1c 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -605,7 +605,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 		 * Reset timer after retransmitting SYNACK, similar to
 		 * the idea of fast retransmit in recovery.
 		 */
-		if (!inet_rtx_syn_ack(sk, req))
+		if (!tcp_oow_rate_limited(sock_net(sk), skb,
+					  LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+					  &tcp_rsk(req)->last_oow_ack_time) &&
+
+		    !inet_rtx_syn_ack(sk, req))
 			req->expires = min(TCP_TIMEOUT_INIT << req->num_timeout,
 					   TCP_RTO_MAX) + jiffies;
 		return NULL;
-- 
cgit v1.2.3


From f2b2c582e82429270d5818fbabe653f4359d7024 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 6 Feb 2015 16:04:40 -0500
Subject: tcp: mitigate ACK loops for connections as tcp_sock

Ensure that in state ESTABLISHED, where the connection is represented
by a tcp_sock, we rate limit dupacks in response to incoming packets
(a) with TCP timestamps that fail PAWS checks, or (b) with sequence
numbers or ACK numbers that are out of the acceptable window.

We do not send a dupack in response to out-of-window packets if it has
been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we
last sent a dupack in response to an out-of-window packet.

There is already a similar (although global) rate-limiting mechanism
for "challenge ACKs". When deciding whether to send a challence ACK,
we first consult the new per-connection rate limit, and then the
global rate limit.

Reported-by: Avery Fay <avery@mixpanel.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  1 +
 net/ipv4/tcp_input.c     | 29 ++++++++++++++++++++++-------
 net/ipv4/tcp_minisocks.c |  1 +
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index bcc828d3b9b9..66d85a80a1ec 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -153,6 +153,7 @@ struct tcp_sock {
  	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
 	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
+	u32	last_oow_ack_time;  /* timestamp of last out-of-window ACK */
 
 	u32	tsoffset;	/* timestamp offset */
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9401aa43b814..8fdd27b17306 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3322,13 +3322,22 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 }
 
 /* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk)
+static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
 {
 	/* unprotected vars, we dont care of overwrites */
 	static u32 challenge_timestamp;
 	static unsigned int challenge_count;
-	u32 now = jiffies / HZ;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 now;
+
+	/* First check our per-socket dupack rate limit. */
+	if (tcp_oow_rate_limited(sock_net(sk), skb,
+				 LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+				 &tp->last_oow_ack_time))
+		return;
 
+	/* Then check the check host-wide RFC 5961 rate limit. */
+	now = jiffies / HZ;
 	if (now != challenge_timestamp) {
 		challenge_timestamp = now;
 		challenge_count = 0;
@@ -3424,7 +3433,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (before(ack, prior_snd_una)) {
 		/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
 		if (before(ack, prior_snd_una - tp->max_window)) {
-			tcp_send_challenge_ack(sk);
+			tcp_send_challenge_ack(sk, skb);
 			return -1;
 		}
 		goto old_ack;
@@ -4993,7 +5002,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 	    tcp_paws_discard(sk, skb)) {
 		if (!th->rst) {
 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
-			tcp_send_dupack(sk, skb);
+			if (!tcp_oow_rate_limited(sock_net(sk), skb,
+						  LINUX_MIB_TCPACKSKIPPEDPAWS,
+						  &tp->last_oow_ack_time))
+				tcp_send_dupack(sk, skb);
 			goto discard;
 		}
 		/* Reset is accepted even if it did not pass PAWS. */
@@ -5010,7 +5022,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 		if (!th->rst) {
 			if (th->syn)
 				goto syn_challenge;
-			tcp_send_dupack(sk, skb);
+			if (!tcp_oow_rate_limited(sock_net(sk), skb,
+						  LINUX_MIB_TCPACKSKIPPEDSEQ,
+						  &tp->last_oow_ack_time))
+				tcp_send_dupack(sk, skb);
 		}
 		goto discard;
 	}
@@ -5026,7 +5041,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 		if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt)
 			tcp_reset(sk);
 		else
-			tcp_send_challenge_ack(sk);
+			tcp_send_challenge_ack(sk, skb);
 		goto discard;
 	}
 
@@ -5040,7 +5055,7 @@ syn_challenge:
 		if (syn_inerr)
 			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
-		tcp_send_challenge_ack(sk);
+		tcp_send_challenge_ack(sk, skb);
 		goto discard;
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 131aa4950d1c..98a840561ec8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -467,6 +467,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		tcp_enable_early_retrans(newtp);
 		newtp->tlp_high_seq = 0;
 		newtp->lsndtime = treq->snt_synack;
+		newtp->last_oow_ack_time = 0;
 		newtp->total_retrans = req->num_retrans;
 
 		/* So many TCP implementations out there (incorrectly) count the
-- 
cgit v1.2.3


From 4fb17a6091674f469e8ac85dc770fbf9a9ba7cc8 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Fri, 6 Feb 2015 16:04:41 -0500
Subject: tcp: mitigate ACK loops for connections as tcp_timewait_sock

Ensure that in state FIN_WAIT2 or TIME_WAIT, where the connection is
represented by a tcp_timewait_sock, we rate limit dupacks in response
to incoming packets (a) with TCP timestamps that fail PAWS checks, or
(b) with sequence numbers that are out of the acceptable window.

We do not send a dupack in response to out-of-window packets if it has
been less than sysctl_tcp_invalid_ratelimit (default 500ms) since we
last sent a dupack in response to an out-of-window packet.

Reported-by: Avery Fay <avery@mixpanel.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  4 ++++
 net/ipv4/tcp_minisocks.c | 29 ++++++++++++++++++++++++-----
 2 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 66d85a80a1ec..1a7adb411647 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -342,6 +342,10 @@ struct tcp_timewait_sock {
 	u32			  tw_rcv_wnd;
 	u32			  tw_ts_offset;
 	u32			  tw_ts_recent;
+
+	/* The time we sent the last out-of-window ACK: */
+	u32			  tw_last_oow_ack_time;
+
 	long			  tw_ts_recent_stamp;
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 98a840561ec8..dd11ac7798c6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -58,6 +58,25 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 	return seq == e_win && seq == end_seq;
 }
 
+static enum tcp_tw_status
+tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
+				  const struct sk_buff *skb, int mib_idx)
+{
+	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+
+	if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx,
+				  &tcptw->tw_last_oow_ack_time)) {
+		/* Send ACK. Note, we do not put the bucket,
+		 * it will be released by caller.
+		 */
+		return TCP_TW_ACK;
+	}
+
+	/* We are rate-limiting, so just release the tw sock and drop skb. */
+	inet_twsk_put(tw);
+	return TCP_TW_SUCCESS;
+}
+
 /*
  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -116,7 +135,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 				   tcptw->tw_rcv_nxt,
 				   tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
-			return TCP_TW_ACK;
+			return tcp_timewait_check_oow_rate_limit(
+				tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
 
 		if (th->rst)
 			goto kill;
@@ -250,10 +270,8 @@ kill:
 			inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN,
 					   TCP_TIMEWAIT_LEN);
 
-		/* Send ACK. Note, we do not put the bucket,
-		 * it will be released by caller.
-		 */
-		return TCP_TW_ACK;
+		return tcp_timewait_check_oow_rate_limit(
+			tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
 	}
 	inet_twsk_put(tw);
 	return TCP_TW_SUCCESS;
@@ -289,6 +307,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		tcptw->tw_ts_recent	= tp->rx_opt.ts_recent;
 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 		tcptw->tw_ts_offset	= tp->tsoffset;
+		tcptw->tw_last_oow_ack_time = 0;
 
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tw->tw_family == PF_INET6) {
-- 
cgit v1.2.3


From 096a4cfa5807aa89c78ce12309c0b1c10cf88184 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 6 Feb 2015 18:54:19 +0100
Subject: net: fix a typo in skb_checksum_validate_zero_check

Remove trailing underscore.

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 111e665455c3..1bb36edb66b9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3072,7 +3072,7 @@ static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
 
 #define skb_checksum_validate_zero_check(skb, proto, check,		\
 					 compute_pseudo)		\
-	__skb_checksum_validate_(skb, proto, true, true, check, compute_pseudo)
+	__skb_checksum_validate(skb, proto, true, true, check, compute_pseudo)
 
 #define skb_checksum_simple_validate(skb)				\
 	__skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo)
-- 
cgit v1.2.3


From 567e4b79731c352a17d73c483959f795d3593e03 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 6 Feb 2015 12:59:01 -0800
Subject: net: rfs: add hash collision detection

Receive Flow Steering is a nice solution but suffers from
hash collisions when a mix of connected and unconnected traffic
is received on the host, when flow hash table is populated.

Also, clearing flow in inet_release() makes RFS not very good
for short lived flows, as many packets can follow close().
(FIN , ACK packets, ...)

This patch extends the information stored into global hash table
to not only include cpu number, but upper part of the hash value.

I use a 32bit value, and dynamically split it in two parts.

For host with less than 64 possible cpus, this gives 6 bits for the
cpu number, and 26 (32-6) bits for the upper part of the hash.

Since hash bucket selection use low order bits of the hash, we have
a full hash match, if /proc/sys/net/core/rps_sock_flow_entries is big
enough.

If the hash found in flow table does not match, we fallback to RPS (if
it is enabled for the rxqueue).

This means that a packet for an non connected flow can avoid the
IPI through a unrelated/victim CPU.

This also means we no longer have to clear the table at socket
close time, and this helps short lived flows performance.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tun.c          |  5 +----
 include/linux/netdevice.h  | 34 ++++++++++++++++----------------
 include/net/sock.h         | 24 +----------------------
 net/core/dev.c             | 48 ++++++++++++++++++++++++++--------------------
 net/core/sysctl_net_core.c |  2 +-
 net/ipv4/af_inet.c         |  2 --
 6 files changed, 47 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index ad7d3d5f3ee5..857dca47bf80 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -256,7 +256,6 @@ static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
 {
 	tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
 		  e->rxhash, e->queue_index);
-	sock_rps_reset_flow_hash(e->rps_rxhash);
 	hlist_del_rcu(&e->hash_link);
 	kfree_rcu(e, rcu);
 	--tun->flow_count;
@@ -373,10 +372,8 @@ unlock:
  */
 static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
 {
-	if (unlikely(e->rps_rxhash != hash)) {
-		sock_rps_reset_flow_hash(e->rps_rxhash);
+	if (unlikely(e->rps_rxhash != hash))
 		e->rps_rxhash = hash;
-	}
 }
 
 /* We try to identify a flow through its rxhash first. The reason that
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ce784d5018e0..ab3b7cef4638 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -644,39 +644,39 @@ struct rps_dev_flow_table {
 /*
  * The rps_sock_flow_table contains mappings of flows to the last CPU
  * on which they were processed by the application (set in recvmsg).
+ * Each entry is a 32bit value. Upper part is the high order bits
+ * of flow hash, lower part is cpu number.
+ * rps_cpu_mask is used to partition the space, depending on number of
+ * possible cpus : rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1
+ * For example, if 64 cpus are possible, rps_cpu_mask = 0x3f,
+ * meaning we use 32-6=26 bits for the hash.
  */
 struct rps_sock_flow_table {
-	unsigned int mask;
-	u16 ents[0];
+	u32	mask;
+	u32	ents[0];
 };
-#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_sock_flow_table) + \
-    ((_num) * sizeof(u16)))
+#define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
 
 #define RPS_NO_CPU 0xffff
 
+extern u32 rps_cpu_mask;
+extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+
 static inline void rps_record_sock_flow(struct rps_sock_flow_table *table,
 					u32 hash)
 {
 	if (table && hash) {
-		unsigned int cpu, index = hash & table->mask;
+		unsigned int index = hash & table->mask;
+		u32 val = hash & ~rps_cpu_mask;
 
 		/* We only give a hint, preemption can change cpu under us */
-		cpu = raw_smp_processor_id();
+		val |= raw_smp_processor_id();
 
-		if (table->ents[index] != cpu)
-			table->ents[index] = cpu;
+		if (table->ents[index] != val)
+			table->ents[index] = val;
 	}
 }
 
-static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
-				       u32 hash)
-{
-	if (table && hash)
-		table->ents[hash & table->mask] = RPS_NO_CPU;
-}
-
-extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
-
 #ifdef CONFIG_RFS_ACCEL
 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
 			 u16 filter_id);
diff --git a/include/net/sock.h b/include/net/sock.h
index d28b8fededd6..e13824570b0f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -857,18 +857,6 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
 #endif
 }
 
-static inline void sock_rps_reset_flow_hash(__u32 hash)
-{
-#ifdef CONFIG_RPS
-	struct rps_sock_flow_table *sock_flow_table;
-
-	rcu_read_lock();
-	sock_flow_table = rcu_dereference(rps_sock_flow_table);
-	rps_reset_sock_flow(sock_flow_table, hash);
-	rcu_read_unlock();
-#endif
-}
-
 static inline void sock_rps_record_flow(const struct sock *sk)
 {
 #ifdef CONFIG_RPS
@@ -876,28 +864,18 @@ static inline void sock_rps_record_flow(const struct sock *sk)
 #endif
 }
 
-static inline void sock_rps_reset_flow(const struct sock *sk)
-{
-#ifdef CONFIG_RPS
-	sock_rps_reset_flow_hash(sk->sk_rxhash);
-#endif
-}
-
 static inline void sock_rps_save_rxhash(struct sock *sk,
 					const struct sk_buff *skb)
 {
 #ifdef CONFIG_RPS
-	if (unlikely(sk->sk_rxhash != skb->hash)) {
-		sock_rps_reset_flow(sk);
+	if (unlikely(sk->sk_rxhash != skb->hash))
 		sk->sk_rxhash = skb->hash;
-	}
 #endif
 }
 
 static inline void sock_rps_reset_rxhash(struct sock *sk)
 {
 #ifdef CONFIG_RPS
-	sock_rps_reset_flow(sk);
 	sk->sk_rxhash = 0;
 #endif
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index a3a96ffc67f4..8be38675e1a8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3030,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 /* One global table that all flow-based protocols share. */
 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 EXPORT_SYMBOL(rps_sock_flow_table);
+u32 rps_cpu_mask __read_mostly;
+EXPORT_SYMBOL(rps_cpu_mask);
 
 struct static_key rps_needed __read_mostly;
 
@@ -3086,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		       struct rps_dev_flow **rflowp)
 {
-	struct netdev_rx_queue *rxqueue;
-	struct rps_map *map;
+	const struct rps_sock_flow_table *sock_flow_table;
+	struct netdev_rx_queue *rxqueue = dev->_rx;
 	struct rps_dev_flow_table *flow_table;
-	struct rps_sock_flow_table *sock_flow_table;
+	struct rps_map *map;
 	int cpu = -1;
-	u16 tcpu;
+	u32 tcpu;
 	u32 hash;
 
 	if (skb_rx_queue_recorded(skb)) {
 		u16 index = skb_get_rx_queue(skb);
+
 		if (unlikely(index >= dev->real_num_rx_queues)) {
 			WARN_ONCE(dev->real_num_rx_queues > 1,
 				  "%s received packet on queue %u, but number "
@@ -3103,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 				  dev->name, index, dev->real_num_rx_queues);
 			goto done;
 		}
-		rxqueue = dev->_rx + index;
-	} else
-		rxqueue = dev->_rx;
+		rxqueue += index;
+	}
 
+	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
+
+	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	map = rcu_dereference(rxqueue->rps_map);
-	if (map) {
-		if (map->len == 1 &&
-		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
-			tcpu = map->cpus[0];
-			if (cpu_online(tcpu))
-				cpu = tcpu;
-			goto done;
-		}
-	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
+	if (!flow_table && !map)
 		goto done;
-	}
 
 	skb_reset_network_header(skb);
 	hash = skb_get_hash(skb);
 	if (!hash)
 		goto done;
 
-	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
 	if (flow_table && sock_flow_table) {
-		u16 next_cpu;
 		struct rps_dev_flow *rflow;
+		u32 next_cpu;
+		u32 ident;
+
+		/* First check into global flow table if there is a match */
+		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
+		if ((ident ^ hash) & ~rps_cpu_mask)
+			goto try_rps;
 
+		next_cpu = ident & rps_cpu_mask;
+
+		/* OK, now we know there is a match,
+		 * we can look at the local (per receive queue) flow table
+		 */
 		rflow = &flow_table->flows[hash & flow_table->mask];
 		tcpu = rflow->cpu;
 
-		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
-
 		/*
 		 * If the desired CPU (where last recvmsg was done) is
 		 * different from current CPU (one in the rx-queue flow
@@ -3162,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		}
 	}
 
+try_rps:
+
 	if (map) {
 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
 		if (cpu_online(tcpu)) {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index fde21d19e61b..7a31be5e361f 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -65,7 +65,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 					mutex_unlock(&sock_flow_mutex);
 					return -ENOMEM;
 				}
-
+				rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
 				sock_table->mask = size - 1;
 			} else
 				sock_table = orig_sock_table;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a44773c8346c..d2e49baaff63 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -395,8 +395,6 @@ int inet_release(struct socket *sock)
 	if (sk) {
 		long timeout;
 
-		sock_rps_reset_flow(sk);
-
 		/* Applications forget to leave groups before exiting */
 		ip_mc_drop_socket(sk);
 
-- 
cgit v1.2.3


From 93c1af6ca94c1e763efba76a127b5c135e3d23a6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 8 Feb 2015 20:39:13 -0800
Subject: net:rfs: adjust table size checking

Make sure root user does not try something stupid.

Also make sure mask field in struct rps_sock_flow_table
does not share a cache line with the potentially often dirtied
flow table.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: 567e4b79731c ("net: rfs: add hash collision detection")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h  | 3 ++-
 net/core/sysctl_net_core.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ab3b7cef4638..d115256ed5a2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -653,7 +653,8 @@ struct rps_dev_flow_table {
  */
 struct rps_sock_flow_table {
 	u32	mask;
-	u32	ents[0];
+
+	u32	ents[0] ____cacheline_aligned_in_smp;
 };
 #define	RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num]))
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 7a31be5e361f..eaa51ddf2368 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -52,7 +52,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
 
 	if (write) {
 		if (size) {
-			if (size > 1<<30) {
+			if (size > 1<<29) {
 				/* Enforce limit to prevent overflow */
 				mutex_unlock(&sock_flow_mutex);
 				return -EINVAL;
-- 
cgit v1.2.3


From 35f05dabf95ac3ebc4c15bafd6833f7a3046e66f Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@mellanox.com>
Date: Sun, 8 Feb 2015 11:49:34 +0200
Subject: IB/mlx4: Reset flow support for IB kernel ULPs

The driver exposes interfaces that directly relate to HW state. Upon fatal
error, consumers of these interfaces (ULPs) that rely on completion of
all their posted work-request could hang, thereby introducing dependencies
in shutdown order.  To prevent this from happening, we manage the
relevant resources (CQs, QPs) that are used by the device. Upon a fatal error,
we now generate simulated completions for outstanding WQEs that were not
completed at the time the HW was reset.

It includes invoking the completion event handler for all involved CQs so that
the ULPs will poll those CQs. When polled we return simulated CQEs with
IB_WC_WR_FLUSH_ERR return code enabling ULPs to clean up their resources and
not wait forever for completions upon receiving remove_one.

The above change requires an extra check in the data path to make sure that when
device is in error state, the simulated CQEs will be returned and no further
WQEs will be posted.

Signed-off-by: Yishai Hadas <yishaih@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/cq.c      | 57 ++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx4/main.c    | 64 ++++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  9 +++++
 drivers/infiniband/hw/mlx4/qp.c      | 59 +++++++++++++++++++++++++++++----
 drivers/infiniband/hw/mlx4/srq.c     |  8 +++++
 include/linux/mlx4/device.h          |  2 ++
 6 files changed, 193 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index a3b70f6c4035..543ecdd8667b 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -188,6 +188,8 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
 	spin_lock_init(&cq->lock);
 	cq->resize_buf = NULL;
 	cq->resize_umem = NULL;
+	INIT_LIST_HEAD(&cq->send_qp_list);
+	INIT_LIST_HEAD(&cq->recv_qp_list);
 
 	if (context) {
 		struct mlx4_ib_create_cq ucmd;
@@ -594,6 +596,55 @@ static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct
 	return 0;
 }
 
+static void mlx4_ib_qp_sw_comp(struct mlx4_ib_qp *qp, int num_entries,
+			       struct ib_wc *wc, int *npolled, int is_send)
+{
+	struct mlx4_ib_wq *wq;
+	unsigned cur;
+	int i;
+
+	wq = is_send ? &qp->sq : &qp->rq;
+	cur = wq->head - wq->tail;
+
+	if (cur == 0)
+		return;
+
+	for (i = 0;  i < cur && *npolled < num_entries; i++) {
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		wc->vendor_err = MLX4_CQE_SYNDROME_WR_FLUSH_ERR;
+		wq->tail++;
+		(*npolled)++;
+		wc->qp = &qp->ibqp;
+		wc++;
+	}
+}
+
+static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries,
+				 struct ib_wc *wc, int *npolled)
+{
+	struct mlx4_ib_qp *qp;
+
+	*npolled = 0;
+	/* Find uncompleted WQEs belonging to that cq and retrun
+	 * simulated FLUSH_ERR completions
+	 */
+	list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) {
+		mlx4_ib_qp_sw_comp(qp, num_entries, wc, npolled, 1);
+		if (*npolled >= num_entries)
+			goto out;
+	}
+
+	list_for_each_entry(qp, &cq->recv_qp_list, cq_recv_list) {
+		mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 0);
+		if (*npolled >= num_entries)
+			goto out;
+	}
+
+out:
+	return;
+}
+
 static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 			    struct mlx4_ib_qp **cur_qp,
 			    struct ib_wc *wc)
@@ -836,8 +887,13 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
 	unsigned long flags;
 	int npolled;
 	int err = 0;
+	struct mlx4_ib_dev *mdev = to_mdev(cq->ibcq.device);
 
 	spin_lock_irqsave(&cq->lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx4_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
+		goto out;
+	}
 
 	for (npolled = 0; npolled < num_entries; ++npolled) {
 		err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
@@ -847,6 +903,7 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
 
 	mlx4_cq_set_ci(&cq->mcq);
 
+out:
 	spin_unlock_irqrestore(&cq->lock, flags);
 
 	if (err == 0 || err == -EAGAIN)
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 3140da518a07..eb8e215f1613 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -2308,6 +2308,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 
 	spin_lock_init(&ibdev->sm_lock);
 	mutex_init(&ibdev->cap_mask_mutex);
+	INIT_LIST_HEAD(&ibdev->qp_list);
+	spin_lock_init(&ibdev->reset_flow_resource_lock);
 
 	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED &&
 	    ib_num_ports) {
@@ -2622,6 +2624,67 @@ out:
 	return;
 }
 
+static void mlx4_ib_handle_catas_error(struct mlx4_ib_dev *ibdev)
+{
+	struct mlx4_ib_qp *mqp;
+	unsigned long flags_qp;
+	unsigned long flags_cq;
+	struct mlx4_ib_cq *send_mcq, *recv_mcq;
+	struct list_head    cq_notify_list;
+	struct mlx4_cq *mcq;
+	unsigned long flags;
+
+	pr_warn("mlx4_ib_handle_catas_error was started\n");
+	INIT_LIST_HEAD(&cq_notify_list);
+
+	/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
+	spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
+
+	list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
+		spin_lock_irqsave(&mqp->sq.lock, flags_qp);
+		if (mqp->sq.tail != mqp->sq.head) {
+			send_mcq = to_mcq(mqp->ibqp.send_cq);
+			spin_lock_irqsave(&send_mcq->lock, flags_cq);
+			if (send_mcq->mcq.comp &&
+			    mqp->ibqp.send_cq->comp_handler) {
+				if (!send_mcq->mcq.reset_notify_added) {
+					send_mcq->mcq.reset_notify_added = 1;
+					list_add_tail(&send_mcq->mcq.reset_notify,
+						      &cq_notify_list);
+				}
+			}
+			spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
+		}
+		spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
+		/* Now, handle the QP's receive queue */
+		spin_lock_irqsave(&mqp->rq.lock, flags_qp);
+		/* no handling is needed for SRQ */
+		if (!mqp->ibqp.srq) {
+			if (mqp->rq.tail != mqp->rq.head) {
+				recv_mcq = to_mcq(mqp->ibqp.recv_cq);
+				spin_lock_irqsave(&recv_mcq->lock, flags_cq);
+				if (recv_mcq->mcq.comp &&
+				    mqp->ibqp.recv_cq->comp_handler) {
+					if (!recv_mcq->mcq.reset_notify_added) {
+						recv_mcq->mcq.reset_notify_added = 1;
+						list_add_tail(&recv_mcq->mcq.reset_notify,
+							      &cq_notify_list);
+					}
+				}
+				spin_unlock_irqrestore(&recv_mcq->lock,
+						       flags_cq);
+			}
+		}
+		spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
+	}
+
+	list_for_each_entry(mcq, &cq_notify_list, reset_notify) {
+		mcq->comp(mcq);
+	}
+	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
+	pr_warn("mlx4_ib_handle_catas_error ended\n");
+}
+
 static void handle_bonded_port_state_event(struct work_struct *work)
 {
 	struct ib_event_work *ew =
@@ -2701,6 +2764,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
 	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
 		ibdev->ib_active = false;
 		ibev.event = IB_EVENT_DEVICE_FATAL;
+		mlx4_ib_handle_catas_error(ibdev);
 		break;
 
 	case MLX4_DEV_EVENT_PORT_MGMT_CHANGE:
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 721540c9163d..f829fd935b79 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -110,6 +110,9 @@ struct mlx4_ib_cq {
 	struct mutex		resize_mutex;
 	struct ib_umem	       *umem;
 	struct ib_umem	       *resize_umem;
+	/* List of qps that it serves.*/
+	struct list_head		send_qp_list;
+	struct list_head		recv_qp_list;
 };
 
 struct mlx4_ib_mr {
@@ -300,6 +303,9 @@ struct mlx4_ib_qp {
 	struct mlx4_roce_smac_vlan_info pri;
 	struct mlx4_roce_smac_vlan_info alt;
 	u64			reg_id;
+	struct list_head	qps_list;
+	struct list_head	cq_recv_list;
+	struct list_head	cq_send_list;
 };
 
 struct mlx4_ib_srq {
@@ -535,6 +541,9 @@ struct mlx4_ib_dev {
 	/* lock when destroying qp1_proxy and getting netdev events */
 	struct mutex		qp1_proxy_lock[MLX4_MAX_PORTS];
 	u8			bond_next_port;
+	/* protect resources needed as part of reset flow */
+	spinlock_t		reset_flow_resource_lock;
+	struct list_head		qp_list;
 };
 
 struct ib_event_work {
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 792f9dc86ada..dfc6ca128a7e 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -46,6 +46,11 @@
 #include "mlx4_ib.h"
 #include "user.h"
 
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
+			     struct mlx4_ib_cq *recv_cq);
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
+			       struct mlx4_ib_cq *recv_cq);
+
 enum {
 	MLX4_IB_ACK_REQ_FREQ	= 8,
 };
@@ -618,6 +623,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	struct mlx4_ib_sqp *sqp;
 	struct mlx4_ib_qp *qp;
 	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
+	struct mlx4_ib_cq *mcq;
+	unsigned long flags;
 
 	/* When tunneling special qps, we use a plain UD qp */
 	if (sqpn) {
@@ -828,6 +835,24 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	qp->mqp.event = mlx4_ib_qp_event;
 	if (!*caller_qp)
 		*caller_qp = qp;
+
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+	mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
+			 to_mcq(init_attr->recv_cq));
+	/* Maintain device to QPs access, needed for further handling
+	 * via reset flow
+	 */
+	list_add_tail(&qp->qps_list, &dev->qp_list);
+	/* Maintain CQ to QPs access, needed for further handling
+	 * via reset flow
+	 */
+	mcq = to_mcq(init_attr->send_cq);
+	list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
+	mcq = to_mcq(init_attr->recv_cq);
+	list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
+	mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
+			   to_mcq(init_attr->recv_cq));
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
 	return 0;
 
 err_qpn:
@@ -886,13 +911,13 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv
 	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
 {
 	if (send_cq == recv_cq) {
-		spin_lock_irq(&send_cq->lock);
+		spin_lock(&send_cq->lock);
 		__acquire(&recv_cq->lock);
 	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
-		spin_lock_irq(&send_cq->lock);
+		spin_lock(&send_cq->lock);
 		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
 	} else {
-		spin_lock_irq(&recv_cq->lock);
+		spin_lock(&recv_cq->lock);
 		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
 	}
 }
@@ -902,13 +927,13 @@ static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *re
 {
 	if (send_cq == recv_cq) {
 		__release(&recv_cq->lock);
-		spin_unlock_irq(&send_cq->lock);
+		spin_unlock(&send_cq->lock);
 	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
 		spin_unlock(&recv_cq->lock);
-		spin_unlock_irq(&send_cq->lock);
+		spin_unlock(&send_cq->lock);
 	} else {
 		spin_unlock(&send_cq->lock);
-		spin_unlock_irq(&recv_cq->lock);
+		spin_unlock(&recv_cq->lock);
 	}
 }
 
@@ -953,6 +978,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 			      int is_user)
 {
 	struct mlx4_ib_cq *send_cq, *recv_cq;
+	unsigned long flags;
 
 	if (qp->state != IB_QPS_RESET) {
 		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
@@ -984,8 +1010,13 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 
 	get_cqs(qp, &send_cq, &recv_cq);
 
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
 	mlx4_ib_lock_cqs(send_cq, recv_cq);
 
+	/* del from lists under both locks above to protect reset flow paths */
+	list_del(&qp->qps_list);
+	list_del(&qp->cq_send_list);
+	list_del(&qp->cq_recv_list);
 	if (!is_user) {
 		__mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
 				 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
@@ -996,6 +1027,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
 	mlx4_qp_remove(dev->dev, &qp->mqp);
 
 	mlx4_ib_unlock_cqs(send_cq, recv_cq);
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
 
 	mlx4_qp_free(dev->dev, &qp->mqp);
 
@@ -2618,8 +2650,15 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	__be32 uninitialized_var(lso_hdr_sz);
 	__be32 blh;
 	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 
 	spin_lock_irqsave(&qp->sq.lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
 
 	ind = qp->sq_next_wqe;
 
@@ -2917,10 +2956,18 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 	int ind;
 	int max_gs;
 	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 
 	max_gs = qp->rq.max_gs;
 	spin_lock_irqsave(&qp->rq.lock, flags);
 
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
+
 	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 62d9285300af..dce5dfe3a70e 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -316,8 +316,15 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 	int err = 0;
 	int nreq;
 	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibsrq->device);
 
 	spin_lock_irqsave(&srq->lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
 		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
@@ -362,6 +369,7 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 
 		*srq->db.db = cpu_to_be32(srq->wqe_ctr);
 	}
+out:
 
 	spin_unlock_irqrestore(&srq->lock, flags);
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index c116cb02475c..e4ebff7e9d02 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -689,6 +689,8 @@ struct mlx4_cq {
 		void (*comp)(struct mlx4_cq *);
 		void		*priv;
 	} tasklet_ctx;
+	int		reset_notify_added;
+	struct list_head	reset_notify;
 };
 
 struct mlx4_qp {
-- 
cgit v1.2.3


From 3cf385713460eb2bb4cb7ceb8ed89833b00b594b Mon Sep 17 00:00:00 2001
From: Antonios Motakis <a.motakis@virtualopensystems.com>
Date: Tue, 6 Jan 2015 11:15:11 +0100
Subject: ARM: 8256/1: driver coamba: add device binding path 'driver_override'

As already demonstrated with PCI [1] and the platform bus [2], a
driver_override property in sysfs can be used to bypass the id
matching of a device to a AMBA driver. This can be used by VFIO to
bind to any AMBA device requested by the user.

[1] http://lists-archives.com/linux-kernel/28030441-pci-introduce-new-device-binding-path-using-pci_dev-driver_override.html
[2] https://www.redhat.com/archives/libvir-list/2014-April/msg00382.html

Signed-off-by: Antonios Motakis <a.motakis@virtualopensystems.com>
Reviewed-by: Kim Phillips <kim.phillips@freescale.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 Documentation/ABI/testing/sysfs-bus-amba | 20 ++++++++++++++
 drivers/amba/bus.c                       | 47 ++++++++++++++++++++++++++++++++
 include/linux/amba/bus.h                 |  1 +
 3 files changed, 68 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-amba

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-amba b/Documentation/ABI/testing/sysfs-bus-amba
new file mode 100644
index 000000000000..e7b54677cfbe
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-amba
@@ -0,0 +1,20 @@
+What:		/sys/bus/amba/devices/.../driver_override
+Date:		September 2014
+Contact:	Antonios Motakis <a.motakis@virtualopensystems.com>
+Description:
+		This file allows the driver for a device to be specified which
+		will override standard OF, ACPI, ID table, and name matching.
+		When specified, only a driver with a name matching the value
+		written to driver_override will have an opportunity to bind to
+		the device. The override is specified by writing a string to the
+		driver_override file (echo vfio-amba > driver_override)	and may
+		be cleared with an empty string (echo > driver_override).
+		This returns the device to standard matching rules binding.
+		Writing to driver_override does not automatically unbind the
+		device from its current driver or make any attempt to
+		automatically load the specified driver. If no driver with a
+		matching name is currently loaded in the kernel, the device will
+		not bind to any driver. This also allows devices to opt-out of
+		driver binding using a driver_override name such as "none".
+		Only a single driver may be specified in the override, there is
+		no support for parsing delimiters.
diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index 52ddd9fbb55e..f0099360039e 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -18,6 +18,7 @@
 #include <linux/pm_domain.h>
 #include <linux/amba/bus.h>
 #include <linux/sizes.h>
+#include <linux/limits.h>
 
 #include <asm/irq.h>
 
@@ -43,6 +44,10 @@ static int amba_match(struct device *dev, struct device_driver *drv)
 	struct amba_device *pcdev = to_amba_device(dev);
 	struct amba_driver *pcdrv = to_amba_driver(drv);
 
+	/* When driver_override is set, only bind to the matching driver */
+	if (pcdev->driver_override)
+		return !strcmp(pcdev->driver_override, drv->name);
+
 	return amba_lookup(pcdrv->id_table, pcdev) != NULL;
 }
 
@@ -59,6 +64,47 @@ static int amba_uevent(struct device *dev, struct kobj_uevent_env *env)
 	return retval;
 }
 
+static ssize_t driver_override_show(struct device *_dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct amba_device *dev = to_amba_device(_dev);
+
+	if (!dev->driver_override)
+		return 0;
+
+	return sprintf(buf, "%s\n", dev->driver_override);
+}
+
+static ssize_t driver_override_store(struct device *_dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	struct amba_device *dev = to_amba_device(_dev);
+	char *driver_override, *old = dev->driver_override, *cp;
+
+	if (count > PATH_MAX)
+		return -EINVAL;
+
+	driver_override = kstrndup(buf, count, GFP_KERNEL);
+	if (!driver_override)
+		return -ENOMEM;
+
+	cp = strchr(driver_override, '\n');
+	if (cp)
+		*cp = '\0';
+
+	if (strlen(driver_override)) {
+		dev->driver_override = driver_override;
+	} else {
+	       kfree(driver_override);
+	       dev->driver_override = NULL;
+	}
+
+	kfree(old);
+
+	return count;
+}
+
 #define amba_attr_func(name,fmt,arg...)					\
 static ssize_t name##_show(struct device *_dev,				\
 			   struct device_attribute *attr, char *buf)	\
@@ -81,6 +127,7 @@ amba_attr_func(resource, "\t%016llx\t%016llx\t%016lx\n",
 static struct device_attribute amba_dev_attrs[] = {
 	__ATTR_RO(id),
 	__ATTR_RO(resource),
+	__ATTR_RW(driver_override),
 	__ATTR_NULL,
 };
 
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index 0ab5f8e0dea2..50fc66868402 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -33,6 +33,7 @@ struct amba_device {
 	struct clk		*pclk;
 	unsigned int		periphid;
 	unsigned int		irq[AMBA_NR_IRQS];
+	char			*driver_override;
 };
 
 struct amba_driver {
-- 
cgit v1.2.3


From 6ee8e25fc3e916193bce4ebb43d5439e1e2144ab Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 10 Feb 2015 14:08:32 -0800
Subject: fsnotify: fix handling of renames in audit

Commit e9fd702a58c4 ("audit: convert audit watches to use fsnotify
instead of inotify") broke handling of renames in audit.  Audit code
wants to update inode number of an inode corresponding to watched name
in a directory.  When something gets renamed into a directory to a
watched name, inotify previously passed moved inode to audit code
however new fsnotify code passes directory inode where the change
happened.  That confuses audit and it starts watching parent directory
instead of a file in a directory.

This can be observed for example by doing:

  cd /tmp
  touch foo bar
  auditctl -w /tmp/foo
  touch foo
  mv bar foo
  touch foo

In audit log we see events like:

  type=CONFIG_CHANGE msg=audit(1423563584.155:90): auid=1000 ses=2 op="updated rules" path="/tmp/foo" key=(null) list=4 res=1
  ...
  type=PATH msg=audit(1423563584.155:91): item=2 name="bar" inode=1046884 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=DELETE
  type=PATH msg=audit(1423563584.155:91): item=3 name="foo" inode=1046842 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=DELETE
  type=PATH msg=audit(1423563584.155:91): item=4 name="foo" inode=1046884 dev=08:0 2 mode=0100644 ouid=0 ogid=0 rdev=00:00 nametype=CREATE
  ...

and that's it - we see event for the first touch after creating the
audit rule, we see events for rename but we don't see any event for the
last touch.  However we start seeing events for unrelated stuff
happening in /tmp.

Fix the problem by passing moved inode as data in the FS_MOVED_FROM and
FS_MOVED_TO events instead of the directory where the change happens.
This doesn't introduce any new problems because noone besides
audit_watch.c cares about the passed value:

  fs/notify/fanotify/fanotify.c cares only about FSNOTIFY_EVENT_PATH events.
  fs/notify/dnotify/dnotify.c doesn't care about passed 'data' value at all.
  fs/notify/inotify/inotify_fsnotify.c uses 'data' only for FSNOTIFY_EVENT_PATH.
  kernel/audit_tree.c doesn't care about passed 'data' at all.
  kernel/audit_watch.c expects moved inode as 'data'.

Fixes: e9fd702a58c49db ("audit: convert audit watches to use fsnotify instead of inotify")
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Paul Moore <paul@paul-moore.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fsnotify.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 1c804b057fb1..7ee1774edee5 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -101,8 +101,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		new_dir_mask |= FS_ISDIR;
 	}
 
-	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie);
-	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie);
+	fsnotify(old_dir, old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_name,
+		 fs_cookie);
+	fsnotify(new_dir, new_dir_mask, source, FSNOTIFY_EVENT_INODE, new_name,
+		 fs_cookie);
 
 	if (target)
 		fsnotify_link_count(target);
-- 
cgit v1.2.3


From ccaafd7fd039aebc9359a9799f8558b01f1c2adc Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Tue, 10 Feb 2015 14:09:35 -0800
Subject: mm: don't use compound_head() in virt_to_head_page()

compound_head() is implemented with assumption that there would be race
condition when checking tail flag.  This assumption is only true when we
try to access arbitrary positioned struct page.

The situation that virt_to_head_page() is called is different case.  We
call virt_to_head_page() only in the range of allocated pages, so there
is no race condition on tail flag.  In this case, we don't need to
handle race condition and we can reduce overhead slightly.  This patch
implements compound_head_fast() which is similar with compound_head()
except tail flag race handling.  And then, virt_to_head_page() uses this
optimized function to improve performance.

I saw 1.8% win in a fast-path loop over kmem_cache_alloc/free, (14.063
ns -> 13.810 ns) if target object is on tail page.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd5ea3016fc4..2c6fd3c5424a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -446,6 +446,12 @@ static inline struct page *compound_head_by_tail(struct page *tail)
 	return tail;
 }
 
+/*
+ * Since either compound page could be dismantled asynchronously in THP
+ * or we access asynchronously arbitrary positioned struct page, there
+ * would be tail flag race. To handle this race, we should call
+ * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
+ */
 static inline struct page *compound_head(struct page *page)
 {
 	if (unlikely(PageTail(page)))
@@ -453,6 +459,18 @@ static inline struct page *compound_head(struct page *page)
 	return page;
 }
 
+/*
+ * If we access compound page synchronously such as access to
+ * allocated page, there is no need to handle tail flag race, so we can
+ * check tail flag directly without any synchronization primitive.
+ */
+static inline struct page *compound_head_fast(struct page *page)
+{
+	if (unlikely(PageTail(page)))
+		return page->first_page;
+	return page;
+}
+
 /*
  * The atomic page->_mapcount, starts from -1: so that transitions
  * both from it and to it can be tracked, using atomic_inc_and_test
@@ -531,7 +549,14 @@ static inline void get_page(struct page *page)
 static inline struct page *virt_to_head_page(const void *x)
 {
 	struct page *page = virt_to_page(x);
-	return compound_head(page);
+
+	/*
+	 * We don't need to worry about synchronization of tail flag
+	 * when we call virt_to_head_page() since it is only called for
+	 * already allocated page and this page won't be freed until
+	 * this virt_to_head_page() is finished. So use _fast variant.
+	 */
+	return compound_head_fast(page);
 }
 
 /*
-- 
cgit v1.2.3


From c8d78c1823f46519473949d33f0d1d33fe21ea16 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:09:46 -0800
Subject: mm: replace remap_file_pages() syscall with emulation

remap_file_pages(2) was invented to be able efficiently map parts of
huge file into limited 32-bit virtual address space such as in database
workloads.

Nonlinear mappings are pain to support and it seems there's no
legitimate use-cases nowadays since 64-bit systems are widely available.

Let's drop it and get rid of all these special-cased code.

The patch replaces the syscall with emulation which creates new VMA on
each remap_file_pages(), unless they it can be merged with an adjacent
one.

I didn't find *any* real code that uses remap_file_pages(2) to test
emulation impact on.  I've checked Debian code search and source of all
packages in ALT Linux.  No real users: libc wrappers, mentions in
strace, gdb, valgrind and this kind of stuff.

There are few basic tests in LTP for the syscall.  They work just fine
with emulation.

To test performance impact, I've written small test case which
demonstrate pretty much worst case scenario: map 4G shmfs file, write to
begin of every page pgoff of the page, remap pages in reverse order,
read every page.

The test creates 1 million of VMAs if emulation is in use, so I had to
set vm.max_map_count to 1100000 to avoid -ENOMEM.

Before:		23.3 ( +-  4.31% ) seconds
After:		43.9 ( +-  0.85% ) seconds
Slowdown:	1.88x

I believe we can live with that.

Test case:

        #define _GNU_SOURCE
        #include <assert.h>
        #include <stdlib.h>
        #include <stdio.h>
        #include <sys/mman.h>

        #define MB	(1024UL * 1024)
        #define SIZE	(4096 * MB)

        int main(int argc, char **argv)
        {
                unsigned long *p;
                long i, pass;

                for (pass = 0; pass < 10; pass++) {
                        p = mmap(NULL, SIZE, PROT_READ|PROT_WRITE,
                                        MAP_SHARED | MAP_ANONYMOUS, -1, 0);
                        if (p == MAP_FAILED) {
                                perror("mmap");
                                return -1;
                        }

                        for (i = 0; i < SIZE / 4096; i++)
                                p[i * 4096 / sizeof(*p)] = i;

                        for (i = 0; i < SIZE / 4096; i++) {
                                if (remap_file_pages(p + i * 4096 / sizeof(*p), 4096,
                                                0, (SIZE - 4096 * (i + 1)) >> 12, 0)) {
                                        perror("remap_file_pages");
                                        return -1;
                                }
                        }

                        for (i = SIZE / 4096 - 1; i >= 0; i--)
                                assert(p[i * 4096 / sizeof(*p)] == SIZE / 4096 - i - 1);

                        munmap(p, SIZE);
                }

                return 0;
        }

[akpm@linux-foundation.org: fix spello]
[sasha.levin@oracle.com: initialize populate before usage]
[sasha.levin@oracle.com: grab file ref to prevent race while mmaping]
Signed-off-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Dave Jones <davej@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Armin Rigo <arigo@tunes.org>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/remap_file_pages.txt |   7 +-
 include/linux/fs.h                    |   8 +-
 mm/Makefile                           |   2 +-
 mm/fremap.c                           | 283 ----------------------------------
 mm/mmap.c                             |  69 +++++++++
 mm/nommu.c                            |   8 -
 6 files changed, 79 insertions(+), 298 deletions(-)
 delete mode 100644 mm/fremap.c

(limited to 'include/linux')

diff --git a/Documentation/vm/remap_file_pages.txt b/Documentation/vm/remap_file_pages.txt
index 560e4363a55d..f609142f406a 100644
--- a/Documentation/vm/remap_file_pages.txt
+++ b/Documentation/vm/remap_file_pages.txt
@@ -18,10 +18,9 @@ on 32-bit systems to map files bigger than can linearly fit into 32-bit
 virtual address space. This use-case is not critical anymore since 64-bit
 systems are widely available.
 
-The plan is to deprecate the syscall and replace it with an emulation.
-The emulation will create new VMAs instead of nonlinear mappings. It's
-going to work slower for rare users of remap_file_pages() but ABI is
-preserved.
+The syscall is deprecated and replaced it with an emulation now. The
+emulation creates new VMAs instead of nonlinear mappings. It's going to
+work slower for rare users of remap_file_pages() but ABI is preserved.
 
 One side effect of emulation (apart from performance) is that user can hit
 vm.max_map_count limit more easily due to additional VMAs. See comment for
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 42efe13077b6..60c4996df7f3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2481,8 +2481,12 @@ extern int sb_min_blocksize(struct super_block *, int);
 
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
-extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
-		unsigned long size, pgoff_t pgoff);
+static inline int generic_file_remap_pages(struct vm_area_struct *vma,
+		unsigned long addr, unsigned long size, pgoff_t pgoff)
+{
+	BUG();
+	return 0;
+}
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/Makefile b/mm/Makefile
index 4bf586e66378..3548460ab7b6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,7 +3,7 @@
 #
 
 mmu-y			:= nommu.o
-mmu-$(CONFIG_MMU)	:= fremap.o gup.o highmem.o memory.o mincore.o \
+mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
 			   vmalloc.o pagewalk.o pgtable-generic.o
 
diff --git a/mm/fremap.c b/mm/fremap.c
deleted file mode 100644
index 2805d71cf476..000000000000
--- a/mm/fremap.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- *   linux/mm/fremap.c
- * 
- * Explicit pagetable population and nonlinear (random) mappings support.
- *
- * started by Ingo Molnar, Copyright (C) 2002, 2003
- */
-#include <linux/export.h>
-#include <linux/backing-dev.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/file.h>
-#include <linux/mman.h>
-#include <linux/pagemap.h>
-#include <linux/swapops.h>
-#include <linux/rmap.h>
-#include <linux/syscalls.h>
-#include <linux/mmu_notifier.h>
-
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-#include "internal.h"
-
-static int mm_counter(struct page *page)
-{
-	return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
-}
-
-static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-	struct page *page;
-	swp_entry_t entry;
-
-	if (pte_present(pte)) {
-		flush_cache_page(vma, addr, pte_pfn(pte));
-		pte = ptep_clear_flush_notify(vma, addr, ptep);
-		page = vm_normal_page(vma, addr, pte);
-		if (page) {
-			if (pte_dirty(pte))
-				set_page_dirty(page);
-			update_hiwater_rss(mm);
-			dec_mm_counter(mm, mm_counter(page));
-			page_remove_rmap(page);
-			page_cache_release(page);
-		}
-	} else {	/* zap_pte() is not called when pte_none() */
-		if (!pte_file(pte)) {
-			update_hiwater_rss(mm);
-			entry = pte_to_swp_entry(pte);
-			if (non_swap_entry(entry)) {
-				if (is_migration_entry(entry)) {
-					page = migration_entry_to_page(entry);
-					dec_mm_counter(mm, mm_counter(page));
-				}
-			} else {
-				free_swap_and_cache(entry);
-				dec_mm_counter(mm, MM_SWAPENTS);
-			}
-		}
-		pte_clear_not_present_full(mm, addr, ptep, 0);
-	}
-}
-
-/*
- * Install a file pte to a given virtual memory address, release any
- * previously existing mapping.
- */
-static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long addr, unsigned long pgoff, pgprot_t prot)
-{
-	int err = -ENOMEM;
-	pte_t *pte, ptfile;
-	spinlock_t *ptl;
-
-	pte = get_locked_pte(mm, addr, &ptl);
-	if (!pte)
-		goto out;
-
-	ptfile = pgoff_to_pte(pgoff);
-
-	if (!pte_none(*pte))
-		zap_pte(mm, vma, addr, pte);
-
-	set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
-	/*
-	 * We don't need to run update_mmu_cache() here because the "file pte"
-	 * being installed by install_file_pte() is not a real pte - it's a
-	 * non-present entry (like a swap entry), noting what file offset should
-	 * be mapped there when there's a fault (in a non-linear vma where
-	 * that's not obvious).
-	 */
-	pte_unmap_unlock(pte, ptl);
-	err = 0;
-out:
-	return err;
-}
-
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-			     unsigned long size, pgoff_t pgoff)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	int err;
-
-	do {
-		err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
-		if (err)
-			return err;
-
-		size -= PAGE_SIZE;
-		addr += PAGE_SIZE;
-		pgoff++;
-	} while (size);
-
-	return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
-/**
- * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
- * @start: start of the remapped virtual memory range
- * @size: size of the remapped virtual memory range
- * @prot: new protection bits of the range (see NOTE)
- * @pgoff: to-be-mapped page of the backing store file
- * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
- *
- * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
- * (shared backing store file).
- *
- * This syscall works purely via pagetables, so it's the most efficient
- * way to map the same (large) file into a given virtual window. Unlike
- * mmap()/mremap() it does not create any new vmas. The new mappings are
- * also safe across swapout.
- *
- * NOTE: the @prot parameter right now is ignored (but must be zero),
- * and the vma's default protection is used. Arbitrary protections
- * might be implemented in the future.
- */
-SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
-		unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
-{
-	struct mm_struct *mm = current->mm;
-	struct address_space *mapping;
-	struct vm_area_struct *vma;
-	int err = -EINVAL;
-	int has_write_lock = 0;
-	vm_flags_t vm_flags = 0;
-
-	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
-			"See Documentation/vm/remap_file_pages.txt.\n",
-			current->comm, current->pid);
-
-	if (prot)
-		return err;
-	/*
-	 * Sanitize the syscall parameters:
-	 */
-	start = start & PAGE_MASK;
-	size = size & PAGE_MASK;
-
-	/* Does the address range wrap, or is the span zero-sized? */
-	if (start + size <= start)
-		return err;
-
-	/* Does pgoff wrap? */
-	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
-		return err;
-
-	/* Can we represent this offset inside this architecture's pte's? */
-#if PTE_FILE_MAX_BITS < BITS_PER_LONG
-	if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
-		return err;
-#endif
-
-	/* We need down_write() to change vma->vm_flags. */
-	down_read(&mm->mmap_sem);
- retry:
-	vma = find_vma(mm, start);
-
-	/*
-	 * Make sure the vma is shared, that it supports prefaulting,
-	 * and that the remapped range is valid and fully within
-	 * the single existing vma.
-	 */
-	if (!vma || !(vma->vm_flags & VM_SHARED))
-		goto out;
-
-	if (!vma->vm_ops || !vma->vm_ops->remap_pages)
-		goto out;
-
-	if (start < vma->vm_start || start + size > vma->vm_end)
-		goto out;
-
-	/* Must set VM_NONLINEAR before any pages are populated. */
-	if (!(vma->vm_flags & VM_NONLINEAR)) {
-		/*
-		 * vm_private_data is used as a swapout cursor
-		 * in a VM_NONLINEAR vma.
-		 */
-		if (vma->vm_private_data)
-			goto out;
-
-		/* Don't need a nonlinear mapping, exit success */
-		if (pgoff == linear_page_index(vma, start)) {
-			err = 0;
-			goto out;
-		}
-
-		if (!has_write_lock) {
-get_write_lock:
-			up_read(&mm->mmap_sem);
-			down_write(&mm->mmap_sem);
-			has_write_lock = 1;
-			goto retry;
-		}
-		mapping = vma->vm_file->f_mapping;
-		/*
-		 * page_mkclean doesn't work on nonlinear vmas, so if
-		 * dirty pages need to be accounted, emulate with linear
-		 * vmas.
-		 */
-		if (mapping_cap_account_dirty(mapping)) {
-			unsigned long addr;
-			struct file *file = get_file(vma->vm_file);
-			/* mmap_region may free vma; grab the info now */
-			vm_flags = vma->vm_flags;
-
-			addr = mmap_region(file, start, size, vm_flags, pgoff);
-			fput(file);
-			if (IS_ERR_VALUE(addr)) {
-				err = addr;
-			} else {
-				BUG_ON(addr != start);
-				err = 0;
-			}
-			goto out_freed;
-		}
-		i_mmap_lock_write(mapping);
-		flush_dcache_mmap_lock(mapping);
-		vma->vm_flags |= VM_NONLINEAR;
-		vma_interval_tree_remove(vma, &mapping->i_mmap);
-		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-		flush_dcache_mmap_unlock(mapping);
-		i_mmap_unlock_write(mapping);
-	}
-
-	if (vma->vm_flags & VM_LOCKED) {
-		/*
-		 * drop PG_Mlocked flag for over-mapped range
-		 */
-		if (!has_write_lock)
-			goto get_write_lock;
-		vm_flags = vma->vm_flags;
-		munlock_vma_pages_range(vma, start, start + size);
-		vma->vm_flags = vm_flags;
-	}
-
-	mmu_notifier_invalidate_range_start(mm, start, start + size);
-	err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
-	mmu_notifier_invalidate_range_end(mm, start, start + size);
-
-	/*
-	 * We can't clear VM_NONLINEAR because we'd have to do
-	 * it after ->populate completes, and that would prevent
-	 * downgrading the lock.  (Locks can't be upgraded).
-	 */
-
-out:
-	if (vma)
-		vm_flags = vma->vm_flags;
-out_freed:
-	if (likely(!has_write_lock))
-		up_read(&mm->mmap_sem);
-	else
-		up_write(&mm->mmap_sem);
-	if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
-		mm_populate(start, size);
-
-	return err;
-}
diff --git a/mm/mmap.c b/mm/mmap.c
index 7f684d5a8087..e023dc5e59a8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2634,6 +2634,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 	return vm_munmap(addr, len);
 }
 
+
+/*
+ * Emulation of deprecated remap_file_pages() syscall.
+ */
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+		unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
+{
+
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long populate = 0;
+	unsigned long ret = -EINVAL;
+	struct file *file;
+
+	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
+			"See Documentation/vm/remap_file_pages.txt.\n",
+			current->comm, current->pid);
+
+	if (prot)
+		return ret;
+	start = start & PAGE_MASK;
+	size = size & PAGE_MASK;
+
+	if (start + size <= start)
+		return ret;
+
+	/* Does pgoff wrap? */
+	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+		return ret;
+
+	down_write(&mm->mmap_sem);
+	vma = find_vma(mm, start);
+
+	if (!vma || !(vma->vm_flags & VM_SHARED))
+		goto out;
+
+	if (start < vma->vm_start || start + size > vma->vm_end)
+		goto out;
+
+	if (pgoff == linear_page_index(vma, start)) {
+		ret = 0;
+		goto out;
+	}
+
+	prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+	prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+	prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+	flags &= MAP_NONBLOCK;
+	flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+	if (vma->vm_flags & VM_LOCKED) {
+		flags |= MAP_LOCKED;
+		/* drop PG_Mlocked flag for over-mapped range */
+		munlock_vma_pages_range(vma, start, start + size);
+	}
+
+	file = get_file(vma->vm_file);
+	ret = do_mmap_pgoff(vma->vm_file, start, size,
+			prot, flags, pgoff, &populate);
+	fput(file);
+out:
+	up_write(&mm->mmap_sem);
+	if (populate)
+		mm_populate(ret, populate);
+	if (!IS_ERR_VALUE(ret))
+		ret = 0;
+	return ret;
+}
+
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
 #ifdef CONFIG_DEBUG_VM
diff --git a/mm/nommu.c b/mm/nommu.c
index 28bd8c4dff6f..541bed64e348 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1984,14 +1984,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_map_pages);
 
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-			     unsigned long size, pgoff_t pgoff)
-{
-	BUG();
-	return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long addr, void *buf, int len, int write)
 {
-- 
cgit v1.2.3


From 8a5f14a23177061ec11daeaa3d09d0765d785c47 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:09:49 -0800
Subject: mm: drop support of non-linear mapping from unmap/zap codepath

We have remap_file_pages(2) emulation in -mm tree for few release cycles
and we plan to have it mainline in v3.20. This patchset removes rest of
VM_NONLINEAR infrastructure.

Patches 1-8 take care about generic code. They are pretty
straight-forward and can be applied without other of patches.

Rest patches removes pte_file()-related stuff from architecture-specific
code. It usually frees up one bit in non-present pte. I've tried to reuse
that bit for swap offset, where I was able to figure out how to do that.

For obvious reason I cannot test all that arch-specific code and would
like to see acks from maintainers.

In total, remap_file_pages(2) required about 1.4K lines of not-so-trivial
kernel code. That's too much for functionality nobody uses.

Tested-by: Felipe Balbi <balbi@ti.com>

This patch (of 38):

We don't create non-linear mappings anymore. Let's drop code which
handles them on unmap/zap.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 -
 mm/madvise.c       |  9 +-----
 mm/memory.c        | 82 ++++++++++++++----------------------------------------
 3 files changed, 22 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c6fd3c5424a..600ef5ed4698 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1146,7 +1146,6 @@ extern void user_shm_unlock(size_t, struct user_struct *);
  * Parameter block passed down to zap_pte_range in exceptional cases.
  */
 struct zap_details {
-	struct vm_area_struct *nonlinear_vma;	/* Check page->index if set */
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
diff --git a/mm/madvise.c b/mm/madvise.c
index a271adc93289..917754d26c17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -278,14 +278,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
 	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
 		return -EINVAL;
 
-	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
-		struct zap_details details = {
-			.nonlinear_vma = vma,
-			.last_index = ULONG_MAX,
-		};
-		zap_page_range(vma, start, end - start, &details);
-	} else
-		zap_page_range(vma, start, end - start, NULL);
+	zap_page_range(vma, start, end - start, NULL);
 	return 0;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 2c3536cc6c63..9a3e73b69dad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1082,6 +1082,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	spinlock_t *ptl;
 	pte_t *start_pte;
 	pte_t *pte;
+	swp_entry_t entry;
 
 again:
 	init_rss_vec(rss);
@@ -1107,28 +1108,12 @@ again:
 				if (details->check_mapping &&
 				    details->check_mapping != page->mapping)
 					continue;
-				/*
-				 * Each page->index must be checked when
-				 * invalidating or truncating nonlinear.
-				 */
-				if (details->nonlinear_vma &&
-				    (page->index < details->first_index ||
-				     page->index > details->last_index))
-					continue;
 			}
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
-			if (unlikely(details) && details->nonlinear_vma
-			    && linear_page_index(details->nonlinear_vma,
-						addr) != page->index) {
-				pte_t ptfile = pgoff_to_pte(page->index);
-				if (pte_soft_dirty(ptent))
-					ptfile = pte_file_mksoft_dirty(ptfile);
-				set_pte_at(mm, addr, pte, ptfile);
-			}
 			if (PageAnon(page))
 				rss[MM_ANONPAGES]--;
 			else {
@@ -1151,33 +1136,25 @@ again:
 			}
 			continue;
 		}
-		/*
-		 * If details->check_mapping, we leave swap entries;
-		 * if details->nonlinear_vma, we leave file entries.
-		 */
+		/* If details->check_mapping, we leave swap entries. */
 		if (unlikely(details))
 			continue;
-		if (pte_file(ptent)) {
-			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
-				print_bad_pte(vma, addr, ptent, NULL);
-		} else {
-			swp_entry_t entry = pte_to_swp_entry(ptent);
 
-			if (!non_swap_entry(entry))
-				rss[MM_SWAPENTS]--;
-			else if (is_migration_entry(entry)) {
-				struct page *page;
+		entry = pte_to_swp_entry(ptent);
+		if (!non_swap_entry(entry))
+			rss[MM_SWAPENTS]--;
+		else if (is_migration_entry(entry)) {
+			struct page *page;
 
-				page = migration_entry_to_page(entry);
+			page = migration_entry_to_page(entry);
 
-				if (PageAnon(page))
-					rss[MM_ANONPAGES]--;
-				else
-					rss[MM_FILEPAGES]--;
-			}
-			if (unlikely(!free_swap_and_cache(entry)))
-				print_bad_pte(vma, addr, ptent, NULL);
+			if (PageAnon(page))
+				rss[MM_ANONPAGES]--;
+			else
+				rss[MM_FILEPAGES]--;
 		}
+		if (unlikely(!free_swap_and_cache(entry)))
+			print_bad_pte(vma, addr, ptent, NULL);
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
@@ -1277,7 +1254,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
 	pgd_t *pgd;
 	unsigned long next;
 
-	if (details && !details->check_mapping && !details->nonlinear_vma)
+	if (details && !details->check_mapping)
 		details = NULL;
 
 	BUG_ON(addr >= end);
@@ -1371,7 +1348,7 @@ void unmap_vmas(struct mmu_gather *tlb,
  * @vma: vm_area_struct holding the applicable pages
  * @start: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
  *
  * Caller must protect the VMA list
  */
@@ -1397,7 +1374,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
  * @vma: vm_area_struct holding the applicable pages
  * @address: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
  *
  * The range must fit into one VMA.
  */
@@ -2331,25 +2308,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
 	}
 }
 
-static inline void unmap_mapping_range_list(struct list_head *head,
-					    struct zap_details *details)
-{
-	struct vm_area_struct *vma;
-
-	/*
-	 * In nonlinear VMAs there is no correspondence between virtual address
-	 * offset and file offset.  So we must perform an exhaustive search
-	 * across *all* the pages in each nonlinear VMA, not just the pages
-	 * whose virtual address lies outside the file truncation point.
-	 */
-	list_for_each_entry(vma, head, shared.nonlinear) {
-		details->nonlinear_vma = vma;
-		unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
-	}
-}
-
 /**
- * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified
+ * address_space corresponding to the specified page range in the underlying
+ * file.
+ *
  * @mapping: the address space containing mmaps to be unmapped.
  * @holebegin: byte in first page to unmap, relative to the start of
  * the underlying file.  This will be rounded down to a PAGE_SIZE
@@ -2378,7 +2341,6 @@ void unmap_mapping_range(struct address_space *mapping,
 	}
 
 	details.check_mapping = even_cows? NULL: mapping;
-	details.nonlinear_vma = NULL;
 	details.first_index = hba;
 	details.last_index = hba + hlen - 1;
 	if (details.last_index < details.first_index)
@@ -2388,8 +2350,6 @@ void unmap_mapping_range(struct address_space *mapping,
 	i_mmap_lock_write(mapping);
 	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
-	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
-		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	i_mmap_unlock_write(mapping);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
-- 
cgit v1.2.3


From 9b4bdd2ffab9557ac43af7dff02e7dab1c8c58bd Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:09:51 -0800
Subject: mm: drop support of non-linear mapping from fault codepath

We don't create non-linear mappings anymore.  Let's drop code which
handles them on page fault.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 16 ++++++--------
 mm/memory.c        | 65 ++++++++----------------------------------------------
 2 files changed, 16 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 600ef5ed4698..376e5c325dee 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -206,21 +206,19 @@ extern unsigned int kobjsize(const void *objp);
 extern pgprot_t protection_map[16];
 
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
-#define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
-#define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
-#define FAULT_FLAG_ALLOW_RETRY	0x08	/* Retry fault if blocking */
-#define FAULT_FLAG_RETRY_NOWAIT	0x10	/* Don't drop mmap_sem and wait when retrying */
-#define FAULT_FLAG_KILLABLE	0x20	/* The fault task is in SIGKILL killable region */
-#define FAULT_FLAG_TRIED	0x40	/* second try */
-#define FAULT_FLAG_USER		0x80	/* The fault originated in userspace */
+#define FAULT_FLAG_MKWRITE	0x02	/* Fault was mkwrite of existing pte */
+#define FAULT_FLAG_ALLOW_RETRY	0x04	/* Retry fault if blocking */
+#define FAULT_FLAG_RETRY_NOWAIT	0x08	/* Don't drop mmap_sem and wait when retrying */
+#define FAULT_FLAG_KILLABLE	0x10	/* The fault task is in SIGKILL killable region */
+#define FAULT_FLAG_TRIED	0x20	/* Second try */
+#define FAULT_FLAG_USER		0x40	/* The fault originated in userspace */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
  * ->fault function. The vma's ->fault is responsible for returning a bitmask
  * of VM_FAULT_xxx flags that give details about how the fault was handled.
  *
- * pgoff should be used in favour of virtual_address, if possible. If pgoff
- * is used, one may implement ->remap_pages to get nonlinear mapping support.
+ * pgoff should be used in favour of virtual_address, if possible.
  */
 struct vm_fault {
 	unsigned int flags;		/* FAULT_FLAG_xxx flags */
diff --git a/mm/memory.c b/mm/memory.c
index 9a3e73b69dad..43a53743cbb4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1899,12 +1899,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically.  Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
+ * handle_pte_fault chooses page fault handler according to an entry which was
+ * read non-atomically.  Before making any commitment, on those architectures
+ * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
+ * parts, do_swap_page must check under lock before unmapping the pte and
+ * proceeding (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page can safely check later on).
  */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
@@ -2710,8 +2709,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-	else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
-		entry = pte_mksoft_dirty(entry);
 	if (anon) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, address);
@@ -2846,8 +2843,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * if page by the offset is not ready to be mapped (cold cache or
 	 * something).
 	 */
-	if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
-	    fault_around_bytes >> PAGE_SHIFT > 1) {
+	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
 		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 		do_fault_around(vma, address, pte, pgoff, flags);
 		if (!pte_same(*pte, orig_pte))
@@ -2992,7 +2988,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
  */
-static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
@@ -3009,46 +3005,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
-/*
- * Fault of a previously existing named mapping. Repopulate the pte
- * from the encoded file_pte if possible. This enables swappable
- * nonlinear vmas.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with pte unmapped and unlocked.
- * The mmap_sem may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
- */
-static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		unsigned int flags, pte_t orig_pte)
-{
-	pgoff_t pgoff;
-
-	flags |= FAULT_FLAG_NONLINEAR;
-
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		return 0;
-
-	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
-		/*
-		 * Page table corrupted: show pte and kill process.
-		 */
-		print_bad_pte(vma, address, orig_pte, NULL);
-		return VM_FAULT_SIGBUS;
-	}
-
-	pgoff = pte_to_pgoff(orig_pte);
-	if (!(flags & FAULT_FLAG_WRITE))
-		return do_read_fault(mm, vma, address, pmd, pgoff, flags,
-				orig_pte);
-	if (!(vma->vm_flags & VM_SHARED))
-		return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
-				orig_pte);
-	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
-}
-
 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 				unsigned long addr, int page_nid,
 				int *flags)
@@ -3176,15 +3132,12 @@ static int handle_pte_fault(struct mm_struct *mm,
 		if (pte_none(entry)) {
 			if (vma->vm_ops) {
 				if (likely(vma->vm_ops->fault))
-					return do_linear_fault(mm, vma, address,
-						pte, pmd, flags, entry);
+					return do_fault(mm, vma, address, pte,
+							pmd, flags, entry);
 			}
 			return do_anonymous_page(mm, vma, address,
 						 pte, pmd, flags);
 		}
-		if (pte_file(entry))
-			return do_nonlinear_fault(mm, vma, address,
-					pte, pmd, flags, entry);
 		return do_swap_page(mm, vma, address,
 					pte, pmd, flags, entry);
 	}
-- 
cgit v1.2.3


From d83a08db5ba6072caa658745881f4baa9bad6a08 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:09:54 -0800
Subject: mm: drop vm_ops->remap_pages and generic_file_remap_pages() stub

Nobody uses it anymore.

[akpm@linux-foundation.org: fix filemap_xip.c]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/9p/vfs_file.c   | 2 --
 fs/btrfs/file.c    | 1 -
 fs/ceph/addr.c     | 1 -
 fs/cifs/file.c     | 1 -
 fs/ext4/file.c     | 1 -
 fs/f2fs/file.c     | 1 -
 fs/fuse/file.c     | 1 -
 fs/gfs2/file.c     | 1 -
 fs/nfs/file.c      | 1 -
 fs/nilfs2/file.c   | 1 -
 fs/ocfs2/mmap.c    | 1 -
 fs/ubifs/file.c    | 1 -
 fs/xfs/xfs_file.c  | 1 -
 include/linux/fs.h | 6 ------
 include/linux/mm.h | 3 ---
 mm/filemap.c       | 1 -
 mm/filemap_xip.c   | 1 -
 mm/shmem.c         | 1 -
 18 files changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5594505e6e73..b40133796b87 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
 	.fault = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = v9fs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
@@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
 	.fault = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = v9fs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..a606ab551296 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2081,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= btrfs_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c81c0e004588..24be059fd1f8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1569,7 +1569,6 @@ out:
 static struct vm_operations_struct ceph_vmops = {
 	.fault		= ceph_filemap_fault,
 	.page_mkwrite	= ceph_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 74f12877493a..294ff302a237 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3248,7 +3248,6 @@ static struct vm_operations_struct cifs_file_vm_ops = {
 	.fault = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = cifs_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8131be8c0af3..7cb592386121 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -195,7 +195,6 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite   = ext4_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c27e0ecb3bc..5674ba13102b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -92,7 +92,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= f2fs_vm_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int get_parent_ino(struct inode *inode, nid_t *pino)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c552197..d769e594855b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2062,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= fuse_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6e600abf694a..ec9c2d33477a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -498,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = {
 	.fault = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = gfs2_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 /**
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2ab6f00dba5b..94712fc781fa 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -646,7 +646,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
 	.fault = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = nfs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 3a03e0aea1fb..a8c728acb7a8 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -128,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= nilfs_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 10d66c75cecb..9581d190f6e1 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,7 +173,6 @@ out:
 static const struct vm_operations_struct ocfs2_file_vm_ops = {
 	.fault		= ocfs2_fault,
 	.page_mkwrite	= ocfs2_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 538519ee37d9..035e51011444 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1536,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
 	.fault        = filemap_fault,
 	.map_pages = filemap_map_pages,
 	.page_mkwrite = ubifs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 13e974e6a889..ac7f1e8f92b3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1384,5 +1384,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= xfs_vm_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 60c4996df7f3..47f557c7ef7e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2481,12 +2481,6 @@ extern int sb_min_blocksize(struct super_block *, int);
 
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
-static inline int generic_file_remap_pages(struct vm_area_struct *vma,
-		unsigned long addr, unsigned long size, pgoff_t pgoff)
-{
-	BUG();
-	return 0;
-}
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 376e5c325dee..2ddd9d1d6268 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -285,9 +285,6 @@ struct vm_operations_struct {
 	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
 					unsigned long addr);
 #endif
-	/* called by sys_remap_file_pages() to populate non-linear mapping */
-	int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
-			   unsigned long size, pgoff_t pgoff);
 };
 
 struct mmu_gather;
diff --git a/mm/filemap.c b/mm/filemap.c
index 673e4581a2e5..bf7a27142704 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2087,7 +2087,6 @@ const struct vm_operations_struct generic_file_vm_ops = {
 	.fault		= filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite	= filemap_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 /* This is used for a general mmap of a disk file */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0d105aeff82f..70c09da1a419 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -301,7 +301,6 @@ out:
 static const struct vm_operations_struct xip_file_vm_ops = {
 	.fault	= xip_file_fault,
 	.page_mkwrite	= filemap_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
diff --git a/mm/shmem.c b/mm/shmem.c
index 993e6ba689cc..b3e403181981 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3201,7 +3201,6 @@ static const struct vm_operations_struct shmem_vm_ops = {
 	.set_policy     = shmem_set_policy,
 	.get_policy     = shmem_get_policy,
 #endif
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
-- 
cgit v1.2.3


From 27ba0644ea9dfe6e7693abc85837b60e40583b96 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:09:59 -0800
Subject: rmap: drop support of non-linear mappings

We don't create non-linear mappings anymore.  Let's drop code which
handles them in rmap.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cachetlb.txt |   8 +-
 fs/inode.c                 |   1 -
 include/linux/fs.h         |   4 +-
 include/linux/mm.h         |   6 --
 include/linux/mm_types.h   |   4 +-
 include/linux/rmap.h       |   2 -
 kernel/fork.c              |   8 +-
 mm/migrate.c               |  32 -------
 mm/mmap.c                  |  24 ++---
 mm/rmap.c                  | 225 +--------------------------------------------
 mm/swap.c                  |   4 +-
 11 files changed, 18 insertions(+), 300 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index d79b008e4a32..3f9f808b5119 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -317,10 +317,10 @@ maps this page at its virtual address.
 	about doing this.
 
 	The idea is, first at flush_dcache_page() time, if
-	page->mapping->i_mmap is an empty tree and ->i_mmap_nonlinear
-	an empty list, just mark the architecture private page flag bit.
-	Later, in update_mmu_cache(), a check is made of this flag bit,
-	and if set the flush is done and the flag bit is cleared.
+	page->mapping->i_mmap is an empty tree, just mark the architecture
+	private page flag bit.  Later, in update_mmu_cache(), a check is
+	made of this flag bit, and if set the flush is done and the flag
+	bit is cleared.
 
 	IMPORTANT NOTE: It is often important, if you defer the flush,
 			that the actual flush occurs on the same CPU
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..c760fac33c92 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -355,7 +355,6 @@ void address_space_init_once(struct address_space *mapping)
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
 	mapping->i_mmap = RB_ROOT;
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 EXPORT_SYMBOL(address_space_init_once);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 47f557c7ef7e..60acab209701 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -401,7 +401,6 @@ struct address_space {
 	spinlock_t		tree_lock;	/* and lock protecting it */
 	atomic_t		i_mmap_writable;/* count VM_SHARED mappings */
 	struct rb_root		i_mmap;		/* tree of private and shared mappings */
-	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	struct rw_semaphore	i_mmap_rwsem;	/* protect tree, count, list */
 	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
@@ -493,8 +492,7 @@ static inline void i_mmap_unlock_read(struct address_space *mapping)
  */
 static inline int mapping_mapped(struct address_space *mapping)
 {
-	return	!RB_EMPTY_ROOT(&mapping->i_mmap) ||
-		!list_empty(&mapping->i_mmap_nonlinear);
+	return	!RB_EMPTY_ROOT(&mapping->i_mmap);
 }
 
 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2ddd9d1d6268..18391eec4864 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1796,12 +1796,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
 	for (vma = vma_interval_tree_iter_first(root, start, last);	\
 	     vma; vma = vma_interval_tree_iter_next(vma, start, last))
 
-static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
-					struct list_head *list)
-{
-	list_add_tail(&vma->shared.nonlinear, list);
-}
-
 void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
 				   struct rb_root *root);
 void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6d34aa266a8c..3b1d20fb0848 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -273,15 +273,13 @@ struct vm_area_struct {
 
 	/*
 	 * For areas with an address space and backing store,
-	 * linkage into the address_space->i_mmap interval tree, or
-	 * linkage of vma in the address_space->i_mmap_nonlinear list.
+	 * linkage into the address_space->i_mmap interval tree.
 	 */
 	union {
 		struct {
 			struct rb_node rb;
 			unsigned long rb_subtree_last;
 		} linear;
-		struct list_head nonlinear;
 	} shared;
 
 	/*
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d9d7e7e56352..b38f559130d5 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -246,7 +246,6 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
  * arg: passed to rmap_one() and invalid_vma()
  * rmap_one: executed on each vma where page is mapped
  * done: for checking traversing termination condition
- * file_nonlinear: for handling file nonlinear mapping
  * anon_lock: for getting anon_lock by optimized way rather than default
  * invalid_vma: for skipping uninterested vma
  */
@@ -255,7 +254,6 @@ struct rmap_walk_control {
 	int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
 					unsigned long addr, void *arg);
 	int (*done)(struct page *page);
-	int (*file_nonlinear)(struct page *, struct address_space *, void *arg);
 	struct anon_vma *(*anon_lock)(struct page *page);
 	bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
 };
diff --git a/kernel/fork.c b/kernel/fork.c
index 4dc2ddade9f1..b379d9abddc7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,12 +438,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				atomic_inc(&mapping->i_mmap_writable);
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
-			if (unlikely(tmp->vm_flags & VM_NONLINEAR))
-				vma_nonlinear_insert(tmp,
-						&mapping->i_mmap_nonlinear);
-			else
-				vma_interval_tree_insert_after(tmp, mpnt,
-							&mapping->i_mmap);
+			vma_interval_tree_insert_after(tmp, mpnt,
+					&mapping->i_mmap);
 			flush_dcache_mmap_unlock(mapping);
 			i_mmap_unlock_write(mapping);
 		}
diff --git a/mm/migrate.c b/mm/migrate.c
index 344cdf692fc8..6e284bcca8bb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -178,37 +178,6 @@ out:
 	return SWAP_AGAIN;
 }
 
-/*
- * Congratulations to trinity for discovering this bug.
- * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
- * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
- * replace the specified range by file ptes throughout (maybe populated after).
- * If page migration finds a page within that range, while it's still located
- * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
- * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
- * But if the migrating page is in a part of the vma outside the range to be
- * remapped, then it will not be cleared, and remove_migration_ptes() needs to
- * deal with it.  Fortunately, this part of the vma is of course still linear,
- * so we just need to use linear location on the nonlinear list.
- */
-static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
-		struct address_space *mapping, void *arg)
-{
-	struct vm_area_struct *vma;
-	/* hugetlbfs does not support remap_pages, so no huge pgoff worries */
-	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-	unsigned long addr;
-
-	list_for_each_entry(vma,
-		&mapping->i_mmap_nonlinear, shared.nonlinear) {
-
-		addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-		if (addr >= vma->vm_start && addr < vma->vm_end)
-			remove_migration_pte(page, vma, addr, arg);
-	}
-	return SWAP_AGAIN;
-}
-
 /*
  * Get rid of all migration entries and replace them by
  * references to the indicated page.
@@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new)
 	struct rmap_walk_control rwc = {
 		.rmap_one = remove_migration_pte,
 		.arg = old,
-		.file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
 	};
 
 	rmap_walk(new, &rwc);
diff --git a/mm/mmap.c b/mm/mmap.c
index e023dc5e59a8..14d84666e8ba 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		mapping_unmap_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
-	if (unlikely(vma->vm_flags & VM_NONLINEAR))
-		list_del_init(&vma->shared.nonlinear);
-	else
-		vma_interval_tree_remove(vma, &mapping->i_mmap);
+	vma_interval_tree_remove(vma, &mapping->i_mmap);
 	flush_dcache_mmap_unlock(mapping);
 }
 
@@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
 			atomic_inc(&mapping->i_mmap_writable);
 
 		flush_dcache_mmap_lock(mapping);
-		if (unlikely(vma->vm_flags & VM_NONLINEAR))
-			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-		else
-			vma_interval_tree_insert(vma, &mapping->i_mmap);
+		vma_interval_tree_insert(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 	}
 }
@@ -789,14 +783,11 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 	if (file) {
 		mapping = file->f_mapping;
-		if (!(vma->vm_flags & VM_NONLINEAR)) {
-			root = &mapping->i_mmap;
-			uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+		root = &mapping->i_mmap;
+		uprobe_munmap(vma, vma->vm_start, vma->vm_end);
 
-			if (adjust_next)
-				uprobe_munmap(next, next->vm_start,
-							next->vm_end);
-		}
+		if (adjust_next)
+			uprobe_munmap(next, next->vm_start, next->vm_end);
 
 		i_mmap_lock_write(mapping);
 		if (insert) {
@@ -3177,8 +3168,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  *
  * mmap_sem in write mode is required in order to block all operations
  * that could modify pagetables and free pages without need of
- * altering the vma layout (for example populate_range() with
- * nonlinear vmas). It's also needed in write mode to avoid new
+ * altering the vma layout. It's also needed in write mode to avoid new
  * anon_vmas to be associated with existing vmas.
  *
  * A single task can't take more than one mm_take_all_locks() in a row
diff --git a/mm/rmap.c b/mm/rmap.c
index 71cd5bd0c17d..70b32498d4f2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 		if (!vma->anon_vma || !page__anon_vma ||
 		    vma->anon_vma->root != page__anon_vma->root)
 			return -EFAULT;
-	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
-		if (!vma->vm_file ||
-		    vma->vm_file->f_mapping != page->mapping)
+	} else if (page->mapping) {
+		if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
 			return -EFAULT;
 	} else
 		return -EFAULT;
@@ -1274,7 +1273,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		if (pte_soft_dirty(pteval))
 			swp_pte = pte_swp_mksoft_dirty(swp_pte);
 		set_pte_at(mm, address, pte, swp_pte);
-		BUG_ON(pte_file(*pte));
 	} else if (IS_ENABLED(CONFIG_MIGRATION) &&
 		   (flags & TTU_MIGRATION)) {
 		/* Establish migration entry for a file page */
@@ -1316,211 +1314,6 @@ out_mlock:
 	return ret;
 }
 
-/*
- * objrmap doesn't work for nonlinear VMAs because the assumption that
- * offset-into-file correlates with offset-into-virtual-addresses does not hold.
- * Consequently, given a particular page and its ->index, we cannot locate the
- * ptes which are mapping that page without an exhaustive linear search.
- *
- * So what this code does is a mini "virtual scan" of each nonlinear VMA which
- * maps the file to which the target page belongs.  The ->vm_private_data field
- * holds the current cursor into that scan.  Successive searches will circulate
- * around the vma's virtual address space.
- *
- * So as more replacement pressure is applied to the pages in a nonlinear VMA,
- * more scanning pressure is placed against them as well.   Eventually pages
- * will become fully unmapped and are eligible for eviction.
- *
- * For very sparsely populated VMAs this is a little inefficient - chances are
- * there there won't be many ptes located within the scan cluster.  In this case
- * maybe we could scan further - to the end of the pte page, perhaps.
- *
- * Mlocked pages:  check VM_LOCKED under mmap_sem held for read, if we can
- * acquire it without blocking.  If vma locked, mlock the pages in the cluster,
- * rather than unmapping them.  If we encounter the "check_page" that vmscan is
- * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
- */
-#define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
-#define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
-
-static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
-		struct vm_area_struct *vma, struct page *check_page)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	pmd_t *pmd;
-	pte_t *pte;
-	pte_t pteval;
-	spinlock_t *ptl;
-	struct page *page;
-	unsigned long address;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
-	unsigned long end;
-	int ret = SWAP_AGAIN;
-	int locked_vma = 0;
-
-	address = (vma->vm_start + cursor) & CLUSTER_MASK;
-	end = address + CLUSTER_SIZE;
-	if (address < vma->vm_start)
-		address = vma->vm_start;
-	if (end > vma->vm_end)
-		end = vma->vm_end;
-
-	pmd = mm_find_pmd(mm, address);
-	if (!pmd)
-		return ret;
-
-	mmun_start = address;
-	mmun_end   = end;
-	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-
-	/*
-	 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
-	 * keep the sem while scanning the cluster for mlocking pages.
-	 */
-	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
-		locked_vma = (vma->vm_flags & VM_LOCKED);
-		if (!locked_vma)
-			up_read(&vma->vm_mm->mmap_sem); /* don't need it */
-	}
-
-	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-
-	/* Update high watermark before we lower rss */
-	update_hiwater_rss(mm);
-
-	for (; address < end; pte++, address += PAGE_SIZE) {
-		if (!pte_present(*pte))
-			continue;
-		page = vm_normal_page(vma, address, *pte);
-		BUG_ON(!page || PageAnon(page));
-
-		if (locked_vma) {
-			if (page == check_page) {
-				/* we know we have check_page locked */
-				mlock_vma_page(page);
-				ret = SWAP_MLOCK;
-			} else if (trylock_page(page)) {
-				/*
-				 * If we can lock the page, perform mlock.
-				 * Otherwise leave the page alone, it will be
-				 * eventually encountered again later.
-				 */
-				mlock_vma_page(page);
-				unlock_page(page);
-			}
-			continue;	/* don't unmap */
-		}
-
-		/*
-		 * No need for _notify because we're within an
-		 * mmu_notifier_invalidate_range_ {start|end} scope.
-		 */
-		if (ptep_clear_flush_young(vma, address, pte))
-			continue;
-
-		/* Nuke the page table entry. */
-		flush_cache_page(vma, address, pte_pfn(*pte));
-		pteval = ptep_clear_flush_notify(vma, address, pte);
-
-		/* If nonlinear, store the file page offset in the pte. */
-		if (page->index != linear_page_index(vma, address)) {
-			pte_t ptfile = pgoff_to_pte(page->index);
-			if (pte_soft_dirty(pteval))
-				ptfile = pte_file_mksoft_dirty(ptfile);
-			set_pte_at(mm, address, pte, ptfile);
-		}
-
-		/* Move the dirty bit to the physical page now the pte is gone. */
-		if (pte_dirty(pteval))
-			set_page_dirty(page);
-
-		page_remove_rmap(page);
-		page_cache_release(page);
-		dec_mm_counter(mm, MM_FILEPAGES);
-		(*mapcount)--;
-	}
-	pte_unmap_unlock(pte - 1, ptl);
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-	if (locked_vma)
-		up_read(&vma->vm_mm->mmap_sem);
-	return ret;
-}
-
-static int try_to_unmap_nonlinear(struct page *page,
-		struct address_space *mapping, void *arg)
-{
-	struct vm_area_struct *vma;
-	int ret = SWAP_AGAIN;
-	unsigned long cursor;
-	unsigned long max_nl_cursor = 0;
-	unsigned long max_nl_size = 0;
-	unsigned int mapcount;
-
-	list_for_each_entry(vma,
-		&mapping->i_mmap_nonlinear, shared.nonlinear) {
-
-		cursor = (unsigned long) vma->vm_private_data;
-		if (cursor > max_nl_cursor)
-			max_nl_cursor = cursor;
-		cursor = vma->vm_end - vma->vm_start;
-		if (cursor > max_nl_size)
-			max_nl_size = cursor;
-	}
-
-	if (max_nl_size == 0) {	/* all nonlinears locked or reserved ? */
-		return SWAP_FAIL;
-	}
-
-	/*
-	 * We don't try to search for this page in the nonlinear vmas,
-	 * and page_referenced wouldn't have found it anyway.  Instead
-	 * just walk the nonlinear vmas trying to age and unmap some.
-	 * The mapcount of the page we came in with is irrelevant,
-	 * but even so use it as a guide to how hard we should try?
-	 */
-	mapcount = page_mapcount(page);
-	if (!mapcount)
-		return ret;
-
-	cond_resched();
-
-	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
-	if (max_nl_cursor == 0)
-		max_nl_cursor = CLUSTER_SIZE;
-
-	do {
-		list_for_each_entry(vma,
-			&mapping->i_mmap_nonlinear, shared.nonlinear) {
-
-			cursor = (unsigned long) vma->vm_private_data;
-			while (cursor < max_nl_cursor &&
-				cursor < vma->vm_end - vma->vm_start) {
-				if (try_to_unmap_cluster(cursor, &mapcount,
-						vma, page) == SWAP_MLOCK)
-					ret = SWAP_MLOCK;
-				cursor += CLUSTER_SIZE;
-				vma->vm_private_data = (void *) cursor;
-				if ((int)mapcount <= 0)
-					return ret;
-			}
-			vma->vm_private_data = (void *) max_nl_cursor;
-		}
-		cond_resched();
-		max_nl_cursor += CLUSTER_SIZE;
-	} while (max_nl_cursor <= max_nl_size);
-
-	/*
-	 * Don't loop forever (perhaps all the remaining pages are
-	 * in locked vmas).  Reset cursor on all unreserved nonlinear
-	 * vmas, now forgetting on which ones it had fallen behind.
-	 */
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
-		vma->vm_private_data = NULL;
-
-	return ret;
-}
-
 bool is_vma_temporary_stack(struct vm_area_struct *vma)
 {
 	int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1566,7 +1359,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 		.rmap_one = try_to_unmap_one,
 		.arg = (void *)flags,
 		.done = page_not_mapped,
-		.file_nonlinear = try_to_unmap_nonlinear,
 		.anon_lock = page_lock_anon_vma_read,
 	};
 
@@ -1612,12 +1404,6 @@ int try_to_munlock(struct page *page)
 		.rmap_one = try_to_unmap_one,
 		.arg = (void *)TTU_MUNLOCK,
 		.done = page_not_mapped,
-		/*
-		 * We don't bother to try to find the munlocked page in
-		 * nonlinears. It's costly. Instead, later, page reclaim logic
-		 * may call try_to_unmap() and recover PG_mlocked lazily.
-		 */
-		.file_nonlinear = NULL,
 		.anon_lock = page_lock_anon_vma_read,
 
 	};
@@ -1748,13 +1534,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 			goto done;
 	}
 
-	if (!rwc->file_nonlinear)
-		goto done;
-
-	if (list_empty(&mapping->i_mmap_nonlinear))
-		goto done;
-
-	ret = rwc->file_nonlinear(page, mapping, rwc->arg);
 done:
 	i_mmap_unlock_read(mapping);
 	return ret;
diff --git a/mm/swap.c b/mm/swap.c
index 8a12b33936b4..5b3087228b99 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1140,10 +1140,8 @@ void __init swap_setup(void)
 
 	if (bdi_init(swapper_spaces[0].backing_dev_info))
 		panic("Failed to init swap bdi");
-	for (i = 0; i < MAX_SWAPFILES; i++) {
+	for (i = 0; i < MAX_SWAPFILES; i++)
 		spin_lock_init(&swapper_spaces[i].tree_lock);
-		INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
-	}
 #endif
 
 	/* Use a smaller cluster for small-memory machines */
-- 
cgit v1.2.3


From ac51b934f3912582d3c897c6c4d09b32ea57b2c7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:10:02 -0800
Subject: mm: replace vma->sharead.linear with vma->shared

After removing vma->shared.nonlinear we have only one member of
vma->shared union, which doesn't make much sense.

This patch drops the union and move struct vma->shared.linear to
vma->shared.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |  8 +++-----
 mm/interval_tree.c       | 34 +++++++++++++++++-----------------
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3b1d20fb0848..07c8bd3f7b48 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -275,11 +275,9 @@ struct vm_area_struct {
 	 * For areas with an address space and backing store,
 	 * linkage into the address_space->i_mmap interval tree.
 	 */
-	union {
-		struct {
-			struct rb_node rb;
-			unsigned long rb_subtree_last;
-		} linear;
+	struct {
+		struct rb_node rb;
+		unsigned long rb_subtree_last;
 	} shared;
 
 	/*
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 8da581fa9060..f2c2492681bf 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
 	return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
 }
 
-INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
-		     unsigned long, shared.linear.rb_subtree_last,
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
+		     unsigned long, shared.rb_subtree_last,
 		     vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
 
 /* Insert node immediately after prev in the interval tree */
@@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
 
 	VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
 
-	if (!prev->shared.linear.rb.rb_right) {
+	if (!prev->shared.rb.rb_right) {
 		parent = prev;
-		link = &prev->shared.linear.rb.rb_right;
+		link = &prev->shared.rb.rb_right;
 	} else {
-		parent = rb_entry(prev->shared.linear.rb.rb_right,
-				  struct vm_area_struct, shared.linear.rb);
-		if (parent->shared.linear.rb_subtree_last < last)
-			parent->shared.linear.rb_subtree_last = last;
-		while (parent->shared.linear.rb.rb_left) {
-			parent = rb_entry(parent->shared.linear.rb.rb_left,
-				struct vm_area_struct, shared.linear.rb);
-			if (parent->shared.linear.rb_subtree_last < last)
-				parent->shared.linear.rb_subtree_last = last;
+		parent = rb_entry(prev->shared.rb.rb_right,
+				  struct vm_area_struct, shared.rb);
+		if (parent->shared.rb_subtree_last < last)
+			parent->shared.rb_subtree_last = last;
+		while (parent->shared.rb.rb_left) {
+			parent = rb_entry(parent->shared.rb.rb_left,
+				struct vm_area_struct, shared.rb);
+			if (parent->shared.rb_subtree_last < last)
+				parent->shared.rb_subtree_last = last;
 		}
-		link = &parent->shared.linear.rb.rb_left;
+		link = &parent->shared.rb.rb_left;
 	}
 
-	node->shared.linear.rb_subtree_last = last;
-	rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
-	rb_insert_augmented(&node->shared.linear.rb, root,
+	node->shared.rb_subtree_last = last;
+	rb_link_node(&node->shared.rb, &parent->shared.rb, link);
+	rb_insert_augmented(&node->shared.rb, root,
 			    &vma_interval_tree_augment);
 }
 
-- 
cgit v1.2.3


From 0661a33611fca12570cba48d9344ce68834ee86c Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Tue, 10 Feb 2015 14:10:04 -0800
Subject: mm: remove rest usage of VM_NONLINEAR and pte_file()

One bit in ->vm_flags is unused now!

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/drm_vma_manager.c |  3 +-
 include/linux/mm.h                |  1 -
 include/linux/swapops.h           |  4 +-
 mm/debug.c                        |  1 -
 mm/gup.c                          |  2 +-
 mm/ksm.c                          |  2 +-
 mm/madvise.c                      |  4 +-
 mm/memcontrol.c                   |  7 +---
 mm/memory.c                       | 78 +++++++++++++++++++--------------------
 mm/mincore.c                      |  9 +----
 mm/mprotect.c                     |  2 +-
 mm/mremap.c                       |  2 -
 mm/msync.c                        |  5 +--
 13 files changed, 49 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_vma_manager.c b/drivers/gpu/drm/drm_vma_manager.c
index 63b471205072..68c1f32fb086 100644
--- a/drivers/gpu/drm/drm_vma_manager.c
+++ b/drivers/gpu/drm/drm_vma_manager.c
@@ -50,8 +50,7 @@
  *
  * You must not use multiple offset managers on a single address_space.
  * Otherwise, mm-core will be unable to tear down memory mappings as the VM will
- * no longer be linear. Please use VM_NONLINEAR in that case and implement your
- * own offset managers.
+ * no longer be linear.
  *
  * This offset manager works on page-based addresses. That is, every argument
  * and return code (with the exception of drm_vma_node_offset_addr()) is given
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 18391eec4864..a0da685bdb82 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -138,7 +138,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
-#define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
 #define VM_ARCH_2	0x02000000
 #define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 6adfb7bfbf44..50cbc876be56 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
-	return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte);
+	return !pte_none(pte) && !pte_present_nonuma(pte);
 }
 #endif
 
@@ -66,7 +66,6 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
 {
 	swp_entry_t arch_entry;
 
-	BUG_ON(pte_file(pte));
 	if (pte_swp_soft_dirty(pte))
 		pte = pte_swp_clear_soft_dirty(pte);
 	arch_entry = __pte_to_swp_entry(pte);
@@ -82,7 +81,6 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)
 	swp_entry_t arch_entry;
 
 	arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
-	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
 
diff --git a/mm/debug.c b/mm/debug.c
index 0e58f3211f89..d69cb5a7ba9a 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = {
 	{VM_ACCOUNT,			"account"	},
 	{VM_NORESERVE,			"noreserve"	},
 	{VM_HUGETLB,			"hugetlb"	},
-	{VM_NONLINEAR,			"nonlinear"	},
 #if defined(CONFIG_X86)
 	{VM_PAT,			"pat"		},
 #elif defined(CONFIG_PPC)
diff --git a/mm/gup.c b/mm/gup.c
index 8dd50ce6326f..12bc2bc33da7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -55,7 +55,7 @@ retry:
 		 */
 		if (likely(!(flags & FOLL_MIGRATION)))
 			goto no_page;
-		if (pte_none(pte) || pte_file(pte))
+		if (pte_none(pte))
 			goto no_page;
 		entry = pte_to_swp_entry(pte);
 		if (!is_migration_entry(entry))
diff --git a/mm/ksm.c b/mm/ksm.c
index 15647fb0394f..4162dce2eb44 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		 */
 		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
 				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
+				 VM_HUGETLB | VM_MIXEDMAP))
 			return 0;		/* just ignore the advice */
 
 #ifdef VM_SAO
diff --git a/mm/madvise.c b/mm/madvise.c
index 917754d26c17..d79fb5e8f80a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
 		pte_unmap_unlock(orig_pte, ptl);
 
-		if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+		if (pte_present(pte) || pte_none(pte))
 			continue;
 		entry = pte_to_swp_entry(pte);
 		if (unlikely(non_swap_entry(entry)))
@@ -296,7 +296,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 
 	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
 
-	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+	if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
 		return -EINVAL;
 
 	f = vma->vm_file;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2f6893c2f01b..8b58701b9647 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4926,10 +4926,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 		return NULL;
 
 	mapping = vma->vm_file->f_mapping;
-	if (pte_none(ptent))
-		pgoff = linear_page_index(vma, addr);
-	else /* pte_file(ptent) is true */
-		pgoff = pte_to_pgoff(ptent);
+	pgoff = linear_page_index(vma, addr);
 
 	/* page is moved even if it's not RSS of this task(page-faulted). */
 #ifdef CONFIG_SWAP
@@ -4961,7 +4958,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		page = mc_handle_present_pte(vma, addr, ptent);
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
-	else if (pte_none(ptent) || pte_file(ptent))
+	else if (pte_none(ptent))
 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
 
 	if (!page && !ent.val)
diff --git a/mm/memory.c b/mm/memory.c
index 43a53743cbb4..9aa09217fe20 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -811,42 +811,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
-		if (!pte_file(pte)) {
-			swp_entry_t entry = pte_to_swp_entry(pte);
-
-			if (likely(!non_swap_entry(entry))) {
-				if (swap_duplicate(entry) < 0)
-					return entry.val;
-
-				/* make sure dst_mm is on swapoff's mmlist. */
-				if (unlikely(list_empty(&dst_mm->mmlist))) {
-					spin_lock(&mmlist_lock);
-					if (list_empty(&dst_mm->mmlist))
-						list_add(&dst_mm->mmlist,
-							 &src_mm->mmlist);
-					spin_unlock(&mmlist_lock);
-				}
-				rss[MM_SWAPENTS]++;
-			} else if (is_migration_entry(entry)) {
-				page = migration_entry_to_page(entry);
-
-				if (PageAnon(page))
-					rss[MM_ANONPAGES]++;
-				else
-					rss[MM_FILEPAGES]++;
-
-				if (is_write_migration_entry(entry) &&
-				    is_cow_mapping(vm_flags)) {
-					/*
-					 * COW mappings require pages in both
-					 * parent and child to be set to read.
-					 */
-					make_migration_entry_read(&entry);
-					pte = swp_entry_to_pte(entry);
-					if (pte_swp_soft_dirty(*src_pte))
-						pte = pte_swp_mksoft_dirty(pte);
-					set_pte_at(src_mm, addr, src_pte, pte);
-				}
+		swp_entry_t entry = pte_to_swp_entry(pte);
+
+		if (likely(!non_swap_entry(entry))) {
+			if (swap_duplicate(entry) < 0)
+				return entry.val;
+
+			/* make sure dst_mm is on swapoff's mmlist. */
+			if (unlikely(list_empty(&dst_mm->mmlist))) {
+				spin_lock(&mmlist_lock);
+				if (list_empty(&dst_mm->mmlist))
+					list_add(&dst_mm->mmlist,
+							&src_mm->mmlist);
+				spin_unlock(&mmlist_lock);
+			}
+			rss[MM_SWAPENTS]++;
+		} else if (is_migration_entry(entry)) {
+			page = migration_entry_to_page(entry);
+
+			if (PageAnon(page))
+				rss[MM_ANONPAGES]++;
+			else
+				rss[MM_FILEPAGES]++;
+
+			if (is_write_migration_entry(entry) &&
+					is_cow_mapping(vm_flags)) {
+				/*
+				 * COW mappings require pages in both
+				 * parent and child to be set to read.
+				 */
+				make_migration_entry_read(&entry);
+				pte = swp_entry_to_pte(entry);
+				if (pte_swp_soft_dirty(*src_pte))
+					pte = pte_swp_mksoft_dirty(pte);
+				set_pte_at(src_mm, addr, src_pte, pte);
 			}
 		}
 		goto out_set_pte;
@@ -1020,11 +1018,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
-			       VM_PFNMAP | VM_MIXEDMAP))) {
-		if (!vma->anon_vma)
-			return 0;
-	}
+	if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+			!vma->anon_vma)
+		return 0;
 
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
diff --git a/mm/mincore.c b/mm/mincore.c
index c8c528b36641..46527c023e0c 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -124,17 +124,13 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		pte_t pte = *ptep;
-		pgoff_t pgoff;
 
 		next = addr + PAGE_SIZE;
 		if (pte_none(pte))
 			mincore_unmapped_range(vma, addr, next, vec);
 		else if (pte_present(pte))
 			*vec = 1;
-		else if (pte_file(pte)) {
-			pgoff = pte_to_pgoff(pte);
-			*vec = mincore_page(vma->vm_file->f_mapping, pgoff);
-		} else { /* pte is a swap entry */
+		else { /* pte is a swap entry */
 			swp_entry_t entry = pte_to_swp_entry(pte);
 
 			if (non_swap_entry(entry)) {
@@ -145,9 +141,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				*vec = 1;
 			} else {
 #ifdef CONFIG_SWAP
-				pgoff = entry.val;
 				*vec = mincore_page(swap_address_space(entry),
-					pgoff);
+					entry.val);
 #else
 				WARN_ON(1);
 				*vec = 1;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ace93454ce8e..33121662f08b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -105,7 +105,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			}
 			if (updated)
 				pages++;
-		} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
+		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
 			swp_entry_t entry = pte_to_swp_entry(oldpte);
 
 			if (is_write_migration_entry(entry)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 17fa018f5f39..57dadc025c64 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte)
 		pte = pte_mksoft_dirty(pte);
 	else if (is_swap_pte(pte))
 		pte = pte_swp_mksoft_dirty(pte);
-	else if (pte_file(pte))
-		pte = pte_file_mksoft_dirty(pte);
 #endif
 	return pte;
 }
diff --git a/mm/msync.c b/mm/msync.c
index 992a1673d488..bb04d53ae852 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 				(vma->vm_flags & VM_SHARED)) {
 			get_file(file);
 			up_read(&mm->mmap_sem);
-			if (vma->vm_flags & VM_NONLINEAR)
-				error = vfs_fsync(file, 1);
-			else
-				error = vfs_fsync_range(file, fstart, fend, 1);
+			error = vfs_fsync_range(file, fstart, fend, 1);
 			fput(file);
 			if (error || start >= end)
 				goto out;
-- 
cgit v1.2.3


From 753162cd849c45580fb5aaa7f3597c81e74e391c Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <a.ryabinin@samsung.com>
Date: Tue, 10 Feb 2015 14:11:36 -0800
Subject: mm: hugetlb: fix type of hugetlb_treat_as_movable variable

hugetlb_treat_as_movable declared as unsigned long, but
proc_dointvec() used for parsing it:

static struct ctl_table vm_table[] = {
...
        {
                .procname	= "hugepages_treat_as_movable",
                .data		= &hugepages_treat_as_movable,
                .maxlen		= sizeof(int),
                .mode		= 0644,
                .proc_handler	= proc_dointvec,
        },

This seems harmless, but it's better to use int type here.

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 2 +-
 mm/hugetlb.c            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 431b7fc605c9..7d7856359920 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -86,7 +86,7 @@ void free_huge_page(struct page *page);
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
 #endif
 
-extern unsigned long hugepages_treat_as_movable;
+extern int hugepages_treat_as_movable;
 extern int sysctl_hugetlb_shm_group;
 extern struct list_head huge_boot_pages;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85032de5e20f..be0e5d0db5ec 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,7 +35,7 @@
 #include <linux/node.h>
 #include "internal.h"
 
-unsigned long hugepages_treat_as_movable;
+int hugepages_treat_as_movable;
 
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
-- 
cgit v1.2.3


From dbf22eb6d8675fc173154d9f1bd1bd0fda53a001 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Tue, 10 Feb 2015 14:11:41 -0800
Subject: memcg: zap __memcg_{charge,uncharge}_slab

They are simple wrappers around memcg_{charge,uncharge}_kmem, so let's
zap them and call these functions directly.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  5 +++--
 mm/memcontrol.c            | 21 +++------------------
 mm/slab.h                  |  4 ++--
 3 files changed, 8 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7c95af8d552c..18ccb2988979 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -403,8 +403,9 @@ void memcg_update_array_size(int num_groups);
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep);
 void __memcg_kmem_put_cache(struct kmem_cache *cachep);
 
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order);
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order);
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+		      unsigned long nr_pages);
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
 
 int __memcg_cleanup_cache_params(struct kmem_cache *s);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8b58701b9647..e229e3ad615c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2495,8 +2495,8 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
 	return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
 }
 
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-			     unsigned long nr_pages)
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+		      unsigned long nr_pages)
 {
 	struct page_counter *counter;
 	int ret = 0;
@@ -2533,8 +2533,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 	return ret;
 }
 
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
-				unsigned long nr_pages)
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
 {
 	page_counter_uncharge(&memcg->memory, nr_pages);
 	if (do_swap_account)
@@ -2767,20 +2766,6 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
 	current->memcg_kmem_skip_account = 0;
 }
 
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
-{
-	unsigned int nr_pages = 1 << order;
-
-	return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
-}
-
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
-{
-	unsigned int nr_pages = 1 << order;
-
-	memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
-}
-
 /*
  * Return the kmem_cache we're supposed to use for a slab allocation.
  * We try to use the current memcg's version of the cache.
diff --git a/mm/slab.h b/mm/slab.h
index 1cf4005482dd..90430d6f665e 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -235,7 +235,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
 		return 0;
 	if (is_root_cache(s))
 		return 0;
-	return __memcg_charge_slab(s, gfp, order);
+	return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order);
 }
 
 static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,7 +244,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 		return;
 	if (is_root_cache(s))
 		return;
-	__memcg_uncharge_slab(s, order);
+	memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order);
 }
 #else
 static inline bool is_root_cache(struct kmem_cache *s)
-- 
cgit v1.2.3


From 3e0350a36414a73c5c2d1e354f8c0ab4ace1296d Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Tue, 10 Feb 2015 14:11:44 -0800
Subject: memcg: zap memcg_name argument of memcg_create_kmem_cache

Instead of passing the name of the memory cgroup which the cache is
created for in the memcg_name_argument, let's obtain it immediately in
memcg_create_kmem_cache.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 3 +--
 mm/memcontrol.c      | 5 +----
 mm/slab_common.c     | 9 +++++----
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9a139b637069..eca9ed303a1b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -117,8 +117,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			void (*)(void *));
 #ifdef CONFIG_MEMCG_KMEM
 struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *,
-					   struct kmem_cache *,
-					   const char *);
+					   struct kmem_cache *);
 #endif
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e229e3ad615c..baf7eb27e3ae 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2607,8 +2607,6 @@ void memcg_update_array_size(int num)
 static void memcg_register_cache(struct mem_cgroup *memcg,
 				 struct kmem_cache *root_cache)
 {
-	static char memcg_name_buf[NAME_MAX + 1]; /* protected by
-						     memcg_slab_mutex */
 	struct kmem_cache *cachep;
 	int id;
 
@@ -2624,8 +2622,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
 	if (cache_from_memcg_idx(root_cache, id))
 		return;
 
-	cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
-	cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
+	cachep = memcg_create_kmem_cache(memcg, root_cache);
 	/*
 	 * If we could not create a memcg cache, do not complain, because
 	 * that's not critical at all as we can always proceed with the root
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 67f182c10f24..1b782a2d3b3d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -430,16 +430,15 @@ EXPORT_SYMBOL(kmem_cache_create);
  * memcg_create_kmem_cache - Create a cache for a memory cgroup.
  * @memcg: The memory cgroup the new cache is for.
  * @root_cache: The parent of the new cache.
- * @memcg_name: The name of the memory cgroup (used for naming the new cache).
  *
  * This function attempts to create a kmem cache that will serve allocation
  * requests going from @memcg to @root_cache. The new cache inherits properties
  * from its parent.
  */
 struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
-					   struct kmem_cache *root_cache,
-					   const char *memcg_name)
+					   struct kmem_cache *root_cache)
 {
+	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
 	struct kmem_cache *s = NULL;
 	char *cache_name;
 
@@ -448,8 +447,10 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
 	mutex_lock(&slab_mutex);
 
+	cgroup_name(mem_cgroup_css(memcg)->cgroup,
+		    memcg_name_buf, sizeof(memcg_name_buf));
 	cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
-			       memcg_cache_id(memcg), memcg_name);
+			       memcg_cache_id(memcg), memcg_name_buf);
 	if (!cache_name)
 		goto out_unlock;
 
-- 
cgit v1.2.3


From d5b3cf7139b8770af4ed8bb36a1ab9d290ac39e9 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Tue, 10 Feb 2015 14:11:47 -0800
Subject: memcg: zap memcg_slab_caches and memcg_slab_mutex

mem_cgroup->memcg_slab_caches is a list of kmem caches corresponding to
the given cgroup.  Currently, it is only used on css free in order to
destroy all caches corresponding to the memory cgroup being freed.  The
list is protected by memcg_slab_mutex.  The mutex is also used to protect
kmem_cache->memcg_params->memcg_caches arrays and synchronizes
kmem_cache_destroy vs memcg_unregister_all_caches.

However, we can perfectly get on without these two.  To destroy all caches
corresponding to a memory cgroup, we can walk over the global list of kmem
caches, slab_caches, and we can do all the synchronization stuff using the
slab_mutex instead of the memcg_slab_mutex.  This patch therefore gets rid
of the memcg_slab_caches and memcg_slab_mutex.

Apart from this nice cleanup, it also:

 - assures that rcu_barrier() is called once at max when a root cache is
   destroyed or a memory cgroup is freed, no matter how many caches have
   SLAB_DESTROY_BY_RCU flag set;

 - fixes the race between kmem_cache_destroy and kmem_cache_create that
   exists, because memcg_cleanup_cache_params, which is called from
   kmem_cache_destroy after checking that kmem_cache->refcount=0,
   releases the slab_mutex, which gives kmem_cache_create a chance to
   make an alias to a cache doomed to be destroyed.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |   2 -
 include/linux/slab.h       |   6 +-
 mm/memcontrol.c            | 156 +++++----------------------------------------
 mm/slab_common.c           | 142 +++++++++++++++++++++++++++++------------
 4 files changed, 120 insertions(+), 186 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 18ccb2988979..fb212e1d700d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -407,8 +407,6 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 		      unsigned long nr_pages);
 void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
 
-int __memcg_cleanup_cache_params(struct kmem_cache *s);
-
 /**
  * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
  * @gfp: the gfp allocation flags.
diff --git a/include/linux/slab.h b/include/linux/slab.h
index eca9ed303a1b..2e3b448cfa2d 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -116,8 +116,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
 			void (*)(void *));
 #ifdef CONFIG_MEMCG_KMEM
-struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *,
-					   struct kmem_cache *);
+void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
+void memcg_destroy_kmem_caches(struct mem_cgroup *);
 #endif
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
@@ -490,7 +490,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
  * Child caches will hold extra metadata needed for its operation. Fields are:
  *
  * @memcg: pointer to the memcg this cache belongs to
- * @list: list_head for the list of all caches in this memcg
  * @root_cache: pointer to the global, root cache, this cache was derived from
  */
 struct memcg_cache_params {
@@ -502,7 +501,6 @@ struct memcg_cache_params {
 		};
 		struct {
 			struct mem_cgroup *memcg;
-			struct list_head list;
 			struct kmem_cache *root_cache;
 		};
 	};
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index baf7eb27e3ae..f3f8a4f52a0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -343,9 +343,6 @@ struct mem_cgroup {
 	struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
-	/* analogous to slab_common's slab_caches list, but per-memcg;
-	 * protected by memcg_slab_mutex */
-	struct list_head memcg_slab_caches;
         /* Index in the kmem_cache->memcg_params->memcg_caches array */
 	int kmemcg_id;
 #endif
@@ -2476,25 +2473,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-/*
- * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
- * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
- */
-static DEFINE_MUTEX(memcg_slab_mutex);
-
-/*
- * This is a bit cumbersome, but it is rarely used and avoids a backpointer
- * in the memcg_cache_params struct.
- */
-static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
-{
-	struct kmem_cache *cachep;
-
-	VM_BUG_ON(p->is_root_cache);
-	cachep = p->root_cache;
-	return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
-}
-
 int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 		      unsigned long nr_pages)
 {
@@ -2578,10 +2556,7 @@ static int memcg_alloc_cache_id(void)
 	else if (size > MEMCG_CACHES_MAX_SIZE)
 		size = MEMCG_CACHES_MAX_SIZE;
 
-	mutex_lock(&memcg_slab_mutex);
 	err = memcg_update_all_caches(size);
-	mutex_unlock(&memcg_slab_mutex);
-
 	if (err) {
 		ida_simple_remove(&kmem_limited_groups, id);
 		return err;
@@ -2604,120 +2579,20 @@ void memcg_update_array_size(int num)
 	memcg_limited_groups_array_size = num;
 }
 
-static void memcg_register_cache(struct mem_cgroup *memcg,
-				 struct kmem_cache *root_cache)
-{
-	struct kmem_cache *cachep;
-	int id;
-
-	lockdep_assert_held(&memcg_slab_mutex);
-
-	id = memcg_cache_id(memcg);
-
-	/*
-	 * Since per-memcg caches are created asynchronously on first
-	 * allocation (see memcg_kmem_get_cache()), several threads can try to
-	 * create the same cache, but only one of them may succeed.
-	 */
-	if (cache_from_memcg_idx(root_cache, id))
-		return;
-
-	cachep = memcg_create_kmem_cache(memcg, root_cache);
-	/*
-	 * If we could not create a memcg cache, do not complain, because
-	 * that's not critical at all as we can always proceed with the root
-	 * cache.
-	 */
-	if (!cachep)
-		return;
-
-	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-
-	/*
-	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
-	 * barrier here to ensure nobody will see the kmem_cache partially
-	 * initialized.
-	 */
-	smp_wmb();
-
-	BUG_ON(root_cache->memcg_params->memcg_caches[id]);
-	root_cache->memcg_params->memcg_caches[id] = cachep;
-}
-
-static void memcg_unregister_cache(struct kmem_cache *cachep)
-{
-	struct kmem_cache *root_cache;
-	struct mem_cgroup *memcg;
-	int id;
-
-	lockdep_assert_held(&memcg_slab_mutex);
-
-	BUG_ON(is_root_cache(cachep));
-
-	root_cache = cachep->memcg_params->root_cache;
-	memcg = cachep->memcg_params->memcg;
-	id = memcg_cache_id(memcg);
-
-	BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
-	root_cache->memcg_params->memcg_caches[id] = NULL;
-
-	list_del(&cachep->memcg_params->list);
-
-	kmem_cache_destroy(cachep);
-}
-
-int __memcg_cleanup_cache_params(struct kmem_cache *s)
-{
-	struct kmem_cache *c;
-	int i, failed = 0;
-
-	mutex_lock(&memcg_slab_mutex);
-	for_each_memcg_cache_index(i) {
-		c = cache_from_memcg_idx(s, i);
-		if (!c)
-			continue;
-
-		memcg_unregister_cache(c);
-
-		if (cache_from_memcg_idx(s, i))
-			failed++;
-	}
-	mutex_unlock(&memcg_slab_mutex);
-	return failed;
-}
-
-static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
-	struct kmem_cache *cachep;
-	struct memcg_cache_params *params, *tmp;
-
-	if (!memcg_kmem_is_active(memcg))
-		return;
-
-	mutex_lock(&memcg_slab_mutex);
-	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
-		cachep = memcg_params_to_cache(params);
-		memcg_unregister_cache(cachep);
-	}
-	mutex_unlock(&memcg_slab_mutex);
-}
-
-struct memcg_register_cache_work {
+struct memcg_kmem_cache_create_work {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *cachep;
 	struct work_struct work;
 };
 
-static void memcg_register_cache_func(struct work_struct *w)
+static void memcg_kmem_cache_create_func(struct work_struct *w)
 {
-	struct memcg_register_cache_work *cw =
-		container_of(w, struct memcg_register_cache_work, work);
+	struct memcg_kmem_cache_create_work *cw =
+		container_of(w, struct memcg_kmem_cache_create_work, work);
 	struct mem_cgroup *memcg = cw->memcg;
 	struct kmem_cache *cachep = cw->cachep;
 
-	mutex_lock(&memcg_slab_mutex);
-	memcg_register_cache(memcg, cachep);
-	mutex_unlock(&memcg_slab_mutex);
+	memcg_create_kmem_cache(memcg, cachep);
 
 	css_put(&memcg->css);
 	kfree(cw);
@@ -2726,10 +2601,10 @@ static void memcg_register_cache_func(struct work_struct *w)
 /*
  * Enqueue the creation of a per-memcg kmem_cache.
  */
-static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
-					    struct kmem_cache *cachep)
+static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+					       struct kmem_cache *cachep)
 {
-	struct memcg_register_cache_work *cw;
+	struct memcg_kmem_cache_create_work *cw;
 
 	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
 	if (!cw)
@@ -2739,18 +2614,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
 
 	cw->memcg = memcg;
 	cw->cachep = cachep;
+	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
 
-	INIT_WORK(&cw->work, memcg_register_cache_func);
 	schedule_work(&cw->work);
 }
 
-static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
-					  struct kmem_cache *cachep)
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+					     struct kmem_cache *cachep)
 {
 	/*
 	 * We need to stop accounting when we kmalloc, because if the
 	 * corresponding kmalloc cache is not yet created, the first allocation
-	 * in __memcg_schedule_register_cache will recurse.
+	 * in __memcg_schedule_kmem_cache_create will recurse.
 	 *
 	 * However, it is better to enclose the whole function. Depending on
 	 * the debugging options enabled, INIT_WORK(), for instance, can
@@ -2759,7 +2634,7 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
 	 * the safest choice is to do it like this, wrapping the whole function.
 	 */
 	current->memcg_kmem_skip_account = 1;
-	__memcg_schedule_register_cache(memcg, cachep);
+	__memcg_schedule_kmem_cache_create(memcg, cachep);
 	current->memcg_kmem_skip_account = 0;
 }
 
@@ -2807,7 +2682,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 	 * could happen with the slab_mutex held. So it's better to
 	 * defer everything.
 	 */
-	memcg_schedule_register_cache(memcg, cachep);
+	memcg_schedule_kmem_cache_create(memcg, cachep);
 out:
 	css_put(&memcg->css);
 	return cachep;
@@ -4136,7 +4011,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
-	memcg_unregister_all_caches(memcg);
+	memcg_destroy_kmem_caches(memcg);
 	mem_cgroup_sockets_destroy(memcg);
 }
 #else
@@ -4664,7 +4539,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	spin_lock_init(&memcg->event_list_lock);
 #ifdef CONFIG_MEMCG_KMEM
 	memcg->kmemcg_id = -1;
-	INIT_LIST_HEAD(&memcg->memcg_slab_caches);
 #endif
 
 	return &memcg->css;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1b782a2d3b3d..6e1e4cf65836 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -425,6 +425,49 @@ out_unlock:
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
+static int do_kmem_cache_shutdown(struct kmem_cache *s,
+		struct list_head *release, bool *need_rcu_barrier)
+{
+	if (__kmem_cache_shutdown(s) != 0) {
+		printk(KERN_ERR "kmem_cache_destroy %s: "
+		       "Slab cache still has objects\n", s->name);
+		dump_stack();
+		return -EBUSY;
+	}
+
+	if (s->flags & SLAB_DESTROY_BY_RCU)
+		*need_rcu_barrier = true;
+
+#ifdef CONFIG_MEMCG_KMEM
+	if (!is_root_cache(s)) {
+		struct kmem_cache *root_cache = s->memcg_params->root_cache;
+		int memcg_id = memcg_cache_id(s->memcg_params->memcg);
+
+		BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s);
+		root_cache->memcg_params->memcg_caches[memcg_id] = NULL;
+	}
+#endif
+	list_move(&s->list, release);
+	return 0;
+}
+
+static void do_kmem_cache_release(struct list_head *release,
+				  bool need_rcu_barrier)
+{
+	struct kmem_cache *s, *s2;
+
+	if (need_rcu_barrier)
+		rcu_barrier();
+
+	list_for_each_entry_safe(s, s2, release, list) {
+#ifdef SLAB_SUPPORTS_SYSFS
+		sysfs_slab_remove(s);
+#else
+		slab_kmem_cache_release(s);
+#endif
+	}
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * memcg_create_kmem_cache - Create a cache for a memory cgroup.
@@ -435,10 +478,11 @@ EXPORT_SYMBOL(kmem_cache_create);
  * requests going from @memcg to @root_cache. The new cache inherits properties
  * from its parent.
  */
-struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
-					   struct kmem_cache *root_cache)
+void memcg_create_kmem_cache(struct mem_cgroup *memcg,
+			     struct kmem_cache *root_cache)
 {
 	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
+	int memcg_id = memcg_cache_id(memcg);
 	struct kmem_cache *s = NULL;
 	char *cache_name;
 
@@ -447,6 +491,14 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 
 	mutex_lock(&slab_mutex);
 
+	/*
+	 * Since per-memcg caches are created asynchronously on first
+	 * allocation (see memcg_kmem_get_cache()), several threads can try to
+	 * create the same cache, but only one of them may succeed.
+	 */
+	if (cache_from_memcg_idx(root_cache, memcg_id))
+		goto out_unlock;
+
 	cgroup_name(mem_cgroup_css(memcg)->cgroup,
 		    memcg_name_buf, sizeof(memcg_name_buf));
 	cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
@@ -458,49 +510,73 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 				 root_cache->size, root_cache->align,
 				 root_cache->flags, root_cache->ctor,
 				 memcg, root_cache);
+	/*
+	 * If we could not create a memcg cache, do not complain, because
+	 * that's not critical at all as we can always proceed with the root
+	 * cache.
+	 */
 	if (IS_ERR(s)) {
 		kfree(cache_name);
-		s = NULL;
+		goto out_unlock;
 	}
 
+	/*
+	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
+	 * barrier here to ensure nobody will see the kmem_cache partially
+	 * initialized.
+	 */
+	smp_wmb();
+	root_cache->memcg_params->memcg_caches[memcg_id] = s;
+
 out_unlock:
 	mutex_unlock(&slab_mutex);
 
 	put_online_mems();
 	put_online_cpus();
-
-	return s;
 }
 
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
+void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 {
-	int rc;
+	LIST_HEAD(release);
+	bool need_rcu_barrier = false;
+	struct kmem_cache *s, *s2;
 
-	if (!s->memcg_params ||
-	    !s->memcg_params->is_root_cache)
-		return 0;
+	get_online_cpus();
+	get_online_mems();
 
-	mutex_unlock(&slab_mutex);
-	rc = __memcg_cleanup_cache_params(s);
 	mutex_lock(&slab_mutex);
+	list_for_each_entry_safe(s, s2, &slab_caches, list) {
+		if (is_root_cache(s) || s->memcg_params->memcg != memcg)
+			continue;
+		/*
+		 * The cgroup is about to be freed and therefore has no charges
+		 * left. Hence, all its caches must be empty by now.
+		 */
+		BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
+	}
+	mutex_unlock(&slab_mutex);
 
-	return rc;
-}
-#else
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
-{
-	return 0;
+	put_online_mems();
+	put_online_cpus();
+
+	do_kmem_cache_release(&release, need_rcu_barrier);
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
 void slab_kmem_cache_release(struct kmem_cache *s)
 {
+	memcg_free_cache_params(s);
 	kfree(s->name);
 	kmem_cache_free(kmem_cache, s);
 }
 
 void kmem_cache_destroy(struct kmem_cache *s)
 {
+	int i;
+	LIST_HEAD(release);
+	bool need_rcu_barrier = false;
+	bool busy = false;
+
 	get_online_cpus();
 	get_online_mems();
 
@@ -510,35 +586,23 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	if (s->refcount)
 		goto out_unlock;
 
-	if (memcg_cleanup_cache_params(s) != 0)
-		goto out_unlock;
+	for_each_memcg_cache_index(i) {
+		struct kmem_cache *c = cache_from_memcg_idx(s, i);
 
-	if (__kmem_cache_shutdown(s) != 0) {
-		printk(KERN_ERR "kmem_cache_destroy %s: "
-		       "Slab cache still has objects\n", s->name);
-		dump_stack();
-		goto out_unlock;
+		if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+			busy = true;
 	}
 
-	list_del(&s->list);
-
-	mutex_unlock(&slab_mutex);
-	if (s->flags & SLAB_DESTROY_BY_RCU)
-		rcu_barrier();
-
-	memcg_free_cache_params(s);
-#ifdef SLAB_SUPPORTS_SYSFS
-	sysfs_slab_remove(s);
-#else
-	slab_kmem_cache_release(s);
-#endif
-	goto out;
+	if (!busy)
+		do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
 
 out_unlock:
 	mutex_unlock(&slab_mutex);
-out:
+
 	put_online_mems();
 	put_online_cpus();
+
+	do_kmem_cache_release(&release, need_rcu_barrier);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
-- 
cgit v1.2.3


From e4b294c2d8f73af4cd41ff30638ad0e4769dc56a Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 11 Feb 2015 15:24:46 -0800
Subject: mm: add fields for compound destructor and order into struct page

Currently, we use lru.next/lru.prev plus cast to access or set
destructor and order of compound page.

Let's replace it with explicit fields in struct page.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h       | 9 ++++-----
 include/linux/mm_types.h | 8 ++++++++
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 65db4aee738a..8dd4fde9d2e5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -627,29 +627,28 @@ int split_free_page(struct page *page);
  * prototype for that function and accessor functions.
  * These are _only_ valid on the head of a PG_compound page.
  */
-typedef void compound_page_dtor(struct page *);
 
 static inline void set_compound_page_dtor(struct page *page,
 						compound_page_dtor *dtor)
 {
-	page[1].lru.next = (void *)dtor;
+	page[1].compound_dtor = dtor;
 }
 
 static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
 {
-	return (compound_page_dtor *)page[1].lru.next;
+	return page[1].compound_dtor;
 }
 
 static inline int compound_order(struct page *page)
 {
 	if (!PageHead(page))
 		return 0;
-	return (unsigned long)page[1].lru.prev;
+	return page[1].compound_order;
 }
 
 static inline void set_compound_order(struct page *page, unsigned long order)
 {
-	page[1].lru.prev = (void *)order;
+	page[1].compound_order = order;
 }
 
 #ifdef CONFIG_MMU
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 07c8bd3f7b48..20ff2105b564 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -28,6 +28,8 @@ struct mem_cgroup;
 		IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK))
 #define ALLOC_SPLIT_PTLOCKS	(SPINLOCK_SIZE > BITS_PER_LONG/8)
 
+typedef void compound_page_dtor(struct page *);
+
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -142,6 +144,12 @@ struct page {
 		struct rcu_head rcu_head;	/* Used by SLAB
 						 * when destroying via RCU
 						 */
+		/* First tail page of compound page */
+		struct {
+			compound_page_dtor *compound_dtor;
+			unsigned long compound_order;
+		};
+
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
 		pgtable_t pmd_huge_pte; /* protected by page->ptl */
 #endif
-- 
cgit v1.2.3


From 1d148e218a0d0566b1c06f2f45f1436d53b049b2 Mon Sep 17 00:00:00 2001
From: "Wang, Yalin" <Yalin.Wang@sonymobile.com>
Date: Wed, 11 Feb 2015 15:24:48 -0800
Subject: mm: add VM_BUG_ON_PAGE() to page_mapcount()

Add VM_BUG_ON_PAGE() for slab pages.  _mapcount is an union with slab
struct in struct page, so we must avoid accessing _mapcount if this page
is a slab page.  Also remove the unneeded bracket.

Signed-off-by: Yalin Wang <yalin.wang@sonymobile.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8dd4fde9d2e5..c6bf813a6b3d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -484,7 +484,8 @@ static inline void page_mapcount_reset(struct page *page)
 
 static inline int page_mapcount(struct page *page)
 {
-	return atomic_read(&(page)->_mapcount) + 1;
+	VM_BUG_ON_PAGE(PageSlab(page), page);
+	return atomic_read(&page->_mapcount) + 1;
 }
 
 static inline int page_count(struct page *page)
-- 
cgit v1.2.3


From 56873f43abdcd574b25105867a990f067747b2f4 Mon Sep 17 00:00:00 2001
From: "Wang, Yalin" <Yalin.Wang@sonymobile.com>
Date: Wed, 11 Feb 2015 15:24:51 -0800
Subject: mm:add KPF_ZERO_PAGE flag for /proc/kpageflags

Add KPF_ZERO_PAGE flag for zero_page, so that userspace processes can
detect zero_page in /proc/kpageflags, and then do memory analysis more
accurately.

Signed-off-by: Yalin Wang <yalin.wang@sonymobile.com>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/pagemap.txt           |  8 ++++++++
 fs/proc/page.c                         | 16 +++++++++++++---
 include/linux/huge_mm.h                | 12 ++++++++++++
 include/uapi/linux/kernel-page-flags.h |  1 +
 mm/huge_memory.c                       |  7 +------
 tools/vm/page-types.c                  |  1 +
 6 files changed, 36 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/vm/pagemap.txt b/Documentation/vm/pagemap.txt
index 5948e455c4d2..6fbd55ef6b45 100644
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -62,6 +62,8 @@ There are three components to pagemap:
     20. NOPAGE
     21. KSM
     22. THP
+    23. BALLOON
+    24. ZERO_PAGE
 
 Short descriptions to the page flags:
 
@@ -102,6 +104,12 @@ Short descriptions to the page flags:
 22. THP
     contiguous pages which construct transparent hugepages
 
+23. BALLOON
+    balloon compaction page
+
+24. ZERO_PAGE
+    zero page for pfn_zero or huge_zero page
+
     [IO related page flags]
  1. ERROR     IO error occurred
  3. UPTODATE  page has up-to-date data
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1e3187da1fed..7eee2d8b97d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -5,6 +5,7 @@
 #include <linux/ksm.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/huge_mm.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
@@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page)
 	 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
 	 * to make sure a given page is a thp, not a non-huge compound page.
 	 */
-	else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
-					     PageAnon(compound_head(page))))
-		u |= 1 << KPF_THP;
+	else if (PageTransCompound(page)) {
+		struct page *head = compound_head(page);
+
+		if (PageLRU(head) || PageAnon(head))
+			u |= 1 << KPF_THP;
+		else if (is_huge_zero_page(head)) {
+			u |= 1 << KPF_ZERO_PAGE;
+			u |= 1 << KPF_THP;
+		}
+	} else if (is_zero_pfn(page_to_pfn(page)))
+		u |= 1 << KPF_ZERO_PAGE;
+
 
 	/*
 	 * Caveats on high order pages: page->_count will only be set
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ad9051bab267..f10b20f05159 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -157,6 +157,13 @@ static inline int hpage_nr_pages(struct page *page)
 extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long addr, pmd_t pmd, pmd_t *pmdp);
 
+extern struct page *huge_zero_page;
+
+static inline bool is_huge_zero_page(struct page *page)
+{
+	return ACCESS_ONCE(huge_zero_page) == page;
+}
+
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -206,6 +213,11 @@ static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_str
 	return 0;
 }
 
+static inline bool is_huge_zero_page(struct page *page)
+{
+	return false;
+}
+
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h
index 2f96d233c980..a6c4962e5d46 100644
--- a/include/uapi/linux/kernel-page-flags.h
+++ b/include/uapi/linux/kernel-page-flags.h
@@ -32,6 +32,7 @@
 #define KPF_KSM			21
 #define KPF_THP			22
 #define KPF_BALLOON		23
+#define KPF_ZERO_PAGE		24
 
 
 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 817a875f2b8c..889713180980 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -171,12 +171,7 @@ static int start_khugepaged(void)
 }
 
 static atomic_t huge_zero_refcount;
-static struct page *huge_zero_page __read_mostly;
-
-static inline bool is_huge_zero_page(struct page *page)
-{
-	return ACCESS_ONCE(huge_zero_page) == page;
-}
+struct page *huge_zero_page __read_mostly;
 
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c
index 264fbc297e0b..8bdf16b8ba60 100644
--- a/tools/vm/page-types.c
+++ b/tools/vm/page-types.c
@@ -133,6 +133,7 @@ static const char * const page_flag_names[] = {
 	[KPF_KSM]		= "x:ksm",
 	[KPF_THP]		= "t:thp",
 	[KPF_BALLOON]		= "o:balloon",
+	[KPF_ZERO_PAGE]		= "z:zero_page",
 
 	[KPF_RESERVED]		= "r:reserved",
 	[KPF_MLOCKED]		= "m:mlocked",
-- 
cgit v1.2.3


From 93aa7d95248d04b934eb8e89717c7b8d6400bf2b Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 11 Feb 2015 15:24:59 -0800
Subject: swap: remove unused mem_cgroup_uncharge_swapcache declaration

The body of this function was removed by commit 0a31bc97c80c ("mm:
memcontrol: rewrite uncharge API").

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 15 ---------------
 mm/shmem.c           |  2 +-
 2 files changed, 1 insertion(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 34e8b60ab973..7067eca501e2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -437,16 +437,6 @@ extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
-#ifdef CONFIG_MEMCG
-extern void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout);
-#else
-static inline void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
-{
-}
-#endif
-
 #else /* CONFIG_SWAP */
 
 #define swap_address_space(entry)		(NULL)
@@ -547,11 +537,6 @@ static inline swp_entry_t get_swap_page(void)
 	return entry;
 }
 
-static inline void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
-{
-}
-
 #endif /* CONFIG_SWAP */
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index b3e403181981..864c878401e6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1131,7 +1131,7 @@ repeat:
 			 * truncated or holepunched since swap was confirmed.
 			 * shmem_undo_range() will have done some of the
 			 * unaccounting, now delete_from_swap_cache() will do
-			 * the rest (including mem_cgroup_uncharge_swapcache).
+			 * the rest.
 			 * Reset swap.val? No, leave it so "failed" goes back to
 			 * "repeat": reading a hole and writing should succeed.
 			 */
-- 
cgit v1.2.3


From 6de226191d12fce30331ebf024ca3ed24834f0ee Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 11 Feb 2015 15:25:01 -0800
Subject: mm: memcontrol: track move_lock state internally

The complexity of memcg page stat synchronization is currently leaking
into the callsites, forcing them to keep track of the move_lock state and
the IRQ flags.  Simplify the API by tracking it in the memcg.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 12 +++-----
 mm/memcontrol.c            | 68 ++++++++++++++++++++++++++--------------------
 mm/page-writeback.c        | 12 +++-----
 mm/rmap.c                  | 12 +++-----
 4 files changed, 51 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fb212e1d700d..76b4084b8d08 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -138,12 +138,10 @@ static inline bool mem_cgroup_disabled(void)
 	return false;
 }
 
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked,
-					      unsigned long *flags);
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
-			      unsigned long *flags);
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page);
 void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
 				 enum mem_cgroup_stat_index idx, int val);
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg);
 
 static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
 					    enum mem_cgroup_stat_index idx)
@@ -285,14 +283,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 
-static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
-					bool *locked, unsigned long *flags)
+static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
 {
 	return NULL;
 }
 
-static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg,
-					bool *locked, unsigned long *flags)
+static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f3f8a4f52a0c..028d07c79104 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -325,9 +325,11 @@ struct mem_cgroup {
 	/*
 	 * set > 0 if pages under this cgroup are moving to other cgroup.
 	 */
-	atomic_t	moving_account;
+	atomic_t		moving_account;
 	/* taken only while moving_account > 0 */
-	spinlock_t	move_lock;
+	spinlock_t		move_lock;
+	struct task_struct	*move_lock_task;
+	unsigned long		move_lock_flags;
 	/*
 	 * percpu counter.
 	 */
@@ -1977,34 +1979,33 @@ cleanup:
 /**
  * mem_cgroup_begin_page_stat - begin a page state statistics transaction
  * @page: page that is going to change accounted state
- * @locked: &memcg->move_lock slowpath was taken
- * @flags: IRQ-state flags for &memcg->move_lock
  *
  * This function must mark the beginning of an accounted page state
  * change to prevent double accounting when the page is concurrently
  * being moved to another memcg:
  *
- *   memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ *   memcg = mem_cgroup_begin_page_stat(page);
  *   if (TestClearPageState(page))
  *     mem_cgroup_update_page_stat(memcg, state, -1);
- *   mem_cgroup_end_page_stat(memcg, locked, flags);
- *
- * The RCU lock is held throughout the transaction.  The fast path can
- * get away without acquiring the memcg->move_lock (@locked is false)
- * because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when the page
- * state that is going to change is the only thing preventing the page
- * from being uncharged.  E.g. end-writeback clearing PageWriteback(),
- * which allows migration to go ahead and uncharge the page before the
- * account transaction might be complete.
+ *   mem_cgroup_end_page_stat(memcg);
  */
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
-					      bool *locked,
-					      unsigned long *flags)
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
 {
 	struct mem_cgroup *memcg;
+	unsigned long flags;
 
+	/*
+	 * The RCU lock is held throughout the transaction.  The fast
+	 * path can get away without acquiring the memcg->move_lock
+	 * because page moving starts with an RCU grace period.
+	 *
+	 * The RCU lock also protects the memcg from being freed when
+	 * the page state that is going to change is the only thing
+	 * preventing the page from being uncharged.
+	 * E.g. end-writeback clearing PageWriteback(), which allows
+	 * migration to go ahead and uncharge the page before the
+	 * account transaction might be complete.
+	 */
 	rcu_read_lock();
 
 	if (mem_cgroup_disabled())
@@ -2014,16 +2015,22 @@ again:
 	if (unlikely(!memcg))
 		return NULL;
 
-	*locked = false;
 	if (atomic_read(&memcg->moving_account) <= 0)
 		return memcg;
 
-	spin_lock_irqsave(&memcg->move_lock, *flags);
+	spin_lock_irqsave(&memcg->move_lock, flags);
 	if (memcg != page->mem_cgroup) {
-		spin_unlock_irqrestore(&memcg->move_lock, *flags);
+		spin_unlock_irqrestore(&memcg->move_lock, flags);
 		goto again;
 	}
-	*locked = true;
+
+	/*
+	 * When charge migration first begins, we can have locked and
+	 * unlocked page stat updates happening concurrently.  Track
+	 * the task who has the lock for mem_cgroup_end_page_stat().
+	 */
+	memcg->move_lock_task = current;
+	memcg->move_lock_flags = flags;
 
 	return memcg;
 }
@@ -2031,14 +2038,17 @@ again:
 /**
  * mem_cgroup_end_page_stat - finish a page state statistics transaction
  * @memcg: the memcg that was accounted against
- * @locked: value received from mem_cgroup_begin_page_stat()
- * @flags: value received from mem_cgroup_begin_page_stat()
  */
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
-			      unsigned long *flags)
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 {
-	if (memcg && *locked)
-		spin_unlock_irqrestore(&memcg->move_lock, *flags);
+	if (memcg && memcg->move_lock_task == current) {
+		unsigned long flags = memcg->move_lock_flags;
+
+		memcg->move_lock_task = NULL;
+		memcg->move_lock_flags = 0;
+
+		spin_unlock_irqrestore(&memcg->move_lock, flags);
+	}
 
 	rcu_read_unlock();
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6f4335238e33..fb71e9deca85 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2308,12 +2308,10 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
 int test_clear_page_writeback(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
-	unsigned long memcg_flags;
 	struct mem_cgroup *memcg;
-	bool locked;
 	int ret;
 
-	memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+	memcg = mem_cgroup_begin_page_stat(page);
 	if (mapping) {
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
@@ -2338,19 +2336,17 @@ int test_clear_page_writeback(struct page *page)
 		dec_zone_page_state(page, NR_WRITEBACK);
 		inc_zone_page_state(page, NR_WRITTEN);
 	}
-	mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+	mem_cgroup_end_page_stat(memcg);
 	return ret;
 }
 
 int __test_set_page_writeback(struct page *page, bool keep_write)
 {
 	struct address_space *mapping = page_mapping(page);
-	unsigned long memcg_flags;
 	struct mem_cgroup *memcg;
-	bool locked;
 	int ret;
 
-	memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+	memcg = mem_cgroup_begin_page_stat(page);
 	if (mapping) {
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
 		unsigned long flags;
@@ -2380,7 +2376,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 		mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
 		inc_zone_page_state(page, NR_WRITEBACK);
 	}
-	mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+	mem_cgroup_end_page_stat(memcg);
 	return ret;
 
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 70b32498d4f2..5e3e09081164 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1085,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page,
 void page_add_file_rmap(struct page *page)
 {
 	struct mem_cgroup *memcg;
-	unsigned long flags;
-	bool locked;
 
-	memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+	memcg = mem_cgroup_begin_page_stat(page);
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		__inc_zone_page_state(page, NR_FILE_MAPPED);
 		mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
 	}
-	mem_cgroup_end_page_stat(memcg, &locked, &flags);
+	mem_cgroup_end_page_stat(memcg);
 }
 
 static void page_remove_file_rmap(struct page *page)
 {
 	struct mem_cgroup *memcg;
-	unsigned long flags;
-	bool locked;
 
-	memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+	memcg = mem_cgroup_begin_page_stat(page);
 
 	/* page still mapped by someone else? */
 	if (!atomic_add_negative(-1, &page->_mapcount))
@@ -1123,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page)
 	if (unlikely(PageMlocked(page)))
 		clear_page_mlock(page);
 out:
-	mem_cgroup_end_page_stat(memcg, &locked, &flags);
+	mem_cgroup_end_page_stat(memcg);
 }
 
 /**
-- 
cgit v1.2.3


From 44628d9755e249aab9a6e1a17407d2f4278047ee Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Wed, 11 Feb 2015 15:25:10 -0800
Subject: mm: fix typo of MIGRATE_RESERVE in comment

Found it when I want to jump to the definition of MIGRATE_RESERVE ctags.

Signed-off-by: Baoquan He <bhe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2f0856d14b21..b41829701334 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -426,7 +426,7 @@ struct zone {
 	const char		*name;
 
 	/*
-	 * Number of MIGRATE_RESEVE page block. To maintain for just
+	 * Number of MIGRATE_RESERVE page block. To maintain for just
 	 * optimization. Protected by zone->lock.
 	 */
 	int			nr_migrate_reserve_block;
-- 
cgit v1.2.3


From e66f17ff71772b209eed39de35aaa99ba819c93d Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Feb 2015 15:25:22 -0800
Subject: mm/hugetlb: take page table lock in follow_huge_pmd()

We have a race condition between move_pages() and freeing hugepages, where
move_pages() calls follow_page(FOLL_GET) for hugepages internally and
tries to get its refcount without preventing concurrent freeing.  This
race crashes the kernel, so this patch fixes it by moving FOLL_GET code
for hugepages into follow_huge_pmd() with taking the page table lock.

This patch intentionally removes page==NULL check after pte_page.
This is justified because pte_page() never returns NULL for any
architectures or configurations.

This patch changes the behavior of follow_huge_pmd() for tail pages and
then tail pages can be pinned/returned.  So the caller must be changed to
properly handle the returned tail pages.

We could have a choice to add the similar locking to
follow_huge_(addr|pud) for consistency, but it's not necessary because
currently these functions don't support FOLL_GET flag, so let's leave it
for future development.

Here is the reproducer:

  $ cat movepages.c
  #include <stdio.h>
  #include <stdlib.h>
  #include <numaif.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000
  #define PS              0x1000

  int main(int argc, char *argv[]) {
          int i;
          int nr_hp = strtol(argv[1], NULL, 0);
          int nr_p  = nr_hp * HPS / PS;
          int ret;
          void **addrs;
          int *status;
          int *nodes;
          pid_t pid;

          pid = strtol(argv[2], NULL, 0);
          addrs  = malloc(sizeof(char *) * nr_p + 1);
          status = malloc(sizeof(char *) * nr_p + 1);
          nodes  = malloc(sizeof(char *) * nr_p + 1);

          while (1) {
                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 1;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");

                  for (i = 0; i < nr_p; i++) {
                          addrs[i] = (void *)ADDR_INPUT + i * PS;
                          nodes[i] = 0;
                          status[i] = 0;
                  }
                  ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
                                        MPOL_MF_MOVE_ALL);
                  if (ret == -1)
                          err("move_pages");
          }
          return 0;
  }

  $ cat hugepage.c
  #include <stdio.h>
  #include <sys/mman.h>
  #include <string.h>

  #define ADDR_INPUT      0x700000000000UL
  #define HPS             0x200000

  int main(int argc, char *argv[]) {
          int nr_hp = strtol(argv[1], NULL, 0);
          char *p;

          while (1) {
                  p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
                           MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
                  if (p != (void *)ADDR_INPUT) {
                          perror("mmap");
                          break;
                  }
                  memset(p, 0, nr_hp * HPS);
                  munmap(p, nr_hp * HPS);
          }
  }

  $ sysctl vm.nr_hugepages=40
  $ ./hugepage 10 &
  $ ./movepages 10 $(pgrep -f hugepage)

Fixes: e632a938d914 ("mm: migrate: add hugepage migration code to move_pages()")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Hugh Dickins <hughd@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: <stable@vger.kernel.org>	[3.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  8 ++++----
 include/linux/swapops.h |  4 ++++
 mm/gup.c                | 25 ++++++++-----------------
 mm/hugetlb.c            | 48 ++++++++++++++++++++++++++++++++++--------------
 mm/migrate.c            |  5 +++--
 5 files changed, 53 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7d7856359920..7b5785032049 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -99,9 +99,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int write);
+				pmd_t *pmd, int flags);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-				pud_t *pud, int write);
+				pud_t *pud, int flags);
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pmd);
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -133,8 +133,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 static inline void hugetlb_show_meminfo(void)
 {
 }
-#define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define follow_huge_pud(mm, addr, pud, write)	NULL
+#define follow_huge_pmd(mm, addr, pmd, flags)	NULL
+#define follow_huge_pud(mm, addr, pud, flags)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define pud_huge(x)	0
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 50cbc876be56..831a3168ab35 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -135,6 +135,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
 	*entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
 }
 
+extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+					spinlock_t *ptl);
 extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					unsigned long address);
 extern void migration_entry_wait_huge(struct vm_area_struct *vma,
@@ -148,6 +150,8 @@ static inline int is_migration_entry(swp_entry_t swp)
 }
 #define migration_entry_to_page(swp) NULL
 static inline void make_migration_entry_read(swp_entry_t *entryp) { }
+static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+					spinlock_t *ptl) { }
 static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 					 unsigned long address) { }
 static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
diff --git a/mm/gup.c b/mm/gup.c
index 12bc2bc33da7..1a8ab05918e0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pud_none(*pud))
 		return no_page_table(vma, flags);
 	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
-		if (flags & FOLL_GET)
-			return NULL;
-		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
-		return page;
+		page = follow_huge_pud(mm, address, pud, flags);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
 	}
 	if (unlikely(pud_bad(*pud)))
 		return no_page_table(vma, flags);
@@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pmd_none(*pmd))
 		return no_page_table(vma, flags);
 	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
-		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
-		if (flags & FOLL_GET) {
-			/*
-			 * Refcount on tail pages are not well-defined and
-			 * shouldn't be taken. The caller should handle a NULL
-			 * return when trying to follow tail pages.
-			 */
-			if (PageHead(page))
-				get_page(page);
-			else
-				page = NULL;
-		}
-		return page;
+		page = follow_huge_pmd(mm, address, pmd, flags);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
 	}
 	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
 		return no_page_table(vma, flags);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d96b8bfa748f..5aca3707450f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3675,28 +3675,48 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address,
 
 struct page * __weak
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
+		pmd_t *pmd, int flags)
 {
-	struct page *page;
-
-	if (!pmd_present(*pmd))
-		return NULL;
-	page = pte_page(*(pte_t *)pmd);
-	if (page)
-		page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
+	struct page *page = NULL;
+	spinlock_t *ptl;
+retry:
+	ptl = pmd_lockptr(mm, pmd);
+	spin_lock(ptl);
+	/*
+	 * make sure that the address range covered by this pmd is not
+	 * unmapped from other threads.
+	 */
+	if (!pmd_huge(*pmd))
+		goto out;
+	if (pmd_present(*pmd)) {
+		page = pte_page(*(pte_t *)pmd) +
+			((address & ~PMD_MASK) >> PAGE_SHIFT);
+		if (flags & FOLL_GET)
+			get_page(page);
+	} else {
+		if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
+			spin_unlock(ptl);
+			__migration_entry_wait(mm, (pte_t *)pmd, ptl);
+			goto retry;
+		}
+		/*
+		 * hwpoisoned entry is treated as no_page_table in
+		 * follow_page_mask().
+		 */
+	}
+out:
+	spin_unlock(ptl);
 	return page;
 }
 
 struct page * __weak
 follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int write)
+		pud_t *pud, int flags)
 {
-	struct page *page;
+	if (flags & FOLL_GET)
+		return NULL;
 
-	page = pte_page(*(pte_t *)pud);
-	if (page)
-		page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
-	return page;
+	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
 }
 
 #ifdef CONFIG_MEMORY_FAILURE
diff --git a/mm/migrate.c b/mm/migrate.c
index 6e284bcca8bb..f98067e5d353 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
  * get to the page and wait until migration is finished.
  * When we return from this function the fault will be retried.
  */
-static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 				spinlock_t *ptl)
 {
 	pte_t pte;
@@ -1236,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 			goto put_and_set;
 
 		if (PageHuge(page)) {
-			isolate_huge_page(page, &pagelist);
+			if (PageHead(page))
+				isolate_huge_page(page, &pagelist);
 			goto put_and_set;
 		}
 
-- 
cgit v1.2.3


From 1a6d53a105406d97396c87511afd6f09b4dc8ad2 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Feb 2015 15:25:44 -0800
Subject: mm: reduce try_to_compact_pages parameters

Expand the usage of the struct alloc_context introduced in the previous
patch also for calling try_to_compact_pages(), to reduce the number of its
parameters.  Since the function is in different compilation unit, we need
to move alloc_context definition in the shared mm/internal.h header.

With this change we get simpler code and small savings of code size and stack
usage:

add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-27 (-27)
function                                     old     new   delta
__alloc_pages_direct_compact                 283     256     -27
add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-13 (-13)
function                                     old     new   delta
try_to_compact_pages                         582     569     -13

Stack usage of __alloc_pages_direct_compact goes from 24 to none (per
scripts/checkstack.pl).

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 17 +++++++++--------
 mm/compaction.c            | 23 +++++++++++------------
 mm/internal.h              | 22 ++++++++++++++++++++++
 mm/page_alloc.c            | 27 ++-------------------------
 4 files changed, 44 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 3238ffa33f68..f2efda2e6ac6 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -21,6 +21,8 @@
 /* Zone lock or lru_lock was contended in async compaction */
 #define COMPACT_CONTENDED_LOCK	2
 
+struct alloc_context; /* in mm/internal.h */
+
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
@@ -30,10 +32,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 			void __user *buffer, size_t *length, loff_t *ppos);
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
-extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
-			int order, gfp_t gfp_mask, nodemask_t *mask,
-			enum migrate_mode mode, int *contended,
-			int alloc_flags, int classzone_idx);
+extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+			int alloc_flags, const struct alloc_context *ac,
+			enum migrate_mode mode, int *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order,
@@ -101,10 +102,10 @@ static inline bool compaction_restarting(struct zone *zone, int order)
 }
 
 #else
-static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
-			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			enum migrate_mode mode, int *contended,
-			int alloc_flags, int classzone_idx)
+static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
+			unsigned int order, int alloc_flags,
+			const struct alloc_context *ac,
+			enum migrate_mode mode, int *contended)
 {
 	return COMPACT_CONTINUE;
 }
diff --git a/mm/compaction.c b/mm/compaction.c
index 546e571e9d60..9c7e6909dd29 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1335,22 +1335,20 @@ int sysctl_extfrag_threshold = 500;
 
 /**
  * try_to_compact_pages - Direct compact to satisfy a high-order allocation
- * @zonelist: The zonelist used for the current allocation
- * @order: The order of the current allocation
  * @gfp_mask: The GFP mask of the current allocation
- * @nodemask: The allowed nodes to allocate from
+ * @order: The order of the current allocation
+ * @alloc_flags: The allocation flags of the current allocation
+ * @ac: The context of current allocation
  * @mode: The migration mode for async, sync light, or sync migration
  * @contended: Return value that determines if compaction was aborted due to
  *	       need_resched() or lock contention
  *
  * This is the main entry point for direct page compaction.
  */
-unsigned long try_to_compact_pages(struct zonelist *zonelist,
-			int order, gfp_t gfp_mask, nodemask_t *nodemask,
-			enum migrate_mode mode, int *contended,
-			int alloc_flags, int classzone_idx)
+unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+			int alloc_flags, const struct alloc_context *ac,
+			enum migrate_mode mode, int *contended)
 {
-	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
 	int may_enter_fs = gfp_mask & __GFP_FS;
 	int may_perform_io = gfp_mask & __GFP_IO;
 	struct zoneref *z;
@@ -1365,8 +1363,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 		return COMPACT_SKIPPED;
 
 	/* Compact each zone in the list */
-	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
-								nodemask) {
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+								ac->nodemask) {
 		int status;
 		int zone_contended;
 
@@ -1374,7 +1372,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 			continue;
 
 		status = compact_zone_order(zone, order, gfp_mask, mode,
-				&zone_contended, alloc_flags, classzone_idx);
+				&zone_contended, alloc_flags,
+				ac->classzone_idx);
 		rc = max(status, rc);
 		/*
 		 * It takes at least one zone that wasn't lock contended
@@ -1384,7 +1383,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
 
 		/* If a normal allocation would succeed, stop compacting */
 		if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
-					classzone_idx, alloc_flags)) {
+					ac->classzone_idx, alloc_flags)) {
 			/*
 			 * We think the allocation will succeed in this zone,
 			 * but it is not certain, hence the false. The caller
diff --git a/mm/internal.h b/mm/internal.h
index efad241f7014..c4d6c9b43491 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -109,6 +109,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
  * in mm/page_alloc.c
  */
 
+/*
+ * Structure for holding the mostly immutable allocation parameters passed
+ * between functions involved in allocations, including the alloc_pages*
+ * family of functions.
+ *
+ * nodemask, migratetype and high_zoneidx are initialized only once in
+ * __alloc_pages_nodemask() and then never change.
+ *
+ * zonelist, preferred_zone and classzone_idx are set first in
+ * __alloc_pages_nodemask() for the fast path, and might be later changed
+ * in __alloc_pages_slowpath(). All other functions pass the whole strucure
+ * by a const pointer.
+ */
+struct alloc_context {
+	struct zonelist *zonelist;
+	nodemask_t *nodemask;
+	struct zone *preferred_zone;
+	int classzone_idx;
+	int migratetype;
+	enum zone_type high_zoneidx;
+};
+
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4aead0bd8d44..d664eb922a7d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -232,27 +232,6 @@ EXPORT_SYMBOL(nr_node_ids);
 EXPORT_SYMBOL(nr_online_nodes);
 #endif
 
-/*
- * Structure for holding the mostly immutable allocation parameters passed
- * between alloc_pages* family of functions.
- *
- * nodemask, migratetype and high_zoneidx are initialized only once in
- * __alloc_pages_nodemask() and then never change.
- *
- * zonelist, preferred_zone and classzone_idx are set first in
- * __alloc_pages_nodemask() for the fast path, and might be later changed
- * in __alloc_pages_slowpath(). All other functions pass the whole strucure
- * by a const pointer.
- */
-struct alloc_context {
-	struct zonelist *zonelist;
-	nodemask_t *nodemask;
-	struct zone *preferred_zone;
-	int classzone_idx;
-	int migratetype;
-	enum zone_type high_zoneidx;
-};
-
 int page_group_by_mobility_disabled __read_mostly;
 
 void set_pageblock_migratetype(struct page *page, int migratetype)
@@ -2429,10 +2408,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		return NULL;
 
 	current->flags |= PF_MEMALLOC;
-	compact_result = try_to_compact_pages(ac->zonelist, order, gfp_mask,
-						ac->nodemask, mode,
-						contended_compaction,
-						alloc_flags, ac->classzone_idx);
+	compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+						mode, contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
 	switch (compact_result) {
-- 
cgit v1.2.3


From 05891fb06517d19ae5357c9dc44e96bbe0300a3c Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Feb 2015 15:25:47 -0800
Subject: mm: microoptimize zonelist operations

next_zones_zonelist() returns a zoneref pointer, as well as a zone pointer
via extra parameter.  Since the latter can be trivially obtained by
dereferencing the former, the overhead of the extra parameter is
unjustified.

This patch thus removes the zone parameter from next_zones_zonelist().
Both callers happen to be in the same header file, so it's simple to add
the zoneref dereference inline.  We save some bytes of code size.

add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-105 (-105)
function                                     old     new   delta
nr_free_zone_pages                           129     115     -14
__alloc_pages_nodemask                      2300    2285     -15
get_page_from_freelist                      2652    2576     -76

add/remove: 0/0 grow/shrink: 1/0 up/down: 10/0 (10)
function                                     old     new   delta
try_to_compact_pages                         569     579     +10

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 13 +++++++------
 mm/mmzone.c            |  4 +---
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b41829701334..f279d9c158cd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -970,7 +970,6 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
  * @z - The cursor used as a starting point for the search
  * @highest_zoneidx - The zone index of the highest zone to return
  * @nodes - An optional nodemask to filter the zonelist with
- * @zone - The first suitable zone found is returned via this parameter
  *
  * This function returns the next zone at or below a given zone index that is
  * within the allowed nodemask using a cursor as the starting point for the
@@ -980,8 +979,7 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
  */
 struct zoneref *next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes,
-					struct zone **zone);
+					nodemask_t *nodes);
 
 /**
  * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
@@ -1000,8 +998,10 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 					nodemask_t *nodes,
 					struct zone **zone)
 {
-	return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
-								zone);
+	struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
+							highest_zoneidx, nodes);
+	*zone = zonelist_zone(z);
+	return z;
 }
 
 /**
@@ -1018,7 +1018,8 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
 	for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone);	\
 		zone;							\
-		z = next_zones_zonelist(++z, highidx, nodemask, &zone))	\
+		z = next_zones_zonelist(++z, highidx, nodemask),	\
+			zone = zonelist_zone(z))			\
 
 /**
  * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
diff --git a/mm/mmzone.c b/mm/mmzone.c
index bf34fb8556db..7d87ebb0d632 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
 /* Returns the next zone at or below highest_zoneidx in a zonelist */
 struct zoneref *next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes,
-					struct zone **zone)
+					nodemask_t *nodes)
 {
 	/*
 	 * Find the next suitable zone to use for the allocation.
@@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
 				(z->zone && !zref_in_nodemask(z, nodes)))
 			z++;
 
-	*zone = zonelist_zone(z);
 	return z;
 }
 
-- 
cgit v1.2.3


From 90cbc2508827e1e15dca23361c33cc26dd2b9e99 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Wed, 11 Feb 2015 15:25:55 -0800
Subject: vmscan: force scan offline memory cgroups

Since commit b2052564e66d ("mm: memcontrol: continue cache reclaim from
offlined groups") pages charged to a memory cgroup are not reparented when
the cgroup is removed.  Instead, they are supposed to be reclaimed in a
regular way, along with pages accounted to online memory cgroups.

However, an lruvec of an offline memory cgroup will sooner or later get so
small that it will be scanned only at low scan priorities (see
get_scan_count()).  Therefore, if there are enough reclaimable pages in
big lruvecs, pages accounted to offline memory cgroups will never be
scanned at all, wasting memory.

Fix this by unconditionally forcing scanning dead lruvecs from kswapd.

[akpm@linux-foundation.org: fix build]
Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ++++++
 mm/memcontrol.c            | 14 ++++++++++++++
 mm/vmscan.c                |  8 ++++++--
 3 files changed, 26 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 76b4084b8d08..353537a5981a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -102,6 +102,7 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
  * For memory reclaim.
  */
 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
+bool mem_cgroup_lruvec_online(struct lruvec *lruvec);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
@@ -266,6 +267,11 @@ mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 	return 1;
 }
 
+static inline bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+	return true;
+}
+
 static inline unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 028d07c79104..6187ca4d5dc2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1367,6 +1367,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 	return inactive * inactive_ratio < active;
 }
 
+bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+	struct mem_cgroup_per_zone *mz;
+	struct mem_cgroup *memcg;
+
+	if (mem_cgroup_disabled())
+		return true;
+
+	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+	memcg = mz->memcg;
+
+	return !!(memcg->css.flags & CSS_ONLINE);
+}
+
 #define mem_cgroup_from_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f756a202d5d5..b6dfa0081a8e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1903,8 +1903,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
 	 * latencies, so it's better to scan a minimum amount there as
 	 * well.
 	 */
-	if (current_is_kswapd() && !zone_reclaimable(zone))
-		force_scan = true;
+	if (current_is_kswapd()) {
+		if (!zone_reclaimable(zone))
+			force_scan = true;
+		if (!mem_cgroup_lruvec_online(lruvec))
+			force_scan = true;
+	}
 	if (!global_reclaim(sc))
 		force_scan = true;
 
-- 
cgit v1.2.3


From 650c5e565492f9092552bfe4d65935196c7d9567 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 11 Feb 2015 15:26:03 -0800
Subject: mm: page_counter: pull "-1" handling out of page_counter_memparse()

The unified hierarchy interface for memory cgroups will no longer use "-1"
to mean maximum possible resource value.  In preparation for this, make
the string an argument and let the caller supply it.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_counter.h | 3 ++-
 mm/hugetlb_cgroup.c          | 2 +-
 mm/memcontrol.c              | 4 ++--
 mm/page_counter.c            | 7 ++++---
 net/ipv4/tcp_memcontrol.c    | 2 +-
 5 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 955421575d16..17fa4f8de3a6 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -41,7 +41,8 @@ int page_counter_try_charge(struct page_counter *counter,
 			    struct page_counter **fail);
 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
 int page_counter_limit(struct page_counter *counter, unsigned long limit);
-int page_counter_memparse(const char *buf, unsigned long *nr_pages);
+int page_counter_memparse(const char *buf, const char *max,
+			  unsigned long *nr_pages);
 
 static inline void page_counter_reset_watermark(struct page_counter *counter)
 {
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 037e1c00a5b7..6e0057439a46 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
 		return -EINVAL;
 
 	buf = strstrip(buf);
-	ret = page_counter_memparse(buf, &nr_pages);
+	ret = page_counter_memparse(buf, "-1", &nr_pages);
 	if (ret)
 		return ret;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dc5c4cd0afdf..6453ea5a27aa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3414,7 +3414,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
 	int ret;
 
 	buf = strstrip(buf);
-	ret = page_counter_memparse(buf, &nr_pages);
+	ret = page_counter_memparse(buf, "-1", &nr_pages);
 	if (ret)
 		return ret;
 
@@ -3786,7 +3786,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	unsigned long usage;
 	int i, size, ret;
 
-	ret = page_counter_memparse(args, &threshold);
+	ret = page_counter_memparse(args, "-1", &threshold);
 	if (ret)
 		return ret;
 
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a009574fbba9..11b4beda14ba 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
 /**
  * page_counter_memparse - memparse() for page counter limits
  * @buf: string to parse
+ * @max: string meaning maximum possible value
  * @nr_pages: returns the result in number of pages
  *
  * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
  * limited to %PAGE_COUNTER_MAX.
  */
-int page_counter_memparse(const char *buf, unsigned long *nr_pages)
+int page_counter_memparse(const char *buf, const char *max,
+			  unsigned long *nr_pages)
 {
-	char unlimited[] = "-1";
 	char *end;
 	u64 bytes;
 
-	if (!strncmp(buf, unlimited, sizeof(unlimited))) {
+	if (!strcmp(buf, max)) {
 		*nr_pages = PAGE_COUNTER_MAX;
 		return 0;
 	}
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 272327134a1b..c2a75c6957a1 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -120,7 +120,7 @@ static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
 	switch (of_cft(of)->private) {
 	case RES_LIMIT:
 		/* see memcontrol.c */
-		ret = page_counter_memparse(buf, &nr_pages);
+		ret = page_counter_memparse(buf, "-1", &nr_pages);
 		if (ret)
 			break;
 		mutex_lock(&tcp_limit_mutex);
-- 
cgit v1.2.3


From 241994ed8649f7300667be8b13a9e04ae04e05a1 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 11 Feb 2015 15:26:06 -0800
Subject: mm: memcontrol: default hierarchy interface for memory

Introduce the basic control files to account, partition, and limit
memory using cgroups in default hierarchy mode.

This interface versioning allows us to address fundamental design
issues in the existing memory cgroup interface, further explained
below.  The old interface will be maintained indefinitely, but a
clearer model and improved workload performance should encourage
existing users to switch over to the new one eventually.

The control files are thus:

  - memory.current shows the current consumption of the cgroup and its
    descendants, in bytes.

  - memory.low configures the lower end of the cgroup's expected
    memory consumption range.  The kernel considers memory below that
    boundary to be a reserve - the minimum that the workload needs in
    order to make forward progress - and generally avoids reclaiming
    it, unless there is an imminent risk of entering an OOM situation.

  - memory.high configures the upper end of the cgroup's expected
    memory consumption range.  A cgroup whose consumption grows beyond
    this threshold is forced into direct reclaim, to work off the
    excess and to throttle new allocations heavily, but is generally
    allowed to continue and the OOM killer is not invoked.

  - memory.max configures the hard maximum amount of memory that the
    cgroup is allowed to consume before the OOM killer is invoked.

  - memory.events shows event counters that indicate how often the
    cgroup was reclaimed while below memory.low, how often it was
    forced to reclaim excess beyond memory.high, how often it hit
    memory.max, and how often it entered OOM due to memory.max.  This
    allows users to identify configuration problems when observing a
    degradation in workload performance.  An overcommitted system will
    have an increased rate of low boundary breaches, whereas increased
    rates of high limit breaches, maximum hits, or even OOM situations
    will indicate internally overcommitted cgroups.

For existing users of memory cgroups, the following deviations from
the current interface are worth pointing out and explaining:

  - The original lower boundary, the soft limit, is defined as a limit
    that is per default unset.  As a result, the set of cgroups that
    global reclaim prefers is opt-in, rather than opt-out.  The costs
    for optimizing these mostly negative lookups are so high that the
    implementation, despite its enormous size, does not even provide
    the basic desirable behavior.  First off, the soft limit has no
    hierarchical meaning.  All configured groups are organized in a
    global rbtree and treated like equal peers, regardless where they
    are located in the hierarchy.  This makes subtree delegation
    impossible.  Second, the soft limit reclaim pass is so aggressive
    that it not just introduces high allocation latencies into the
    system, but also impacts system performance due to overreclaim, to
    the point where the feature becomes self-defeating.

    The memory.low boundary on the other hand is a top-down allocated
    reserve.  A cgroup enjoys reclaim protection when it and all its
    ancestors are below their low boundaries, which makes delegation
    of subtrees possible.  Secondly, new cgroups have no reserve per
    default and in the common case most cgroups are eligible for the
    preferred reclaim pass.  This allows the new low boundary to be
    efficiently implemented with just a minor addition to the generic
    reclaim code, without the need for out-of-band data structures and
    reclaim passes.  Because the generic reclaim code considers all
    cgroups except for the ones running low in the preferred first
    reclaim pass, overreclaim of individual groups is eliminated as
    well, resulting in much better overall workload performance.

  - The original high boundary, the hard limit, is defined as a strict
    limit that can not budge, even if the OOM killer has to be called.
    But this generally goes against the goal of making the most out of
    the available memory.  The memory consumption of workloads varies
    during runtime, and that requires users to overcommit.  But doing
    that with a strict upper limit requires either a fairly accurate
    prediction of the working set size or adding slack to the limit.
    Since working set size estimation is hard and error prone, and
    getting it wrong results in OOM kills, most users tend to err on
    the side of a looser limit and end up wasting precious resources.

    The memory.high boundary on the other hand can be set much more
    conservatively.  When hit, it throttles allocations by forcing
    them into direct reclaim to work off the excess, but it never
    invokes the OOM killer.  As a result, a high boundary that is
    chosen too aggressively will not terminate the processes, but
    instead it will lead to gradual performance degradation.  The user
    can monitor this and make corrections until the minimal memory
    footprint that still gives acceptable performance is found.

    In extreme cases, with many concurrent allocations and a complete
    breakdown of reclaim progress within the group, the high boundary
    can be exceeded.  But even then it's mostly better to satisfy the
    allocation from the slack available in other groups or the rest of
    the system than killing the group.  Otherwise, memory.max is there
    to limit this type of spillover and ultimately contain buggy or
    even malicious applications.

  - The original control file names are unwieldy and inconsistent in
    many different ways.  For example, the upper boundary hit count is
    exported in the memory.failcnt file, but an OOM event count has to
    be manually counted by listening to memory.oom_control events, and
    lower boundary / soft limit events have to be counted by first
    setting a threshold for that value and then counting those events.
    Also, usage and limit files encode their units in the filename.
    That makes the filenames very long, even though this is not
    information that a user needs to be reminded of every time they
    type out those names.

    To address these naming issues, as well as to signal clearly that
    the new interface carries a new configuration model, the naming
    conventions in it necessarily differ from the old interface.

  - The original limit files indicate the state of an unset limit with
    a very high number, and a configured limit can be unset by echoing
    -1 into those files.  But that very high number is implementation
    and architecture dependent and not very descriptive.  And while -1
    can be understood as an underflow into the highest possible value,
    -2 or -10M etc. do not work, so it's not inconsistent.

    memory.low, memory.high, and memory.max will use the string
    "infinity" to indicate and set the highest possible value.

[akpm@linux-foundation.org: use seq_puts() for basic strings]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/unified-hierarchy.txt |  79 ++++++++++
 include/linux/memcontrol.h                  |  32 ++++
 mm/memcontrol.c                             | 229 ++++++++++++++++++++++++++--
 mm/vmscan.c                                 |  22 ++-
 4 files changed, 348 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 4f4563277864..71daa35ec2d9 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -327,6 +327,85 @@ supported and the interface files "release_agent" and
 - use_hierarchy is on by default and the cgroup file for the flag is
   not created.
 
+- The original lower boundary, the soft limit, is defined as a limit
+  that is per default unset.  As a result, the set of cgroups that
+  global reclaim prefers is opt-in, rather than opt-out.  The costs
+  for optimizing these mostly negative lookups are so high that the
+  implementation, despite its enormous size, does not even provide the
+  basic desirable behavior.  First off, the soft limit has no
+  hierarchical meaning.  All configured groups are organized in a
+  global rbtree and treated like equal peers, regardless where they
+  are located in the hierarchy.  This makes subtree delegation
+  impossible.  Second, the soft limit reclaim pass is so aggressive
+  that it not just introduces high allocation latencies into the
+  system, but also impacts system performance due to overreclaim, to
+  the point where the feature becomes self-defeating.
+
+  The memory.low boundary on the other hand is a top-down allocated
+  reserve.  A cgroup enjoys reclaim protection when it and all its
+  ancestors are below their low boundaries, which makes delegation of
+  subtrees possible.  Secondly, new cgroups have no reserve per
+  default and in the common case most cgroups are eligible for the
+  preferred reclaim pass.  This allows the new low boundary to be
+  efficiently implemented with just a minor addition to the generic
+  reclaim code, without the need for out-of-band data structures and
+  reclaim passes.  Because the generic reclaim code considers all
+  cgroups except for the ones running low in the preferred first
+  reclaim pass, overreclaim of individual groups is eliminated as
+  well, resulting in much better overall workload performance.
+
+- The original high boundary, the hard limit, is defined as a strict
+  limit that can not budge, even if the OOM killer has to be called.
+  But this generally goes against the goal of making the most out of
+  the available memory.  The memory consumption of workloads varies
+  during runtime, and that requires users to overcommit.  But doing
+  that with a strict upper limit requires either a fairly accurate
+  prediction of the working set size or adding slack to the limit.
+  Since working set size estimation is hard and error prone, and
+  getting it wrong results in OOM kills, most users tend to err on the
+  side of a looser limit and end up wasting precious resources.
+
+  The memory.high boundary on the other hand can be set much more
+  conservatively.  When hit, it throttles allocations by forcing them
+  into direct reclaim to work off the excess, but it never invokes the
+  OOM killer.  As a result, a high boundary that is chosen too
+  aggressively will not terminate the processes, but instead it will
+  lead to gradual performance degradation.  The user can monitor this
+  and make corrections until the minimal memory footprint that still
+  gives acceptable performance is found.
+
+  In extreme cases, with many concurrent allocations and a complete
+  breakdown of reclaim progress within the group, the high boundary
+  can be exceeded.  But even then it's mostly better to satisfy the
+  allocation from the slack available in other groups or the rest of
+  the system than killing the group.  Otherwise, memory.max is there
+  to limit this type of spillover and ultimately contain buggy or even
+  malicious applications.
+
+- The original control file names are unwieldy and inconsistent in
+  many different ways.  For example, the upper boundary hit count is
+  exported in the memory.failcnt file, but an OOM event count has to
+  be manually counted by listening to memory.oom_control events, and
+  lower boundary / soft limit events have to be counted by first
+  setting a threshold for that value and then counting those events.
+  Also, usage and limit files encode their units in the filename.
+  That makes the filenames very long, even though this is not
+  information that a user needs to be reminded of every time they type
+  out those names.
+
+  To address these naming issues, as well as to signal clearly that
+  the new interface carries a new configuration model, the naming
+  conventions in it necessarily differ from the old interface.
+
+- The original limit files indicate the state of an unset limit with a
+  Very High Number, and a configured limit can be unset by echoing -1
+  into those files.  But that very high number is implementation and
+  architecture dependent and not very descriptive.  And while -1 can
+  be understood as an underflow into the highest possible value, -2 or
+  -10M etc. do not work, so it's not consistent.
+
+  memory.low, memory.high, and memory.max will use the string
+  "infinity" to indicate and set the highest possible value.
 
 5. Planned Changes
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 353537a5981a..6cfd934c7c9b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -52,7 +52,27 @@ struct mem_cgroup_reclaim_cookie {
 	unsigned int generation;
 };
 
+enum mem_cgroup_events_index {
+	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
+	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
+	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
+	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
+	MEM_CGROUP_EVENTS_NSTATS,
+	/* default hierarchy events */
+	MEMCG_LOW = MEM_CGROUP_EVENTS_NSTATS,
+	MEMCG_HIGH,
+	MEMCG_MAX,
+	MEMCG_OOM,
+	MEMCG_NR_EVENTS,
+};
+
 #ifdef CONFIG_MEMCG
+void mem_cgroup_events(struct mem_cgroup *memcg,
+		       enum mem_cgroup_events_index idx,
+		       unsigned int nr);
+
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
+
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 			  gfp_t gfp_mask, struct mem_cgroup **memcgp);
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
@@ -175,6 +195,18 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
+static inline void mem_cgroup_events(struct mem_cgroup *memcg,
+				     enum mem_cgroup_events_index idx,
+				     unsigned int nr)
+{
+}
+
+static inline bool mem_cgroup_low(struct mem_cgroup *root,
+				  struct mem_cgroup *memcg)
+{
+	return false;
+}
+
 static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask,
 					struct mem_cgroup **memcgp)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6453ea5a27aa..ee97c9ac62c0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -97,14 +97,6 @@ static const char * const mem_cgroup_stat_names[] = {
 	"swap",
 };
 
-enum mem_cgroup_events_index {
-	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
-	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
-	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
-	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
-	MEM_CGROUP_EVENTS_NSTATS,
-};
-
 static const char * const mem_cgroup_events_names[] = {
 	"pgpgin",
 	"pgpgout",
@@ -138,7 +130,7 @@ enum mem_cgroup_events_target {
 
 struct mem_cgroup_stat_cpu {
 	long count[MEM_CGROUP_STAT_NSTATS];
-	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+	unsigned long events[MEMCG_NR_EVENTS];
 	unsigned long nr_page_events;
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
@@ -284,6 +276,10 @@ struct mem_cgroup {
 	struct page_counter memsw;
 	struct page_counter kmem;
 
+	/* Normal memory consumption range */
+	unsigned long low;
+	unsigned long high;
+
 	unsigned long soft_limit;
 
 	/* vmpressure notifications */
@@ -2315,6 +2311,8 @@ retry:
 	if (!(gfp_mask & __GFP_WAIT))
 		goto nomem;
 
+	mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
+
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
 						    gfp_mask, may_swap);
 
@@ -2356,6 +2354,8 @@ retry:
 	if (fatal_signal_pending(current))
 		goto bypass;
 
+	mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
+
 	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
 nomem:
 	if (!(gfp_mask & __GFP_NOFAIL))
@@ -2367,6 +2367,16 @@ done_restock:
 	css_get_many(&memcg->css, batch);
 	if (batch > nr_pages)
 		refill_stock(memcg, batch - nr_pages);
+	/*
+	 * If the hierarchy is above the normal consumption range,
+	 * make the charging task trim their excess contribution.
+	 */
+	do {
+		if (page_counter_read(&memcg->memory) <= memcg->high)
+			continue;
+		mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+	} while ((memcg = parent_mem_cgroup(memcg)));
 done:
 	return ret;
 }
@@ -4276,7 +4286,7 @@ out_kfree:
 	return ret;
 }
 
-static struct cftype mem_cgroup_files[] = {
+static struct cftype mem_cgroup_legacy_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@@ -4552,6 +4562,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (parent_css == NULL) {
 		root_mem_cgroup = memcg;
 		page_counter_init(&memcg->memory, NULL);
+		memcg->high = PAGE_COUNTER_MAX;
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
@@ -4597,6 +4608,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 
 	if (parent->use_hierarchy) {
 		page_counter_init(&memcg->memory, &parent->memory);
+		memcg->high = PAGE_COUNTER_MAX;
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, &parent->memsw);
 		page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4607,6 +4619,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 		 */
 	} else {
 		page_counter_init(&memcg->memory, NULL);
+		memcg->high = PAGE_COUNTER_MAX;
 		memcg->soft_limit = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
@@ -4682,6 +4695,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
 	mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
 	memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+	memcg->low = 0;
+	memcg->high = PAGE_COUNTER_MAX;
 	memcg->soft_limit = PAGE_COUNTER_MAX;
 }
 
@@ -5267,6 +5282,147 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
 		mem_cgroup_from_css(root_css)->use_hierarchy = true;
 }
 
+static u64 memory_current_read(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+}
+
+static int memory_low_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned long low = ACCESS_ONCE(memcg->low);
+
+	if (low == PAGE_COUNTER_MAX)
+		seq_puts(m, "infinity\n");
+	else
+		seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
+
+	return 0;
+}
+
+static ssize_t memory_low_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	unsigned long low;
+	int err;
+
+	buf = strstrip(buf);
+	err = page_counter_memparse(buf, "infinity", &low);
+	if (err)
+		return err;
+
+	memcg->low = low;
+
+	return nbytes;
+}
+
+static int memory_high_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned long high = ACCESS_ONCE(memcg->high);
+
+	if (high == PAGE_COUNTER_MAX)
+		seq_puts(m, "infinity\n");
+	else
+		seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
+
+	return 0;
+}
+
+static ssize_t memory_high_write(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	unsigned long high;
+	int err;
+
+	buf = strstrip(buf);
+	err = page_counter_memparse(buf, "infinity", &high);
+	if (err)
+		return err;
+
+	memcg->high = high;
+
+	return nbytes;
+}
+
+static int memory_max_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+	unsigned long max = ACCESS_ONCE(memcg->memory.limit);
+
+	if (max == PAGE_COUNTER_MAX)
+		seq_puts(m, "infinity\n");
+	else
+		seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
+
+	return 0;
+}
+
+static ssize_t memory_max_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	unsigned long max;
+	int err;
+
+	buf = strstrip(buf);
+	err = page_counter_memparse(buf, "infinity", &max);
+	if (err)
+		return err;
+
+	err = mem_cgroup_resize_limit(memcg, max);
+	if (err)
+		return err;
+
+	return nbytes;
+}
+
+static int memory_events_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+
+	seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
+	seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
+	seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
+	seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
+
+	return 0;
+}
+
+static struct cftype memory_files[] = {
+	{
+		.name = "current",
+		.read_u64 = memory_current_read,
+	},
+	{
+		.name = "low",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_low_show,
+		.write = memory_low_write,
+	},
+	{
+		.name = "high",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_high_show,
+		.write = memory_high_write,
+	},
+	{
+		.name = "max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_max_show,
+		.write = memory_max_write,
+	},
+	{
+		.name = "events",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_events_show,
+	},
+	{ }	/* terminate */
+};
+
 struct cgroup_subsys memory_cgrp_subsys = {
 	.css_alloc = mem_cgroup_css_alloc,
 	.css_online = mem_cgroup_css_online,
@@ -5277,7 +5433,8 @@ struct cgroup_subsys memory_cgrp_subsys = {
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
 	.bind = mem_cgroup_bind,
-	.legacy_cftypes = mem_cgroup_files,
+	.dfl_cftypes = memory_files,
+	.legacy_cftypes = mem_cgroup_legacy_files,
 	.early_init = 0,
 };
 
@@ -5312,6 +5469,56 @@ static void __init enable_swap_cgroup(void)
 }
 #endif
 
+/**
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+void mem_cgroup_events(struct mem_cgroup *memcg,
+		       enum mem_cgroup_events_index idx,
+		       unsigned int nr)
+{
+	this_cpu_add(memcg->stat->events[idx], nr);
+}
+
+/**
+ * mem_cgroup_low - check if memory consumption is below the normal range
+ * @root: the highest ancestor to consider
+ * @memcg: the memory cgroup to check
+ *
+ * Returns %true if memory consumption of @memcg, and that of all
+ * configurable ancestors up to @root, is below the normal range.
+ */
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled())
+		return false;
+
+	/*
+	 * The toplevel group doesn't have a configurable range, so
+	 * it's never low when looked at directly, and it is not
+	 * considered an ancestor when assessing the hierarchy.
+	 */
+
+	if (memcg == root_mem_cgroup)
+		return false;
+
+	if (page_counter_read(&memcg->memory) > memcg->low)
+		return false;
+
+	while (memcg != root) {
+		memcg = parent_mem_cgroup(memcg);
+
+		if (memcg == root_mem_cgroup)
+			break;
+
+		if (page_counter_read(&memcg->memory) > memcg->low)
+			return false;
+	}
+	return true;
+}
+
 #ifdef CONFIG_MEMCG_SWAP
 /**
  * mem_cgroup_swapout - transfer a memsw charge to swap
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b6dfa0081a8e..8e645ee52045 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -91,6 +91,9 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	unsigned int may_swap:1;
 
+	/* Can cgroups be reclaimed below their normal consumption range? */
+	unsigned int may_thrash:1;
+
 	unsigned int hibernation_mode:1;
 
 	/* One of the zones is ready for compaction */
@@ -2294,6 +2297,12 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 			struct lruvec *lruvec;
 			int swappiness;
 
+			if (mem_cgroup_low(root, memcg)) {
+				if (!sc->may_thrash)
+					continue;
+				mem_cgroup_events(memcg, MEMCG_LOW, 1);
+			}
+
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 			swappiness = mem_cgroup_swappiness(memcg);
 
@@ -2315,8 +2324,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
 				mem_cgroup_iter_break(root, memcg);
 				break;
 			}
-			memcg = mem_cgroup_iter(root, memcg, &reclaim);
-		} while (memcg);
+		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
 
 		/*
 		 * Shrink the slab caches in the same proportion that
@@ -2519,10 +2527,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 					  struct scan_control *sc)
 {
+	int initial_priority = sc->priority;
 	unsigned long total_scanned = 0;
 	unsigned long writeback_threshold;
 	bool zones_reclaimable;
-
+retry:
 	delayacct_freepages_start();
 
 	if (global_reclaim(sc))
@@ -2572,6 +2581,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	if (sc->compaction_ready)
 		return 1;
 
+	/* Untapped cgroup reserves?  Don't OOM, retry. */
+	if (!sc->may_thrash) {
+		sc->priority = initial_priority;
+		sc->may_thrash = 1;
+		goto retry;
+	}
+
 	/* Any of the zones still reclaimable?  Don't OOM. */
 	if (zones_reclaimable)
 		return 1;
-- 
cgit v1.2.3


From 49550b605587924b3336386caae53200c68969d3 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Wed, 11 Feb 2015 15:26:12 -0800
Subject: oom: add helpers for setting and clearing TIF_MEMDIE

This patchset addresses a race which was described in the changelog for
5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM suspend"):

: PM freezer relies on having all tasks frozen by the time devices are
: getting frozen so that no task will touch them while they are getting
: frozen.  But OOM killer is allowed to kill an already frozen task in order
: to handle OOM situtation.  In order to protect from late wake ups OOM
: killer is disabled after all tasks are frozen.  This, however, still keeps
: a window open when a killed task didn't manage to die by the time
: freeze_processes finishes.

The original patch hasn't closed the race window completely because that
would require a more complex solution as it can be seen by this patchset.

The primary motivation was to close the race condition between OOM killer
and PM freezer _completely_.  As Tejun pointed out, even though the race
condition is unlikely the harder it would be to debug weird bugs deep in
the PM freezer when the debugging options are reduced considerably.  I can
only speculate what might happen when a task is still runnable
unexpectedly.

On a plus side and as a side effect the oom enable/disable has a better
(full barrier) semantic without polluting hot paths.

I have tested the series in KVM with 100M RAM:
- many small tasks (20M anon mmap) which are triggering OOM continually
- s2ram which resumes automatically is triggered in a loop
	echo processors > /sys/power/pm_test
	while true
	do
		echo mem > /sys/power/state
		sleep 1s
	done
- simple module which allocates and frees 20M in 8K chunks. If it sees
  freezing(current) then it tries another round of allocation before calling
  try_to_freeze
- debugging messages of PM stages and OOM killer enable/disable/fail added
  and unmark_oom_victim is delayed by 1s after it clears TIF_MEMDIE and before
  it wakes up waiters.
- rebased on top of the current mmotm which means some necessary updates
  in mm/oom_kill.c. mark_tsk_oom_victim is now called under task_lock but
  I think this should be OK because __thaw_task shouldn't interfere with any
  locking down wake_up_process. Oleg?

As expected there are no OOM killed tasks after oom is disabled and
allocations requested by the kernel thread are failing after all the tasks
are frozen and OOM disabled.  I wasn't able to catch a race where
oom_killer_disable would really have to wait but I kinda expected the race
is really unlikely.

[  242.609330] Killed process 2992 (mem_eater) total-vm:24412kB, anon-rss:2164kB, file-rss:4kB
[  243.628071] Unmarking 2992 OOM victim. oom_victims: 1
[  243.636072] (elapsed 2.837 seconds) done.
[  243.641985] Trying to disable OOM killer
[  243.643032] Waiting for concurent OOM victims
[  243.644342] OOM killer disabled
[  243.645447] Freezing remaining freezable tasks ... (elapsed 0.005 seconds) done.
[  243.652983] Suspending console(s) (use no_console_suspend to debug)
[  243.903299] kmem_eater: page allocation failure: order:1, mode:0x204010
[...]
[  243.992600] PM: suspend of devices complete after 336.667 msecs
[  243.993264] PM: late suspend of devices complete after 0.660 msecs
[  243.994713] PM: noirq suspend of devices complete after 1.446 msecs
[  243.994717] ACPI: Preparing to enter system sleep state S3
[  243.994795] PM: Saving platform NVS memory
[  243.994796] Disabling non-boot CPUs ...

The first 2 patches are simple cleanups for OOM.  They should go in
regardless the rest IMO.

Patches 3 and 4 are trivial printk -> pr_info conversion and they should
go in ditto.

The main patch is the last one and I would appreciate acks from Tejun and
Rafael.  I think the OOM part should be OK (except for __thaw_task vs.
task_lock where a look from Oleg would appreciated) but I am not so sure I
haven't screwed anything in the freezer code.  I have found several
surprises there.

This patch (of 5):

This patch is just a preparatory and it doesn't introduce any functional
change.

Note:
I am utterly unhappy about lowmemory killer abusing TIF_MEMDIE just to
wait for the oom victim and to prevent from new killing. This is
just a side effect of the flag. The primary meaning is to give the oom
victim access to the memory reserves and that shouldn't be necessary
here.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/staging/android/lowmemorykiller.c |  7 ++++++-
 include/linux/oom.h                       |  4 ++++
 kernel/exit.c                             |  2 +-
 mm/memcontrol.c                           |  2 +-
 mm/oom_kill.c                             | 23 ++++++++++++++++++++---
 5 files changed, 32 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/android/lowmemorykiller.c b/drivers/staging/android/lowmemorykiller.c
index b545d3d1da3e..feafa172b155 100644
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -160,7 +160,12 @@ static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 			     selected->pid, selected->comm,
 			     selected_oom_score_adj, selected_tasksize);
 		lowmem_deathpending_timeout = jiffies + HZ;
-		set_tsk_thread_flag(selected, TIF_MEMDIE);
+		/*
+		 * FIXME: lowmemorykiller shouldn't abuse global OOM killer
+		 * infrastructure. There is no real reason why the selected
+		 * task should have access to the memory reserves.
+		 */
+		mark_tsk_oom_victim(selected);
 		send_sig(SIGKILL, selected, 0);
 		rem += selected_tasksize;
 	}
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 76200984d1e2..b42b80f88c3a 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -47,6 +47,10 @@ static inline bool oom_task_origin(const struct task_struct *p)
 	return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
 }
 
+extern void mark_tsk_oom_victim(struct task_struct *tsk);
+
+extern void unmark_oom_victim(void);
+
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
diff --git a/kernel/exit.c b/kernel/exit.c
index 6806c55475ee..02b3d1ab2ec0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,7 @@ static void exit_mm(struct task_struct *tsk)
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
-	clear_thread_flag(TIF_MEMDIE);
+	unmark_oom_victim();
 }
 
 static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 11c9e6a1dad5..fe4d258ef32b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1556,7 +1556,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * quickly exit and free its memory.
 	 */
 	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-		set_thread_flag(TIF_MEMDIE);
+		mark_tsk_oom_victim(current);
 		return;
 	}
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 294493a7ae4b..80b34e285f96 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -416,6 +416,23 @@ void note_oom_kill(void)
 	atomic_inc(&oom_kills);
 }
 
+/**
+ * mark_tsk_oom_victim - marks the given taks as OOM victim.
+ * @tsk: task to mark
+ */
+void mark_tsk_oom_victim(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk, TIF_MEMDIE);
+}
+
+/**
+ * unmark_oom_victim - unmarks the current task as OOM victim.
+ */
+void unmark_oom_victim(void)
+{
+	clear_thread_flag(TIF_MEMDIE);
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Must be called while holding a reference to p, which will be released upon
@@ -440,7 +457,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 */
 	task_lock(p);
 	if (p->mm && task_will_free_mem(p)) {
-		set_tsk_thread_flag(p, TIF_MEMDIE);
+		mark_tsk_oom_victim(p);
 		task_unlock(p);
 		put_task_struct(p);
 		return;
@@ -495,7 +512,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 	/* mm cannot safely be dereferenced after task_unlock(victim) */
 	mm = victim->mm;
-	set_tsk_thread_flag(victim, TIF_MEMDIE);
+	mark_tsk_oom_victim(victim);
 	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
 		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
 		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -652,7 +669,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 */
 	if (current->mm &&
 	    (fatal_signal_pending(current) || task_will_free_mem(current))) {
-		set_thread_flag(TIF_MEMDIE);
+		mark_tsk_oom_victim(current);
 		return;
 	}
 
-- 
cgit v1.2.3


From c32b3cbe0d067a9cfae85aa70ba1e97ceba0ced7 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Wed, 11 Feb 2015 15:26:24 -0800
Subject: oom, PM: make OOM detection in the freezer path raceless

Commit 5695be142e20 ("OOM, PM: OOM killed task shouldn't escape PM
suspend") has left a race window when OOM killer manages to
note_oom_kill after freeze_processes checks the counter.  The race
window is quite small and really unlikely and partial solution deemed
sufficient at the time of submission.

Tejun wasn't happy about this partial solution though and insisted on a
full solution.  That requires the full OOM and freezer's task freezing
exclusion, though.  This is done by this patch which introduces oom_sem
RW lock and turns oom_killer_disable() into a full OOM barrier.

oom_killer_disabled check is moved from the allocation path to the OOM
level and we take oom_sem for reading for both the check and the whole
OOM invocation.

oom_killer_disable() takes oom_sem for writing so it waits for all
currently running OOM killer invocations.  Then it disable all the further
OOMs by setting oom_killer_disabled and checks for any oom victims.
Victims are counted via mark_tsk_oom_victim resp.  unmark_oom_victim.  The
last victim wakes up all waiters enqueued by oom_killer_disable().
Therefore this function acts as the full OOM barrier.

The page fault path is covered now as well although it was assumed to be
safe before.  As per Tejun, "We used to have freezing points deep in file
system code which may be reacheable from page fault." so it would be
better and more robust to not rely on freezing points here.  Same applies
to the memcg OOM killer.

out_of_memory tells the caller whether the OOM was allowed to trigger and
the callers are supposed to handle the situation.  The page allocation
path simply fails the allocation same as before.  The page fault path will
retry the fault (more on that later) and Sysrq OOM trigger will simply
complain to the log.

Normally there wouldn't be any unfrozen user tasks after
try_to_freeze_tasks so the function will not block. But if there was an
OOM killer racing with try_to_freeze_tasks and the OOM victim didn't
finish yet then we have to wait for it. This should complete in a finite
time, though, because

	- the victim cannot loop in the page fault handler (it would die
	  on the way out from the exception)
	- it cannot loop in the page allocator because all the further
	  allocation would fail and __GFP_NOFAIL allocations are not
	  acceptable at this stage
	- it shouldn't be blocked on any locks held by frozen tasks
	  (try_to_freeze expects lockless context) and kernel threads and
	  work queues are not frozen yet

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Suggested-by: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/sysrq.c    |   5 +-
 include/linux/oom.h    |  14 ++----
 kernel/exit.c          |   3 +-
 kernel/power/process.c |  50 ++++---------------
 mm/memcontrol.c        |   2 +-
 mm/oom_kill.c          | 132 +++++++++++++++++++++++++++++++++++++++++--------
 mm/page_alloc.c        |  17 +------
 7 files changed, 132 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 0071469ecbf1..259a4d5a4e8f 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -355,8 +355,9 @@ static struct sysrq_key_op sysrq_term_op = {
 
 static void moom_callback(struct work_struct *ignored)
 {
-	out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL,
-		      0, NULL, true);
+	if (!out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL),
+			   GFP_KERNEL, 0, NULL, true))
+		pr_info("OOM request ignored because killer is disabled\n");
 }
 
 static DECLARE_WORK(moom_work, moom_callback);
diff --git a/include/linux/oom.h b/include/linux/oom.h
index b42b80f88c3a..d5771bed59c9 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,22 +72,14 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 		unsigned long totalpages, const nodemask_t *nodemask,
 		bool force_kill);
 
-extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+extern bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *mask, bool force_kill);
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
 extern bool oom_killer_disabled;
-
-static inline void oom_killer_disable(void)
-{
-	oom_killer_disabled = true;
-}
-
-static inline void oom_killer_enable(void)
-{
-	oom_killer_disabled = false;
-}
+extern bool oom_killer_disable(void);
+extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 02b3d1ab2ec0..feff10bbb307 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -435,7 +435,8 @@ static void exit_mm(struct task_struct *tsk)
 	task_unlock(tsk);
 	mm_update_next_owner(mm);
 	mmput(mm);
-	unmark_oom_victim();
+	if (test_thread_flag(TIF_MEMDIE))
+		unmark_oom_victim();
 }
 
 static struct task_struct *find_alive_thread(struct task_struct *p)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3ac45f192e9f..564f786df470 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -108,30 +108,6 @@ static int try_to_freeze_tasks(bool user_only)
 	return todo ? -EBUSY : 0;
 }
 
-static bool __check_frozen_processes(void)
-{
-	struct task_struct *g, *p;
-
-	for_each_process_thread(g, p)
-		if (p != current && !freezer_should_skip(p) && !frozen(p))
-			return false;
-
-	return true;
-}
-
-/*
- * Returns true if all freezable tasks (except for current) are frozen already
- */
-static bool check_frozen_processes(void)
-{
-	bool ret;
-
-	read_lock(&tasklist_lock);
-	ret = __check_frozen_processes();
-	read_unlock(&tasklist_lock);
-	return ret;
-}
-
 /**
  * freeze_processes - Signal user space processes to enter the refrigerator.
  * The current thread will not be frozen.  The same process that calls
@@ -142,7 +118,6 @@ static bool check_frozen_processes(void)
 int freeze_processes(void)
 {
 	int error;
-	int oom_kills_saved;
 
 	error = __usermodehelper_disable(UMH_FREEZING);
 	if (error)
@@ -157,29 +132,22 @@ int freeze_processes(void)
 	pm_wakeup_clear();
 	pr_info("Freezing user space processes ... ");
 	pm_freezing = true;
-	oom_kills_saved = oom_kills_count();
 	error = try_to_freeze_tasks(true);
 	if (!error) {
 		__usermodehelper_set_disable_depth(UMH_DISABLED);
-		oom_killer_disable();
-
-		/*
-		 * There might have been an OOM kill while we were
-		 * freezing tasks and the killed task might be still
-		 * on the way out so we have to double check for race.
-		 */
-		if (oom_kills_count() != oom_kills_saved &&
-		    !check_frozen_processes()) {
-			__usermodehelper_set_disable_depth(UMH_ENABLED);
-			pr_cont("OOM in progress.");
-			error = -EBUSY;
-		} else {
-			pr_cont("done.");
-		}
+		pr_cont("done.");
 	}
 	pr_cont("\n");
 	BUG_ON(in_atomic());
 
+	/*
+	 * Now that the whole userspace is frozen we need to disbale
+	 * the OOM killer to disallow any further interference with
+	 * killable tasks.
+	 */
+	if (!error && !oom_killer_disable())
+		error = -EBUSY;
+
 	if (error)
 		thaw_processes();
 	return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe4d258ef32b..fbf64e6f64e4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1930,7 +1930,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
 	if (!memcg)
 		return false;
 
-	if (!handle)
+	if (!handle || oom_killer_disabled)
 		goto cleanup;
 
 	owait.memcg = memcg;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3cbd76b8c13b..b8df76ee2be3 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -398,30 +398,27 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 
 /*
- * Number of OOM killer invocations (including memcg OOM killer).
- * Primarily used by PM freezer to check for potential races with
- * OOM killed frozen task.
+ * Number of OOM victims in flight
  */
-static atomic_t oom_kills = ATOMIC_INIT(0);
+static atomic_t oom_victims = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
 
-int oom_kills_count(void)
-{
-	return atomic_read(&oom_kills);
-}
-
-void note_oom_kill(void)
-{
-	atomic_inc(&oom_kills);
-}
+bool oom_killer_disabled __read_mostly;
+static DECLARE_RWSEM(oom_sem);
 
 /**
  * mark_tsk_oom_victim - marks the given taks as OOM victim.
  * @tsk: task to mark
+ *
+ * Has to be called with oom_sem taken for read and never after
+ * oom has been disabled already.
  */
 void mark_tsk_oom_victim(struct task_struct *tsk)
 {
-	set_tsk_thread_flag(tsk, TIF_MEMDIE);
-
+	WARN_ON(oom_killer_disabled);
+	/* OOM killer might race with memcg OOM */
+	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
+		return;
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
 	 * if it is frozen because OOM killer wouldn't be able to free
@@ -429,14 +426,70 @@ void mark_tsk_oom_victim(struct task_struct *tsk)
 	 * that TIF_MEMDIE tasks should be ignored.
 	 */
 	__thaw_task(tsk);
+	atomic_inc(&oom_victims);
 }
 
 /**
  * unmark_oom_victim - unmarks the current task as OOM victim.
+ *
+ * Wakes up all waiters in oom_killer_disable()
  */
 void unmark_oom_victim(void)
 {
-	clear_thread_flag(TIF_MEMDIE);
+	if (!test_and_clear_thread_flag(TIF_MEMDIE))
+		return;
+
+	down_read(&oom_sem);
+	/*
+	 * There is no need to signal the lasst oom_victim if there
+	 * is nobody who cares.
+	 */
+	if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+		wake_up_all(&oom_victims_wait);
+	up_read(&oom_sem);
+}
+
+/**
+ * oom_killer_disable - disable OOM killer
+ *
+ * Forces all page allocations to fail rather than trigger OOM killer.
+ * Will block and wait until all OOM victims are killed.
+ *
+ * The function cannot be called when there are runnable user tasks because
+ * the userspace would see unexpected allocation failures as a result. Any
+ * new usage of this function should be consulted with MM people.
+ *
+ * Returns true if successful and false if the OOM killer cannot be
+ * disabled.
+ */
+bool oom_killer_disable(void)
+{
+	/*
+	 * Make sure to not race with an ongoing OOM killer
+	 * and that the current is not the victim.
+	 */
+	down_write(&oom_sem);
+	if (test_thread_flag(TIF_MEMDIE)) {
+		up_write(&oom_sem);
+		return false;
+	}
+
+	oom_killer_disabled = true;
+	up_write(&oom_sem);
+
+	wait_event(oom_victims_wait, !atomic_read(&oom_victims));
+
+	return true;
+}
+
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
+{
+	down_write(&oom_sem);
+	oom_killer_disabled = false;
+	up_write(&oom_sem);
 }
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -637,7 +690,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
 }
 
 /**
- * out_of_memory - kill the "best" process when we run out of memory
+ * __out_of_memory - kill the "best" process when we run out of memory
  * @zonelist: zonelist pointer
  * @gfp_mask: memory allocation flags
  * @order: amount of memory being requested as a power of 2
@@ -649,7 +702,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 		int order, nodemask_t *nodemask, bool force_kill)
 {
 	const nodemask_t *mpol_mask;
@@ -718,6 +771,32 @@ out:
 		schedule_timeout_killable(1);
 }
 
+/**
+ * out_of_memory -  tries to invoke OOM killer.
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
+ *
+ * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
+ * when it returns false. Otherwise returns true.
+ */
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+		int order, nodemask_t *nodemask, bool force_kill)
+{
+	bool ret = false;
+
+	down_read(&oom_sem);
+	if (!oom_killer_disabled) {
+		__out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
+		ret = true;
+	}
+	up_read(&oom_sem);
+
+	return ret;
+}
+
 /*
  * The pagefault handler calls here because it is out of memory, so kill a
  * memory-hogging task.  If any populated zone has ZONE_OOM_LOCKED set, a
@@ -727,12 +806,25 @@ void pagefault_out_of_memory(void)
 {
 	struct zonelist *zonelist;
 
+	down_read(&oom_sem);
 	if (mem_cgroup_oom_synchronize(true))
-		return;
+		goto unlock;
 
 	zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
 	if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
-		out_of_memory(NULL, 0, 0, NULL, false);
+		if (!oom_killer_disabled)
+			__out_of_memory(NULL, 0, 0, NULL, false);
+		else
+			/*
+			 * There shouldn't be any user tasks runable while the
+			 * OOM killer is disabled so the current task has to
+			 * be a racing OOM victim for which oom_killer_disable()
+			 * is waiting for.
+			 */
+			WARN_ON(test_thread_flag(TIF_MEMDIE));
+
 		oom_zonelist_unlock(zonelist, GFP_KERNEL);
 	}
+unlock:
+	up_read(&oom_sem);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 641d5a9a8617..134e25525044 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -244,8 +244,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
 					PB_migrate, PB_migrate_end);
 }
 
-bool oom_killer_disabled __read_mostly;
-
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -2317,9 +2315,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 
 	*did_some_progress = 0;
 
-	if (oom_killer_disabled)
-		return NULL;
-
 	/*
 	 * Acquire the per-zone oom lock for each zone.  If that
 	 * fails, somebody else is making progress for us.
@@ -2330,14 +2325,6 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 		return NULL;
 	}
 
-	/*
-	 * PM-freezer should be notified that there might be an OOM killer on
-	 * its way to kill and wake somebody up. This is too early and we might
-	 * end up not killing anything but false positives are acceptable.
-	 * See freeze_processes.
-	 */
-	note_oom_kill();
-
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
 	 * here, this is only to catch a parallel oom killing, we must fail if
@@ -2372,8 +2359,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
-	out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false);
-	*did_some_progress = 1;
+	if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
+		*did_some_progress = 1;
 out:
 	oom_zonelist_unlock(ac->zonelist, gfp_mask);
 	return page;
-- 
cgit v1.2.3


From dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 11 Feb 2015 15:26:50 -0800
Subject: mm: account pmd page tables to the process

Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup.  The trick is to allocate a lot of PMD page tables.  Linux
kernel doesn't account PMD tables to the process, only PTE.

The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low.  oom_score for the process will be 0.

	#include <errno.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>
	#include <sys/mman.h>
	#include <sys/prctl.h>

	#define PUD_SIZE (1UL << 30)
	#define PMD_SIZE (1UL << 21)

	#define NR_PUD 130000

	int main(void)
	{
		char *addr = NULL;
		unsigned long i;

		prctl(PR_SET_THP_DISABLE);
		for (i = 0; i < NR_PUD ; i++) {
			addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
					MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
			if (addr == MAP_FAILED) {
				perror("mmap");
				break;
			}
			*addr = 'x';
			munmap(addr, PMD_SIZE);
			mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
					MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
			if (addr == MAP_FAILED)
				perror("re-mmap"), exit(1);
		}
		printf("PID %d consumed %lu KiB in PMD page tables\n",
				getpid(), i * 4096 >> 10);
		return pause();
	}

The patch addresses the issue by account PMD tables to the process the
same way we account PTE.

The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:

 - HugeTLB can share PMD page tables. The patch handles by accounting
   the table to all processes who share it.

 - x86 PAE pre-allocates few PMD tables on fork.

 - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
   check on exit(2).

Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded).  As with nr_ptes we use per-mm counter.  The
counter value is used to calculate baseline for badness score by
oom-killer.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 12 ++++++------
 arch/x86/mm/pgtable.c       | 14 +++++++++-----
 fs/proc/task_mmu.c          |  9 ++++++---
 include/linux/mm.h          | 24 ++++++++++++++++++++++++
 include/linux/mm_types.h    |  3 ++-
 kernel/fork.c               |  3 +++
 mm/debug.c                  |  3 ++-
 mm/hugetlb.c                |  8 ++++++--
 mm/memory.c                 | 15 +++++++++------
 mm/mmap.c                   |  4 +++-
 mm/oom_kill.c               |  9 +++++----
 11 files changed, 75 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 4415aa915681..e9c706e4627a 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -555,12 +555,12 @@ this is causing problems for your system/application.
 
 oom_dump_tasks
 
-Enables a system-wide task dump (excluding kernel threads) to be
-produced when the kernel performs an OOM-killing and includes such
-information as pid, uid, tgid, vm size, rss, nr_ptes, swapents,
-oom_score_adj score, and name.  This is helpful to determine why the
-OOM killer was invoked, to identify the rogue task that caused it,
-and to determine why the OOM killer chose the task it did to kill.
+Enables a system-wide task dump (excluding kernel threads) to be produced
+when the kernel performs an OOM-killing and includes such information as
+pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
+score, and name.  This is helpful to determine why the OOM killer was
+invoked, to identify the rogue task that caused it, and to determine why
+the OOM killer chose the task it did to kill.
 
 If this is set to zero, this information is suppressed.  On very
 large systems with thousands of tasks it may not be feasible to dump
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 6fb6927f9e76..7b22adaad4f1 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
 
 #endif	/* CONFIG_X86_PAE */
 
-static void free_pmds(pmd_t *pmds[])
+static void free_pmds(struct mm_struct *mm, pmd_t *pmds[])
 {
 	int i;
 
@@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[])
 		if (pmds[i]) {
 			pgtable_pmd_page_dtor(virt_to_page(pmds[i]));
 			free_page((unsigned long)pmds[i]);
+			mm_dec_nr_pmds(mm);
 		}
 }
 
-static int preallocate_pmds(pmd_t *pmds[])
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
 {
 	int i;
 	bool failed = false;
@@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[])
 			pmd = NULL;
 			failed = true;
 		}
+		if (pmd)
+			mm_inc_nr_pmds(mm);
 		pmds[i] = pmd;
 	}
 
 	if (failed) {
-		free_pmds(pmds);
+		free_pmds(mm, pmds);
 		return -ENOMEM;
 	}
 
@@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
 
 			paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
 			pmd_free(mm, pmd);
+			mm_dec_nr_pmds(mm);
 		}
 	}
 }
@@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	mm->pgd = pgd;
 
-	if (preallocate_pmds(pmds) != 0)
+	if (preallocate_pmds(mm, pmds) != 0)
 		goto out_free_pgd;
 
 	if (paravirt_pgd_alloc(mm) != 0)
@@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	return pgd;
 
 out_free_pmds:
-	free_pmds(pmds);
+	free_pmds(mm, pmds);
 out_free_pgd:
 	free_page((unsigned long)pgd);
 out:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6396f88c6687..e6e0abeb5d12 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -21,7 +21,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long data, text, lib, swap;
+	unsigned long data, text, lib, swap, ptes, pmds;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	/*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
+	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
+		"VmPMD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		(PTRS_PER_PTE * sizeof(pte_t) *
-		 atomic_long_read(&mm->nr_ptes)) >> 10,
+		ptes >> 10,
+		pmds >> 10,
 		swap << (PAGE_SHIFT-10));
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c6bf813a6b3d..644990b83cda 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1438,8 +1438,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 {
 	return 0;
 }
+
+static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
+static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
+
 #else
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
+
+static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_pmds);
+}
+
+static inline void mm_inc_nr_pmds(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_pmds);
+}
+
+static inline void mm_dec_nr_pmds(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_pmds);
+}
 #endif
 
 int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 20ff2105b564..199a03aab8dc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -363,7 +363,8 @@ struct mm_struct {
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
-	atomic_long_t nr_ptes;			/* Page table pages */
+	atomic_long_t nr_ptes;			/* PTE page table pages */
+	atomic_long_t nr_pmds;			/* PMD page table pages */
 	int map_count;				/* number of VMAs */
 
 	spinlock_t page_table_lock;		/* Protects page tables and some counters */
diff --git a/kernel/fork.c b/kernel/fork.c
index b379d9abddc7..c99098c52641 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -555,6 +555,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
 	atomic_long_set(&mm->nr_ptes, 0);
+#ifndef __PAGETABLE_PMD_FOLDED
+	atomic_long_set(&mm->nr_pmds, 0);
+#endif
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
diff --git a/mm/debug.c b/mm/debug.c
index d69cb5a7ba9a..3eb3ac2fcee7 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm)
 		"get_unmapped_area %p\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
+		"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
 		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+		mm_nr_pmds((struct mm_struct *)mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 		mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fd28d6ba5e5d..0a9ac6c26832 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3598,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 		if (saddr) {
 			spte = huge_pte_offset(svma->vm_mm, saddr);
 			if (spte) {
+				mm_inc_nr_pmds(mm);
 				get_page(virt_to_page(spte));
 				break;
 			}
@@ -3609,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 
 	ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
 	spin_lock(ptl);
-	if (pud_none(*pud))
+	if (pud_none(*pud)) {
 		pud_populate(mm, pud,
 				(pmd_t *)((unsigned long)spte & PAGE_MASK));
-	else
+	} else {
 		put_page(virt_to_page(spte));
+		mm_inc_nr_pmds(mm);
+	}
 	spin_unlock(ptl);
 out:
 	pte = (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3644,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 
 	pud_clear(pud);
 	put_page(virt_to_page(ptep));
+	mm_dec_nr_pmds(mm);
 	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
 	return 1;
 }
diff --git a/mm/memory.c b/mm/memory.c
index d63849b5188f..bbe6a73a899d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 	pmd = pmd_offset(pud, start);
 	pud_clear(pud);
 	pmd_free_tlb(tlb, pmd, start);
+	mm_dec_nr_pmds(tlb->mm);
 }
 
 static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
-	if (pud_present(*pud))		/* Another has populated it */
-		pmd_free(mm, new);
-	else
+	if (!pud_present(*pud)) {
+		mm_inc_nr_pmds(mm);
 		pud_populate(mm, pud, new);
-#else
-	if (pgd_present(*pud))		/* Another has populated it */
+	} else	/* Another has populated it */
 		pmd_free(mm, new);
-	else
+#else
+	if (!pgd_present(*pud)) {
+		mm_inc_nr_pmds(mm);
 		pgd_populate(mm, pud, new);
+	} else /* Another has populated it */
+		pmd_free(mm, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index 14d84666e8ba..6a7d36d133fb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2853,7 +2853,9 @@ void exit_mmap(struct mm_struct *mm)
 	vm_unacct_memory(nr_accounted);
 
 	WARN_ON(atomic_long_read(&mm->nr_ptes) >
-			(FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+			round_up(FIRST_USER_ADDRESS, PMD_SIZE) >> PMD_SHIFT);
+	WARN_ON(mm_nr_pmds(mm) >
+			round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT);
 }
 
 /* Insert vm structure into process list sorted by address
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b8df76ee2be3..642f38cb175a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * The baseline for the badness score is the proportion of RAM that each
 	 * task's rss, pagetable and swap space use.
 	 */
-	points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
-		 get_mm_counter(p->mm, MM_SWAPENTS);
+	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
+		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
 	task_unlock(p);
 
 	/*
@@ -351,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes swapents oom_score_adj name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -367,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 			atomic_long_read(&task->mm->nr_ptes),
+			mm_nr_pmds(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.2.3


From 16c4a097a035c01809aa0c0abd458ca1fe4ff3d0 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Feb 2015 15:27:01 -0800
Subject: mm/compaction: enhance tracepoint output for compaction begin/end

We now have tracepoint for begin event of compaction and it prints start
position of both scanners, but, tracepoint for end event of compaction
doesn't print finish position of both scanners.  It'd be also useful to
know finish position of both scanners so this patch add it.  It will help
to find odd behavior or problem on compaction internal logic.

And mode is added to both begin/end tracepoint output, since according to
mode, compaction behavior is quite different.

And lastly, status format is changed to string rather than status number
for readability.

[akpm@linux-foundation.org: fix sparse warning]
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        |  1 +
 include/trace/events/compaction.h | 49 ++++++++++++++++++++++++++++-----------
 mm/compaction.c                   | 15 ++++++++++--
 3 files changed, 49 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index f2efda2e6ac6..db64cae06530 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -12,6 +12,7 @@
 #define COMPACT_PARTIAL		3
 /* The full zone was compacted */
 #define COMPACT_COMPLETE	4
+/* When adding new state, please change compaction_status_string, too */
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
 /* No contention detected */
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 1337d9e01e3d..839f6fac921a 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -85,46 +85,67 @@ TRACE_EVENT(mm_compaction_migratepages,
 );
 
 TRACE_EVENT(mm_compaction_begin,
-	TP_PROTO(unsigned long zone_start, unsigned long migrate_start,
-		unsigned long free_start, unsigned long zone_end),
+	TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
+		unsigned long free_pfn, unsigned long zone_end, bool sync),
 
-	TP_ARGS(zone_start, migrate_start, free_start, zone_end),
+	TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync),
 
 	TP_STRUCT__entry(
 		__field(unsigned long, zone_start)
-		__field(unsigned long, migrate_start)
-		__field(unsigned long, free_start)
+		__field(unsigned long, migrate_pfn)
+		__field(unsigned long, free_pfn)
 		__field(unsigned long, zone_end)
+		__field(bool, sync)
 	),
 
 	TP_fast_assign(
 		__entry->zone_start = zone_start;
-		__entry->migrate_start = migrate_start;
-		__entry->free_start = free_start;
+		__entry->migrate_pfn = migrate_pfn;
+		__entry->free_pfn = free_pfn;
 		__entry->zone_end = zone_end;
+		__entry->sync = sync;
 	),
 
-	TP_printk("zone_start=0x%lx migrate_start=0x%lx free_start=0x%lx zone_end=0x%lx",
+	TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s",
 		__entry->zone_start,
-		__entry->migrate_start,
-		__entry->free_start,
-		__entry->zone_end)
+		__entry->migrate_pfn,
+		__entry->free_pfn,
+		__entry->zone_end,
+		__entry->sync ? "sync" : "async")
 );
 
 TRACE_EVENT(mm_compaction_end,
-	TP_PROTO(int status),
+	TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn,
+		unsigned long free_pfn, unsigned long zone_end, bool sync,
+		int status),
 
-	TP_ARGS(status),
+	TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync, status),
 
 	TP_STRUCT__entry(
+		__field(unsigned long, zone_start)
+		__field(unsigned long, migrate_pfn)
+		__field(unsigned long, free_pfn)
+		__field(unsigned long, zone_end)
+		__field(bool, sync)
 		__field(int, status)
 	),
 
 	TP_fast_assign(
+		__entry->zone_start = zone_start;
+		__entry->migrate_pfn = migrate_pfn;
+		__entry->free_pfn = free_pfn;
+		__entry->zone_end = zone_end;
+		__entry->sync = sync;
 		__entry->status = status;
 	),
 
-	TP_printk("status=%d", __entry->status)
+	TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s status=%s",
+		__entry->zone_start,
+		__entry->migrate_pfn,
+		__entry->free_pfn,
+		__entry->zone_end,
+		__entry->sync ? "sync" : "async",
+		compaction_status_string[__entry->status])
 );
 
 #endif /* _TRACE_COMPACTION_H */
diff --git a/mm/compaction.c b/mm/compaction.c
index 9c7e6909dd29..66f7c365e888 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -34,6 +34,15 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 #endif
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
+#ifdef CONFIG_TRACEPOINTS
+static const char *const compaction_status_string[] = {
+	"deferred",
+	"skipped",
+	"continue",
+	"partial",
+	"complete",
+};
+#endif
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/compaction.h>
@@ -1197,7 +1206,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
 
-	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
+	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
+				cc->free_pfn, end_pfn, sync);
 
 	migrate_prep_local();
 
@@ -1299,7 +1309,8 @@ out:
 			zone->compact_cached_free_pfn = free_pfn;
 	}
 
-	trace_mm_compaction_end(ret);
+	trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
+				cc->free_pfn, end_pfn, sync, ret);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 837d026d560c5ef26abeca0441713d82e4e82cad Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Feb 2015 15:27:06 -0800
Subject: mm/compaction: more trace to understand when/why compaction
 start/finish

It is not well analyzed that when/why compaction start/finish or not.
With these new tracepoints, we can know much more about start/finish
reason of compaction.  I can find following bug with these tracepoint.

http://www.spinics.net/lists/linux-mm/msg81582.html

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        |  3 ++
 include/trace/events/compaction.h | 74 +++++++++++++++++++++++++++++++++++++++
 mm/compaction.c                   | 38 +++++++++++++++++---
 3 files changed, 111 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index db64cae06530..501d7513aac1 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -12,6 +12,9 @@
 #define COMPACT_PARTIAL		3
 /* The full zone was compacted */
 #define COMPACT_COMPLETE	4
+/* For more detailed tracepoint output */
+#define COMPACT_NO_SUITABLE_PAGE	5
+#define COMPACT_NOT_SUITABLE_ZONE	6
 /* When adding new state, please change compaction_status_string, too */
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 139020b55612..d46535801f63 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -164,6 +164,80 @@ TRACE_EVENT(mm_compaction_end,
 		compaction_status_string[__entry->status])
 );
 
+TRACE_EVENT(mm_compaction_try_to_compact_pages,
+
+	TP_PROTO(
+		int order,
+		gfp_t gfp_mask,
+		enum migrate_mode mode),
+
+	TP_ARGS(order, gfp_mask, mode),
+
+	TP_STRUCT__entry(
+		__field(int, order)
+		__field(gfp_t, gfp_mask)
+		__field(enum migrate_mode, mode)
+	),
+
+	TP_fast_assign(
+		__entry->order = order;
+		__entry->gfp_mask = gfp_mask;
+		__entry->mode = mode;
+	),
+
+	TP_printk("order=%d gfp_mask=0x%x mode=%d",
+		__entry->order,
+		__entry->gfp_mask,
+		(int)__entry->mode)
+);
+
+DECLARE_EVENT_CLASS(mm_compaction_suitable_template,
+
+	TP_PROTO(struct zone *zone,
+		int order,
+		int ret),
+
+	TP_ARGS(zone, order, ret),
+
+	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(char *, name)
+		__field(int, order)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->nid = zone_to_nid(zone);
+		__entry->name = (char *)zone->name;
+		__entry->order = order;
+		__entry->ret = ret;
+	),
+
+	TP_printk("node=%d zone=%-8s order=%d ret=%s",
+		__entry->nid,
+		__entry->name,
+		__entry->order,
+		compaction_status_string[__entry->ret])
+);
+
+DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished,
+
+	TP_PROTO(struct zone *zone,
+		int order,
+		int ret),
+
+	TP_ARGS(zone, order, ret)
+);
+
+DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable,
+
+	TP_PROTO(struct zone *zone,
+		int order,
+		int ret),
+
+	TP_ARGS(zone, order, ret)
+);
+
 #endif /* _TRACE_COMPACTION_H */
 
 /* This part must be outside protection */
diff --git a/mm/compaction.c b/mm/compaction.c
index b12df9fe10b4..b6ede459c1bb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -41,6 +41,8 @@ static const char *const compaction_status_string[] = {
 	"continue",
 	"partial",
 	"complete",
+	"no_suitable_page",
+	"not_suitable_zone",
 };
 #endif
 
@@ -1049,7 +1051,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
 
-static int compact_finished(struct zone *zone, struct compact_control *cc,
+static int __compact_finished(struct zone *zone, struct compact_control *cc,
 			    const int migratetype)
 {
 	unsigned int order;
@@ -1104,7 +1106,20 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
 			return COMPACT_PARTIAL;
 	}
 
-	return COMPACT_CONTINUE;
+	return COMPACT_NO_SUITABLE_PAGE;
+}
+
+static int compact_finished(struct zone *zone, struct compact_control *cc,
+			    const int migratetype)
+{
+	int ret;
+
+	ret = __compact_finished(zone, cc, migratetype);
+	trace_mm_compaction_finished(zone, cc->order, ret);
+	if (ret == COMPACT_NO_SUITABLE_PAGE)
+		ret = COMPACT_CONTINUE;
+
+	return ret;
 }
 
 /*
@@ -1114,7 +1129,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
  *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
  *   COMPACT_CONTINUE - If compaction should run now
  */
-unsigned long compaction_suitable(struct zone *zone, int order,
+static unsigned long __compaction_suitable(struct zone *zone, int order,
 					int alloc_flags, int classzone_idx)
 {
 	int fragindex;
@@ -1158,11 +1173,24 @@ unsigned long compaction_suitable(struct zone *zone, int order,
 	 */
 	fragindex = fragmentation_index(zone, order);
 	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-		return COMPACT_SKIPPED;
+		return COMPACT_NOT_SUITABLE_ZONE;
 
 	return COMPACT_CONTINUE;
 }
 
+unsigned long compaction_suitable(struct zone *zone, int order,
+					int alloc_flags, int classzone_idx)
+{
+	unsigned long ret;
+
+	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+	trace_mm_compaction_suitable(zone, order, ret);
+	if (ret == COMPACT_NOT_SUITABLE_ZONE)
+		ret = COMPACT_SKIPPED;
+
+	return ret;
+}
+
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
 	int ret;
@@ -1376,6 +1404,8 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	if (!order || !may_enter_fs || !may_perform_io)
 		return COMPACT_SKIPPED;
 
+	trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
+
 	/* Compact each zone in the list */
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
-- 
cgit v1.2.3


From 24e2716f63e613cf15d3beba3faa0711bcacc427 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Feb 2015 15:27:09 -0800
Subject: mm/compaction: add tracepoint to observe behaviour of compaction
 defer

Compaction deferring logic is heavy hammer that block the way to the
compaction.  It doesn't consider overall system state, so it could prevent
user from doing compaction falsely.  In other words, even if system has
enough range of memory to compact, compaction would be skipped due to
compaction deferring logic.  This patch add new tracepoint to understand
work of deferring logic.  This will also help to check compaction success
and fail.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        | 65 +++--------------------------------
 include/trace/events/compaction.h | 56 ++++++++++++++++++++++++++++++
 mm/compaction.c                   | 71 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 501d7513aac1..a014559e4a49 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -44,66 +44,11 @@ extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order,
 					int alloc_flags, int classzone_idx);
 
-/* Do not skip compaction more than 64 times */
-#define COMPACT_MAX_DEFER_SHIFT 6
-
-/*
- * Compaction is deferred when compaction fails to result in a page
- * allocation success. 1 << compact_defer_limit compactions are skipped up
- * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
- */
-static inline void defer_compaction(struct zone *zone, int order)
-{
-	zone->compact_considered = 0;
-	zone->compact_defer_shift++;
-
-	if (order < zone->compact_order_failed)
-		zone->compact_order_failed = order;
-
-	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
-		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
-}
-
-/* Returns true if compaction should be skipped this time */
-static inline bool compaction_deferred(struct zone *zone, int order)
-{
-	unsigned long defer_limit = 1UL << zone->compact_defer_shift;
-
-	if (order < zone->compact_order_failed)
-		return false;
-
-	/* Avoid possible overflow */
-	if (++zone->compact_considered > defer_limit)
-		zone->compact_considered = defer_limit;
-
-	return zone->compact_considered < defer_limit;
-}
-
-/*
- * Update defer tracking counters after successful compaction of given order,
- * which means an allocation either succeeded (alloc_success == true) or is
- * expected to succeed.
- */
-static inline void compaction_defer_reset(struct zone *zone, int order,
-		bool alloc_success)
-{
-	if (alloc_success) {
-		zone->compact_considered = 0;
-		zone->compact_defer_shift = 0;
-	}
-	if (order >= zone->compact_order_failed)
-		zone->compact_order_failed = order + 1;
-}
-
-/* Returns true if restarting compaction after many failures */
-static inline bool compaction_restarting(struct zone *zone, int order)
-{
-	if (order < zone->compact_order_failed)
-		return false;
-
-	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
-		zone->compact_considered >= 1UL << zone->compact_defer_shift;
-}
+extern void defer_compaction(struct zone *zone, int order);
+extern bool compaction_deferred(struct zone *zone, int order);
+extern void compaction_defer_reset(struct zone *zone, int order,
+				bool alloc_success);
+extern bool compaction_restarting(struct zone *zone, int order);
 
 #else
 static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index d46535801f63..9a6a3fe0fb51 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -238,6 +238,62 @@ DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable,
 	TP_ARGS(zone, order, ret)
 );
 
+#ifdef CONFIG_COMPACTION
+DECLARE_EVENT_CLASS(mm_compaction_defer_template,
+
+	TP_PROTO(struct zone *zone, int order),
+
+	TP_ARGS(zone, order),
+
+	TP_STRUCT__entry(
+		__field(int, nid)
+		__field(char *, name)
+		__field(int, order)
+		__field(unsigned int, considered)
+		__field(unsigned int, defer_shift)
+		__field(int, order_failed)
+	),
+
+	TP_fast_assign(
+		__entry->nid = zone_to_nid(zone);
+		__entry->name = (char *)zone->name;
+		__entry->order = order;
+		__entry->considered = zone->compact_considered;
+		__entry->defer_shift = zone->compact_defer_shift;
+		__entry->order_failed = zone->compact_order_failed;
+	),
+
+	TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu",
+		__entry->nid,
+		__entry->name,
+		__entry->order,
+		__entry->order_failed,
+		__entry->considered,
+		1UL << __entry->defer_shift)
+);
+
+DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_deferred,
+
+	TP_PROTO(struct zone *zone, int order),
+
+	TP_ARGS(zone, order)
+);
+
+DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_compaction,
+
+	TP_PROTO(struct zone *zone, int order),
+
+	TP_ARGS(zone, order)
+);
+
+DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset,
+
+	TP_PROTO(struct zone *zone, int order),
+
+	TP_ARGS(zone, order)
+);
+#endif
+
 #endif /* _TRACE_COMPACTION_H */
 
 /* This part must be outside protection */
diff --git a/mm/compaction.c b/mm/compaction.c
index b6ede459c1bb..b68736c8a1ce 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -124,6 +124,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
 }
 
 #ifdef CONFIG_COMPACTION
+
+/* Do not skip compaction more than 64 times */
+#define COMPACT_MAX_DEFER_SHIFT 6
+
+/*
+ * Compaction is deferred when compaction fails to result in a page
+ * allocation success. 1 << compact_defer_limit compactions are skipped up
+ * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
+ */
+void defer_compaction(struct zone *zone, int order)
+{
+	zone->compact_considered = 0;
+	zone->compact_defer_shift++;
+
+	if (order < zone->compact_order_failed)
+		zone->compact_order_failed = order;
+
+	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
+		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
+
+	trace_mm_compaction_defer_compaction(zone, order);
+}
+
+/* Returns true if compaction should be skipped this time */
+bool compaction_deferred(struct zone *zone, int order)
+{
+	unsigned long defer_limit = 1UL << zone->compact_defer_shift;
+
+	if (order < zone->compact_order_failed)
+		return false;
+
+	/* Avoid possible overflow */
+	if (++zone->compact_considered > defer_limit)
+		zone->compact_considered = defer_limit;
+
+	if (zone->compact_considered >= defer_limit)
+		return false;
+
+	trace_mm_compaction_deferred(zone, order);
+
+	return true;
+}
+
+/*
+ * Update defer tracking counters after successful compaction of given order,
+ * which means an allocation either succeeded (alloc_success == true) or is
+ * expected to succeed.
+ */
+void compaction_defer_reset(struct zone *zone, int order,
+		bool alloc_success)
+{
+	if (alloc_success) {
+		zone->compact_considered = 0;
+		zone->compact_defer_shift = 0;
+	}
+	if (order >= zone->compact_order_failed)
+		zone->compact_order_failed = order + 1;
+
+	trace_mm_compaction_defer_reset(zone, order);
+}
+
+/* Returns true if restarting compaction after many failures */
+bool compaction_restarting(struct zone *zone, int order)
+{
+	if (order < zone->compact_order_failed)
+		return false;
+
+	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
+		zone->compact_considered >= 1UL << zone->compact_defer_shift;
+}
+
 /* Returns true if the pageblock should be scanned for pages to isolate. */
 static inline bool isolation_suitable(struct compact_control *cc,
 					struct page *page)
-- 
cgit v1.2.3


From 077fcf116c8c2bd7ee9487b645aa3b50368db7e1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 11 Feb 2015 15:27:12 -0800
Subject: mm/thp: allocate transparent hugepages on local node

This make sure that we try to allocate hugepages from local node if
allowed by mempolicy.  If we can't, we fallback to small page allocation
based on mempolicy.  This is based on the observation that allocating
pages on local node is more beneficial than allocating hugepages on remote
node.

With this patch applied we may find transparent huge page allocation
failures if the current node doesn't have enough freee hugepages.  Before
this patch such failures result in us retrying the allocation on other
nodes in the numa node mask.

[akpm@linux-foundation.org: fix comment, add CONFIG_TRANSPARENT_HUGEPAGE dependency]
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h |  4 +++
 mm/huge_memory.c    | 24 +++++++-----------
 mm/mempolicy.c      | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b840e3b2770d..60110e06419d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -335,11 +335,15 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 			struct vm_area_struct *vma, unsigned long addr,
 			int node);
+extern struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma,
+				       unsigned long addr, int order);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
 #define alloc_pages_vma(gfp_mask, order, vma, addr, node)	\
 	alloc_pages(gfp_mask, order)
+#define alloc_hugepage_vma(gfp_mask, vma, addr, order)	\
+	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 #define alloc_page_vma(gfp_mask, vma, addr)			\
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 889713180980..0531ea7dd7cf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -761,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
 	return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
 }
 
-static inline struct page *alloc_hugepage_vma(int defrag,
-					      struct vm_area_struct *vma,
-					      unsigned long haddr, int nd,
-					      gfp_t extra_gfp)
-{
-	return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
-			       HPAGE_PMD_ORDER, vma, haddr, nd);
-}
-
 /* Caller must hold page table lock. */
 static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -790,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			       unsigned long address, pmd_t *pmd,
 			       unsigned int flags)
 {
+	gfp_t gfp;
 	struct page *page;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
 
@@ -824,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		return 0;
 	}
-	page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-			vma, haddr, numa_node_id(), 0);
+	gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+	page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
 	if (unlikely(!page)) {
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
@@ -1113,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_unlock(ptl);
 alloc:
 	if (transparent_hugepage_enabled(vma) &&
-	    !transparent_hugepage_debug_cow())
-		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
-					      vma, haddr, numa_node_id(), 0);
-	else
+	    !transparent_hugepage_debug_cow()) {
+		gfp_t gfp;
+
+		gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+		new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+	} else
 		new_page = NULL;
 
 	if (unlikely(!new_page)) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e0961b8c39c..8a32873fdbf7 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2030,6 +2030,78 @@ retry_cpuset:
 	return page;
 }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/**
+ * alloc_hugepage_vma: Allocate a hugepage for a VMA
+ * @gfp:
+ *   %GFP_USER	  user allocation.
+ *   %GFP_KERNEL  kernel allocations,
+ *   %GFP_HIGHMEM highmem/user allocations,
+ *   %GFP_FS	  allocation should not call back into a file system.
+ *   %GFP_ATOMIC  don't sleep.
+ *
+ * @vma:   Pointer to VMA or NULL if not available.
+ * @addr:  Virtual Address of the allocation. Must be inside the VMA.
+ * @order: Order of the hugepage for gfp allocation.
+ *
+ * This functions allocate a huge page from the kernel page pool and applies
+ * a NUMA policy associated with the VMA or the current process.
+ * For policy other than %MPOL_INTERLEAVE, we make sure we allocate hugepage
+ * only from the current node if the current node is part of the node mask.
+ * If we can't allocate a hugepage we fail the allocation and don' try to fallback
+ * to other nodes in the node mask. If the current node is not part of node mask
+ * or if the NUMA policy is MPOL_INTERLEAVE we use the allocator that can
+ * fallback to nodes in the policy node mask.
+ *
+ * When VMA is not NULL caller must hold down_read on the mmap_sem of the
+ * mm_struct of the VMA to prevent it from going away. Should be used for
+ * all allocations for pages that will be mapped into
+ * user space. Returns NULL when no page can be allocated.
+ *
+ * Should be called with vma->vm_mm->mmap_sem held.
+ *
+ */
+struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma,
+				unsigned long addr, int order)
+{
+	struct page *page;
+	nodemask_t *nmask;
+	struct mempolicy *pol;
+	int node = numa_node_id();
+	unsigned int cpuset_mems_cookie;
+
+retry_cpuset:
+	pol = get_vma_policy(vma, addr);
+	cpuset_mems_cookie = read_mems_allowed_begin();
+	/*
+	 * For interleave policy, we don't worry about
+	 * current node. Otherwise if current node is
+	 * in nodemask, try to allocate hugepage from
+	 * the current node. Don't fall back to other nodes
+	 * for THP.
+	 */
+	if (unlikely(pol->mode == MPOL_INTERLEAVE))
+		goto alloc_with_fallback;
+	nmask = policy_nodemask(gfp, pol);
+	if (!nmask || node_isset(node, *nmask)) {
+		mpol_cond_put(pol);
+		page = alloc_pages_exact_node(node, gfp, order);
+		if (unlikely(!page &&
+			     read_mems_allowed_retry(cpuset_mems_cookie)))
+			goto retry_cpuset;
+		return page;
+	}
+alloc_with_fallback:
+	mpol_cond_put(pol);
+	/*
+	 * if current node is not part of node mask, try
+	 * the allocation from any node, and we can do retry
+	 * in that case.
+	 */
+	return alloc_pages_vma(gfp, order, vma, addr, node);
+}
+#endif
+
 /**
  * 	alloc_pages_current - Allocate pages.
  *
-- 
cgit v1.2.3


From be97a41b291e495d6cb767b3ee0f84ed05804892 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Feb 2015 15:27:15 -0800
Subject: mm/mempolicy.c: merge alloc_hugepage_vma to alloc_pages_vma

The previous commit ("mm/thp: Allocate transparent hugepages on local
node") introduced alloc_hugepage_vma() to mm/mempolicy.c to perform a
special policy for THP allocations.  The function has the same interface
as alloc_pages_vma(), shares a lot of boilerplate code and a long
comment.

This patch merges the hugepage special case into alloc_pages_vma.  The
extra if condition should be cheap enough price to pay.  We also prevent
a (however unlikely) race with parallel mems_allowed update, which could
make hugepage allocation restart only within the fallback call to
alloc_hugepage_vma() and not reconsider the special rule in
alloc_hugepage_vma().

Also by making sure mpol_cond_put(pol) is always called before actual
allocation attempt, we can use a single exit path within the function.

Also update the comment for missing node parameter and obsolete reference
to mm_sem.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h |  12 +++---
 mm/mempolicy.c      | 118 +++++++++++++++-------------------------------------
 2 files changed, 39 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 60110e06419d..51bd1e72a917 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -334,22 +334,22 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
 }
 extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
 			struct vm_area_struct *vma, unsigned long addr,
-			int node);
-extern struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma,
-				       unsigned long addr, int order);
+			int node, bool hugepage);
+#define alloc_hugepage_vma(gfp_mask, vma, addr, order)	\
+	alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
-#define alloc_pages_vma(gfp_mask, order, vma, addr, node)	\
+#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
 	alloc_pages(gfp_mask, order)
 #define alloc_hugepage_vma(gfp_mask, vma, addr, order)	\
 	alloc_pages(gfp_mask, order)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 #define alloc_page_vma(gfp_mask, vma, addr)			\
-	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
+	alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
 #define alloc_page_vma_node(gfp_mask, vma, addr, node)		\
-	alloc_pages_vma(gfp_mask, 0, vma, addr, node)
+	alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
 
 extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order);
 extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8a32873fdbf7..acbbf4c821e2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1988,120 +1988,68 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  *	@order:Order of the GFP allocation.
  * 	@vma:  Pointer to VMA or NULL if not available.
  *	@addr: Virtual Address of the allocation. Must be inside the VMA.
+ *	@node: Which node to prefer for allocation (modulo policy).
+ *	@hugepage: for hugepages try only the preferred node if possible
  *
  * 	This function allocates a page from the kernel page pool and applies
  *	a NUMA policy associated with the VMA or the current process.
  *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
  *	mm_struct of the VMA to prevent it from going away. Should be used for
- *	all allocations for pages that will be mapped into
- * 	user space. Returns NULL when no page can be allocated.
- *
- *	Should be called with the mm_sem of the vma hold.
+ *	all allocations for pages that will be mapped into user space. Returns
+ *	NULL when no page can be allocated.
  */
 struct page *
 alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
-		unsigned long addr, int node)
+		unsigned long addr, int node, bool hugepage)
 {
 	struct mempolicy *pol;
 	struct page *page;
 	unsigned int cpuset_mems_cookie;
+	struct zonelist *zl;
+	nodemask_t *nmask;
 
 retry_cpuset:
 	pol = get_vma_policy(vma, addr);
 	cpuset_mems_cookie = read_mems_allowed_begin();
 
-	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
+					pol->mode != MPOL_INTERLEAVE)) {
+		/*
+		 * For hugepage allocation and non-interleave policy which
+		 * allows the current node, we only try to allocate from the
+		 * current node and don't fall back to other nodes, as the
+		 * cost of remote accesses would likely offset THP benefits.
+		 *
+		 * If the policy is interleave, or does not allow the current
+		 * node in its nodemask, we allocate the standard way.
+		 */
+		nmask = policy_nodemask(gfp, pol);
+		if (!nmask || node_isset(node, *nmask)) {
+			mpol_cond_put(pol);
+			page = alloc_pages_exact_node(node, gfp, order);
+			goto out;
+		}
+	}
+
+	if (pol->mode == MPOL_INTERLEAVE) {
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, order, nid);
-		if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
-			goto retry_cpuset;
-
-		return page;
+		goto out;
 	}
-	page = __alloc_pages_nodemask(gfp, order,
-				      policy_zonelist(gfp, pol, node),
-				      policy_nodemask(gfp, pol));
+
+	nmask = policy_nodemask(gfp, pol);
+	zl = policy_zonelist(gfp, pol, node);
 	mpol_cond_put(pol);
+	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+out:
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	return page;
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/**
- * alloc_hugepage_vma: Allocate a hugepage for a VMA
- * @gfp:
- *   %GFP_USER	  user allocation.
- *   %GFP_KERNEL  kernel allocations,
- *   %GFP_HIGHMEM highmem/user allocations,
- *   %GFP_FS	  allocation should not call back into a file system.
- *   %GFP_ATOMIC  don't sleep.
- *
- * @vma:   Pointer to VMA or NULL if not available.
- * @addr:  Virtual Address of the allocation. Must be inside the VMA.
- * @order: Order of the hugepage for gfp allocation.
- *
- * This functions allocate a huge page from the kernel page pool and applies
- * a NUMA policy associated with the VMA or the current process.
- * For policy other than %MPOL_INTERLEAVE, we make sure we allocate hugepage
- * only from the current node if the current node is part of the node mask.
- * If we can't allocate a hugepage we fail the allocation and don' try to fallback
- * to other nodes in the node mask. If the current node is not part of node mask
- * or if the NUMA policy is MPOL_INTERLEAVE we use the allocator that can
- * fallback to nodes in the policy node mask.
- *
- * When VMA is not NULL caller must hold down_read on the mmap_sem of the
- * mm_struct of the VMA to prevent it from going away. Should be used for
- * all allocations for pages that will be mapped into
- * user space. Returns NULL when no page can be allocated.
- *
- * Should be called with vma->vm_mm->mmap_sem held.
- *
- */
-struct page *alloc_hugepage_vma(gfp_t gfp, struct vm_area_struct *vma,
-				unsigned long addr, int order)
-{
-	struct page *page;
-	nodemask_t *nmask;
-	struct mempolicy *pol;
-	int node = numa_node_id();
-	unsigned int cpuset_mems_cookie;
-
-retry_cpuset:
-	pol = get_vma_policy(vma, addr);
-	cpuset_mems_cookie = read_mems_allowed_begin();
-	/*
-	 * For interleave policy, we don't worry about
-	 * current node. Otherwise if current node is
-	 * in nodemask, try to allocate hugepage from
-	 * the current node. Don't fall back to other nodes
-	 * for THP.
-	 */
-	if (unlikely(pol->mode == MPOL_INTERLEAVE))
-		goto alloc_with_fallback;
-	nmask = policy_nodemask(gfp, pol);
-	if (!nmask || node_isset(node, *nmask)) {
-		mpol_cond_put(pol);
-		page = alloc_pages_exact_node(node, gfp, order);
-		if (unlikely(!page &&
-			     read_mems_allowed_retry(cpuset_mems_cookie)))
-			goto retry_cpuset;
-		return page;
-	}
-alloc_with_fallback:
-	mpol_cond_put(pol);
-	/*
-	 * if current node is not part of node mask, try
-	 * the allocation from any node, and we can do retry
-	 * in that case.
-	 */
-	return alloc_pages_vma(gfp, order, vma, addr, node);
-}
-#endif
-
 /**
  * 	alloc_pages_current - Allocate pages.
  *
-- 
cgit v1.2.3


From f0818f472d8d527a96ec9cc2c3a56223497f9dd3 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 11 Feb 2015 15:27:17 -0800
Subject: mm: gup: add get_user_pages_locked and get_user_pages_unlocked

FAULT_FOLL_ALLOW_RETRY allows the page fault to drop the mmap_sem for
reading to reduce the mmap_sem contention (for writing), like while
waiting for I/O completion.  The problem is that right now practically no
get_user_pages call uses FAULT_FOLL_ALLOW_RETRY, so we're not leveraging
that nifty feature.

Andres fixed it for the KVM page fault.  However get_user_pages_fast
remains uncovered, and 99% of other get_user_pages aren't using it either
(the only exception being FOLL_NOWAIT in KVM which is really nonblocking
and in fact it doesn't even release the mmap_sem).

So this patchsets extends the optimization Andres did in the KVM page
fault to the whole kernel.  It makes most important places (including
gup_fast) to use FAULT_FOLL_ALLOW_RETRY to reduce the mmap_sem hold times
during I/O.

The only few places that remains uncovered are drivers like v4l and other
exceptions that tends to work on their own memory and they're not working
on random user memory (for example like O_DIRECT that uses gup_fast and is
fully covered by this patch).

A follow up patch should probably also add a printk_once warning to
get_user_pages that should go obsolete and be phased out eventually.  The
"vmas" parameter of get_user_pages makes it fundamentally incompatible
with FAULT_FOLL_ALLOW_RETRY (vmas array becomes meaningless the moment the
mmap_sem is released).

While this is just an optimization, this becomes an absolute requirement
for the userfaultfd feature http://lwn.net/Articles/615086/ .

The userfaultfd allows to block the page fault, and in order to do so I
need to drop the mmap_sem first.  So this patch also ensures that all
memory where userfaultfd could be registered by KVM, the very first fault
(no matter if it is a regular page fault, or a get_user_pages) always has
FAULT_FOLL_ALLOW_RETRY set.  Then the userfaultfd blocks and it is waken
only when the pagetable is already mapped.  The second fault attempt after
the wakeup doesn't need FAULT_FOLL_ALLOW_RETRY, so it's ok to retry
without it.

This patch (of 5):

We can leverage the VM_FAULT_RETRY functionality in the page fault paths
better by using either get_user_pages_locked or get_user_pages_unlocked.

The former allows conversion of get_user_pages invocations that will have
to pass a "&locked" parameter to know if the mmap_sem was dropped during
the call.  Example from:

    down_read(&mm->mmap_sem);
    do_something()
    get_user_pages(tsk, mm, ..., pages, NULL);
    up_read(&mm->mmap_sem);

to:

    int locked = 1;
    down_read(&mm->mmap_sem);
    do_something()
    get_user_pages_locked(tsk, mm, ..., pages, &locked);
    if (locked)
        up_read(&mm->mmap_sem);

The latter is suitable only as a drop in replacement of the form:

    down_read(&mm->mmap_sem);
    get_user_pages(tsk, mm, ..., pages, NULL);
    up_read(&mm->mmap_sem);

into:

    get_user_pages_unlocked(tsk, mm, ..., pages);

Where tsk, mm, the intermediate "..." paramters and "pages" can be any
value as before.  Just the last parameter of get_user_pages (vmas) must be
NULL for get_user_pages_locked|unlocked to be usable (the latter original
form wouldn't have been safe anyway if vmas wasn't null, for the former we
just make it explicit by dropping the parameter).

If vmas is not NULL these two methods cannot be used.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Reviewed-by: Peter Feiner <pfeiner@google.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |   7 +++
 mm/gup.c           | 177 +++++++++++++++++++++++++++++++++++++++++++++++++----
 mm/nommu.c         |  23 +++++++
 3 files changed, 196 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 644990b83cda..fc499e675475 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1261,6 +1261,13 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
 		    int write, int force, struct page **pages,
 		    struct vm_area_struct **vmas);
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+		    unsigned long start, unsigned long nr_pages,
+		    int write, int force, struct page **pages,
+		    int *locked);
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+		    unsigned long start, unsigned long nr_pages,
+		    int write, int force, struct page **pages);
 int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 			struct page **pages);
 struct kvec;
diff --git a/mm/gup.c b/mm/gup.c
index 1a8ab05918e0..71a37738a326 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -575,6 +575,165 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
 	return 0;
 }
 
+static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
+						struct mm_struct *mm,
+						unsigned long start,
+						unsigned long nr_pages,
+						int write, int force,
+						struct page **pages,
+						struct vm_area_struct **vmas,
+						int *locked, bool notify_drop)
+{
+	int flags = FOLL_TOUCH;
+	long ret, pages_done;
+	bool lock_dropped;
+
+	if (locked) {
+		/* if VM_FAULT_RETRY can be returned, vmas become invalid */
+		BUG_ON(vmas);
+		/* check caller initialized locked */
+		BUG_ON(*locked != 1);
+	}
+
+	if (pages)
+		flags |= FOLL_GET;
+	if (write)
+		flags |= FOLL_WRITE;
+	if (force)
+		flags |= FOLL_FORCE;
+
+	pages_done = 0;
+	lock_dropped = false;
+	for (;;) {
+		ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
+				       vmas, locked);
+		if (!locked)
+			/* VM_FAULT_RETRY couldn't trigger, bypass */
+			return ret;
+
+		/* VM_FAULT_RETRY cannot return errors */
+		if (!*locked) {
+			BUG_ON(ret < 0);
+			BUG_ON(ret >= nr_pages);
+		}
+
+		if (!pages)
+			/* If it's a prefault don't insist harder */
+			return ret;
+
+		if (ret > 0) {
+			nr_pages -= ret;
+			pages_done += ret;
+			if (!nr_pages)
+				break;
+		}
+		if (*locked) {
+			/* VM_FAULT_RETRY didn't trigger */
+			if (!pages_done)
+				pages_done = ret;
+			break;
+		}
+		/* VM_FAULT_RETRY triggered, so seek to the faulting offset */
+		pages += ret;
+		start += ret << PAGE_SHIFT;
+
+		/*
+		 * Repeat on the address that fired VM_FAULT_RETRY
+		 * without FAULT_FLAG_ALLOW_RETRY but with
+		 * FAULT_FLAG_TRIED.
+		 */
+		*locked = 1;
+		lock_dropped = true;
+		down_read(&mm->mmap_sem);
+		ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
+				       pages, NULL, NULL);
+		if (ret != 1) {
+			BUG_ON(ret > 1);
+			if (!pages_done)
+				pages_done = ret;
+			break;
+		}
+		nr_pages--;
+		pages_done++;
+		if (!nr_pages)
+			break;
+		pages++;
+		start += PAGE_SIZE;
+	}
+	if (notify_drop && lock_dropped && *locked) {
+		/*
+		 * We must let the caller know we temporarily dropped the lock
+		 * and so the critical section protected by it was lost.
+		 */
+		up_read(&mm->mmap_sem);
+		*locked = 0;
+	}
+	return pages_done;
+}
+
+/*
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
+ * get_user_pages_locked() is suitable to replace the form:
+ *
+ *      down_read(&mm->mmap_sem);
+ *      do_something()
+ *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      up_read(&mm->mmap_sem);
+ *
+ *  to:
+ *
+ *      int locked = 1;
+ *      down_read(&mm->mmap_sem);
+ *      do_something()
+ *      get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ *      if (locked)
+ *          up_read(&mm->mmap_sem);
+ */
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+			   unsigned long start, unsigned long nr_pages,
+			   int write, int force, struct page **pages,
+			   int *locked)
+{
+	return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+				       pages, NULL, locked, true);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+/*
+ * get_user_pages_unlocked() is suitable to replace the form:
+ *
+ *      down_read(&mm->mmap_sem);
+ *      get_user_pages(tsk, mm, ..., pages, NULL);
+ *      up_read(&mm->mmap_sem);
+ *
+ *  with:
+ *
+ *      get_user_pages_unlocked(tsk, mm, ..., pages);
+ *
+ * It is functionally equivalent to get_user_pages_fast so
+ * get_user_pages_fast should be used instead, if the two parameters
+ * "tsk" and "mm" are respectively equal to current and current->mm,
+ * or if "force" shall be set to 1 (get_user_pages_fast misses the
+ * "force" parameter).
+ */
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+			     unsigned long start, unsigned long nr_pages,
+			     int write, int force, struct page **pages)
+{
+	long ret;
+	int locked = 1;
+	down_read(&mm->mmap_sem);
+	ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+				      pages, NULL, &locked, false);
+	if (locked)
+		up_read(&mm->mmap_sem);
+	return ret;
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
 /*
  * get_user_pages() - pin user pages in memory
  * @tsk:	the task_struct to use for page fault accounting, or
@@ -624,22 +783,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
  * use the correct cache flushing APIs.
  *
  * See also get_user_pages_fast, for performance critical applications.
+ *
+ * get_user_pages should be phased out in favor of
+ * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
+ * should use get_user_pages because it cannot pass
+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
  */
 long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long start, unsigned long nr_pages, int write,
 		int force, struct page **pages, struct vm_area_struct **vmas)
 {
-	int flags = FOLL_TOUCH;
-
-	if (pages)
-		flags |= FOLL_GET;
-	if (write)
-		flags |= FOLL_WRITE;
-	if (force)
-		flags |= FOLL_FORCE;
-
-	return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
-				NULL);
+	return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+				       pages, vmas, NULL, false);
 }
 EXPORT_SYMBOL(get_user_pages);
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 541bed64e348..bfb690b0f986 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -214,6 +214,29 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages);
 
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+			   unsigned long start, unsigned long nr_pages,
+			   int write, int force, struct page **pages,
+			   int *locked)
+{
+	return get_user_pages(tsk, mm, start, nr_pages, write, force,
+			      pages, NULL);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+			     unsigned long start, unsigned long nr_pages,
+			     int write, int force, struct page **pages)
+{
+	long ret;
+	down_read(&mm->mmap_sem);
+	ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
+			     pages, NULL);
+	up_read(&mm->mmap_sem);
+	return ret;
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
 /**
  * follow_pfn - look up PFN at a user virtual address
  * @vma: memory mapping
-- 
cgit v1.2.3


From 0fd71a56f41d4ffabeda1dae9ff5ed4f34d4e935 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 11 Feb 2015 15:27:20 -0800
Subject: mm: gup: add __get_user_pages_unlocked to customize gup_flags

Some callers (like KVM) may want to set the gup_flags like FOLL_HWPOSION
to get a proper -EHWPOSION retval instead of -EFAULT to take a more
appropriate action if get_user_pages runs into a memory failure.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Peter Feiner <pfeiner@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  4 ++++
 mm/gup.c           | 44 ++++++++++++++++++++++++++++++++------------
 mm/nommu.c         | 16 +++++++++++++---
 3 files changed, 49 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fc499e675475..3696b3bd1d7e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1265,6 +1265,10 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
 		    int write, int force, struct page **pages,
 		    int *locked);
+long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+			       unsigned long start, unsigned long nr_pages,
+			       int write, int force, struct page **pages,
+			       unsigned int gup_flags);
 long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 		    unsigned long start, unsigned long nr_pages,
 		    int write, int force, struct page **pages);
diff --git a/mm/gup.c b/mm/gup.c
index 71a37738a326..dad5875fb766 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -582,9 +582,9 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
 						int write, int force,
 						struct page **pages,
 						struct vm_area_struct **vmas,
-						int *locked, bool notify_drop)
+						int *locked, bool notify_drop,
+						unsigned int flags)
 {
-	int flags = FOLL_TOUCH;
 	long ret, pages_done;
 	bool lock_dropped;
 
@@ -698,10 +698,36 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 			   int *locked)
 {
 	return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
-				       pages, NULL, locked, true);
+				       pages, NULL, locked, true, FOLL_TOUCH);
 }
 EXPORT_SYMBOL(get_user_pages_locked);
 
+/*
+ * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
+ * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
+ *
+ * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
+ * caller if required (just like with __get_user_pages). "FOLL_GET",
+ * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
+ * according to the parameters "pages", "write", "force"
+ * respectively.
+ */
+__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+					       unsigned long start, unsigned long nr_pages,
+					       int write, int force, struct page **pages,
+					       unsigned int gup_flags)
+{
+	long ret;
+	int locked = 1;
+	down_read(&mm->mmap_sem);
+	ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+				      pages, NULL, &locked, false, gup_flags);
+	if (locked)
+		up_read(&mm->mmap_sem);
+	return ret;
+}
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+
 /*
  * get_user_pages_unlocked() is suitable to replace the form:
  *
@@ -723,14 +749,8 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 			     unsigned long start, unsigned long nr_pages,
 			     int write, int force, struct page **pages)
 {
-	long ret;
-	int locked = 1;
-	down_read(&mm->mmap_sem);
-	ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
-				      pages, NULL, &locked, false);
-	if (locked)
-		up_read(&mm->mmap_sem);
-	return ret;
+	return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+					 force, pages, FOLL_TOUCH);
 }
 EXPORT_SYMBOL(get_user_pages_unlocked);
 
@@ -794,7 +814,7 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		int force, struct page **pages, struct vm_area_struct **vmas)
 {
 	return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
-				       pages, vmas, NULL, false);
+				       pages, vmas, NULL, false, FOLL_TOUCH);
 }
 EXPORT_SYMBOL(get_user_pages);
 
diff --git a/mm/nommu.c b/mm/nommu.c
index bfb690b0f986..4d1b8a199867 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -224,9 +224,10 @@ long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages_locked);
 
-long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
-			     unsigned long start, unsigned long nr_pages,
-			     int write, int force, struct page **pages)
+long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+			       unsigned long start, unsigned long nr_pages,
+			       int write, int force, struct page **pages,
+			       unsigned int gup_flags)
 {
 	long ret;
 	down_read(&mm->mmap_sem);
@@ -235,6 +236,15 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
 	up_read(&mm->mmap_sem);
 	return ret;
 }
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+			     unsigned long start, unsigned long nr_pages,
+			     int write, int force, struct page **pages)
+{
+	return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+					 force, pages, 0);
+}
 EXPORT_SYMBOL(get_user_pages_unlocked);
 
 /**
-- 
cgit v1.2.3


From 0664e57ff0c68cbca012a45a38288fa277eb6795 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Wed, 11 Feb 2015 15:27:28 -0800
Subject: mm: gup: kvm use get_user_pages_unlocked

Use the more generic get_user_pages_unlocked which has the additional
benefit of passing FAULT_FLAG_ALLOW_RETRY at the very first page fault
(which allows the first page fault in an unmapped area to be always able
to block indefinitely by being allowed to release the mmap_sem).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andres Lagar-Cavilla <andreslc@google.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Peter Feiner <pfeiner@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kvm_host.h | 11 -----------
 virt/kvm/async_pf.c      |  2 +-
 virt/kvm/kvm_main.c      | 50 ++++--------------------------------------------
 3 files changed, 5 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 26f106022c88..d189ee098aa2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -200,17 +200,6 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
-/*
- * Carry out a gup that requires IO. Allow the mm to relinquish the mmap
- * semaphore if the filemap/swap has to wait on a page lock. pagep == NULL
- * controls whether we retry the gup one more time to completion in that case.
- * Typically this is called after a FAULT_FLAG_RETRY_NOWAIT in the main tdp
- * handler.
- */
-int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
-			 unsigned long addr, bool write_fault,
-			 struct page **pagep);
-
 enum {
 	OUTSIDE_GUEST_MODE,
 	IN_GUEST_MODE,
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 5ff7f7f2689a..44660aee335f 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -80,7 +80,7 @@ static void async_pf_execute(struct work_struct *work)
 
 	might_sleep();
 
-	kvm_get_user_page_io(NULL, mm, addr, 1, NULL);
+	get_user_pages_unlocked(NULL, mm, addr, 1, 1, 0, NULL);
 	kvm_async_page_present_sync(vcpu, apf);
 
 	spin_lock(&vcpu->async_pf.lock);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1cc6e2e19982..458b9b14b15c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1128,43 +1128,6 @@ static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm,
 	return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL);
 }
 
-int kvm_get_user_page_io(struct task_struct *tsk, struct mm_struct *mm,
-			 unsigned long addr, bool write_fault,
-			 struct page **pagep)
-{
-	int npages;
-	int locked = 1;
-	int flags = FOLL_TOUCH | FOLL_HWPOISON |
-		    (pagep ? FOLL_GET : 0) |
-		    (write_fault ? FOLL_WRITE : 0);
-
-	/*
-	 * If retrying the fault, we get here *not* having allowed the filemap
-	 * to wait on the page lock. We should now allow waiting on the IO with
-	 * the mmap semaphore released.
-	 */
-	down_read(&mm->mmap_sem);
-	npages = __get_user_pages(tsk, mm, addr, 1, flags, pagep, NULL,
-				  &locked);
-	if (!locked) {
-		VM_BUG_ON(npages);
-
-		if (!pagep)
-			return 0;
-
-		/*
-		 * The previous call has now waited on the IO. Now we can
-		 * retry and complete. Pass TRIED to ensure we do not re
-		 * schedule async IO (see e.g. filemap_fault).
-		 */
-		down_read(&mm->mmap_sem);
-		npages = __get_user_pages(tsk, mm, addr, 1, flags | FOLL_TRIED,
-					  pagep, NULL, NULL);
-	}
-	up_read(&mm->mmap_sem);
-	return npages;
-}
-
 static inline int check_user_page_hwpoison(unsigned long addr)
 {
 	int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
@@ -1227,15 +1190,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 		npages = get_user_page_nowait(current, current->mm,
 					      addr, write_fault, page);
 		up_read(&current->mm->mmap_sem);
-	} else {
-		/*
-		 * By now we have tried gup_fast, and possibly async_pf, and we
-		 * are certainly not atomic. Time to retry the gup, allowing
-		 * mmap semaphore to be relinquished in the case of IO.
-		 */
-		npages = kvm_get_user_page_io(current, current->mm, addr,
-					      write_fault, page);
-	}
+	} else
+		npages = __get_user_pages_unlocked(current, current->mm, addr, 1,
+						   write_fault, 0, page,
+						   FOLL_TOUCH|FOLL_HWPOISON);
 	if (npages != 1)
 		return npages;
 
-- 
cgit v1.2.3


From 0b1fbfe50006c41014cc25660c0e735d21c34939 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Feb 2015 15:27:34 -0800
Subject: mm/pagewalk: remove pgd_entry() and pud_entry()

Currently no user of page table walker sets ->pgd_entry() or
->pud_entry(), so checking their existence in each loop is just wasting
CPU cycle.  So let's remove it to reduce overhead.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 6 ------
 mm/pagewalk.c      | 9 ++-------
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3696b3bd1d7e..f6106d3f3dab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1164,8 +1164,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 
 /**
  * mm_walk - callbacks for walk_page_range
- * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
- * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
  * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
  *	       this handler is required to be able to handle
  *	       pmd_trans_huge() pmds.  They may simply choose to
@@ -1179,10 +1177,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
  * (see walk_page_range for more details)
  */
 struct mm_walk {
-	int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
-			 unsigned long next, struct mm_walk *walk);
-	int (*pud_entry)(pud_t *pud, unsigned long addr,
-	                 unsigned long next, struct mm_walk *walk);
 	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
 			 unsigned long next, struct mm_walk *walk);
 	int (*pte_entry)(pte_t *pte, unsigned long addr,
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index b264bda46e1b..b793ef149da2 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 				break;
 			continue;
 		}
-		if (walk->pud_entry)
-			err = walk->pud_entry(pud, addr, next, walk);
-		if (!err && (walk->pmd_entry || walk->pte_entry))
+		if (walk->pmd_entry || walk->pte_entry)
 			err = walk_pmd_range(pud, addr, next, walk);
 		if (err)
 			break;
@@ -237,10 +235,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
 			pgd++;
 			continue;
 		}
-		if (walk->pgd_entry)
-			err = walk->pgd_entry(pgd, addr, next, walk);
-		if (!err &&
-		    (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
+		if (walk->pmd_entry || walk->pte_entry)
 			err = walk_pud_range(pgd, addr, next, walk);
 		if (err)
 			break;
-- 
cgit v1.2.3


From fafaa4264eba49fd10695c193a82760558d093f4 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Feb 2015 15:27:37 -0800
Subject: pagewalk: improve vma handling

Current implementation of page table walker has a fundamental problem in
vma handling, which started when we tried to handle vma(VM_HUGETLB).
Because it's done in pgd loop, considering vma boundary makes code
complicated and bug-prone.

From the users viewpoint, some user checks some vma-related condition to
determine whether the user really does page walk over the vma.

In order to solve these, this patch moves vma check outside pgd loop and
introduce a new callback ->test_walk().

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  15 +++-
 mm/pagewalk.c      | 206 ++++++++++++++++++++++++++++++-----------------------
 2 files changed, 129 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6106d3f3dab..3891a368e5e0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1171,10 +1171,16 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
  * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
  * @pte_hole: if set, called for each hole at all levels
  * @hugetlb_entry: if set, called for each hugetlb entry
- *		   *Caution*: The caller must hold mmap_sem() if @hugetlb_entry
- * 			      is used.
+ * @test_walk: caller specific callback function to determine whether
+ *             we walk over the current vma or not. A positive returned
+ *             value means "do page table walk over the current vma,"
+ *             and a negative one means "abort current page table walk
+ *             right now." 0 means "skip the current vma."
+ * @mm:        mm_struct representing the target process of page table walk
+ * @vma:       vma currently walked (NULL if walking outside vmas)
+ * @private:   private data for callbacks' usage
  *
- * (see walk_page_range for more details)
+ * (see the comment on walk_page_range() for more details)
  */
 struct mm_walk {
 	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
@@ -1186,7 +1192,10 @@ struct mm_walk {
 	int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
 			     unsigned long addr, unsigned long next,
 			     struct mm_walk *walk);
+	int (*test_walk)(unsigned long addr, unsigned long next,
+			struct mm_walk *walk);
 	struct mm_struct *mm;
+	struct vm_area_struct *vma;
 	void *private;
 };
 
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index b793ef149da2..d9cc3caae802 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
 			continue;
 
 		split_huge_page_pmd_mm(walk->mm, addr, pmd);
-		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+		if (pmd_trans_unstable(pmd))
 			goto again;
 		err = walk_pte_range(pmd, addr, next, walk);
 		if (err)
@@ -95,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
 	return err;
 }
 
+static int walk_pgd_range(unsigned long addr, unsigned long end,
+			  struct mm_walk *walk)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	int err = 0;
+
+	pgd = pgd_offset(walk->mm, addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd)) {
+			if (walk->pte_hole)
+				err = walk->pte_hole(addr, next, walk);
+			if (err)
+				break;
+			continue;
+		}
+		if (walk->pmd_entry || walk->pte_entry)
+			err = walk_pud_range(pgd, addr, next, walk);
+		if (err)
+			break;
+	} while (pgd++, addr = next, addr != end);
+
+	return err;
+}
+
 #ifdef CONFIG_HUGETLB_PAGE
 static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
 				       unsigned long end)
@@ -103,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
 	return boundary < end ? boundary : end;
 }
 
-static int walk_hugetlb_range(struct vm_area_struct *vma,
-			      unsigned long addr, unsigned long end,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 			      struct mm_walk *walk)
 {
+	struct vm_area_struct *vma = walk->vma;
 	struct hstate *h = hstate_vma(vma);
 	unsigned long next;
 	unsigned long hmask = huge_page_mask(h);
@@ -119,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
 		if (pte && walk->hugetlb_entry)
 			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
 		if (err)
-			return err;
+			break;
 	} while (addr = next, addr != end);
 
-	return 0;
+	return err;
 }
 
 #else /* CONFIG_HUGETLB_PAGE */
-static int walk_hugetlb_range(struct vm_area_struct *vma,
-			      unsigned long addr, unsigned long end,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 			      struct mm_walk *walk)
 {
 	return 0;
@@ -135,112 +160,115 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
 
 #endif /* CONFIG_HUGETLB_PAGE */
 
+/*
+ * Decide whether we really walk over the current vma on [@start, @end)
+ * or skip it via the returned value. Return 0 if we do walk over the
+ * current vma, and return 1 if we skip the vma. Negative values means
+ * error, where we abort the current walk.
+ *
+ * Default check (only VM_PFNMAP check for now) is used when the caller
+ * doesn't define test_walk() callback.
+ */
+static int walk_page_test(unsigned long start, unsigned long end,
+			struct mm_walk *walk)
+{
+	struct vm_area_struct *vma = walk->vma;
 
+	if (walk->test_walk)
+		return walk->test_walk(start, end, walk);
+
+	/*
+	 * Do not walk over vma(VM_PFNMAP), because we have no valid struct
+	 * page backing a VM_PFNMAP range. See also commit a9ff785e4437.
+	 */
+	if (vma->vm_flags & VM_PFNMAP)
+		return 1;
+	return 0;
+}
+
+static int __walk_page_range(unsigned long start, unsigned long end,
+			struct mm_walk *walk)
+{
+	int err = 0;
+	struct vm_area_struct *vma = walk->vma;
+
+	if (vma && is_vm_hugetlb_page(vma)) {
+		if (walk->hugetlb_entry)
+			err = walk_hugetlb_range(start, end, walk);
+	} else
+		err = walk_pgd_range(start, end, walk);
+
+	return err;
+}
 
 /**
- * walk_page_range - walk a memory map's page tables with a callback
- * @addr: starting address
- * @end: ending address
- * @walk: set of callbacks to invoke for each level of the tree
- *
- * Recursively walk the page table for the memory area in a VMA,
- * calling supplied callbacks. Callbacks are called in-order (first
- * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
- * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ * walk_page_range - walk page table with caller specific callbacks
  *
- * Each callback receives an entry pointer and the start and end of the
- * associated range, and a copy of the original mm_walk for access to
- * the ->private or ->mm fields.
+ * Recursively walk the page table tree of the process represented by @walk->mm
+ * within the virtual address range [@start, @end). During walking, we can do
+ * some caller-specific works for each entry, by setting up pmd_entry(),
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ *  - 0  : succeeded to handle the current entry, and if you don't reach the
+ *         end address yet, continue to walk.
+ *  - >0 : succeeded to handle the current entry, and return to the caller
+ *         with caller specific value.
+ *  - <0 : failed to handle the current entry, and return to the caller
+ *         with error code.
  *
- * Usually no locks are taken, but splitting transparent huge page may
- * take page table lock. And the bottom level iterator will map PTE
- * directories from highmem if necessary.
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the current vma, typically by checking
+ * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
+ * purpose.
  *
- * If any callback returns a non-zero value, the walk is aborted and
- * the return value is propagated back to the caller. Otherwise 0 is returned.
+ * struct mm_walk keeps current values of some common data like vma and pmd,
+ * which are useful for the access from callbacks. If you want to pass some
+ * caller-specific data to callbacks, @walk->private should be helpful.
  *
- * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
- * is !NULL.
+ * Locking:
+ *   Callers of walk_page_range() and walk_page_vma() should hold
+ *   @walk->mm->mmap_sem, because these function traverse vma list and/or
+ *   access to vma's data.
  */
-int walk_page_range(unsigned long addr, unsigned long end,
+int walk_page_range(unsigned long start, unsigned long end,
 		    struct mm_walk *walk)
 {
-	pgd_t *pgd;
-	unsigned long next;
 	int err = 0;
+	unsigned long next;
+	struct vm_area_struct *vma;
 
-	if (addr >= end)
-		return err;
+	if (start >= end)
+		return -EINVAL;
 
 	if (!walk->mm)
 		return -EINVAL;
 
 	VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
 
-	pgd = pgd_offset(walk->mm, addr);
+	vma = find_vma(walk->mm, start);
 	do {
-		struct vm_area_struct *vma = NULL;
+		if (!vma) { /* after the last vma */
+			walk->vma = NULL;
+			next = end;
+		} else if (start < vma->vm_start) { /* outside vma */
+			walk->vma = NULL;
+			next = min(end, vma->vm_start);
+		} else { /* inside vma */
+			walk->vma = vma;
+			next = min(end, vma->vm_end);
+			vma = vma->vm_next;
 
-		next = pgd_addr_end(addr, end);
-
-		/*
-		 * This function was not intended to be vma based.
-		 * But there are vma special cases to be handled:
-		 * - hugetlb vma's
-		 * - VM_PFNMAP vma's
-		 */
-		vma = find_vma(walk->mm, addr);
-		if (vma) {
-			/*
-			 * There are no page structures backing a VM_PFNMAP
-			 * range, so do not allow split_huge_page_pmd().
-			 */
-			if ((vma->vm_start <= addr) &&
-			    (vma->vm_flags & VM_PFNMAP)) {
-				if (walk->pte_hole)
-					err = walk->pte_hole(addr, next, walk);
-				if (err)
-					break;
-				pgd = pgd_offset(walk->mm, next);
+			err = walk_page_test(start, next, walk);
+			if (err > 0)
 				continue;
-			}
-			/*
-			 * Handle hugetlb vma individually because pagetable
-			 * walk for the hugetlb page is dependent on the
-			 * architecture and we can't handled it in the same
-			 * manner as non-huge pages.
-			 */
-			if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
-			    is_vm_hugetlb_page(vma)) {
-				if (vma->vm_end < next)
-					next = vma->vm_end;
-				/*
-				 * Hugepage is very tightly coupled with vma,
-				 * so walk through hugetlb entries within a
-				 * given vma.
-				 */
-				err = walk_hugetlb_range(vma, addr, next, walk);
-				if (err)
-					break;
-				pgd = pgd_offset(walk->mm, next);
-				continue;
-			}
-		}
-
-		if (pgd_none_or_clear_bad(pgd)) {
-			if (walk->pte_hole)
-				err = walk->pte_hole(addr, next, walk);
-			if (err)
+			if (err < 0)
 				break;
-			pgd++;
-			continue;
 		}
-		if (walk->pmd_entry || walk->pte_entry)
-			err = walk_pud_range(pgd, addr, next, walk);
+		if (walk->vma || walk->pte_hole)
+			err = __walk_page_range(start, next, walk);
 		if (err)
 			break;
-		pgd++;
-	} while (addr = next, addr < end);
-
+	} while (start = next, start < end);
 	return err;
 }
-- 
cgit v1.2.3


From 900fc5f197b05253ae9433fb9a066c3f37d08f69 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Feb 2015 15:27:40 -0800
Subject: pagewalk: add walk_page_vma()

Introduce walk_page_vma(), which is useful for the callers which want to
walk over a given vma.  It's used by later patches.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  1 +
 mm/pagewalk.c      | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3891a368e5e0..a4d24f3c5430 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1201,6 +1201,7 @@ struct mm_walk {
 
 int walk_page_range(unsigned long addr, unsigned long end,
 		struct mm_walk *walk);
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
 void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d9cc3caae802..4c9a653ba563 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -272,3 +272,21 @@ int walk_page_range(unsigned long start, unsigned long end,
 	} while (start = next, start < end);
 	return err;
 }
+
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
+{
+	int err;
+
+	if (!walk->mm)
+		return -EINVAL;
+
+	VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+	VM_BUG_ON(!vma);
+	walk->vma = vma;
+	err = walk_page_test(vma->vm_start, vma->vm_end, walk);
+	if (err > 0)
+		return 0;
+	if (err < 0)
+		return err;
+	return __walk_page_range(vma->vm_start, vma->vm_end, walk);
+}
-- 
cgit v1.2.3


From 94f759d62b2c6a9d124b0622077b1ddcfac43fb5 Mon Sep 17 00:00:00 2001
From: Sergei Rogachev <rogachevsergei@gmail.com>
Date: Wed, 11 Feb 2015 15:28:34 -0800
Subject: mm/page_owner.c: remove unnecessary stack_trace field

Page owner uses the page_ext structure to keep meta-information for every
page in the system.  The structure also contains a field of type 'struct
stack_trace', page owner uses this field during invocation of the function
save_stack_trace.  It is easy to notice that keeping a copy of this
structure for every page in the system is very inefficiently in terms of
memory.

The patch removes this unnecessary field of page_ext and forces page owner
to use a stack_trace structure allocated on the stack.

[akpm@linux-foundation.org: use struct initializers]
Signed-off-by: Sergei Rogachev <rogachevsergei@gmail.com>
Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page_ext.h |  2 +-
 mm/page_owner.c          | 26 ++++++++++++++------------
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index d2a2c84c72d0..c42981cd99aa 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -40,7 +40,7 @@ struct page_ext {
 #ifdef CONFIG_PAGE_OWNER
 	unsigned int order;
 	gfp_t gfp_mask;
-	struct stack_trace trace;
+	unsigned int nr_entries;
 	unsigned long trace_entries[8];
 #endif
 };
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 9ab4a9b5bc09..0993f5f36b01 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -59,20 +59,19 @@ void __reset_page_owner(struct page *page, unsigned int order)
 
 void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
 {
-	struct page_ext *page_ext;
-	struct stack_trace *trace;
-
-	page_ext = lookup_page_ext(page);
+	struct page_ext *page_ext = lookup_page_ext(page);
+	struct stack_trace trace = {
+		.nr_entries = 0,
+		.max_entries = ARRAY_SIZE(page_ext->trace_entries),
+		.entries = &page_ext->trace_entries[0],
+		.skip = 3,
+	};
 
-	trace = &page_ext->trace;
-	trace->nr_entries = 0;
-	trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
-	trace->entries = &page_ext->trace_entries[0];
-	trace->skip = 3;
-	save_stack_trace(&page_ext->trace);
+	save_stack_trace(&trace);
 
 	page_ext->order = order;
 	page_ext->gfp_mask = gfp_mask;
+	page_ext->nr_entries = trace.nr_entries;
 
 	__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
 }
@@ -84,6 +83,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 	int ret;
 	int pageblock_mt, page_mt;
 	char *kbuf;
+	struct stack_trace trace = {
+		.nr_entries = page_ext->nr_entries,
+		.entries = &page_ext->trace_entries[0],
+	};
 
 	kbuf = kmalloc(count, GFP_KERNEL);
 	if (!kbuf)
@@ -121,8 +124,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 	if (ret >= count)
 		goto err;
 
-	ret += snprint_stack_trace(kbuf + ret, count - ret,
-					&page_ext->trace, 0);
+	ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
 	if (ret >= count)
 		goto err;
 
-- 
cgit v1.2.3